summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1/encoder
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/aom/av1/encoder
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--third_party/aom/av1/encoder/allintra_vis.c1055
-rw-r--r--third_party/aom/av1/encoder/allintra_vis.h46
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.c175
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.h37
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.c657
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.h332
-rw-r--r--third_party/aom/av1/encoder/aq_variance.c220
-rw-r--r--third_party/aom/av1/encoder/aq_variance.h35
-rw-r--r--third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c61
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_error_neon.c95
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_error_sve.c109
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c3090
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c146
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c115
-rw-r--r--third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c360
-rw-r--r--third_party/aom/av1/encoder/arm/neon/cnn_neon.c1144
-rw-r--r--third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c646
-rw-r--r--third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c2619
-rw-r--r--third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c1207
-rw-r--r--third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c49
-rw-r--r--third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c562
-rw-r--r--third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c73
-rw-r--r--third_party/aom/av1/encoder/arm/neon/ml_neon.c339
-rw-r--r--third_party/aom/av1/encoder/arm/neon/pickrst_neon.c1217
-rw-r--r--third_party/aom/av1/encoder/arm/neon/pickrst_neon.h188
-rw-r--r--third_party/aom/av1/encoder/arm/neon/quantize_neon.c928
-rw-r--r--third_party/aom/av1/encoder/arm/neon/rdopt_neon.c459
-rw-r--r--third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c288
-rw-r--r--third_party/aom/av1/encoder/arm/neon/shift_neon.h49
-rw-r--r--third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c548
-rw-r--r--third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c299
-rw-r--r--third_party/aom/av1/encoder/arm/neon/txfm_neon.h26
-rw-r--r--third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c131
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d.c1885
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d.h49
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h19
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm2d.c423
-rw-r--r--third_party/aom/av1/encoder/av1_ml_partition_models.h179
-rw-r--r--third_party/aom/av1/encoder/av1_noise_estimate.c296
-rw-r--r--third_party/aom/av1/encoder/av1_noise_estimate.h50
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.c917
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.h224
-rw-r--r--third_party/aom/av1/encoder/av1_temporal_denoiser.c805
-rw-r--r--third_party/aom/av1/encoder/av1_temporal_denoiser.h134
-rw-r--r--third_party/aom/av1/encoder/bitstream.c4248
-rw-r--r--third_party/aom/av1/encoder/bitstream.h137
-rw-r--r--third_party/aom/av1/encoder/block.h1515
-rw-r--r--third_party/aom/av1/encoder/blockiness.c140
-rw-r--r--third_party/aom/av1/encoder/cnn.c1189
-rw-r--r--third_party/aom/av1/encoder/cnn.h191
-rw-r--r--third_party/aom/av1/encoder/compound_type.c1678
-rw-r--r--third_party/aom/av1/encoder/compound_type.h52
-rw-r--r--third_party/aom/av1/encoder/context_tree.c311
-rw-r--r--third_party/aom/av1/encoder/context_tree.h142
-rw-r--r--third_party/aom/av1/encoder/cost.c46
-rw-r--r--third_party/aom/av1/encoder/cost.h51
-rw-r--r--third_party/aom/av1/encoder/deltaq4_model.c7776
-rw-r--r--third_party/aom/av1/encoder/dwt.c146
-rw-r--r--third_party/aom/av1/encoder/dwt.h27
-rw-r--r--third_party/aom/av1/encoder/enc_enums.h268
-rw-r--r--third_party/aom/av1/encoder/encode_strategy.c1767
-rw-r--r--third_party/aom/av1/encoder/encode_strategy.h138
-rw-r--r--third_party/aom/av1/encoder/encodeframe.c2408
-rw-r--r--third_party/aom/av1/encoder/encodeframe.h55
-rw-r--r--third_party/aom/av1/encoder/encodeframe_utils.c1775
-rw-r--r--third_party/aom/av1/encoder/encodeframe_utils.h595
-rw-r--r--third_party/aom/av1/encoder/encodemb.c866
-rw-r--r--third_party/aom/av1/encoder/encodemb.h180
-rw-r--r--third_party/aom/av1/encoder/encodemv.c345
-rw-r--r--third_party/aom/av1/encoder/encodemv.h110
-rw-r--r--third_party/aom/av1/encoder/encoder.c5409
-rw-r--r--third_party/aom/av1/encoder/encoder.h4512
-rw-r--r--third_party/aom/av1/encoder/encoder_alloc.h531
-rw-r--r--third_party/aom/av1/encoder/encoder_utils.c1503
-rw-r--r--third_party/aom/av1/encoder/encoder_utils.h1141
-rw-r--r--third_party/aom/av1/encoder/encodetxb.c886
-rw-r--r--third_party/aom/av1/encoder/encodetxb.h276
-rw-r--r--third_party/aom/av1/encoder/ethread.c3469
-rw-r--r--third_party/aom/av1/encoder/ethread.h133
-rw-r--r--third_party/aom/av1/encoder/extend.c163
-rw-r--r--third_party/aom/av1/encoder/extend.h29
-rw-r--r--third_party/aom/av1/encoder/external_partition.c98
-rw-r--r--third_party/aom/av1/encoder/external_partition.h58
-rw-r--r--third_party/aom/av1/encoder/firstpass.c1600
-rw-r--r--third_party/aom/av1/encoder/firstpass.h603
-rw-r--r--third_party/aom/av1/encoder/global_motion.c575
-rw-r--r--third_party/aom/av1/encoder/global_motion.h157
-rw-r--r--third_party/aom/av1/encoder/global_motion_facade.c450
-rw-r--r--third_party/aom/av1/encoder/global_motion_facade.h58
-rw-r--r--third_party/aom/av1/encoder/gop_structure.c867
-rw-r--r--third_party/aom/av1/encoder/gop_structure.h95
-rw-r--r--third_party/aom/av1/encoder/grain_test_vectors.h781
-rw-r--r--third_party/aom/av1/encoder/hash.c126
-rw-r--r--third_party/aom/av1/encoder/hash.h53
-rw-r--r--third_party/aom/av1/encoder/hash_motion.c503
-rw-r--r--third_party/aom/av1/encoder/hash_motion.h103
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.c370
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.h40
-rw-r--r--third_party/aom/av1/encoder/interp_search.c801
-rw-r--r--third_party/aom/av1/encoder/interp_search.h205
-rw-r--r--third_party/aom/av1/encoder/intra_mode_search.c1739
-rw-r--r--third_party/aom/av1/encoder/intra_mode_search.h329
-rw-r--r--third_party/aom/av1/encoder/intra_mode_search_utils.h690
-rw-r--r--third_party/aom/av1/encoder/k_means_template.h151
-rw-r--r--third_party/aom/av1/encoder/level.c1397
-rw-r--r--third_party/aom/av1/encoder/level.h221
-rw-r--r--third_party/aom/av1/encoder/lookahead.c222
-rw-r--r--third_party/aom/av1/encoder/lookahead.h138
-rw-r--r--third_party/aom/av1/encoder/mcomp.c3998
-rw-r--r--third_party/aom/av1/encoder/mcomp.h398
-rw-r--r--third_party/aom/av1/encoder/mcomp_structs.h109
-rw-r--r--third_party/aom/av1/encoder/misc_model_weights.h696
-rw-r--r--third_party/aom/av1/encoder/ml.c171
-rw-r--r--third_party/aom/av1/encoder/ml.h85
-rw-r--r--third_party/aom/av1/encoder/mode_prune_model_weights.h185
-rw-r--r--third_party/aom/av1/encoder/model_rd.h270
-rw-r--r--third_party/aom/av1/encoder/motion_search_facade.c1071
-rw-r--r--third_party/aom/av1/encoder/motion_search_facade.h145
-rw-r--r--third_party/aom/av1/encoder/mv_prec.c429
-rw-r--r--third_party/aom/av1/encoder/mv_prec.h52
-rw-r--r--third_party/aom/av1/encoder/nonrd_opt.c933
-rw-r--r--third_party/aom/av1/encoder/nonrd_opt.h575
-rw-r--r--third_party/aom/av1/encoder/nonrd_pickmode.c3537
-rw-r--r--third_party/aom/av1/encoder/optical_flow.c1113
-rw-r--r--third_party/aom/av1/encoder/optical_flow.h76
-rw-r--r--third_party/aom/av1/encoder/palette.c975
-rw-r--r--third_party/aom/av1/encoder/palette.h215
-rw-r--r--third_party/aom/av1/encoder/partition_cnn_weights.h2139
-rw-r--r--third_party/aom/av1/encoder/partition_model_weights.h5646
-rw-r--r--third_party/aom/av1/encoder/partition_search.c6263
-rw-r--r--third_party/aom/av1/encoder/partition_search.h81
-rw-r--r--third_party/aom/av1/encoder/partition_strategy.c2573
-rw-r--r--third_party/aom/av1/encoder/partition_strategy.h265
-rw-r--r--third_party/aom/av1/encoder/pass2_strategy.c4488
-rw-r--r--third_party/aom/av1/encoder/pass2_strategy.h149
-rw-r--r--third_party/aom/av1/encoder/pickcdef.c958
-rw-r--r--third_party/aom/av1/encoder/pickcdef.h261
-rw-r--r--third_party/aom/av1/encoder/picklpf.c339
-rw-r--r--third_party/aom/av1/encoder/picklpf.h165
-rw-r--r--third_party/aom/av1/encoder/pickrst.c2217
-rw-r--r--third_party/aom/av1/encoder/pickrst.h126
-rw-r--r--third_party/aom/av1/encoder/pustats.h198
-rw-r--r--third_party/aom/av1/encoder/random.h85
-rw-r--r--third_party/aom/av1/encoder/ratectrl.c3587
-rw-r--r--third_party/aom/av1/encoder/ratectrl.h864
-rw-r--r--third_party/aom/av1/encoder/rc_utils.h469
-rw-r--r--third_party/aom/av1/encoder/rd.c1580
-rw-r--r--third_party/aom/av1/encoder/rd.h390
-rw-r--r--third_party/aom/av1/encoder/rdopt.c6598
-rw-r--r--third_party/aom/av1/encoder/rdopt.h327
-rw-r--r--third_party/aom/av1/encoder/rdopt_data_defs.h294
-rw-r--r--third_party/aom/av1/encoder/rdopt_utils.h797
-rw-r--r--third_party/aom/av1/encoder/reconinter_enc.c701
-rw-r--r--third_party/aom/av1/encoder/reconinter_enc.h94
-rw-r--r--third_party/aom/av1/encoder/saliency_map.c1414
-rw-r--r--third_party/aom/av1/encoder/saliency_map.h28
-rw-r--r--third_party/aom/av1/encoder/segmentation.c54
-rw-r--r--third_party/aom/av1/encoder/segmentation.h38
-rw-r--r--third_party/aom/av1/encoder/sorting_network.h140
-rw-r--r--third_party/aom/av1/encoder/sparse_linear_solver.c472
-rw-r--r--third_party/aom/av1/encoder/sparse_linear_solver.h67
-rw-r--r--third_party/aom/av1/encoder/speed_features.c2715
-rw-r--r--third_party/aom/av1/encoder/speed_features.h2025
-rw-r--r--third_party/aom/av1/encoder/superres_scale.c423
-rw-r--r--third_party/aom/av1/encoder/superres_scale.h28
-rw-r--r--third_party/aom/av1/encoder/svc_layercontext.c701
-rw-r--r--third_party/aom/av1/encoder/svc_layercontext.h325
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.c1520
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.h458
-rw-r--r--third_party/aom/av1/encoder/thirdpass.c877
-rw-r--r--third_party/aom/av1/encoder/thirdpass.h197
-rw-r--r--third_party/aom/av1/encoder/tokenize.c396
-rw-r--r--third_party/aom/av1/encoder/tokenize.h159
-rw-r--r--third_party/aom/av1/encoder/tpl_model.c2511
-rw-r--r--third_party/aom/av1/encoder/tpl_model.h794
-rw-r--r--third_party/aom/av1/encoder/tune_butteraugli.c313
-rw-r--r--third_party/aom/av1/encoder/tune_butteraugli.h45
-rw-r--r--third_party/aom/av1/encoder/tune_vmaf.c1112
-rw-r--r--third_party/aom/av1/encoder/tune_vmaf.h63
-rw-r--r--third_party/aom/av1/encoder/tx_prune_model_weights.h3422
-rw-r--r--third_party/aom/av1/encoder/tx_search.c3830
-rw-r--r--third_party/aom/av1/encoder/tx_search.h226
-rw-r--r--third_party/aom/av1/encoder/txb_rdopt.c659
-rw-r--r--third_party/aom/av1/encoder/txb_rdopt.h160
-rw-r--r--third_party/aom/av1/encoder/txb_rdopt_utils.h236
-rw-r--r--third_party/aom/av1/encoder/var_based_part.c1914
-rw-r--r--third_party/aom/av1/encoder/var_based_part.h104
-rw-r--r--third_party/aom/av1/encoder/wedge_utils.c125
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c1409
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c3010
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c336
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h96
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c2673
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h253
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c137
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c195
-rw-r--r--third_party/aom/av1/encoder/x86/av1_k_means_avx2.c132
-rw-r--r--third_party/aom/av1/encoder/x86/av1_k_means_sse2.c124
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_avx2.c414
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_sse2.c289
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm204
-rw-r--r--third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm222
-rw-r--r--third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c328
-rw-r--r--third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h144
-rw-r--r--third_party/aom/av1/encoder/x86/cnn_avx2.c532
-rw-r--r--third_party/aom/av1/encoder/x86/dct_sse2.asm82
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_avx2.c122
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_sse2.c505
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_sse4.c84
-rw-r--r--third_party/aom/av1/encoder/x86/error_intrin_avx2.c210
-rw-r--r--third_party/aom/av1/encoder/x86/error_intrin_sse2.c75
-rw-r--r--third_party/aom/av1/encoder/x86/error_sse2.asm88
-rw-r--r--third_party/aom/av1/encoder/x86/hash_sse42.c53
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c64
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c74
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c3132
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c2629
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c466
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c341
-rw-r--r--third_party/aom/av1/encoder/x86/ml_avx2.c240
-rw-r--r--third_party/aom/av1/encoder/x86/ml_sse3.c336
-rw-r--r--third_party/aom/av1/encoder/x86/ml_sse3.h29
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_avx2.c2348
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_sse4.c1483
-rw-r--r--third_party/aom/av1/encoder/x86/rdopt_avx2.c254
-rw-r--r--third_party/aom/av1/encoder/x86/rdopt_sse4.c272
-rw-r--r--third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c347
-rw-r--r--third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c67
-rw-r--r--third_party/aom/av1/encoder/x86/temporal_filter_avx2.c647
-rw-r--r--third_party/aom/av1/encoder/x86/temporal_filter_sse2.c320
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_avx2.c215
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_sse2.c254
232 files changed, 188397 insertions, 0 deletions
diff --git a/third_party/aom/av1/encoder/allintra_vis.c b/third_party/aom/av1/encoder/allintra_vis.c
new file mode 100644
index 0000000000..8dcef5fc85
--- /dev/null
+++ b/third_party/aom/av1/encoder/allintra_vis.c
@@ -0,0 +1,1055 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#if CONFIG_TFLITE
+#include "tensorflow/lite/c/c_api.h"
+#include "av1/encoder/deltaq4_model.c"
+#endif
+
+#include "av1/common/common_data.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#define MB_WIENER_PRED_BLOCK_SIZE BLOCK_128X128
+#define MB_WIENER_PRED_BUF_STRIDE 128
+
+void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td) {
+ const int is_high_bitdepth = is_cur_buf_hbd(&td->mb.e_mbd);
+ assert(MB_WIENER_PRED_BLOCK_SIZE < BLOCK_SIZES_ALL);
+ const int buf_width = block_size_wide[MB_WIENER_PRED_BLOCK_SIZE];
+ const int buf_height = block_size_high[MB_WIENER_PRED_BLOCK_SIZE];
+ assert(buf_width == MB_WIENER_PRED_BUF_STRIDE);
+ const size_t buf_size =
+ (buf_width * buf_height * sizeof(*td->wiener_tmp_pred_buf))
+ << is_high_bitdepth;
+ CHECK_MEM_ERROR(cm, td->wiener_tmp_pred_buf, aom_memalign(32, buf_size));
+}
+
+void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td) {
+ aom_free(td->wiener_tmp_pred_buf);
+ td->wiener_tmp_pred_buf = NULL;
+}
+
+void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+
+ // This block size is also used to determine number of workers in
+ // multi-threading. If it is changed, one needs to change it accordingly in
+ // "compute_num_ai_workers()".
+ cpi->weber_bsize = BLOCK_8X8;
+
+ if (cpi->oxcf.enable_rate_guide_deltaq) {
+ if (cpi->mb_weber_stats && cpi->prep_rate_estimates &&
+ cpi->ext_rate_distribution)
+ return;
+ } else {
+ if (cpi->mb_weber_stats) return;
+ }
+
+ CHECK_MEM_ERROR(cm, cpi->mb_weber_stats,
+ aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+ sizeof(*cpi->mb_weber_stats)));
+
+ if (cpi->oxcf.enable_rate_guide_deltaq) {
+ CHECK_MEM_ERROR(
+ cm, cpi->prep_rate_estimates,
+ aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+ sizeof(*cpi->prep_rate_estimates)));
+
+ CHECK_MEM_ERROR(
+ cm, cpi->ext_rate_distribution,
+ aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+ sizeof(*cpi->ext_rate_distribution)));
+ }
+}
+
+static int64_t get_satd(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ const int mi_step = mi_size_wide[cpi->weber_bsize];
+ int mb_stride = cpi->frame_info.mi_cols;
+ int mb_count = 0;
+ int64_t satd = 0;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+ for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+ if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+ continue;
+
+ satd += cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]
+ .satd;
+ ++mb_count;
+ }
+ }
+
+ if (mb_count) satd = (int)(satd / mb_count);
+ satd = AOMMAX(1, satd);
+
+ return (int)satd;
+}
+
+static int64_t get_sse(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ const int mi_step = mi_size_wide[cpi->weber_bsize];
+ int mb_stride = cpi->frame_info.mi_cols;
+ int mb_count = 0;
+ int64_t distortion = 0;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+ for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+ if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+ continue;
+
+ distortion +=
+ cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]
+ .distortion;
+ ++mb_count;
+ }
+ }
+
+ if (mb_count) distortion = (int)(distortion / mb_count);
+ distortion = AOMMAX(1, distortion);
+
+ return (int)distortion;
+}
+
+static double get_max_scale(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+ const int mi_step = mi_size_wide[cpi->weber_bsize];
+ int mb_stride = cpi->frame_info.mi_cols;
+ double min_max_scale = 10.0;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+ for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+ if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+ continue;
+ WeberStats *weber_stats =
+ &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)];
+ if (weber_stats->max_scale < 1.0) continue;
+ if (weber_stats->max_scale < min_max_scale)
+ min_max_scale = weber_stats->max_scale;
+ }
+ }
+ return min_max_scale;
+}
+
+static int get_window_wiener_var(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ const int mi_step = mi_size_wide[cpi->weber_bsize];
+ int sb_wiener_var = 0;
+ int mb_stride = cpi->frame_info.mi_cols;
+ int mb_count = 0;
+ double base_num = 1;
+ double base_den = 1;
+ double base_reg = 1;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+ for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+ if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+ continue;
+
+ WeberStats *weber_stats =
+ &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)];
+
+ base_num += ((double)weber_stats->distortion) *
+ sqrt((double)weber_stats->src_variance) *
+ weber_stats->rec_pix_max;
+
+ base_den += fabs(
+ weber_stats->rec_pix_max * sqrt((double)weber_stats->src_variance) -
+ weber_stats->src_pix_max * sqrt((double)weber_stats->rec_variance));
+
+ base_reg += sqrt((double)weber_stats->distortion) *
+ sqrt((double)weber_stats->src_pix_max) * 0.1;
+ ++mb_count;
+ }
+ }
+
+ sb_wiener_var =
+ (int)(((base_num + base_reg) / (base_den + base_reg)) / mb_count);
+ sb_wiener_var = AOMMAX(1, sb_wiener_var);
+
+ return (int)sb_wiener_var;
+}
+
+static int get_var_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ int sb_wiener_var = get_window_wiener_var(cpi, bsize, mi_row, mi_col);
+
+ if (mi_row >= (mi_high / 2)) {
+ sb_wiener_var =
+ AOMMIN(sb_wiener_var,
+ get_window_wiener_var(cpi, bsize, mi_row - mi_high / 2, mi_col));
+ }
+ if (mi_row <= (cm->mi_params.mi_rows - mi_high - (mi_high / 2))) {
+ sb_wiener_var =
+ AOMMIN(sb_wiener_var,
+ get_window_wiener_var(cpi, bsize, mi_row + mi_high / 2, mi_col));
+ }
+ if (mi_col >= (mi_wide / 2)) {
+ sb_wiener_var =
+ AOMMIN(sb_wiener_var,
+ get_window_wiener_var(cpi, bsize, mi_row, mi_col - mi_wide / 2));
+ }
+ if (mi_col <= (cm->mi_params.mi_cols - mi_wide - (mi_wide / 2))) {
+ sb_wiener_var =
+ AOMMIN(sb_wiener_var,
+ get_window_wiener_var(cpi, bsize, mi_row, mi_col + mi_wide / 2));
+ }
+
+ return sb_wiener_var;
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+ const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+
+ assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+ int rate_cost = 1;
+
+ for (int idx = 0; idx < eob; ++idx) {
+ int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+ rate_cost += (int)(log1p(abs_level) / log(2.0)) + 1 + (abs_level > 0);
+ }
+
+ return (rate_cost << AV1_PROB_COST_SHIFT);
+}
+
+void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
+ MACROBLOCKD *xd, const int mi_row,
+ int16_t *src_diff, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ double *sum_rec_distortion,
+ double *sum_est_rate, uint8_t *pred_buffer) {
+ AV1_COMMON *const cm = &cpi->common;
+ uint8_t *buffer = cpi->source->y_buffer;
+ int buf_stride = cpi->source->y_stride;
+ MB_MODE_INFO mbmi;
+ memset(&mbmi, 0, sizeof(mbmi));
+ MB_MODE_INFO *mbmi_ptr = &mbmi;
+ xd->mi = &mbmi_ptr;
+ const BLOCK_SIZE bsize = cpi->weber_bsize;
+ const TX_SIZE tx_size = max_txsize_lookup[bsize];
+ const int block_size = tx_size_wide[tx_size];
+ const int coeff_count = block_size * block_size;
+ const int mb_step = mi_size_wide[bsize];
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ const MultiThreadInfo *const mt_info = &cpi->mt_info;
+ const AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt;
+ AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+ &cpi->ppi->intra_row_mt_sync;
+ const int mi_cols = cm->mi_params.mi_cols;
+ const int mt_thread_id = mi_row / mb_step;
+ // TODO(chengchen): test different unit step size
+ const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE];
+ const int mt_unit_cols = (mi_cols + (mt_unit_step >> 1)) / mt_unit_step;
+ int mt_unit_col = 0;
+ const int is_high_bitdepth = is_cur_buf_hbd(xd);
+
+ uint8_t *dst_buffer = pred_buffer;
+ const int dst_buffer_stride = MB_WIENER_PRED_BUF_STRIDE;
+
+ if (is_high_bitdepth) {
+ uint16_t *pred_buffer_16 = (uint16_t *)pred_buffer;
+ dst_buffer = CONVERT_TO_BYTEPTR(pred_buffer_16);
+ }
+
+ for (int mi_col = 0; mi_col < mi_cols; mi_col += mb_step) {
+ if (mi_col % mt_unit_step == 0) {
+ intra_mt->intra_sync_read_ptr(intra_row_mt_sync, mt_thread_id,
+ mt_unit_col);
+#if CONFIG_MULTITHREAD
+ const int num_workers =
+ AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers);
+ if (num_workers > 1) {
+ const AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ pthread_mutex_lock(enc_row_mt->mutex_);
+ const bool exit = enc_row_mt->mb_wiener_mt_exit;
+ pthread_mutex_unlock(enc_row_mt->mutex_);
+ // Stop further processing in case any worker has encountered an error.
+ if (exit) break;
+ }
+#endif
+ }
+
+ PREDICTION_MODE best_mode = DC_PRED;
+ int best_intra_cost = INT_MAX;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+ mi_row, mi_col);
+ set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+ AOMMIN(mi_row + mi_height, cm->mi_params.mi_rows),
+ AOMMIN(mi_col + mi_width, cm->mi_params.mi_cols));
+ set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+ av1_num_planes(cm));
+ xd->mi[0]->bsize = bsize;
+ xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+ // Set above and left mbmi to NULL as they are not available in the
+ // preprocessing stage.
+ // They are used to detemine intra edge filter types in intra prediction.
+ if (xd->up_available) {
+ xd->above_mbmi = NULL;
+ }
+ if (xd->left_available) {
+ xd->left_mbmi = NULL;
+ }
+ uint8_t *mb_buffer =
+ buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE;
+ for (PREDICTION_MODE mode = INTRA_MODE_START; mode < INTRA_MODE_END;
+ ++mode) {
+ // TODO(chengchen): Here we use src instead of reconstructed frame as
+ // the intra predictor to make single and multithread version match.
+ // Ideally we want to use the reconstructed.
+ av1_predict_intra_block(
+ xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter,
+ block_size, block_size, tx_size, mode, 0, 0, FILTER_INTRA_MODES,
+ mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
+ av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
+ mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+ av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+ int intra_cost = aom_satd(coeff, coeff_count);
+ if (intra_cost < best_intra_cost) {
+ best_intra_cost = intra_cost;
+ best_mode = mode;
+ }
+ }
+
+ av1_predict_intra_block(
+ xd, cm->seq_params->sb_size, cm->seq_params->enable_intra_edge_filter,
+ block_size, block_size, tx_size, best_mode, 0, 0, FILTER_INTRA_MODES,
+ mb_buffer, buf_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
+ av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
+ mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+ av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+
+ const struct macroblock_plane *const p = &x->plane[0];
+ uint16_t eob;
+ const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+ QUANT_PARAM quant_param;
+ int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+ av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+ scan_order, &quant_param);
+ } else {
+ av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+ scan_order, &quant_param);
+ }
+#else
+ av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob, scan_order,
+ &quant_param);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+ if (cpi->oxcf.enable_rate_guide_deltaq) {
+ const int rate_cost = rate_estimator(qcoeff, eob, tx_size);
+ cpi->prep_rate_estimates[(mi_row / mb_step) * cpi->frame_info.mi_cols +
+ (mi_col / mb_step)] = rate_cost;
+ }
+
+ av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer,
+ dst_buffer_stride, eob, 0);
+ WeberStats *weber_stats =
+ &cpi->mb_weber_stats[(mi_row / mb_step) * cpi->frame_info.mi_cols +
+ (mi_col / mb_step)];
+
+ weber_stats->rec_pix_max = 1;
+ weber_stats->rec_variance = 0;
+ weber_stats->src_pix_max = 1;
+ weber_stats->src_variance = 0;
+ weber_stats->distortion = 0;
+
+ int64_t src_mean = 0;
+ int64_t rec_mean = 0;
+ int64_t dist_mean = 0;
+
+ for (int pix_row = 0; pix_row < block_size; ++pix_row) {
+ for (int pix_col = 0; pix_col < block_size; ++pix_col) {
+ int src_pix, rec_pix;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(mb_buffer);
+ uint16_t *rec = CONVERT_TO_SHORTPTR(dst_buffer);
+ src_pix = src[pix_row * buf_stride + pix_col];
+ rec_pix = rec[pix_row * dst_buffer_stride + pix_col];
+ } else {
+ src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+ rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+ }
+#else
+ src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+ rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+#endif
+ src_mean += src_pix;
+ rec_mean += rec_pix;
+ dist_mean += src_pix - rec_pix;
+ weber_stats->src_variance += src_pix * src_pix;
+ weber_stats->rec_variance += rec_pix * rec_pix;
+ weber_stats->src_pix_max = AOMMAX(weber_stats->src_pix_max, src_pix);
+ weber_stats->rec_pix_max = AOMMAX(weber_stats->rec_pix_max, rec_pix);
+ weber_stats->distortion += (src_pix - rec_pix) * (src_pix - rec_pix);
+ }
+ }
+
+ if (cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) {
+ *sum_rec_distortion += weber_stats->distortion;
+ int est_block_rate = 0;
+ int64_t est_block_dist = 0;
+ model_rd_sse_fn[MODELRD_LEGACY](cpi, x, bsize, 0, weber_stats->distortion,
+ pix_num, &est_block_rate,
+ &est_block_dist);
+ *sum_est_rate += est_block_rate;
+ }
+
+ weber_stats->src_variance -= (src_mean * src_mean) / pix_num;
+ weber_stats->rec_variance -= (rec_mean * rec_mean) / pix_num;
+ weber_stats->distortion -= (dist_mean * dist_mean) / pix_num;
+ weber_stats->satd = best_intra_cost;
+
+ qcoeff[0] = 0;
+ int max_scale = 0;
+ for (int idx = 1; idx < coeff_count; ++idx) {
+ const int abs_qcoeff = abs(qcoeff[idx]);
+ max_scale = AOMMAX(max_scale, abs_qcoeff);
+ }
+ weber_stats->max_scale = max_scale;
+
+ if ((mi_col + mb_step) % mt_unit_step == 0 ||
+ (mi_col + mb_step) >= mi_cols) {
+ intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id,
+ mt_unit_col, mt_unit_cols);
+ ++mt_unit_col;
+ }
+ }
+ // Set the pointer to null since mbmi is only allocated inside this function.
+ xd->mi = NULL;
+}
+
+static void calc_mb_wiener_var(AV1_COMP *const cpi, double *sum_rec_distortion,
+ double *sum_est_rate) {
+ MACROBLOCK *x = &cpi->td.mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BLOCK_SIZE bsize = cpi->weber_bsize;
+ const int mb_step = mi_size_wide[bsize];
+ DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+ for (int mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
+ av1_calc_mb_wiener_var_row(cpi, x, xd, mi_row, src_diff, coeff, qcoeff,
+ dqcoeff, sum_rec_distortion, sum_est_rate,
+ cpi->td.wiener_tmp_pred_buf);
+ }
+}
+
+static int64_t estimate_wiener_var_norm(AV1_COMP *const cpi,
+ const BLOCK_SIZE norm_block_size) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int64_t norm_factor = 1;
+ assert(norm_block_size >= BLOCK_16X16 && norm_block_size <= BLOCK_128X128);
+ const int norm_step = mi_size_wide[norm_block_size];
+ double sb_wiener_log = 0;
+ double sb_count = 0;
+ for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) {
+ for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += norm_step) {
+ const int sb_wiener_var =
+ get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col);
+ const int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col);
+ const int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col);
+ const double scaled_satd = (double)satd / sqrt((double)sse);
+ sb_wiener_log += scaled_satd * log(sb_wiener_var);
+ sb_count += scaled_satd;
+ }
+ }
+ if (sb_count > 0) norm_factor = (int64_t)(exp(sb_wiener_log / sb_count));
+ norm_factor = AOMMAX(1, norm_factor);
+
+ return norm_factor;
+}
+
+static void automatic_intra_tools_off(AV1_COMP *cpi,
+ const double sum_rec_distortion,
+ const double sum_est_rate) {
+ if (!cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) return;
+
+ // Thresholds
+ const int high_quality_qindex = 128;
+ const double high_quality_bpp = 2.0;
+ const double high_quality_dist_per_pix = 4.0;
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int qindex = cm->quant_params.base_qindex;
+ const double dist_per_pix =
+ (double)sum_rec_distortion / (cm->width * cm->height);
+ // The estimate bpp is not accurate, an empirical constant 100 is divided.
+ const double estimate_bpp = sum_est_rate / (cm->width * cm->height * 100);
+
+ if (qindex < high_quality_qindex && estimate_bpp > high_quality_bpp &&
+ dist_per_pix < high_quality_dist_per_pix) {
+ cpi->oxcf.intra_mode_cfg.enable_smooth_intra = 0;
+ cpi->oxcf.intra_mode_cfg.enable_paeth_intra = 0;
+ cpi->oxcf.intra_mode_cfg.enable_cfl_intra = 0;
+ cpi->oxcf.intra_mode_cfg.enable_diagonal_intra = 0;
+ }
+}
+
+static void ext_rate_guided_quantization(AV1_COMP *cpi) {
+ // Calculation uses 8x8.
+ const int mb_step = mi_size_wide[cpi->weber_bsize];
+ // Accumulate to 16x16, step size is in the unit of mi.
+ const int block_step = 4;
+
+ const char *filename = cpi->oxcf.rate_distribution_info;
+ FILE *pfile = fopen(filename, "r");
+ if (pfile == NULL) {
+ assert(pfile != NULL);
+ return;
+ }
+
+ double ext_rate_sum = 0.0;
+ for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) {
+ for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) {
+ float val;
+ const int fields_converted = fscanf(pfile, "%f", &val);
+ if (fields_converted != 1) {
+ assert(fields_converted == 1);
+ fclose(pfile);
+ return;
+ }
+ ext_rate_sum += val;
+ cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols +
+ (col / mb_step)] = val;
+ }
+ }
+ fclose(pfile);
+
+ int uniform_rate_sum = 0;
+ for (int row = 0; row < cpi->frame_info.mi_rows; row += block_step) {
+ for (int col = 0; col < cpi->frame_info.mi_cols; col += block_step) {
+ int rate_sum = 0;
+ for (int r = 0; r < block_step; r += mb_step) {
+ for (int c = 0; c < block_step; c += mb_step) {
+ const int mi_row = row + r;
+ const int mi_col = col + c;
+ rate_sum += cpi->prep_rate_estimates[(mi_row / mb_step) *
+ cpi->frame_info.mi_cols +
+ (mi_col / mb_step)];
+ }
+ }
+ uniform_rate_sum += rate_sum;
+ }
+ }
+
+ const double scale = uniform_rate_sum / ext_rate_sum;
+ cpi->ext_rate_scale = scale;
+}
+
+void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ if (aom_realloc_frame_buffer(
+ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+ NULL, cpi->image_pyramid_levels, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ av1_alloc_mb_wiener_var_pred_buf(&cpi->common, &cpi->td);
+ cpi->norm_wiener_variance = 0;
+
+ MACROBLOCK *x = &cpi->td.mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ // xd->mi needs to be setup since it is used in av1_frame_init_quantizer.
+ MB_MODE_INFO mbmi;
+ memset(&mbmi, 0, sizeof(mbmi));
+ MB_MODE_INFO *mbmi_ptr = &mbmi;
+ xd->mi = &mbmi_ptr;
+ cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level;
+ av1_frame_init_quantizer(cpi);
+
+ double sum_rec_distortion = 0.0;
+ double sum_est_rate = 0.0;
+
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ const int num_workers =
+ AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers);
+ AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt;
+ intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read_dummy;
+ intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write_dummy;
+ // Calculate differential contrast for each block for the entire image.
+ // TODO(chengchen): properly accumulate the distortion and rate in
+ // av1_calc_mb_wiener_var_mt(). Until then, call calc_mb_wiener_var() if
+ // auto_intra_tools_off is true.
+ if (num_workers > 1 && !cpi->oxcf.intra_mode_cfg.auto_intra_tools_off) {
+ intra_mt->intra_sync_read_ptr = av1_row_mt_sync_read;
+ intra_mt->intra_sync_write_ptr = av1_row_mt_sync_write;
+ av1_calc_mb_wiener_var_mt(cpi, num_workers, &sum_rec_distortion,
+ &sum_est_rate);
+ } else {
+ calc_mb_wiener_var(cpi, &sum_rec_distortion, &sum_est_rate);
+ }
+
+ // Determine whether to turn off several intra coding tools.
+ automatic_intra_tools_off(cpi, sum_rec_distortion, sum_est_rate);
+
+ // Read external rate distribution and use it to guide delta quantization
+ if (cpi->oxcf.enable_rate_guide_deltaq) ext_rate_guided_quantization(cpi);
+
+ const BLOCK_SIZE norm_block_size = cm->seq_params->sb_size;
+ cpi->norm_wiener_variance = estimate_wiener_var_norm(cpi, norm_block_size);
+ const int norm_step = mi_size_wide[norm_block_size];
+
+ double sb_wiener_log = 0;
+ double sb_count = 0;
+ for (int its_cnt = 0; its_cnt < 2; ++its_cnt) {
+ sb_wiener_log = 0;
+ sb_count = 0;
+ for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += norm_step) {
+ for (int mi_col = 0; mi_col < cm->mi_params.mi_cols;
+ mi_col += norm_step) {
+ int sb_wiener_var =
+ get_var_perceptual_ai(cpi, norm_block_size, mi_row, mi_col);
+
+ double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+ double min_max_scale = AOMMAX(
+ 1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col));
+
+ beta = AOMMIN(beta, 4);
+ beta = AOMMAX(beta, 0.25);
+
+ if (beta < 1 / min_max_scale) continue;
+
+ sb_wiener_var = (int)(cpi->norm_wiener_variance / beta);
+
+ int64_t satd = get_satd(cpi, norm_block_size, mi_row, mi_col);
+ int64_t sse = get_sse(cpi, norm_block_size, mi_row, mi_col);
+ double scaled_satd = (double)satd / sqrt((double)sse);
+ sb_wiener_log += scaled_satd * log(sb_wiener_var);
+ sb_count += scaled_satd;
+ }
+ }
+
+ if (sb_count > 0)
+ cpi->norm_wiener_variance = (int64_t)(exp(sb_wiener_log / sb_count));
+ cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance);
+ }
+
+ // Set the pointer to null since mbmi is only allocated inside this function.
+ xd->mi = NULL;
+ aom_free_frame_buffer(&cm->cur_frame->buf);
+ av1_dealloc_mb_wiener_var_pred_buf(&cpi->td);
+}
+
+static int get_rate_guided_quantizer(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ // Calculation uses 8x8.
+ const int mb_step = mi_size_wide[cpi->weber_bsize];
+ // Accumulate to 16x16
+ const int block_step = mi_size_wide[BLOCK_16X16];
+ double sb_rate_hific = 0.0;
+ double sb_rate_uniform = 0.0;
+ for (int row = mi_row; row < mi_row + mi_size_wide[bsize];
+ row += block_step) {
+ for (int col = mi_col; col < mi_col + mi_size_high[bsize];
+ col += block_step) {
+ sb_rate_hific +=
+ cpi->ext_rate_distribution[(row / mb_step) * cpi->frame_info.mi_cols +
+ (col / mb_step)];
+
+ for (int r = 0; r < block_step; r += mb_step) {
+ for (int c = 0; c < block_step; c += mb_step) {
+ const int this_row = row + r;
+ const int this_col = col + c;
+ sb_rate_uniform +=
+ cpi->prep_rate_estimates[(this_row / mb_step) *
+ cpi->frame_info.mi_cols +
+ (this_col / mb_step)];
+ }
+ }
+ }
+ }
+ sb_rate_hific *= cpi->ext_rate_scale;
+
+ const double weight = 1.0;
+ const double rate_diff =
+ weight * (sb_rate_hific - sb_rate_uniform) / sb_rate_uniform;
+ double scale = pow(2, rate_diff);
+
+ scale = scale * scale;
+ double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col));
+ scale = 1.0 / AOMMIN(1.0 / scale, min_max_scale);
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int base_qindex = cm->quant_params.base_qindex;
+ int offset =
+ av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, scale);
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ const int max_offset = delta_q_info->delta_q_res * 10;
+ offset = AOMMIN(offset, max_offset - 1);
+ offset = AOMMAX(offset, -max_offset + 1);
+ int qindex = cm->quant_params.base_qindex + offset;
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ);
+ if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1);
+
+ return qindex;
+}
+
+int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ if (cpi->oxcf.enable_rate_guide_deltaq) {
+ return get_rate_guided_quantizer(cpi, bsize, mi_row, mi_col);
+ }
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int base_qindex = cm->quant_params.base_qindex;
+ int sb_wiener_var = get_var_perceptual_ai(cpi, bsize, mi_row, mi_col);
+ int offset = 0;
+ double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+ double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col));
+ beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
+
+ // Cap beta such that the delta q value is not much far away from the base q.
+ beta = AOMMIN(beta, 4);
+ beta = AOMMAX(beta, 0.25);
+ offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ offset = AOMMIN(offset, delta_q_info->delta_q_res * 20 - 1);
+ offset = AOMMAX(offset, -delta_q_info->delta_q_res * 20 + 1);
+ int qindex = cm->quant_params.base_qindex + offset;
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ);
+ if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1);
+
+ return qindex;
+}
+
+void av1_init_mb_ur_var_buffer(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+
+ if (cpi->mb_delta_q) return;
+
+ CHECK_MEM_ERROR(cm, cpi->mb_delta_q,
+ aom_calloc(cpi->frame_info.mb_rows * cpi->frame_info.mb_cols,
+ sizeof(*cpi->mb_delta_q)));
+}
+
+#if CONFIG_TFLITE
+static int model_predict(BLOCK_SIZE block_size, int num_cols, int num_rows,
+ int bit_depth, uint8_t *y_buffer, int y_stride,
+ float *predicts0, float *predicts1) {
+ // Create the model and interpreter options.
+ TfLiteModel *model =
+ TfLiteModelCreate(av1_deltaq4_model_file, av1_deltaq4_model_fsize);
+ if (model == NULL) return 1;
+
+ TfLiteInterpreterOptions *options = TfLiteInterpreterOptionsCreate();
+ TfLiteInterpreterOptionsSetNumThreads(options, 2);
+ if (options == NULL) {
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ // Create the interpreter.
+ TfLiteInterpreter *interpreter = TfLiteInterpreterCreate(model, options);
+ if (interpreter == NULL) {
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ // Allocate tensors and populate the input tensor data.
+ TfLiteInterpreterAllocateTensors(interpreter);
+ TfLiteTensor *input_tensor = TfLiteInterpreterGetInputTensor(interpreter, 0);
+ if (input_tensor == NULL) {
+ TfLiteInterpreterDelete(interpreter);
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ size_t input_size = TfLiteTensorByteSize(input_tensor);
+ float *input_data = aom_calloc(input_size, 1);
+ if (input_data == NULL) {
+ TfLiteInterpreterDelete(interpreter);
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ const int num_mi_w = mi_size_wide[block_size];
+ const int num_mi_h = mi_size_high[block_size];
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int row_offset = (row * num_mi_h) << 2;
+ const int col_offset = (col * num_mi_w) << 2;
+
+ uint8_t *buf = y_buffer + row_offset * y_stride + col_offset;
+ int r = row_offset, pos = 0;
+ const float base = (float)((1 << bit_depth) - 1);
+ while (r < row_offset + (num_mi_h << 2)) {
+ for (int c = 0; c < (num_mi_w << 2); ++c) {
+ input_data[pos++] = bit_depth > 8
+ ? (float)*CONVERT_TO_SHORTPTR(buf + c) / base
+ : (float)*(buf + c) / base;
+ }
+ buf += y_stride;
+ ++r;
+ }
+ TfLiteTensorCopyFromBuffer(input_tensor, input_data, input_size);
+
+ // Execute inference.
+ if (TfLiteInterpreterInvoke(interpreter) != kTfLiteOk) {
+ TfLiteInterpreterDelete(interpreter);
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ // Extract the output tensor data.
+ const TfLiteTensor *output_tensor =
+ TfLiteInterpreterGetOutputTensor(interpreter, 0);
+ if (output_tensor == NULL) {
+ TfLiteInterpreterDelete(interpreter);
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ return 1;
+ }
+
+ size_t output_size = TfLiteTensorByteSize(output_tensor);
+ float output_data[2];
+
+ TfLiteTensorCopyToBuffer(output_tensor, output_data, output_size);
+ predicts0[row * num_cols + col] = output_data[0];
+ predicts1[row * num_cols + col] = output_data[1];
+ }
+ }
+
+ // Dispose of the model and interpreter objects.
+ TfLiteInterpreterDelete(interpreter);
+ TfLiteInterpreterOptionsDelete(options);
+ TfLiteModelDelete(model);
+ aom_free(input_data);
+ return 0;
+}
+
+void av1_set_mb_ur_variance(AV1_COMP *cpi) {
+ const AV1_COMMON *cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ uint8_t *y_buffer = cpi->source->y_buffer;
+ const int y_stride = cpi->source->y_stride;
+ const int block_size = cpi->common.seq_params->sb_size;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+
+ const int num_mi_w = mi_size_wide[block_size];
+ const int num_mi_h = mi_size_high[block_size];
+ const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+
+ // TODO(sdeng): fit a better model_1; disable it at this time.
+ float *mb_delta_q0, *mb_delta_q1, delta_q_avg0 = 0.0f;
+ CHECK_MEM_ERROR(cm, mb_delta_q0,
+ aom_calloc(num_rows * num_cols, sizeof(float)));
+ CHECK_MEM_ERROR(cm, mb_delta_q1,
+ aom_calloc(num_rows * num_cols, sizeof(float)));
+
+ if (model_predict(block_size, num_cols, num_rows, bit_depth, y_buffer,
+ y_stride, mb_delta_q0, mb_delta_q1)) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Failed to call TFlite functions.");
+ }
+
+ // Loop through each SB block.
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ delta_q_avg0 += mb_delta_q0[index];
+ }
+ }
+
+ delta_q_avg0 /= (float)(num_rows * num_cols);
+
+ float scaling_factor;
+ const float cq_level = (float)cpi->oxcf.rc_cfg.cq_level / (float)MAXQ;
+ if (cq_level < delta_q_avg0) {
+ scaling_factor = cq_level / delta_q_avg0;
+ } else {
+ scaling_factor = 1.0f - (cq_level - delta_q_avg0) / (1.0f - delta_q_avg0);
+ }
+
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ cpi->mb_delta_q[index] =
+ RINT((float)cpi->oxcf.q_cfg.deltaq_strength / 100.0f * (float)MAXQ *
+ scaling_factor * (mb_delta_q0[index] - delta_q_avg0));
+ }
+ }
+
+ aom_free(mb_delta_q0);
+ aom_free(mb_delta_q1);
+}
+#else // !CONFIG_TFLITE
+void av1_set_mb_ur_variance(AV1_COMP *cpi) {
+ const AV1_COMMON *cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ uint8_t *y_buffer = cpi->source->y_buffer;
+ const int y_stride = cpi->source->y_stride;
+ const int block_size = cpi->common.seq_params->sb_size;
+
+ const int num_mi_w = mi_size_wide[block_size];
+ const int num_mi_h = mi_size_high[block_size];
+ const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+
+ int *mb_delta_q[2];
+ CHECK_MEM_ERROR(cm, mb_delta_q[0],
+ aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[0])));
+ CHECK_MEM_ERROR(cm, mb_delta_q[1],
+ aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[1])));
+
+ // Approximates the model change between current version (Spet 2021) and the
+ // baseline (July 2021).
+ const double model_change[] = { 3.0, 3.0 };
+ // The following parameters are fitted from user labeled data.
+ const double a[] = { -24.50 * 4.0, -17.20 * 4.0 };
+ const double b[] = { 0.004898, 0.003093 };
+ const double c[] = { (29.932 + model_change[0]) * 4.0,
+ (42.100 + model_change[1]) * 4.0 };
+ int delta_q_avg[2] = { 0, 0 };
+ // Loop through each SB block.
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ double var = 0.0, num_of_var = 0.0;
+ const int index = row * num_cols + col;
+
+ // Loop through each 8x8 block.
+ for (int mi_row = row * num_mi_h;
+ mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+ mi_row += 2) {
+ for (int mi_col = col * num_mi_w;
+ mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+ mi_col += 2) {
+ struct buf_2d buf;
+ const int row_offset_y = mi_row << 2;
+ const int col_offset_y = mi_col << 2;
+
+ buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+ buf.stride = y_stride;
+
+ unsigned int block_variance;
+ block_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &buf, BLOCK_8X8, AOM_PLANE_Y);
+
+ block_variance = AOMMAX(block_variance, 1);
+ var += log((double)block_variance);
+ num_of_var += 1.0;
+ }
+ }
+ var = exp(var / num_of_var);
+ mb_delta_q[0][index] = RINT(a[0] * exp(-b[0] * var) + c[0]);
+ mb_delta_q[1][index] = RINT(a[1] * exp(-b[1] * var) + c[1]);
+ delta_q_avg[0] += mb_delta_q[0][index];
+ delta_q_avg[1] += mb_delta_q[1][index];
+ }
+ }
+
+ delta_q_avg[0] = RINT((double)delta_q_avg[0] / (num_rows * num_cols));
+ delta_q_avg[1] = RINT((double)delta_q_avg[1] / (num_rows * num_cols));
+
+ int model_idx;
+ double scaling_factor;
+ const int cq_level = cpi->oxcf.rc_cfg.cq_level;
+ if (cq_level < delta_q_avg[0]) {
+ model_idx = 0;
+ scaling_factor = (double)cq_level / delta_q_avg[0];
+ } else if (cq_level < delta_q_avg[1]) {
+ model_idx = 2;
+ scaling_factor =
+ (double)(cq_level - delta_q_avg[0]) / (delta_q_avg[1] - delta_q_avg[0]);
+ } else {
+ model_idx = 1;
+ scaling_factor = (double)(MAXQ - cq_level) / (MAXQ - delta_q_avg[1]);
+ }
+
+ const double new_delta_q_avg =
+ delta_q_avg[0] + scaling_factor * (delta_q_avg[1] - delta_q_avg[0]);
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ if (model_idx == 2) {
+ const double delta_q =
+ mb_delta_q[0][index] +
+ scaling_factor * (mb_delta_q[1][index] - mb_delta_q[0][index]);
+ cpi->mb_delta_q[index] = RINT((double)cpi->oxcf.q_cfg.deltaq_strength /
+ 100.0 * (delta_q - new_delta_q_avg));
+ } else {
+ cpi->mb_delta_q[index] = RINT(
+ (double)cpi->oxcf.q_cfg.deltaq_strength / 100.0 * scaling_factor *
+ (mb_delta_q[model_idx][index] - delta_q_avg[model_idx]));
+ }
+ }
+ }
+
+ aom_free(mb_delta_q[0]);
+ aom_free(mb_delta_q[1]);
+}
+#endif
+
+int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col) {
+ const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ AV1_COMMON *const cm = &cpi->common;
+ const int base_qindex = cm->quant_params.base_qindex;
+ if (base_qindex == MINQ || base_qindex == MAXQ) return base_qindex;
+
+ const int num_mi_w = mi_size_wide[bsize];
+ const int num_mi_h = mi_size_high[bsize];
+ const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+ const int index = (mi_row / num_mi_h) * num_cols + (mi_col / num_mi_w);
+ const int delta_q = cpi->mb_delta_q[index];
+
+ int qindex = base_qindex + delta_q;
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ + 1);
+
+ return qindex;
+}
diff --git a/third_party/aom/av1/encoder/allintra_vis.h b/third_party/aom/av1/encoder/allintra_vis.h
new file mode 100644
index 0000000000..0d34ce0841
--- /dev/null
+++ b/third_party/aom/av1/encoder/allintra_vis.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ALLINTRA_VIS_H_
+#define AOM_AV1_ENCODER_ALLINTRA_VIS_H_
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+
+#define MB_WIENER_MT_UNIT_SIZE BLOCK_64X64
+
+void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi);
+
+void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x,
+ MACROBLOCKD *xd, const int mi_row,
+ int16_t *src_diff, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ double *sum_rec_distortion,
+ double *sum_est_rate, uint8_t *pred_buffer);
+
+void av1_set_mb_wiener_variance(AV1_COMP *cpi);
+
+int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col);
+
+// User rating based mode
+void av1_init_mb_ur_var_buffer(AV1_COMP *cpi);
+
+void av1_set_mb_ur_variance(AV1_COMP *cpi);
+
+int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col);
+
+#endif // AOM_AV1_ENCODER_ALLINTRA_VIS_H_
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
new file mode 100644
index 0000000000..4cf6bd572d
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3 // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { 1.75, 1.25, 1.05, 1.00, 0.90 },
+ { 2.00, 1.50, 1.15, 1.00, 0.85 },
+ { 2.50, 1.75, 1.25, 1.00, 0.80 }
+};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { 0.15, 0.30, 0.55, 2.00, 100.0 },
+ { 0.20, 0.40, 0.65, 2.00, 100.0 },
+ { 0.25, 0.50, 0.75, 2.00, 100.0 }
+};
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { -4.0, -3.0, -2.0, 100.00, 100.0 },
+ { -3.5, -2.5, -1.5, 100.00, 100.0 },
+ { -3.0, -2.0, -1.0, 100.00, 100.0 }
+};
+
+static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
+ // Approximate base quatizer (truncated to int)
+ const int base_quant = av1_ac_quant_QTX(q_index, 0, bit_depth) / 4;
+ return (base_quant > 10) + (base_quant > 25);
+}
+
+static bool is_frame_aq_enabled(const AV1_COMP *const cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+
+ return frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+ refresh_frame->alt_ref_frame ||
+ (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+// Segmentation only makes sense if the target bits per SB is above a threshold.
+// Below this the overheads will usually outweigh any benefit.
+static bool is_sb_aq_enabled(const AV1_COMP *const cpi) {
+ return cpi->rc.sb64_target_rate >= 256;
+}
+
+void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int base_qindex = cm->quant_params.base_qindex;
+ struct segmentation *const seg = &cm->seg;
+ const int resolution_change =
+ cm->prev_frame && (cm->width != cm->prev_frame->width ||
+ cm->height != cm->prev_frame->height);
+
+ // Make SURE use of floating point in this function is safe.
+
+ if (resolution_change) {
+ memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ av1_clearall_segfeatures(seg);
+ av1_disable_segmentation(seg);
+ return;
+ }
+
+ if (is_frame_aq_enabled(cpi)) {
+ int segment;
+ const int aq_strength =
+ get_aq_c_strength(base_qindex, cm->seq_params->bit_depth);
+
+ // Clear down the segment map.
+ memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG,
+ cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+
+ av1_clearall_segfeatures(seg);
+
+ if (!is_sb_aq_enabled(cpi)) {
+ av1_disable_segmentation(seg);
+ return;
+ }
+
+ av1_enable_segmentation(seg);
+
+ // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+ av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+ // Use some of the segments for in frame Q adjustment.
+ for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+ int qindex_delta;
+
+ if (segment == DEFAULT_AQ2_SEG) continue;
+
+ qindex_delta = av1_compute_qdelta_by_rate(
+ cpi, cm->current_frame.frame_type, base_qindex,
+ aq_c_q_adj_factor[aq_strength][segment]);
+
+ // For AQ complexity mode, we dont allow Q0 in a segment if the base
+ // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -base_qindex + 1;
+ }
+ if ((base_qindex + qindex_delta) > 0) {
+ av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+ av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+ }
+ }
+ }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+ int mi_row, int mi_col, int projected_rate) {
+ if ((!is_frame_aq_enabled(cpi)) || (!is_sb_aq_enabled(cpi))) return;
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ const int mi_offset = mi_row * cm->mi_params.mi_cols + mi_col;
+ const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_size_wide[bs]);
+ const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_size_high[bs]);
+ int i;
+ unsigned char segment;
+
+ // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+ // It is converted to bits << AV1_PROB_COST_SHIFT units.
+ const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
+ << AV1_PROB_COST_SHIFT;
+ const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size;
+ const int target_rate = (int)(num / denom);
+ double logvar;
+ double low_var_thresh;
+ const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex,
+ cm->seq_params->bit_depth);
+
+ low_var_thresh =
+ (is_stat_consumption_stage_twopass(cpi))
+ ? AOMMAX(exp(cpi->twopass_frame.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+ : DEFAULT_LV_THRESH;
+
+ av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs);
+ logvar = av1_log_block_var(cpi, mb, bs);
+
+ segment = AQ_C_SEGMENTS - 1; // Just in case no break out below.
+ for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+ // Test rate against a threshold value and variance against a threshold.
+ // Increasing segment number (higher variance and complexity) = higher Q.
+ if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
+ (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+ segment = i;
+ break;
+ }
+ }
+
+ // Fill in the entires in the segment map corresponding to this SB64.
+ const int mi_stride = cm->mi_params.mi_cols;
+ set_segment_id(cpi->enc_seg.map, mi_offset, xmis, ymis, mi_stride, segment);
+}
diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h
new file mode 100644
index 0000000000..3421d74c93
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *,
+ BLOCK_SIZE bs, int mi_row, int mi_col,
+ int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
new file mode 100644
index 0000000000..f48ff11e51
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/common/pred_common.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+ CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
+ if (cr == NULL) return NULL;
+
+ cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+ cr->counter_encode_maxq_scene_change = 0;
+ cr->percent_refresh_adjustment = 5;
+ cr->rate_ratio_qdelta_adjustment = 0.25;
+ if (cr->map == NULL) {
+ av1_cyclic_refresh_free(cr);
+ return NULL;
+ }
+ return cr;
+}
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+ if (cr != NULL) {
+ aom_free(cr->map);
+ aom_free(cr);
+ }
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+ const MB_MODE_INFO *mbmi, int64_t rate,
+ int64_t dist, BLOCK_SIZE bsize,
+ int noise_level) {
+ MV mv = mbmi->mv[0].as_mv;
+ int is_compound = has_second_ref(mbmi);
+ // Reject the block for lower-qp coding for non-compound mode if
+ // projected distortion is above the threshold, and any of the following
+ // is true:
+ // 1) mode uses large mv
+ // 2) mode is an intra-mode
+ // Otherwise accept for refresh.
+ if (!is_compound && dist > cr->thresh_dist_sb &&
+ (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+ mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+ !is_inter_block(mbmi)))
+ return CR_SEGMENT_ID_BASE;
+ else if ((is_compound && noise_level < kMedium) ||
+ (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+ is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
+ cr->rate_boost_fac > 10))
+ // More aggressive delta-q for bigger blocks with zero motion.
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int deltaq = av1_compute_qdelta_by_rate(
+ cpi, cpi->common.current_frame.frame_type, q, rate_factor);
+ if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+ deltaq = -cr->max_qdelta_perc * q / 100;
+ }
+ return deltaq;
+}
+
+int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
+ double correction_factor) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int base_qindex = cm->quant_params.base_qindex;
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int mbs = cm->mi_params.MBs;
+ const int num4x4bl = mbs << 4;
+ // Weight for non-base segments: use actual number of blocks refreshed in
+ // previous/just encoded frame. Note number of blocks here is in 4x4 units.
+ double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl;
+ double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl;
+ if (cpi->rc.rtc_external_ratectrl) {
+ weight_segment1 = (double)(cr->percent_refresh * cm->mi_params.mi_rows *
+ cm->mi_params.mi_cols / 100) /
+ num4x4bl;
+ weight_segment2 = 0;
+ }
+ // Take segment weighted average for estimated bits.
+ const int estimated_bits =
+ (int)((1.0 - weight_segment1 - weight_segment2) *
+ av1_estimate_bits_at_q(cpi, base_qindex, correction_factor) +
+ weight_segment1 *
+ av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[1],
+ correction_factor) +
+ weight_segment2 *
+ av1_estimate_bits_at_q(cpi, base_qindex + cr->qindex_delta[2],
+ correction_factor));
+ return estimated_bits;
+}
+
+int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
+ double correction_factor) {
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int bits_per_mb;
+ int num4x4bl = cm->mi_params.MBs << 4;
+ // Weight for segment prior to encoding: take the average of the target
+ // number for the frame to be encoded and the actual from the previous frame.
+ double weight_segment =
+ (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks +
+ cr->actual_num_seg2_blocks) >>
+ 1) /
+ num4x4bl;
+ if (cpi->rc.rtc_external_ratectrl) {
+ weight_segment = (double)((cr->target_num_seg_blocks +
+ cr->percent_refresh * cm->mi_params.mi_rows *
+ cm->mi_params.mi_cols / 100) >>
+ 1) /
+ num4x4bl;
+ }
+ // Compute delta-q corresponding to qindex i.
+ int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+ const int accurate_estimate = cpi->sf.hl_sf.accurate_bit_estimate;
+ // Take segment weighted average for bits per mb.
+ bits_per_mb =
+ (int)((1.0 - weight_segment) *
+ av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, i,
+ correction_factor, accurate_estimate) +
+ weight_segment * av1_rc_bits_per_mb(
+ cpi, cm->current_frame.frame_type, i + deltaq,
+ correction_factor, accurate_estimate));
+ return bits_per_mb;
+}
+
+void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RUN_TYPE dry_run) {
+ int cdf_num;
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int prev_segment_id = mbmi->segment_id;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+
+ assert(cm->seg.enabled);
+
+ if (!cr->skip_over4x4) {
+ mbmi->segment_id =
+ av1_get_spatial_seg_pred(cm, xd, &cdf_num, cr->skip_over4x4);
+ if (prev_segment_id != mbmi->segment_id) {
+ const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+ const int mi_stride = cm->mi_params.mi_cols;
+ const uint8_t segment_id = mbmi->segment_id;
+ for (int mi_y = 0; mi_y < ymis; mi_y++) {
+ const int map_offset = block_index + mi_y * mi_stride;
+ memset(&cr->map[map_offset], 0, xmis);
+ memset(&cpi->enc_seg.map[map_offset], segment_id, xmis);
+ memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis);
+ }
+ }
+ }
+ if (!dry_run) {
+ if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1)
+ x->actual_num_seg1_blocks -= xmis * ymis;
+ else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2)
+ x->actual_num_seg2_blocks -= xmis * ymis;
+ }
+}
+
+void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip,
+ RUN_TYPE dry_run) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+ const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+ int noise_level = 0;
+ if (cpi->noise_estimate.enabled) noise_level = cpi->noise_estimate.level;
+ const int refresh_this_block =
+ candidate_refresh_aq(cr, mbmi, rate, dist, bsize, noise_level);
+ int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1;
+ // Default is to not update the refresh map.
+ int new_map_value = cr->map[block_index];
+
+ // If this block is labeled for refresh, check if we should reset the
+ // segment_id.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+ mbmi->segment_id = refresh_this_block;
+ // Reset segment_id if will be skipped.
+ if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE;
+ }
+ const uint8_t segment_id = mbmi->segment_id;
+
+ // Update the cyclic refresh map, to be used for setting segmentation map
+ // for the next frame. If the block will be refreshed this frame, mark it
+ // as clean. The magnitude of the -ve influences how long before we consider
+ // it for refresh again.
+ if (cyclic_refresh_segment_id_boosted(segment_id)) {
+ new_map_value = -cr->time_for_refresh;
+ } else if (refresh_this_block) {
+ // Else if it is accepted as candidate for refresh, and has not already
+ // been refreshed (marked as 1) then mark it as a candidate for cleanup
+ // for future time (marked as 0), otherwise don't update it.
+ if (cr->map[block_index] == 1) new_map_value = 0;
+ } else {
+ // Leave it marked as block that is not candidate for refresh.
+ new_map_value = 1;
+ }
+
+ // Update entries in the cyclic refresh map with new_map_value, and
+ // copy mbmi->segment_id into global segmentation map.
+ const int mi_stride = cm->mi_params.mi_cols;
+ for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+ const int map_offset = block_index + mi_y * mi_stride;
+ memset(&cr->map[map_offset], new_map_value, xmis);
+ memset(&cpi->enc_seg.map[map_offset], segment_id, xmis);
+ memset(&cm->cur_frame->seg_map[map_offset], segment_id, xmis);
+ }
+
+ // Accumulate cyclic refresh update counters.
+ if (!dry_run) {
+ if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST1)
+ x->actual_num_seg1_blocks += xmis * ymis;
+ else if (cyclic_refresh_segment_id(segment_id) == CR_SEGMENT_ID_BOOST2)
+ x->actual_num_seg2_blocks += xmis * ymis;
+ }
+}
+
+// Initializes counters used for cyclic refresh.
+void av1_init_cyclic_refresh_counters(MACROBLOCK *const x) {
+ x->actual_num_seg1_blocks = 0;
+ x->actual_num_seg2_blocks = 0;
+}
+
+// Accumulate cyclic refresh counters.
+void av1_accumulate_cyclic_refresh_counters(
+ CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x) {
+ cyclic_refresh->actual_num_seg1_blocks += x->actual_num_seg1_blocks;
+ cyclic_refresh->actual_num_seg2_blocks += x->actual_num_seg2_blocks;
+}
+
+void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ // Set minimum gf_interval for GF update to a multiple of the refresh period,
+ // with some max limit. Depending on past encoding stats, GF flag may be
+ // reset and update may not occur until next baseline_gf_interval.
+ const int gf_length_mult[2] = { 8, 4 };
+ if (cr->percent_refresh > 0)
+ p_rc->baseline_gf_interval =
+ AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] *
+ (100 / cr->percent_refresh),
+ MAX_GF_INTERVAL_RT);
+ else
+ p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT;
+ if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40)
+ p_rc->baseline_gf_interval = 16;
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ unsigned char *const seg_map = cpi->enc_seg.map;
+ int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+ int xmis, ymis, x, y;
+ uint64_t sb_sad = 0;
+ uint64_t thresh_sad_low = 0;
+ uint64_t thresh_sad = INT64_MAX;
+ const int mi_rows = mi_params->mi_rows, mi_cols = mi_params->mi_cols;
+ const int mi_stride = mi_cols;
+ memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols);
+ sb_cols = (mi_cols + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size;
+ sb_rows = (mi_rows + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size;
+ sbs_in_frame = sb_cols * sb_rows;
+ // Number of target blocks to get the q delta (segment 1).
+ block_count = cr->percent_refresh * mi_rows * mi_cols / 100;
+ // Set the segmentation map: cycle through the superblocks, starting at
+ // cr->mb_index, and stopping when either block_count blocks have been found
+ // to be refreshed, or we have passed through whole frame.
+ if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0;
+ assert(cr->sb_index < sbs_in_frame);
+ i = cr->sb_index;
+ cr->last_sb_index = cr->sb_index;
+ cr->target_num_seg_blocks = 0;
+ do {
+ int sum_map = 0;
+ // Get the mi_row/mi_col corresponding to superblock index i.
+ int sb_row_index = (i / sb_cols);
+ int sb_col_index = i - sb_row_index * sb_cols;
+ int mi_row = sb_row_index * cm->seq_params->mib_size;
+ int mi_col = sb_col_index * cm->seq_params->mib_size;
+ assert(mi_row >= 0 && mi_row < mi_rows);
+ assert(mi_col >= 0 && mi_col < mi_cols);
+ bl_index = mi_row * mi_stride + mi_col;
+ // Loop through all MI blocks in superblock and update map.
+ xmis = AOMMIN(mi_cols - mi_col, cm->seq_params->mib_size);
+ ymis = AOMMIN(mi_rows - mi_row, cm->seq_params->mib_size);
+ if (cr->use_block_sad_scene_det && cpi->rc.frames_since_key > 30 &&
+ cr->counter_encode_maxq_scene_change > 30 &&
+ cpi->src_sad_blk_64x64 != NULL &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ sb_sad = cpi->src_sad_blk_64x64[sb_col_index + sb_cols * sb_row_index];
+ int scale = (cm->width * cm->height < 640 * 360) ? 6 : 8;
+ int scale_low = 2;
+ thresh_sad = (scale * 64 * 64);
+ thresh_sad_low = (scale_low * 64 * 64);
+ // For temporal layers: the base temporal layer (temporal_layer_id = 0)
+ // has larger frame separation (2 or 4 frames apart), so use larger sad
+ // thresholds to compensate for larger frame sad. The larger thresholds
+ // also increase the amount of refresh, which is needed for the base
+ // temporal layer.
+ if (cpi->svc.number_temporal_layers > 1 &&
+ cpi->svc.temporal_layer_id == 0) {
+ thresh_sad <<= 4;
+ thresh_sad_low <<= 2;
+ }
+ }
+ // cr_map only needed at 8x8 blocks.
+ for (y = 0; y < ymis; y += 2) {
+ for (x = 0; x < xmis; x += 2) {
+ const int bl_index2 = bl_index + y * mi_stride + x;
+ // If the block is as a candidate for clean up then mark it
+ // for possible boost/refresh (segment 1). The segment id may get
+ // reset to 0 later if block gets coded anything other than low motion.
+ // If the block_sad (sb_sad) is very low label it for refresh anyway.
+ if (cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) {
+ sum_map += 4;
+ } else if (cr->map[bl_index2] < 0) {
+ cr->map[bl_index2]++;
+ }
+ }
+ }
+ // Enforce constant segment over superblock.
+ // If segment is at least half of superblock, set to 1.
+ // Enforce that block sad (sb_sad) is not too high.
+ if (sum_map >= (xmis * ymis) >> 1 && sb_sad < thresh_sad) {
+ set_segment_id(seg_map, bl_index, xmis, ymis, mi_stride,
+ CR_SEGMENT_ID_BOOST1);
+ cr->target_num_seg_blocks += xmis * ymis;
+ }
+ i++;
+ if (i == sbs_in_frame) {
+ i = 0;
+ }
+ } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+ cr->sb_index = i;
+ if (cr->target_num_seg_blocks == 0) {
+ // Disable segmentation, seg_map is already set to 0 above.
+ av1_disable_segmentation(&cm->seg);
+ }
+}
+
+static int is_scene_change_detected(AV1_COMP *const cpi) {
+ return cpi->rc.high_source_sad;
+}
+
+// Set cyclic refresh parameters.
+void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
+ // TODO(marpan): Parameters need to be tuned.
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ SVC *const svc = &cpi->svc;
+ const int qp_thresh = AOMMAX(16, rc->best_quality + 4);
+ const int qp_max_thresh = 118 * MAXQ >> 7;
+ const int scene_change_detected = is_scene_change_detected(cpi);
+ const int is_screen_content =
+ (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN);
+
+ // A scene change or key frame marks the start of a cyclic refresh cycle.
+ const int frames_since_scene_change =
+ (cpi->ppi->use_svc || !is_screen_content)
+ ? cpi->rc.frames_since_key
+ : AOMMIN(cpi->rc.frames_since_key,
+ cr->counter_encode_maxq_scene_change);
+
+ // Cases to reset the cyclic refresh adjustment parameters.
+ if (frame_is_intra_only(cm) || scene_change_detected ||
+ cpi->ppi->rtc_ref.bias_recovery_frame) {
+ // Reset adaptive elements for intra only frames and scene changes.
+ cr->percent_refresh_adjustment = 5;
+ cr->rate_ratio_qdelta_adjustment = 0.25;
+ }
+
+ // Although this segment feature for RTC is only used for
+ // blocks >= 8X8, for more efficient coding of the seg map
+ // cur_frame->seg_map needs to set at 4x4 along with the
+ // function av1_cyclic_reset_segment_skip(). Skipping over
+ // 4x4 will therefore have small bdrate loss (~0.2%), so
+ // we use it only for speed > 9 for now.
+ // Also if loop-filter deltas is applied via segment, then
+ // we need to set cr->skip_over4x4 = 1.
+ cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0;
+
+ // should we enable cyclic refresh on this frame.
+ cr->apply_cyclic_refresh = 1;
+ if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
+ scene_change_detected || svc->temporal_layer_id > 0 ||
+ svc->prev_number_spatial_layers != svc->number_spatial_layers ||
+ p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+ (svc->number_spatial_layers > 1 &&
+ svc->layer_context[svc->temporal_layer_id].is_key_frame) ||
+ (frames_since_scene_change > 20 &&
+ p_rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
+ (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 30 &&
+ frames_since_scene_change > 40) ||
+ cpi->ppi->rtc_ref.bias_recovery_frame) {
+ cr->apply_cyclic_refresh = 0;
+ return;
+ }
+
+ // Increase the amount of refresh for #temporal_layers > 2
+ if (svc->number_temporal_layers > 2)
+ cr->percent_refresh = 15;
+ else
+ cr->percent_refresh = 10 + cr->percent_refresh_adjustment;
+
+ cr->max_qdelta_perc = 60;
+ cr->time_for_refresh = 0;
+ cr->use_block_sad_scene_det =
+ (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+ cm->seq_params->sb_size == BLOCK_64X64)
+ ? 1
+ : 0;
+ cr->motion_thresh = 32;
+ cr->rate_boost_fac =
+ (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) ? 10 : 15;
+
+ // Use larger delta-qp (increase rate_ratio_qdelta) for first few
+ // refresh cycles after a key frame (svc) or scene change (non svc).
+ // For non svc screen content, after a scene change gradually reduce
+ // this boost and supress it further if either of the previous two
+ // frames overshot.
+ if (cr->percent_refresh > 0) {
+ if (cpi->ppi->use_svc || !is_screen_content) {
+ if (frames_since_scene_change <
+ ((4 * svc->number_temporal_layers) * (100 / cr->percent_refresh))) {
+ cr->rate_ratio_qdelta = 3.0 + cr->rate_ratio_qdelta_adjustment;
+ } else {
+ cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment;
+ }
+ } else {
+ double distance_from_sc_factor =
+ AOMMIN(0.75, (int)(frames_since_scene_change / 10) * 0.1);
+ cr->rate_ratio_qdelta =
+ 3.0 + cr->rate_ratio_qdelta_adjustment - distance_from_sc_factor;
+ if ((frames_since_scene_change < 10) &&
+ ((cpi->rc.rc_1_frame < 0) || (cpi->rc.rc_2_frame < 0))) {
+ cr->rate_ratio_qdelta -= 0.25;
+ }
+ }
+ } else {
+ cr->rate_ratio_qdelta = 2.25 + cr->rate_ratio_qdelta_adjustment;
+ }
+ // Adjust some parameters for low resolutions.
+ if (cm->width * cm->height <= 352 * 288) {
+ if (cpi->svc.number_temporal_layers > 1) {
+ cr->motion_thresh = 32;
+ cr->rate_boost_fac = 13;
+ } else {
+ if (rc->avg_frame_bandwidth < 3000) {
+ cr->motion_thresh = 16;
+ cr->rate_boost_fac = 13;
+ } else {
+ cr->max_qdelta_perc = 50;
+ cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.0);
+ }
+ }
+ }
+ if (cpi->oxcf.rc_cfg.mode == AOM_VBR) {
+ // To be adjusted for VBR mode, e.g., based on gf period and boost.
+ // For now use smaller qp-delta (than CBR), no second boosted seg, and
+ // turn-off (no refresh) on golden refresh (since it's already boosted).
+ cr->percent_refresh = 10;
+ cr->rate_ratio_qdelta = 1.5;
+ cr->rate_boost_fac = 10;
+ if (cpi->refresh_frame.golden_frame) {
+ cr->percent_refresh = 0;
+ cr->rate_ratio_qdelta = 1.0;
+ }
+ }
+ if (rc->rtc_external_ratectrl) {
+ cr->actual_num_seg1_blocks = cr->percent_refresh * cm->mi_params.mi_rows *
+ cm->mi_params.mi_cols / 100;
+ cr->actual_num_seg2_blocks = 0;
+ }
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ struct segmentation *const seg = &cm->seg;
+ const int scene_change_detected = is_scene_change_detected(cpi);
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+ // Set resolution_change flag: for svc only set it when the
+ // number of spatial layers has not changed.
+ const int resolution_change =
+ cm->prev_frame &&
+ (cm->width != cm->prev_frame->width ||
+ cm->height != cm->prev_frame->height) &&
+ cpi->svc.prev_number_spatial_layers == cpi->svc.number_spatial_layers;
+
+ if (resolution_change) av1_cyclic_refresh_reset_resize(cpi);
+ if (!cr->apply_cyclic_refresh) {
+ // Set segmentation map to 0 and disable.
+ unsigned char *const seg_map = cpi->enc_seg.map;
+ memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ av1_disable_segmentation(&cm->seg);
+ if (frame_is_intra_only(cm) || scene_change_detected ||
+ cpi->ppi->rtc_ref.bias_recovery_frame) {
+ cr->sb_index = 0;
+ cr->last_sb_index = 0;
+ cr->counter_encode_maxq_scene_change = 0;
+ cr->actual_num_seg1_blocks = 0;
+ cr->actual_num_seg2_blocks = 0;
+ }
+ return;
+ } else {
+ cr->counter_encode_maxq_scene_change++;
+ const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
+ cm->seq_params->bit_depth);
+ // Set rate threshold to some multiple (set to 2 for now) of the target
+ // rate (target is given by sb64_target_rate and scaled by 256).
+ cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+ // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+ // q will not exceed 457, so (q * q) is within 32bit; see:
+ // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[].
+ cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+ // For low-resoln or lower speeds, the rate/dist thresholds need to be
+ // tuned/updated.
+ if (cpi->oxcf.speed <= 7 || (cm->width * cm->height < 640 * 360)) {
+ cr->thresh_dist_sb = 0;
+ cr->thresh_rate_sb = INT64_MAX;
+ }
+ // Set up segmentation.
+ // Clear down the segment map.
+ av1_enable_segmentation(&cm->seg);
+ av1_clearall_segfeatures(seg);
+
+ // Note: setting temporal_update has no effect, as the seg-map coding method
+ // (temporal or spatial) is determined in
+ // av1_choose_segmap_coding_method(),
+ // based on the coding cost of each method. For error_resilient mode on the
+ // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+ // relative to 0 previous map.
+ // seg->temporal_update = 0;
+
+ // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+ av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+ // Use segment BOOST1 for in-frame Q adjustment.
+ av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+ // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+ av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+ // Set the q delta for segment BOOST1.
+ const CommonQuantParams *const quant_params = &cm->quant_params;
+ int qindex_delta =
+ compute_deltaq(cpi, quant_params->base_qindex, cr->rate_ratio_qdelta);
+ cr->qindex_delta[1] = qindex_delta;
+
+ // Compute rd-mult for segment BOOST1.
+ const int qindex2 = clamp(
+ quant_params->base_qindex + quant_params->y_dc_delta_q + qindex_delta,
+ 0, MAXQ);
+ cr->rdmult = av1_compute_rd_mult(
+ qindex2, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+
+ av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Set a more aggressive (higher) q delta for segment BOOST2.
+ qindex_delta = compute_deltaq(
+ cpi, quant_params->base_qindex,
+ AOMMIN(CR_MAX_RATE_TARGET_RATIO,
+ 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+ cr->qindex_delta[2] = qindex_delta;
+ av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Update the segmentation and refresh map.
+ cyclic_refresh_update_map(cpi);
+ }
+}
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+ return cr->rdmult;
+}
+
+void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ cr->sb_index = 0;
+ cr->last_sb_index = 0;
+ cpi->refresh_frame.golden_frame = true;
+ cr->apply_cyclic_refresh = 0;
+ cr->counter_encode_maxq_scene_change = 0;
+ cr->percent_refresh_adjustment = 5;
+ cr->rate_ratio_qdelta_adjustment = 0.25;
+}
+
+int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int qindex = cpi->common.quant_params.base_qindex;
+ if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 &&
+ cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh &&
+ cpi->rc.frame_source_sad < 1000 &&
+ qindex < 7 * (cpi->rc.worst_quality >> 3))
+ return 1;
+ // More aggressive skip.
+ else if (cpi->sf.rt_sf.skip_lf_screen > 1 && !cpi->rc.high_source_sad &&
+ cpi->rc.frame_source_sad < 50000 && qindex < cpi->rc.worst_quality)
+ return 1;
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
new file mode 100644
index 0000000000..10974f018b
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/tokenize.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE 0
+#define CR_SEGMENT_ID_BOOST1 1
+#define CR_SEGMENT_ID_BOOST2 2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+/*!
+ * \brief The stucture of CYCLIC_REFRESH.
+ * \ingroup cyclic_refresh
+ */
+struct CYCLIC_REFRESH {
+ /*!
+ * Percentage of blocks per frame that are targeted as candidates
+ * for cyclic refresh.
+ */
+ int percent_refresh;
+
+ /*!
+ * Active adjustment delta for cyclic refresh for rate control.
+ */
+ int percent_refresh_adjustment;
+
+ /*!
+ * Maximum q-delta as percentage of base q.
+ */
+ int max_qdelta_perc;
+ /*!
+ *Superblock starting index for cycling through the frame.
+ */
+ int sb_index;
+ /*!
+ *Superblock index cyclic refresh index last frame
+ */
+ int last_sb_index;
+ /*!
+ * Controls how long block will need to wait to be refreshed again, in
+ * excess of the cycle time, i.e., in the case of all zero motion, block
+ * will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+ */
+ int time_for_refresh;
+ /*!
+ * Target number of (4x4) blocks that are set for delta-q.
+ */
+ int target_num_seg_blocks;
+ /*!
+ * Actual number of (4x4) blocks that were applied delta-q,
+ * for segment 1.
+ */
+ int actual_num_seg1_blocks;
+ /*!
+ * Actual number of (4x4) blocks that were applied delta-q,
+ * for segment 2.
+ */
+ int actual_num_seg2_blocks;
+ /*!
+ * RD mult. parameters for segment 1.
+ */
+ int rdmult;
+ /*!
+ * Cyclic refresh map.
+ */
+ int8_t *map;
+ /*!
+ * Threshold applied to the projected rate of the coding block,
+ * when deciding whether block should be refreshed.
+ */
+ int64_t thresh_rate_sb;
+ /*!
+ * Threshold applied to the projected distortion of the coding block,
+ * when deciding whether block should be refreshed.
+ */
+ int64_t thresh_dist_sb;
+ /*!
+ * Threshold applied to the motion vector (in units of 1/8 pel) of the
+ * coding block, when deciding whether block should be refreshed.
+ */
+ int16_t motion_thresh;
+ /*!
+ * Rate target ratio to set q delta.
+ */
+ double rate_ratio_qdelta;
+
+ /*!
+ * Active adjustment of qdelta rate ratio for enhanced rate control
+ */
+ double rate_ratio_qdelta_adjustment;
+
+ /*!
+ * Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+ */
+ int rate_boost_fac;
+
+ /*!\cond */
+ int qindex_delta[3];
+ int apply_cyclic_refresh;
+ int skip_over4x4;
+ int counter_encode_maxq_scene_change;
+ int use_block_sad_scene_det;
+ /*!\endcond */
+};
+
+struct AV1_COMP;
+
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+/*!\brief Estimate the bits, incorporating the delta-q from the segments.
+ *
+ * For the just encoded frame, estimate the bits, incorporating the delta-q
+ * from non-base segment(s). Note this function is called in the postencode
+ * (called from rc_update_rate_correction_factors()).
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] correction_factor rate correction factor
+ *
+ * \return Return the estimated bits at given q.
+ */
+int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
+ double correction_factor);
+
+/*!\brief Estimate the bits per mb, for given q = i and delta-q.
+ *
+ * Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+ * a corresponding delta-q (for segment 1). This function is called in the
+ * rc_regulate_q() to set the base qp index. Note: the segment map is set to
+ * either 0/CR_SEGMENT_ID_BASE (no refresh) or to 1/CR_SEGMENT_ID_BOOST1
+ * (refresh) for each superblock, prior to encoding.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] i q index
+ * \param[in] correction_factor rate correction factor
+ *
+ * \return Return the estimated bits for q = i and delta-q (segment 1).
+ */
+int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
+ double correction_factor);
+
+/*!\brief Update segment_id for blocks are skipped.
+ *
+ * After encoding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id based on skip_txfm,
+ * and update the cyclic_refresh map and segmentation counters.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] x Pointer to MACROBLOCK structure
+ * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+ * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE
+ * \param[in] bsize Block size
+ * \param[in] dry_run A code indicating whether it is part of the final
+ * pass for reconstructing the superblock
+ *
+ * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
+
+void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, RUN_TYPE dry_run);
+
+/*!\brief Update segment_id for block based on mode selected.
+ *
+ * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id (based on mode/motion/skip selected
+ * for that block) and update the cyclic_refresh map and segmentation map.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] x Pointer to MACROBLOCK structure
+ * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+ * \param[in] mi_col Col coordinate of the block in a step size of MI_SIZE
+ * \param[in] bsize Block size
+ * \param[in] rate Projected block rate from pickmode
+ * \param[in] dist Projected block dist from pickmode
+ * \param[in] skip Skip flag set from picmode
+ * \param[in] dry_run A code indicating whether it is part of the final
+ * pass for reconstructing the superblock
+ *
+ * \remark Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
+void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
+ MACROBLOCK *const x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip,
+ RUN_TYPE dry_run);
+
+/*!\brief Initialize counters used for cyclic refresh.
+ *
+ * Initializes cyclic refresh counters actual_num_seg1_blocks and
+ * actual_num_seg2_blocks.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] x Pointer to MACROBLOCK structure
+ *
+ * \remark Update the \c x->actual_num_seg1_blocks and the
+ * \c x->actual_num_seg2_blocks.
+ */
+void av1_init_cyclic_refresh_counters(MACROBLOCK *const x);
+
+/*!\brief Accumulate cyclic refresh counters.
+ *
+ * Accumulates cyclic refresh counters actual_num_seg1_blocks and
+ * actual_num_seg2_blocks from MACROBLOCK strcture to CYCLIC_REFRESH strcture.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cyclic_refresh Pointer to CYCLIC_REFRESH structure
+ * \param[in] x Pointer to MACROBLOCK structure
+ *
+ * \remark Update the \c cyclic_refresh->actual_num_seg1_blocks and the
+ * \c cyclic_refresh->actual_num_seg2_blocks.
+ */
+void av1_accumulate_cyclic_refresh_counters(
+ CYCLIC_REFRESH *const cyclic_refresh, const MACROBLOCK *const x);
+
+/*!\brief Set golden frame update interval nased on cyclic refresh.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Returns the interval in \c cpi->rc.baseline_gf_interval.
+ */
+void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
+
+/*!\brief Set the global/frame level parameters for cyclic refresh.
+ *
+ * First call to the cyclic refresh, before encoding the frame.
+ * Sets the flag on whether cyclic refresh should be applied, sets
+ * the amount/percent of refresh, and the amount of boost applied to
+ * the two segments (set by rate_ratio_qdelta and rate_boost_fac).
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Updates the \c cpi->cyclic_refresh with the settings.
+ */
+void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
+
+/*!\brief Setup the cyclic background refresh.
+ *
+ * Set the delta q for the segment(s), and set the segmentation map.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Updates the \c cpi->cyclic_refresh with the cyclic refresh
+ * parameters and the \c cm->seg with the segmentation data.
+ */
+void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_disable_lf_cdef(struct AV1_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+ return segment_id == CR_SEGMENT_ID_BOOST1 ||
+ segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+ if (segment_id == CR_SEGMENT_ID_BOOST1)
+ return CR_SEGMENT_ID_BOOST1;
+ else if (segment_id == CR_SEGMENT_ID_BOOST2)
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BASE;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
new file mode 100644
index 0000000000..086928a118
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/dwt.h"
+
+static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
+ 0.9, .8, .7, .6 };
+
+static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0,
+ 0.75, 1.0, 1.0, 1.0 };
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
+#define ENERGY_IN_BOUNDS(energy) \
+ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+DECLARE_ALIGNED(16, static const uint16_t,
+ av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
+
+void av1_vaq_frame_setup(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const int base_qindex = cm->quant_params.base_qindex;
+ struct segmentation *seg = &cm->seg;
+ int i;
+
+ int resolution_change =
+ cm->prev_frame && (cm->width != cm->prev_frame->width ||
+ cm->height != cm->prev_frame->height);
+ int avg_energy = (int)(cpi->twopass_frame.mb_av_energy - 2);
+ double avg_ratio;
+ if (avg_energy > 7) avg_energy = 7;
+ if (avg_energy < 0) avg_energy = 0;
+ avg_ratio = rate_ratio[avg_energy];
+
+ if (resolution_change) {
+ memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ av1_clearall_segfeatures(seg);
+ av1_disable_segmentation(seg);
+ return;
+ }
+ if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+ refresh_frame->alt_ref_frame ||
+ (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ cpi->vaq_refresh = 1;
+
+ av1_enable_segmentation(seg);
+ av1_clearall_segfeatures(seg);
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ // Set up avg segment id to be 1.0 and adjust the other segments around
+ // it.
+ int qindex_delta =
+ av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type,
+ base_qindex, rate_ratio[i] / avg_ratio);
+
+ // We don't allow qindex 0 in a segment if the base value is not 0.
+ // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -base_qindex + 1;
+ }
+
+ av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+ av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ }
+ }
+}
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ // This functions returns a score for the blocks local variance as calculated
+ // by: sum of the log of the (4x4 variances) of each subblock to the current
+ // block (x,bs)
+ // * 32 / number of pixels in the block_size.
+ // This is used for segmentation because to avoid situations in which a large
+ // block with a gentle gradient gets marked high variance even though each
+ // subblock has a low variance. This allows us to assign the same segment
+ // number for the same sorts of area regardless of how the partitioning goes.
+
+ MACROBLOCKD *xd = &x->e_mbd;
+ double var = 0;
+ unsigned int sse;
+ int i, j;
+
+ int right_overflow =
+ (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+ int bottom_overflow =
+ (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+ const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+ const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+ for (i = 0; i < bh; i += 4) {
+ for (j = 0; j < bw; j += 4) {
+ if (is_cur_buf_hbd(xd)) {
+ var += log1p(cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
+ 16.0);
+ } else {
+ var += log1p(cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
+ 16.0);
+ }
+ }
+ }
+ // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561.
+ var /= (bw / 4 * bh / 4);
+ if (var > 7) var = 7;
+
+ return (int)(var);
+}
+
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+ int mi_row, int mi_col) {
+ // This functions returns the block average of luma block
+ unsigned int sum, avg, num_pix;
+ int r, c;
+ const int pic_w = cpi->common.width;
+ const int pic_h = cpi->common.height;
+ const int bw = MI_SIZE * mi_size_wide[bs];
+ const int bh = MI_SIZE * mi_size_high[bs];
+ const uint16_t *x16 = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+ sum = 0;
+ num_pix = 0;
+ avg = 0;
+ int row = mi_row << MI_SIZE_LOG2;
+ int col = mi_col << MI_SIZE_LOG2;
+ for (r = row; (r < (row + bh)) && (r < pic_h); r++) {
+ for (c = col; (c < (col + bw)) && (c < pic_w); c++) {
+ sum += *(x16 + r * x->plane[0].src.stride + c);
+ num_pix++;
+ }
+ }
+ if (num_pix != 0) {
+ avg = sum / num_pix;
+ }
+ return avg;
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+
+static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ int stride = x->plane[0].src.stride;
+ uint8_t *buf = x->plane[0].src.buf;
+ const int num_8x8_cols = block_size_wide[bs] / 8;
+ const int num_8x8_rows = block_size_high[bs] / 8;
+ const int hbd = is_cur_buf_hbd(xd);
+
+ int64_t var = av1_haar_ac_sad_mxn_uint8_input(buf, stride, hbd, num_8x8_rows,
+ num_8x8_cols);
+
+ return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
+}
+
+static double log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+ unsigned int haar_sad = haar_ac_energy(x, bs);
+ return log1p(haar_sad);
+}
+
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs) {
+ double energy, energy_midpoint;
+ energy_midpoint = (is_stat_consumption_stage_twopass(cpi))
+ ? cpi->twopass_frame.frame_avg_haar_energy
+ : DEFAULT_E_MIDPOINT;
+ energy = log_block_wavelet_energy(x, bs) - energy_midpoint;
+ return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
+
+int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
+ int block_var_level) {
+ int rate_level;
+ const AV1_COMMON *const cm = &cpi->common;
+
+ if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
+ ENERGY_IN_BOUNDS(block_var_level);
+ rate_level = SEGMENT_ID(block_var_level);
+ } else {
+ rate_level = block_var_level;
+ }
+ const int base_qindex = cm->quant_params.base_qindex;
+ int qindex_delta =
+ av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type, base_qindex,
+ deltaq_rate_ratio[rate_level]);
+
+ if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -base_qindex + 1;
+ }
+ return base_qindex + qindex_delta;
+}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
new file mode 100644
index 0000000000..aa0535ad72
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_
+#define AOM_AV1_ENCODER_AQ_VARIANCE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_vaq_frame_setup(AV1_COMP *cpi);
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+ int mi_row, int mi_col);
+int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
+ int block_var_level);
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c
new file mode 100644
index 0000000000..91fc1e00a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/crc32/hash_arm_crc32.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <intrin.h>
+#else
+#include <arm_acle.h>
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config/aom_config.h"
+
+#define CRC_LOOP(op, crc, type, buf, len) \
+ while ((len) >= sizeof(type)) { \
+ (crc) = op((crc), *(type *)(buf)); \
+ (len) -= sizeof(type); \
+ buf += sizeof(type); \
+ }
+
+#define CRC_SINGLE(op, crc, type, buf, len) \
+ if ((len) >= sizeof(type)) { \
+ (crc) = op((crc), *(type *)(buf)); \
+ (len) -= sizeof(type); \
+ buf += sizeof(type); \
+ }
+
+/* Return 32-bit CRC for the input buffer.
+ * Polynomial is 0x1EDC6F41.
+ */
+
+uint32_t av1_get_crc32c_value_arm_crc32(void *crc_calculator, uint8_t *p,
+ size_t len) {
+ (void)crc_calculator;
+ const uint8_t *buf = p;
+ uint32_t crc = 0xFFFFFFFF;
+
+#if !AOM_ARCH_AARCH64
+ // Align input to 8-byte boundary (only necessary for 32-bit builds.)
+ while (len && ((uintptr_t)buf & 7)) {
+ crc = __crc32cb(crc, *buf++);
+ len--;
+ }
+#endif
+
+ CRC_LOOP(__crc32cd, crc, uint64_t, buf, len)
+ CRC_SINGLE(__crc32cw, crc, uint32_t, buf, len)
+ CRC_SINGLE(__crc32ch, crc, uint16_t, buf, len)
+ CRC_SINGLE(__crc32cb, crc, uint8_t, buf, len)
+
+ return ~crc;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c
new file mode 100644
index 0000000000..26d06b46fe
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_error_neon.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+ int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+ const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+ // By operating on unsigned integers we can store up to 4 squared diff in a
+ // 32-bit element before having to widen to 64 bits.
+ uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+ err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+ err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+ err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+ err_u64 = vpadalq_u32(err_u64, err);
+
+ // We can't do the same here as we're operating on signed integers, so we
+ // can only accumulate 2 squares.
+ int32x4_t ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0));
+ ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0));
+ ssz_s64 = vpadalq_s32(ssz_s64, ssz0);
+
+ int32x4_t ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1));
+ ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1));
+ ssz_s64 = vpadalq_s32(ssz_s64, ssz1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ *ssz = horizontal_add_s64x2(ssz_s64);
+ return (int64_t)horizontal_add_u64x2(err_u64);
+}
+
+int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
+ int block_size) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int16x8_t c0 = vld1q_s16(coeff);
+ const int16x8_t c1 = vld1q_s16(coeff + 8);
+ const int16x8_t d0 = vld1q_s16(dqcoeff);
+ const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
+
+ const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+ const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+ // By operating on unsigned integers we can store up to 4 squared diff in a
+ // 32-bit element before having to widen to 64 bits.
+ uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+ err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+ err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+ err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+ err_u64 = vpadalq_u32(err_u64, err);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ return (int64_t)horizontal_add_u64x2(err_u64);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
new file mode 100644
index 0000000000..63aad0b785
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_error_sve.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+ int64x2_t sqcoeff[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const int16x8_t diff0 = vsubq_s16(c0, d0);
+ const int16x8_t diff1 = vsubq_s16(c1, d1);
+
+ error[0] = aom_sdotq_s16(error[0], diff0, diff0);
+ error[1] = aom_sdotq_s16(error[1], diff1, diff1);
+ sqcoeff[0] = aom_sdotq_s16(sqcoeff[0], c0, c0);
+ sqcoeff[1] = aom_sdotq_s16(sqcoeff[1], c1, c1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ *ssz = vaddvq_s64(vaddq_s64(sqcoeff[0], sqcoeff[1]));
+ return vaddvq_s64(vaddq_s64(error[0], error[1]));
+}
+
+int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff,
+ int block_size) {
+ if (block_size % 32 == 0) {
+ int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+ vdupq_n_s64(0) };
+
+ do {
+ const int16x8_t c0 = vld1q_s16(coeff);
+ const int16x8_t c1 = vld1q_s16(coeff + 8);
+ const int16x8_t c2 = vld1q_s16(coeff + 16);
+ const int16x8_t c3 = vld1q_s16(coeff + 24);
+ const int16x8_t d0 = vld1q_s16(dqcoeff);
+ const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
+ const int16x8_t d2 = vld1q_s16(dqcoeff + 16);
+ const int16x8_t d3 = vld1q_s16(dqcoeff + 24);
+
+ const int16x8_t diff0 = vsubq_s16(c0, d0);
+ const int16x8_t diff1 = vsubq_s16(c1, d1);
+ const int16x8_t diff2 = vsubq_s16(c2, d2);
+ const int16x8_t diff3 = vsubq_s16(c3, d3);
+
+ error[0] = aom_sdotq_s16(error[0], diff0, diff0);
+ error[1] = aom_sdotq_s16(error[1], diff1, diff1);
+ error[2] = aom_sdotq_s16(error[2], diff2, diff2);
+ error[3] = aom_sdotq_s16(error[3], diff3, diff3);
+
+ coeff += 32;
+ dqcoeff += 32;
+ block_size -= 32;
+ } while (block_size != 0);
+
+ error[0] = vaddq_s64(error[0], error[1]);
+ error[2] = vaddq_s64(error[2], error[3]);
+ error[0] = vaddq_s64(error[0], error[2]);
+ return vaddvq_s64(error[0]);
+ }
+ assert(block_size == 16);
+
+ int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+ do {
+ const int16x8_t c0 = vld1q_s16(coeff);
+ const int16x8_t c1 = vld1q_s16(coeff + 8);
+ const int16x8_t d0 = vld1q_s16(dqcoeff);
+ const int16x8_t d1 = vld1q_s16(dqcoeff + 8);
+
+ const int16x8_t diff0 = vsubq_s16(c0, d0);
+ const int16x8_t diff1 = vsubq_s16(c1, d1);
+
+ error[0] = aom_sdotq_s16(error[0], diff0, diff0);
+ error[1] = aom_sdotq_s16(error[1], diff1, diff1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ return vaddvq_s64(vaddq_s64(error[0], error[1]));
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
new file mode 100644
index 0000000000..5148ee74a9
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -0,0 +1,3090 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "shift_neon.h"
+#include "txfm_neon.h"
+
+#define TXFM_COS_BIT_MAX 13
+
+// A note on butterfly helper naming:
+//
+// butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon
+// e.g. butterfly_s32_s32_x4_0231_neon
+// | | | ^ Weights are applied as indices 0, 2, 3, 1
+// | | | (see more detail below)
+// | | ^ (int32)x4 input/output parameters
+// | ^ 32-bit accumulators internally
+// ^ 32-bit input/output parameters
+//
+// Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to
+// avoid needing separate negation instructions. This is represented in the
+// helper naming by referring to the lane index in the loaded tuple that each
+// multiply is performed with:
+//
+// in0 in1
+// /----------
+// out0 | w0 w1 ==> out0 = in0 * w0 + in1 * w1
+// out1 | w2 w3 ==> out1 = in0 * w2 + in1 * w3
+//
+// So for indices 0331 from the earlier example, we end up with:
+//
+// in0 in1
+// /------------------
+// out0 | (lane 0) (lane 2) ==> out0 = in0 * w0 + in1 * -w0
+// out1 | (lane 3) (lane 1) ==> out1 = in0 * (w0-1) + in1 * (1-w0)
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+ o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+ o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+ o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon(
+ const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1,
+ int32x4_t *out0, int32x4_t *out1) {
+ int32x4_t w0101 = vmovl_s16(w0101_s16);
+ int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1);
+ o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0);
+ int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0);
+ o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1);
+ *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX);
+ *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX);
+}
+
+#define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
+ out0, out1) \
+ do { \
+ int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0); \
+ u0 = vmlal_lane_s16(u0, in1, wvec, lane1); \
+ int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2); \
+ v0 = vmlal_lane_s16(v0, in1, wvec, lane3); \
+ *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \
+ *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \
+ } while (0)
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon(
+ const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1,
+ int16x4_t *out0, int16x4_t *out1) {
+ butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
+}
+
+#define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \
+ out0, out1) \
+ do { \
+ int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0); \
+ u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1); \
+ int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0); \
+ u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1); \
+ int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2); \
+ v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3); \
+ int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2); \
+ v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3); \
+ const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \
+ const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX); \
+ const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \
+ const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX); \
+ *out0 = vcombine_s16(c0, c1); \
+ *out1 = vcombine_s16(d0, d1); \
+ } while (0)
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon(
+ const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1,
+ int16x8_t *out0, int16x8_t *out1) {
+ butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1);
+}
+
+static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out,
+ int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out,
+ int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8(
+ int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2,
+ const int stride, const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + stride * i, in1[i]);
+ vst1q_s32(out + stride * i + 4, in2[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in,
+ const int stride,
+ int16x4_t *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = vld1_s16(in);
+ in += stride;
+ }
+}
+
+static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride,
+ int16x8_t *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = vld1q_s16(in + i * stride);
+ }
+}
+
+static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride, vmovl_s16(in[i]));
+ }
+}
+
+static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i])));
+ vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i])));
+ }
+}
+
+// A note on naming:
+// round_shift_[sqrt2]_s16_s32_4x1_neon(...)
+// | | | ^ 1 => a single vector
+// | | | n => an array of vectors
+// | | | ^ input/output vector element count
+// | | ^ output type
+// | ^ input type
+// ^ multiplicand and shift identifier
+
+static AOM_FORCE_INLINE int16x4_t
+round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) {
+ return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int16x8_t
+round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) {
+ return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
+ round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
+}
+
+static AOM_FORCE_INLINE int16x4_t
+round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) {
+ return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int16x8_t
+round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) {
+ return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)),
+ round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a)));
+}
+
+static AOM_FORCE_INLINE int32x4_t
+round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) {
+ return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits);
+}
+
+static AOM_FORCE_INLINE int32x4_t
+round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) {
+ return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits);
+}
+
+#define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn) \
+ static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \
+ for (int i = 0; i < size; ++i) { \
+ out[i] = fn(in[i]); \
+ } \
+ }
+
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t,
+ int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t,
+ int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t,
+ int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t,
+ int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon)
+ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t,
+ int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon)
+
+static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i]));
+ }
+}
+
+static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ vst1q_s32(out + i * stride + 0,
+ round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i])));
+ vst1q_s32(out + i * stride + 4,
+ round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i])));
+ }
+}
+
+static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ int32x4_t u[6], v[6];
+ const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
+ const int16x4_t u01 = vqadd_s16(input[0], input[1]);
+
+ v[5] = vmull_lane_s16(input[2], sinpi, 2);
+ v[0] = vmull_lane_s16(input[1], sinpi, 1);
+ v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0);
+ v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3);
+ v[2] = vmull_lane_s16(u01, sinpi, 2);
+ v[3] = vmull_lane_s16(input[0], sinpi, 3);
+ v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0);
+ v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1);
+
+ u[0] = vaddq_s32(v[0], v[1]);
+ u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2);
+ u[2] = vsubq_s32(v[3], v[4]);
+ u[3] = vsubq_s32(u[2], u[0]);
+ u[3] = vmlaq_n_s32(u[3], v[5], 3);
+
+ output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
+ output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
+ output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
+ output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
+}
+
+static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+ // stage 1-2
+ int16x4_t x2[8];
+ butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
+
+ // stage 3
+ int16x4_t x3[8];
+ x3[0] = vqadd_s16(input[0], x2[2]);
+ x3[1] = vqsub_s16(x2[3], input[7]);
+ x3[2] = vqsub_s16(input[0], x2[2]);
+ x3[3] = vqadd_s16(input[7], x2[3]);
+ x3[4] = vqsub_s16(x2[6], input[1]);
+ x3[5] = vqadd_s16(input[6], x2[7]);
+ x3[6] = vqadd_s16(input[1], x2[6]);
+ x3[7] = vqsub_s16(input[6], x2[7]);
+
+ // stage 4
+ int16x4_t x4[8];
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]);
+
+ // stage 5
+ int16x4_t x5[8];
+ x5[0] = vqadd_s16(x3[0], x4[4]);
+ x5[1] = vqadd_s16(x3[1], x4[5]);
+ x5[2] = vqadd_s16(x3[2], x4[6]);
+ x5[3] = vqsub_s16(x4[7], x3[3]);
+ x5[4] = vqsub_s16(x3[0], x4[4]);
+ x5[5] = vqsub_s16(x3[1], x4[5]);
+ x5[6] = vqsub_s16(x3[2], x4[6]);
+ x5[7] = vqadd_s16(x3[3], x4[7]);
+
+ // stage 6-7
+ butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
+ butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
+ butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
+ butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ int32x4_t u_lo[4], u_hi[4];
+ const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
+ const int16x8_t u01 = vqaddq_s16(input[0], input[1]);
+
+ u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1);
+ u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1);
+
+ u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0);
+ u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0);
+
+ u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3);
+ u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3);
+
+ u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2);
+ u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2);
+
+ u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2);
+ u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2);
+
+ u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3);
+ u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3);
+
+ u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0);
+ u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0);
+
+ u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1);
+ u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1);
+
+ u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2);
+ u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2);
+
+ u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2);
+ u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2);
+
+ u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]);
+ u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]);
+
+ const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3);
+ u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2);
+ u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2);
+
+ output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX));
+ output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX));
+ output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX));
+ output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX),
+ vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX));
+}
+
+static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+ const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]);
+
+ int16x4_t in12a = vadd_s16(input[1], input[2]);
+ int16x4_t in12s = vsub_s16(input[1], input[2]);
+ int16x4_t in03a = vadd_s16(input[0], input[3]);
+ int16x4_t in03s = vsub_s16(input[0], input[3]);
+
+ int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]);
+ int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]);
+
+ int32x4_t u[4];
+ u[0] = vaddq_s32(u0ad1, u0ad2);
+ u[1] = vsubq_s32(u0ad2, u0ad1);
+ u[2] = vmull_lane_s16(in12s, cospi16, 1);
+ u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0);
+ u[3] = vmull_lane_s16(in03s, cospi16, 1);
+ u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0);
+
+ output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX);
+ output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX);
+ output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX);
+ output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
+}
+
+// Butterfly pre-processing:
+// e.g. n=4:
+// out[0] = in[0] + in[3]
+// out[1] = in[1] + in[2]
+// out[2] = in[1] - in[2]
+// out[3] = in[0] - in[3]
+
+static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input,
+ int16x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vqadd_s16(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input,
+ int16x8_t *output,
+ int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vqaddq_s16(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input,
+ int32x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vqaddq_s32(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
+}
+
+// Butterfly post-processing:
+// e.g. n=8:
+// out[0] = in0[0] + in1[3];
+// out[1] = in0[1] + in1[2];
+// out[2] = in0[1] - in1[2];
+// out[3] = in0[0] - in1[3];
+// out[4] = in0[7] - in1[4];
+// out[5] = in0[6] - in1[5];
+// out[6] = in0[6] + in1[5];
+// out[7] = in0[7] + in1[4];
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0,
+ const int16x4_t *in1,
+ int16x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0,
+ const int16x8_t *in1,
+ int16x8_t *output,
+ int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
+
+static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0,
+ const int32x4_t *in1,
+ int32x4_t *output,
+ int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
+
+static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+
+ // stage 1
+ int16x8_t x1[4];
+ butterfly_dct_pre_s16_x8(input, x1, 4);
+
+ // stage 2
+ int16x8_t x2[4];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]);
+
+ // stage 3
+ output[0] = x2[0];
+ output[1] = x2[2];
+ output[2] = x2[1];
+ output[3] = x2[3];
+}
+
+static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+
+ // stage 1
+ int16x4_t x1[8];
+ butterfly_dct_pre_s16_x4(input, x1, 8);
+
+ // stage 2
+ int16x4_t x2[8];
+ butterfly_dct_pre_s16_x4(x1, x2, 4);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
+
+ // stage 3
+ int16x4_t x3[8];
+ butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
+ butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4);
+
+ // stage 4-5
+ butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+
+ // stage 1
+ int16x8_t x1[8];
+ butterfly_dct_pre_s16_x8(input, x1, 8);
+
+ // stage 2
+ int16x8_t x2[8];
+ butterfly_dct_pre_s16_x8(x1, x2, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]);
+
+ // stage 3
+ int16x8_t x3[8];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]);
+ butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4);
+
+ // stage 4-5
+ butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+ // stage 1
+ int16x4_t x1[16];
+ butterfly_dct_pre_s16_x4(input, x1, 16);
+
+ // stage 2
+ int16x4_t x2[16];
+ butterfly_dct_pre_s16_x4(x1, x2, 8);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
+
+ // stage 3
+ int16x4_t x3[16];
+ butterfly_dct_pre_s16_x4(x2, x3, 4);
+ butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
+ butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8);
+
+ // stage 4
+ int16x4_t x4[16];
+ butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4],
+ &output[12]);
+ butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
+ butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
+
+ // stage 5
+ int16x4_t x5[16];
+ butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10],
+ &output[6]);
+ butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4);
+ butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4);
+
+ // stage 6-7
+ butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1],
+ &output[15]);
+ butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9],
+ &output[7]);
+ butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5],
+ &output[11]);
+ butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13],
+ &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+ // stage 1
+ int16x8_t x1[16];
+ butterfly_dct_pre_s16_x8(input, x1, 16);
+
+ // stage 2
+ int16x8_t x2[16];
+ butterfly_dct_pre_s16_x8(x1, x2, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]);
+
+ // stage 3
+ int16x8_t x3[16];
+ butterfly_dct_pre_s16_x8(x2, x3, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]);
+ butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8);
+
+ // stage 4
+ int16x8_t x4[16];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4],
+ &output[12]);
+ butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]);
+
+ // stage 5
+ int16x8_t x5[16];
+ butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10],
+ &output[6]);
+ butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4);
+ butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4);
+
+ // stage 6-7
+ butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1],
+ &output[15]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9],
+ &output[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5],
+ &output[11]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13],
+ &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+ // stage 1
+ int16x8_t x1[32];
+ butterfly_dct_pre_s16_x8(input, x1, 32);
+
+ // stage 2
+ int16x8_t x2[32];
+ butterfly_dct_pre_s16_x8(x1, x2, 16);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]);
+
+ // stage 3
+ int16x8_t x3[32];
+ butterfly_dct_pre_s16_x8(x2, x3, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]);
+ butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16);
+
+ // stage 4
+ int16x8_t x4[32];
+ butterfly_dct_pre_s16_x8(x3, x4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]);
+ butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]);
+
+ // stage 5
+ int16x8_t x5[32];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0],
+ &output[16]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8],
+ &output[24]);
+ butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]);
+ butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8);
+ butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8);
+
+ // stage 6
+ int16x8_t x6[32];
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20],
+ &output[12]);
+ butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4);
+ butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]);
+
+ // stage 7
+ int16x8_t x7[32];
+ butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2],
+ &output[30]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18],
+ &output[14]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10],
+ &output[22]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26],
+ &output[6]);
+ butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4);
+ butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4);
+ butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4);
+ butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4);
+
+ butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1],
+ &output[31]);
+ butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17],
+ &output[15]);
+ butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9],
+ &output[23]);
+ butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25],
+ &output[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5],
+ &output[27]);
+ butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21],
+ &output[11]);
+ butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13],
+ &output[19]);
+ butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29],
+ &output[3]);
+}
+
+static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+ const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+ const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+ const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+ const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+ const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+ const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+ const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+ const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+ const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+ const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+ const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+ const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+ const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+ const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+ const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+ const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+ const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+ const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+ const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+ const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+ const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+ const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+ const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+ const int16x4_t cospi31 = vget_high_s16(cospi29_31);
+
+ // stage 1
+ int16x8_t x1[64];
+ butterfly_dct_pre_s16_x8(input, x1, 64);
+
+ // stage 2
+ int16x8_t x2[64];
+ butterfly_dct_pre_s16_x8(x1, x2, 32);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
+
+ // stage 3
+ int16x8_t x3[64];
+ butterfly_dct_pre_s16_x8(x2, x3, 16);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32);
+
+ // stage 4
+ int16x8_t x4[64];
+ butterfly_dct_pre_s16_x8(x3, x4, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
+ butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
+
+ // stage 5
+ int16x8_t x5[64];
+ butterfly_dct_pre_s16_x8(x4, x5, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
+ butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
+ butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16);
+ butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16);
+
+ // stage 6
+ int16x8_t x6[64];
+ butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
+ butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
+ butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
+ butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8);
+ butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
+
+ // stage 7
+ int16x8_t x7[64];
+ butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
+ butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4);
+ butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
+ butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
+ butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
+ butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8);
+ butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8);
+ butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8);
+ butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8);
+
+ // stage 8
+ int16x8_t x8[64];
+ butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
+ butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4);
+ butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4);
+ butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4);
+ butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4);
+ butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
+ butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
+ butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
+ butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
+ butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
+ butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
+
+ // stage 9
+ int16x8_t x9[64];
+ butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
+ butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
+ butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
+ butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
+ butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
+ butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
+ butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
+ butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
+ butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4);
+ butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4);
+ butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4);
+ butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4);
+ butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4);
+ butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4);
+ butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4);
+ butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4);
+
+ // stage 10
+ butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1],
+ &output[63]);
+ butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33],
+ &output[31]);
+ butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17],
+ &output[47]);
+ butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49],
+ &output[15]);
+ butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9],
+ &output[55]);
+ butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41],
+ &output[23]);
+ butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25],
+ &output[39]);
+ butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57],
+ &output[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5],
+ &output[59]);
+ butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37],
+ &output[27]);
+ butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21],
+ &output[43]);
+ butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53],
+ &output[11]);
+ butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13],
+ &output[51]);
+ butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45],
+ &output[19]);
+ butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29],
+ &output[35]);
+ butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61],
+ &output[3]);
+
+ // stage 11
+ output[0] = x6[0];
+ output[2] = x9[16];
+ output[4] = x8[8];
+ output[6] = x9[24];
+ output[8] = x7[4];
+ output[10] = x9[20];
+ output[12] = x8[12];
+ output[14] = x9[28];
+ output[16] = x6[2];
+ output[18] = x9[18];
+ output[20] = x8[10];
+ output[22] = x9[26];
+ output[24] = x7[6];
+ output[26] = x9[22];
+ output[28] = x8[14];
+ output[30] = x9[30];
+ output[32] = x6[1];
+ output[34] = x9[17];
+ output[36] = x8[9];
+ output[38] = x9[25];
+ output[40] = x7[5];
+ output[42] = x9[21];
+ output[44] = x8[13];
+ output[46] = x9[29];
+ output[48] = x6[3];
+ output[52] = x8[11];
+ output[54] = x9[27];
+ output[56] = x7[7];
+ output[58] = x9[23];
+ output[60] = x8[15];
+ output[62] = x9[31];
+}
+
+static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+
+ // stage 2
+ int16x8_t x2[8];
+ butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]);
+
+ // stage 3
+ int16x8_t x3[8];
+ x3[0] = vqaddq_s16(input[0], x2[2]);
+ x3[1] = vqsubq_s16(x2[3], input[7]);
+ x3[2] = vqsubq_s16(input[0], x2[2]);
+ x3[3] = vqaddq_s16(input[7], x2[3]);
+ x3[4] = vqsubq_s16(x2[6], input[1]);
+ x3[5] = vqaddq_s16(input[6], x2[7]);
+ x3[6] = vqaddq_s16(input[1], x2[6]);
+ x3[7] = vqsubq_s16(input[6], x2[7]);
+
+ // stage 4
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+
+ // stage 5
+ int16x8_t x5[8];
+ x5[0] = vqaddq_s16(x3[0], x3[4]);
+ x5[1] = vqaddq_s16(x3[1], x3[5]);
+ x5[2] = vqaddq_s16(x3[2], x3[6]);
+ x5[3] = vqsubq_s16(x3[7], x3[3]);
+ x5[4] = vqsubq_s16(x3[0], x3[4]);
+ x5[5] = vqsubq_s16(x3[1], x3[5]);
+ x5[6] = vqsubq_s16(x3[2], x3[6]);
+ x5[7] = vqaddq_s16(x3[3], x3[7]);
+
+ // stage 6
+ butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]);
+ butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]);
+ butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]);
+ butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+ // stage 2
+ int16x4_t x2[8];
+ butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
+ butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
+ butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
+
+ // stage 3
+ int16x4_t x3[16];
+ x3[0] = vqadd_s16(input[0], x2[0]);
+ x3[1] = vqsub_s16(x2[1], input[15]);
+ x3[2] = vqsub_s16(input[0], x2[0]);
+ x3[3] = vqadd_s16(input[15], x2[1]);
+ x3[4] = vqsub_s16(x2[2], input[3]);
+ x3[5] = vqadd_s16(input[12], x2[3]);
+ x3[6] = vqadd_s16(input[3], x2[2]);
+ x3[7] = vqsub_s16(input[12], x2[3]);
+ x3[8] = vqsub_s16(x2[4], input[1]);
+ x3[9] = vqadd_s16(input[14], x2[5]);
+ x3[10] = vqadd_s16(input[1], x2[4]);
+ x3[11] = vqsub_s16(input[14], x2[5]);
+ x3[12] = vqadd_s16(input[2], x2[6]);
+ x3[13] = vqsub_s16(x2[7], input[13]);
+ x3[14] = vqsub_s16(input[2], x2[6]);
+ x3[15] = vqadd_s16(input[13], x2[7]);
+
+ // stage 4
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+ butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
+ butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
+
+ // stage 5
+ int16x4_t x5[16];
+ x5[0] = vqadd_s16(x3[0], x3[4]);
+ x5[1] = vqadd_s16(x3[1], x3[5]);
+ x5[2] = vqadd_s16(x3[2], x3[6]);
+ x5[3] = vqsub_s16(x3[7], x3[3]);
+ x5[4] = vqsub_s16(x3[0], x3[4]);
+ x5[5] = vqsub_s16(x3[1], x3[5]);
+ x5[6] = vqsub_s16(x3[2], x3[6]);
+ x5[7] = vqadd_s16(x3[3], x3[7]);
+ x5[8] = vqadd_s16(x3[8], x3[12]);
+ x5[9] = vqadd_s16(x3[9], x3[13]);
+ x5[10] = vqsub_s16(x3[14], x3[10]);
+ x5[11] = vqadd_s16(x3[11], x3[15]);
+ x5[12] = vqsub_s16(x3[8], x3[12]);
+ x5[13] = vqsub_s16(x3[9], x3[13]);
+ x5[14] = vqadd_s16(x3[10], x3[14]);
+ x5[15] = vqsub_s16(x3[11], x3[15]);
+
+ // stage 6
+ butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
+ butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
+ butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
+
+ // stage 7
+ int16x4_t x7[16];
+ x7[0] = vqadd_s16(x5[0], x5[8]);
+ x7[1] = vqadd_s16(x5[1], x5[9]);
+ x7[2] = vqadd_s16(x5[2], x5[10]);
+ x7[3] = vqadd_s16(x5[3], x5[11]);
+ x7[4] = vqadd_s16(x5[4], x5[12]);
+ x7[5] = vqadd_s16(x5[5], x5[13]);
+ x7[6] = vqadd_s16(x5[6], x5[14]);
+ x7[7] = vqsub_s16(x5[15], x5[7]);
+ x7[8] = vqsub_s16(x5[0], x5[8]);
+ x7[9] = vqsub_s16(x5[1], x5[9]);
+ x7[10] = vqsub_s16(x5[2], x5[10]);
+ x7[11] = vqsub_s16(x5[3], x5[11]);
+ x7[12] = vqsub_s16(x5[4], x5[12]);
+ x7[13] = vqsub_s16(x5[5], x5[13]);
+ x7[14] = vqsub_s16(x5[6], x5[14]);
+ x7[15] = vqadd_s16(x5[7], x5[15]);
+
+ // stage 8
+ butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
+ butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13],
+ &output[2]);
+ butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11],
+ &output[4]);
+ butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
+ butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
+ butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5],
+ &output[10]);
+ butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3],
+ &output[12]);
+ butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14],
+ &output[1]);
+}
+
+static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+ // stage 2
+ int16x8_t x2[8];
+ butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]);
+ butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]);
+ butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]);
+
+ // stage 3
+ int16x8_t x3[16];
+ x3[0] = vqaddq_s16(input[0], x2[0]);
+ x3[1] = vqsubq_s16(x2[1], input[15]);
+ x3[2] = vqsubq_s16(input[0], x2[0]);
+ x3[3] = vqaddq_s16(input[15], x2[1]);
+ x3[4] = vqsubq_s16(x2[2], input[3]);
+ x3[5] = vqaddq_s16(input[12], x2[3]);
+ x3[6] = vqaddq_s16(input[3], x2[2]);
+ x3[7] = vqsubq_s16(input[12], x2[3]);
+ x3[8] = vqsubq_s16(x2[4], input[1]);
+ x3[9] = vqaddq_s16(input[14], x2[5]);
+ x3[10] = vqaddq_s16(input[1], x2[4]);
+ x3[11] = vqsubq_s16(input[14], x2[5]);
+ x3[12] = vqaddq_s16(input[2], x2[6]);
+ x3[13] = vqsubq_s16(x2[7], input[13]);
+ x3[14] = vqsubq_s16(input[2], x2[6]);
+ x3[15] = vqaddq_s16(input[13], x2[7]);
+
+ // stage 4
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]);
+ butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]);
+ butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]);
+
+ // stage 5
+ int16x8_t x5[16];
+ x5[0] = vqaddq_s16(x3[0], x3[4]);
+ x5[1] = vqaddq_s16(x3[1], x3[5]);
+ x5[2] = vqaddq_s16(x3[2], x3[6]);
+ x5[3] = vqsubq_s16(x3[7], x3[3]);
+ x5[4] = vqsubq_s16(x3[0], x3[4]);
+ x5[5] = vqsubq_s16(x3[1], x3[5]);
+ x5[6] = vqsubq_s16(x3[2], x3[6]);
+ x5[7] = vqaddq_s16(x3[3], x3[7]);
+ x5[8] = vqaddq_s16(x3[8], x3[12]);
+ x5[9] = vqaddq_s16(x3[9], x3[13]);
+ x5[10] = vqsubq_s16(x3[14], x3[10]);
+ x5[11] = vqaddq_s16(x3[11], x3[15]);
+ x5[12] = vqsubq_s16(x3[8], x3[12]);
+ x5[13] = vqsubq_s16(x3[9], x3[13]);
+ x5[14] = vqaddq_s16(x3[10], x3[14]);
+ x5[15] = vqsubq_s16(x3[11], x3[15]);
+
+ // stage 6
+ butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]);
+ butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]);
+ butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]);
+
+ // stage 7
+ int16x8_t x7[16];
+ x7[0] = vqaddq_s16(x5[0], x5[8]);
+ x7[1] = vqaddq_s16(x5[1], x5[9]);
+ x7[2] = vqaddq_s16(x5[2], x5[10]);
+ x7[3] = vqaddq_s16(x5[3], x5[11]);
+ x7[4] = vqaddq_s16(x5[4], x5[12]);
+ x7[5] = vqaddq_s16(x5[5], x5[13]);
+ x7[6] = vqaddq_s16(x5[6], x5[14]);
+ x7[7] = vqsubq_s16(x5[15], x5[7]);
+ x7[8] = vqsubq_s16(x5[0], x5[8]);
+ x7[9] = vqsubq_s16(x5[1], x5[9]);
+ x7[10] = vqsubq_s16(x5[2], x5[10]);
+ x7[11] = vqsubq_s16(x5[3], x5[11]);
+ x7[12] = vqsubq_s16(x5[4], x5[12]);
+ x7[13] = vqsubq_s16(x5[5], x5[13]);
+ x7[14] = vqsubq_s16(x5[6], x5[14]);
+ x7[15] = vqaddq_s16(x5[7], x5[15]);
+
+ // stage 8
+ butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]);
+ butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13],
+ &output[2]);
+ butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11],
+ &output[4]);
+ butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]);
+ butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]);
+ butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5],
+ &output[10]);
+ butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3],
+ &output[12]);
+ butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14],
+ &output[1]);
+}
+
+static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input,
+ int16x4_t *const output,
+ const int cos_bit) {
+ (void)cos_bit;
+ round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4);
+}
+
+static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input,
+ int16x8_t *const output,
+ const int cos_bit) {
+ (void)cos_bit;
+ round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4);
+}
+
+static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input,
+ int16x4_t *output, int cos_bit) {
+ (void)cos_bit;
+ shift_left_1_s16_x4(input, output, 8);
+}
+
+static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input,
+ int16x8_t *output, int cos_bit) {
+ (void)cos_bit;
+ shift_left_1_s16_x8(input, output, 8);
+}
+
+static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input,
+ int16x4_t *output,
+ int cos_bit) {
+ (void)cos_bit;
+ round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16);
+}
+
+static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input,
+ int16x8_t *output,
+ int cos_bit) {
+ (void)cos_bit;
+ round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16);
+}
+
+static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input,
+ int16x8_t *output,
+ int cos_bit) {
+ (void)cos_bit;
+ shift_left_2_s16_x8(input, output, 32);
+}
+
+#define TRANSFORM_COL(name, tw, n) \
+ static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \
+ int stride, int cos_bit) { \
+ int16x##tw##_t buf0[n]; \
+ load_buffer_s16_x##tw(input, stride, buf0, n); \
+ shift_left_2_s16_x##tw(buf0, buf0, n); \
+ name##_neon(buf0, output, cos_bit); \
+ }
+
+TRANSFORM_COL(fadst4x4, 4, 4)
+TRANSFORM_COL(fadst4x8, 4, 8)
+TRANSFORM_COL(fadst4x16, 4, 16)
+TRANSFORM_COL(fadst8x4, 8, 4)
+TRANSFORM_COL(fadst8x8, 8, 8)
+TRANSFORM_COL(fadst8x16, 8, 16)
+TRANSFORM_COL(fdct4x4, 4, 4)
+TRANSFORM_COL(fdct4x8, 4, 8)
+TRANSFORM_COL(fdct4x16, 4, 16)
+TRANSFORM_COL(fdct8x4, 8, 4)
+TRANSFORM_COL(fdct8x8, 8, 8)
+TRANSFORM_COL(fdct8x16, 8, 16)
+TRANSFORM_COL(fdct8x32, 8, 32)
+TRANSFORM_COL(fidentity4x4, 4, 4)
+TRANSFORM_COL(fidentity4x8, 4, 8)
+TRANSFORM_COL(fidentity4x16, 4, 16)
+TRANSFORM_COL(fidentity8x4, 8, 4)
+TRANSFORM_COL(fidentity8x8, 8, 8)
+TRANSFORM_COL(fidentity8x16, 8, 16)
+TRANSFORM_COL(fidentity8x32, 8, 32)
+
+#define TRANSFORM_ROW(name, tw, n) \
+ static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \
+ int stride, int cos_bit) { \
+ int16x##tw##_t buf0[n]; \
+ name##_neon(input, buf0, cos_bit); \
+ store_buffer_s16_x##tw(buf0, output, stride, n); \
+ }
+
+#define TRANSFORM_ROW_RECT(name, tw, n) \
+ static void name##_row_rect_neon(const int16x##tw##_t *input, \
+ int32_t *output, int stride, int cos_bit) { \
+ int16x##tw##_t buf0[n]; \
+ name##_neon(input, buf0, cos_bit); \
+ store_rect_buffer_s16_x##tw(buf0, output, stride, n); \
+ }
+
+TRANSFORM_ROW(fadst4x4, 4, 4)
+TRANSFORM_ROW(fadst4x16, 4, 16)
+TRANSFORM_ROW(fadst8x4, 8, 4)
+TRANSFORM_ROW(fadst8x8, 8, 8)
+TRANSFORM_ROW(fadst8x16, 8, 16)
+TRANSFORM_ROW(fdct4x4, 4, 4)
+TRANSFORM_ROW(fdct4x16, 4, 16)
+TRANSFORM_ROW(fdct8x4, 8, 4)
+TRANSFORM_ROW(fdct8x8, 8, 8)
+TRANSFORM_ROW(fdct8x16, 8, 16)
+TRANSFORM_ROW(fdct8x32, 8, 32)
+TRANSFORM_ROW(fidentity4x4, 4, 4)
+TRANSFORM_ROW(fidentity4x16, 4, 16)
+TRANSFORM_ROW(fidentity8x4, 8, 4)
+TRANSFORM_ROW(fidentity8x8, 8, 8)
+TRANSFORM_ROW(fidentity8x16, 8, 16)
+TRANSFORM_ROW(fidentity8x32, 8, 32)
+
+TRANSFORM_ROW_RECT(fadst4x8, 4, 8)
+TRANSFORM_ROW_RECT(fadst8x4, 8, 4)
+TRANSFORM_ROW_RECT(fadst8x8, 8, 8)
+TRANSFORM_ROW_RECT(fadst8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct4x8, 4, 8)
+TRANSFORM_ROW_RECT(fdct8x4, 8, 4)
+TRANSFORM_ROW_RECT(fdct8x8, 8, 8)
+TRANSFORM_ROW_RECT(fdct8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct8x32, 8, 32)
+TRANSFORM_ROW_RECT(fidentity4x8, 4, 8)
+TRANSFORM_ROW_RECT(fidentity8x4, 8, 4)
+TRANSFORM_ROW_RECT(fidentity8x8, 8, 8)
+TRANSFORM_ROW_RECT(fidentity8x16, 8, 16)
+TRANSFORM_ROW_RECT(fidentity8x32, 8, 32)
+
+typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input,
+ int16x4_t *output, int cos_bit);
+typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input,
+ int16x8_t *output, int cos_bit);
+
+typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input,
+ int16x4_t *output, int stride,
+ int cos_bit);
+typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input,
+ int16x8_t *output, int stride,
+ int cos_bit);
+
+typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input,
+ int32_t *output, int stride,
+ int cos_bit);
+typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input,
+ int32_t *output, int stride,
+ int cos_bit);
+
+static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_col_neon, // DCT_DCT
+ fadst4x8_col_neon, // ADST_DCT
+ fdct4x8_col_neon, // DCT_ADST
+ fadst4x8_col_neon, // ADST_ADST
+ fadst4x8_col_neon, // FLIPADST_DCT
+ fdct4x8_col_neon, // DCT_FLIPADST
+ fadst4x8_col_neon, // FLIPADST_FLIPADST
+ fadst4x8_col_neon, // ADST_FLIPADST
+ fadst4x8_col_neon, // FLIPADST_ADST
+ fidentity4x8_col_neon, // IDTX
+ fdct4x8_col_neon, // V_DCT
+ fidentity4x8_col_neon, // H_DCT
+ fadst4x8_col_neon, // V_ADST
+ fidentity4x8_col_neon, // H_ADST
+ fadst4x8_col_neon, // V_FLIPADST
+ fidentity4x8_col_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_row_neon, // DCT_DCT
+ fdct8x4_row_neon, // ADST_DCT
+ fadst8x4_row_neon, // DCT_ADST
+ fadst8x4_row_neon, // ADST_ADST
+ fdct8x4_row_neon, // FLIPADST_DCT
+ fadst8x4_row_neon, // DCT_FLIPADST
+ fadst8x4_row_neon, // FLIPADST_FLIPADST
+ fadst8x4_row_neon, // ADST_FLIPADST
+ fadst8x4_row_neon, // FLIPADST_ADST
+ fidentity8x4_row_neon, // IDTX
+ fidentity8x4_row_neon, // V_DCT
+ fdct8x4_row_neon, // H_DCT
+ fidentity8x4_row_neon, // V_ADST
+ fadst8x4_row_neon, // H_ADST
+ fidentity8x4_row_neon, // V_FLIPADST
+ fadst8x4_row_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_row_rect_neon, // DCT_DCT
+ fdct8x4_row_rect_neon, // ADST_DCT
+ fadst8x4_row_rect_neon, // DCT_ADST
+ fadst8x4_row_rect_neon, // ADST_ADST
+ fdct8x4_row_rect_neon, // FLIPADST_DCT
+ fadst8x4_row_rect_neon, // DCT_FLIPADST
+ fadst8x4_row_rect_neon, // FLIPADST_FLIPADST
+ fadst8x4_row_rect_neon, // ADST_FLIPADST
+ fadst8x4_row_rect_neon, // FLIPADST_ADST
+ fidentity8x4_row_rect_neon, // IDTX
+ fidentity8x4_row_rect_neon, // V_DCT
+ fdct8x4_row_rect_neon, // H_DCT
+ fidentity8x4_row_rect_neon, // V_ADST
+ fadst8x4_row_rect_neon, // H_ADST
+ fidentity8x4_row_rect_neon, // V_FLIPADST
+ fadst8x4_row_rect_neon // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_col_neon, // DCT_DCT
+ fadst8x4_col_neon, // ADST_DCT
+ fdct8x4_col_neon, // DCT_ADST
+ fadst8x4_col_neon, // ADST_ADST
+ fadst8x4_col_neon, // FLIPADST_DCT
+ fdct8x4_col_neon, // DCT_FLIPADST
+ fadst8x4_col_neon, // FLIPADST_FLIPADST
+ fadst8x4_col_neon, // ADST_FLIPADST
+ fadst8x4_col_neon, // FLIPADST_ADST
+ fidentity8x4_col_neon, // IDTX
+ fdct8x4_col_neon, // V_DCT
+ fidentity8x4_col_neon, // H_DCT
+ fadst8x4_col_neon, // V_ADST
+ fidentity8x4_col_neon, // H_ADST
+ fadst8x4_col_neon, // V_FLIPADST
+ fidentity8x4_col_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_row_rect_neon, // DCT_DCT
+ fdct4x8_row_rect_neon, // ADST_DCT
+ fadst4x8_row_rect_neon, // DCT_ADST
+ fadst4x8_row_rect_neon, // ADST_ADST
+ fdct4x8_row_rect_neon, // FLIPADST_DCT
+ fadst4x8_row_rect_neon, // DCT_FLIPADST
+ fadst4x8_row_rect_neon, // FLIPADST_FLIPADST
+ fadst4x8_row_rect_neon, // ADST_FLIPADST
+ fadst4x8_row_rect_neon, // FLIPADST_ADST
+ fidentity4x8_row_rect_neon, // IDTX
+ fidentity4x8_row_rect_neon, // V_DCT
+ fdct4x8_row_rect_neon, // H_DCT
+ fidentity4x8_row_rect_neon, // V_ADST
+ fadst4x8_row_rect_neon, // H_ADST
+ fidentity4x8_row_rect_neon, // V_FLIPADST
+ fadst4x8_row_rect_neon // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_col_neon, // DCT_DCT
+ fadst8x8_col_neon, // ADST_DCT
+ fdct8x8_col_neon, // DCT_ADST
+ fadst8x8_col_neon, // ADST_ADST
+ fadst8x8_col_neon, // FLIPADST_DCT
+ fdct8x8_col_neon, // DCT_FLIPADST
+ fadst8x8_col_neon, // FLIPADST_FLIPADST
+ fadst8x8_col_neon, // ADST_FLIPADST
+ fadst8x8_col_neon, // FLIPADST_ADST
+ fidentity8x8_col_neon, // IDTX
+ fdct8x8_col_neon, // V_DCT
+ fidentity8x8_col_neon, // H_DCT
+ fadst8x8_col_neon, // V_ADST
+ fidentity8x8_col_neon, // H_ADST
+ fadst8x8_col_neon, // V_FLIPADST
+ fidentity8x8_col_neon, // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_row_neon, // DCT_DCT
+ fdct8x8_row_neon, // ADST_DCT
+ fadst8x8_row_neon, // DCT_ADST
+ fadst8x8_row_neon, // ADST_ADST
+ fdct8x8_row_neon, // FLIPADST_DCT
+ fadst8x8_row_neon, // DCT_FLIPADST
+ fadst8x8_row_neon, // FLIPADST_FLIPADST
+ fadst8x8_row_neon, // ADST_FLIPADST
+ fadst8x8_row_neon, // FLIPADST_ADST
+ fidentity8x8_row_neon, // IDTX
+ fidentity8x8_row_neon, // V_DCT
+ fdct8x8_row_neon, // H_DCT
+ fidentity8x8_row_neon, // V_ADST
+ fadst8x8_row_neon, // H_ADST
+ fidentity8x8_row_neon, // V_FLIPADST
+ fadst8x8_row_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_row_rect_neon, // DCT_DCT
+ fdct8x8_row_rect_neon, // ADST_DCT
+ fadst8x8_row_rect_neon, // DCT_ADST
+ fadst8x8_row_rect_neon, // ADST_ADST
+ fdct8x8_row_rect_neon, // FLIPADST_DCT
+ fadst8x8_row_rect_neon, // DCT_FLIPADST
+ fadst8x8_row_rect_neon, // FLIPADST_FLIPADST
+ fadst8x8_row_rect_neon, // ADST_FLIPADST
+ fadst8x8_row_rect_neon, // FLIPADST_ADST
+ fidentity8x8_row_rect_neon, // IDTX
+ fidentity8x8_row_rect_neon, // V_DCT
+ fdct8x8_row_rect_neon, // H_DCT
+ fidentity8x8_row_rect_neon, // V_ADST
+ fadst8x8_row_rect_neon, // H_ADST
+ fidentity8x8_row_rect_neon, // V_FLIPADST
+ fadst8x8_row_rect_neon // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = {
+ fdct4x16_col_neon, // DCT_DCT
+ fadst4x16_col_neon, // ADST_DCT
+ fdct4x16_col_neon, // DCT_ADST
+ fadst4x16_col_neon, // ADST_ADST
+ fadst4x16_col_neon, // FLIPADST_DCT
+ fdct4x16_col_neon, // DCT_FLIPADST
+ fadst4x16_col_neon, // FLIPADST_FLIPADST
+ fadst4x16_col_neon, // ADST_FLIPADST
+ fadst4x16_col_neon, // FLIPADST_ADST
+ fidentity4x16_col_neon, // IDTX
+ fdct4x16_col_neon, // V_DCT
+ fidentity4x16_col_neon, // H_DCT
+ fadst4x16_col_neon, // V_ADST
+ fidentity4x16_col_neon, // H_ADST
+ fadst4x16_col_neon, // V_FLIPADST
+ fidentity4x16_col_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = {
+ fdct4x16_row_neon, // DCT_DCT
+ fdct4x16_row_neon, // ADST_DCT
+ fadst4x16_row_neon, // DCT_ADST
+ fadst4x16_row_neon, // ADST_ADST
+ fdct4x16_row_neon, // FLIPADST_DCT
+ fadst4x16_row_neon, // DCT_FLIPADST
+ fadst4x16_row_neon, // FLIPADST_FLIPADST
+ fadst4x16_row_neon, // ADST_FLIPADST
+ fadst4x16_row_neon, // FLIPADST_ADST
+ fidentity4x16_row_neon, // IDTX
+ fidentity4x16_row_neon, // V_DCT
+ fdct4x16_row_neon, // H_DCT
+ fidentity4x16_row_neon, // V_ADST
+ fadst4x16_row_neon, // H_ADST
+ fidentity4x16_row_neon, // V_FLIPADST
+ fadst4x16_row_neon // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_col_neon, // DCT_DCT
+ fadst8x16_col_neon, // ADST_DCT
+ fdct8x16_col_neon, // DCT_ADST
+ fadst8x16_col_neon, // ADST_ADST
+ fadst8x16_col_neon, // FLIPADST_DCT
+ fdct8x16_col_neon, // DCT_FLIPADST
+ fadst8x16_col_neon, // FLIPADST_FLIPADST
+ fadst8x16_col_neon, // ADST_FLIPADST
+ fadst8x16_col_neon, // FLIPADST_ADST
+ fidentity8x16_col_neon, // IDTX
+ fdct8x16_col_neon, // V_DCT
+ fidentity8x16_col_neon, // H_DCT
+ fadst8x16_col_neon, // V_ADST
+ fidentity8x16_col_neon, // H_ADST
+ fadst8x16_col_neon, // V_FLIPADST
+ fidentity8x16_col_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_row_neon, // DCT_DCT
+ fdct8x16_row_neon, // ADST_DCT
+ fadst8x16_row_neon, // DCT_ADST
+ fadst8x16_row_neon, // ADST_ADST
+ fdct8x16_row_neon, // FLIPADST_DCT
+ fadst8x16_row_neon, // DCT_FLIPADST
+ fadst8x16_row_neon, // FLIPADST_FLIPADST
+ fadst8x16_row_neon, // ADST_FLIPADST
+ fadst8x16_row_neon, // FLIPADST_ADST
+ fidentity8x16_row_neon, // IDTX
+ fidentity8x16_row_neon, // V_DCT
+ fdct8x16_row_neon, // H_DCT
+ fidentity8x16_row_neon, // V_ADST
+ fadst8x16_row_neon, // H_ADST
+ fidentity8x16_row_neon, // V_FLIPADST
+ fadst8x16_row_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_row_rect_neon, // DCT_DCT
+ fdct8x16_row_rect_neon, // ADST_DCT
+ fadst8x16_row_rect_neon, // DCT_ADST
+ fadst8x16_row_rect_neon, // ADST_ADST
+ fdct8x16_row_rect_neon, // FLIPADST_DCT
+ fadst8x16_row_rect_neon, // DCT_FLIPADST
+ fadst8x16_row_rect_neon, // FLIPADST_FLIPADST
+ fadst8x16_row_rect_neon, // ADST_FLIPADST
+ fadst8x16_row_rect_neon, // FLIPADST_ADST
+ fidentity8x16_row_rect_neon, // IDTX
+ fidentity8x16_row_rect_neon, // V_DCT
+ fdct8x16_row_rect_neon, // H_DCT
+ fidentity8x16_row_rect_neon, // V_ADST
+ fadst8x16_row_rect_neon, // H_ADST
+ fidentity8x16_row_rect_neon, // V_FLIPADST
+ fadst8x16_row_rect_neon // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_row_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_row_neon, // IDTX
+ fidentity8x32_row_neon, // V_DCT
+ fdct8x32_row_neon, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_row_rect_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_row_rect_neon, // IDTX
+ fidentity8x32_row_rect_neon, // V_DCT
+ fdct8x32_row_rect_neon, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_col_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_col_neon, // IDTX
+ fdct8x32_col_neon, // V_DCT
+ fidentity8x32_col_neon, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+
+ int16x4_t buf0[4], buf1[4];
+ switch (tx_type) {
+ case DCT_DCT:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case ADST_DCT:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case DCT_ADST:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case ADST_ADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case FLIPADST_DCT:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case DCT_FLIPADST:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ case FLIPADST_FLIPADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ case ADST_FLIPADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ case FLIPADST_ADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case IDTX:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case V_DCT:
+ fdct4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case H_DCT:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fdct4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case V_ADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case H_ADST:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fadst4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case V_FLIPADST:
+ fadst4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ fidentity4x4_row_neon(buf1, output, 4, 13);
+ break;
+ case H_FLIPADST:
+ fidentity4x4_col_neon(input, buf0, stride, 13);
+ transpose_arrays_s16_4x4(buf0, buf1);
+ flip_buf_4_neon(buf1, buf0, 4);
+ fadst4x4_row_neon(buf0, output, 4, 13);
+ break;
+ }
+}
+
+static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x4_t buf0[8];
+ int16x8_t buf1[8];
+ const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_1_round_s16_x4(buf0, buf0, 8);
+ transpose_arrays_s16_4x8(buf0, buf1);
+
+ if (lr_flip) {
+ int16x8_t buf2[8];
+ flip_buf_8_neon(buf1, buf2, 4);
+ row_txfm(buf2, output, 8, 13);
+ } else {
+ row_txfm(buf1, output, 8, 13);
+ }
+}
+
+static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x4_t buf0[16];
+ int16x8_t buf1[16];
+ const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_1_round_s16_x4(buf0, buf0, 16);
+ transpose_arrays_s16_4x8(buf0, buf1);
+ transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ if (lr_flip) {
+ int16x8_t buf2[16];
+ flip_buf_8_neon(buf1 + 8 * i, buf2, 4);
+ row_txfm(buf2, output + 8 * i, 16, 12);
+ } else {
+ int16x8_t *buf = buf1 + 8 * i;
+ row_txfm(buf, output + 8 * i, 16, 12);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[8];
+ int16x4_t buf1[8];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+ const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 4);
+ transpose_arrays_s16_8x4(buf0, buf1);
+
+ if (lr_flip) {
+ int16x4_t buf2[8];
+ flip_buf_4_neon(buf1, buf2, 8);
+ row_txfm(buf2, output, 4, 13);
+ } else {
+ row_txfm(buf1, output, 4, 13);
+ }
+}
+
+static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+ int16x8_t buf0[8], buf1[8];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case ADST_DCT:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case DCT_ADST:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case ADST_ADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case FLIPADST_DCT:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case DCT_FLIPADST:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ case FLIPADST_FLIPADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ case ADST_FLIPADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ case FLIPADST_ADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case IDTX:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case V_DCT:
+ fdct8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case H_DCT:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fdct8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case V_ADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case H_ADST:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fadst8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case V_FLIPADST:
+ fadst8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ fidentity8x8_row_neon(buf1, output, 8, 13);
+ break;
+ case H_FLIPADST:
+ fidentity8x8_col_neon(input, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ flip_buf_8_neon(buf1, buf0, 8);
+ fadst8x8_row_neon(buf0, output, 8, 13);
+ break;
+ }
+}
+
+static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[16], buf1[16];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ col_txfm(input, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 16);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
+ row_txfm(buf0, output + 8 * i, 16, 13);
+ } else {
+ int16x8_t *buf = buf1 + 8 * i;
+ row_txfm(buf, output + 8 * i, 16, 13);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[32], buf1[32];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+ col_txfm(input, buf0, stride, 12);
+ shift_right_2_round_s16_x8(buf0, buf0, 32);
+ transpose_arrays_s16_8x8(buf0, buf1);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
+ transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16);
+ transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24);
+
+ for (int i = 0; i < 4; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
+ row_txfm(buf0, output + 8 * i, 32, 12);
+ } else {
+ int16x8_t *buf = buf1 + 8 * i;
+ row_txfm(buf, output + 8 * i, 32, 12);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[16];
+ int16x4_t buf1[16];
+ int16x4_t buf2[16];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+ const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+ for (int i = 0; i < 2; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_1_round_s16_x8(buf0, buf0, 4);
+ transpose_arrays_s16_8x4(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ flip_buf_4_neon(buf1, buf2, 16);
+ row_txfm(buf2, output, 4, 13);
+ } else {
+ row_txfm(buf1, output, 4, 13);
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[16], buf1[16];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+ for (int i = 0; i < 2; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ flip_buf_8_neon(buf1, buf0, 16);
+ row_txfm(buf0, output, 8, 13);
+ } else {
+ row_txfm(buf1, output, 8, 13);
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[16], buf1[32];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ for (int i = 0; i < 2; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 16);
+ transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
+ row_txfm(buf0, output + 8 * i, 16, 12);
+ } else {
+ int16x8_t *buf = buf1 + 16 * i;
+ row_txfm(buf, output + 8 * i, 16, 12);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[32], buf1[64];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
+
+ if (col_txfm == NULL || row_txfm == NULL) {
+ av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+ for (int i = 0; i < 2; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 12);
+ shift_right_4_round_s16_x8(buf0, buf0, 32);
+ transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
+ row_txfm(buf0, output + 8 * i, 32, 13);
+ } else {
+ int16x8_t *buf = buf1 + 16 * i;
+ row_txfm(buf, output + 8 * i, 32, 13);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[32], buf1[32];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm == NULL || row_txfm == NULL) {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+ for (int i = 0; i < 4; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 8);
+ transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
+ }
+
+ if (lr_flip) {
+ flip_buf_8_neon(buf1, buf0, 32);
+ row_txfm(buf0, output, 8, 12);
+ } else {
+ row_txfm(buf1, output, 8, 12);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[32], buf1[64];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type];
+
+ if (col_txfm == NULL || row_txfm == NULL) {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+ for (int i = 0; i < 4; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 13);
+ shift_right_4_round_s16_x8(buf0, buf0, 16);
+ transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
+ row_txfm(buf0, output + 8 * i, 16, 13);
+ } else {
+ int16x8_t *buf = buf1 + 32 * i;
+ row_txfm(buf, output + 8 * i, 16, 13);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[32], buf1[128];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+ const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm == NULL || row_txfm == NULL) {
+ av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+ return;
+ }
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
+ for (int i = 0; i < 4; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 12);
+ shift_right_4_round_s16_x8(buf0, buf0, 32);
+ transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i);
+ transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ if (lr_flip) {
+ flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
+ row_txfm(buf0, output + 8 * i, 32, 12);
+ } else {
+ int16x8_t *buf = buf1 + 32 * i;
+ row_txfm(buf, output + 8 * i, 32, 12);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ int16x8_t buf0[64], buf1[128];
+ const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon;
+ const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon;
+
+ for (int i = 0; i < 8; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 16);
+ shift_left_2_s16_x8(buf0, buf0, 16);
+ col_txfm(buf0, buf0, 13);
+ shift_right_4_round_s16_x8(buf0, buf0, 16);
+ for (int j = 0; j < 2; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < 2; i++) {
+ int16x8_t *buf = buf1 + 64 * i;
+ row_txfm(buf, buf, 12);
+ store_buffer_s16_x8(buf, output + 8 * i, 16, 32);
+ }
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ int16x8_t buf0[64], buf1[128];
+ const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+ const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon;
+
+ for (int i = 0; i < 2; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+ col_txfm(buf0, buf0, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 64);
+ for (int j = 0; j < 8; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < 4; i++) {
+ int16x8_t *buf = buf1 + 16 * i;
+ row_txfm(buf, buf, 12);
+ store_buffer_s16_x8(buf, output + 8 * i, 32, 16);
+ }
+}
+
+static void fdct32_neon(const int32x4_t *input, int32x4_t *output,
+ int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+
+ int32x4_t buf0[32];
+ int32x4_t buf1[32];
+
+ // stage 1
+ butterfly_dct_pre_s32_x4(input, buf1, 32);
+
+ // stage 2
+ butterfly_dct_pre_s32_x4(buf1, buf0, 16);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27],
+ &buf0[20]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26],
+ &buf0[21]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25],
+ &buf0[22]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24],
+ &buf0[23]);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ butterfly_dct_pre_s32_x4(buf0, buf1, 8);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13],
+ &buf1[10]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12],
+ &buf1[11]);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16);
+
+ // stage 4
+ butterfly_dct_pre_s32_x4(buf1, buf0, 4);
+ buf0[4] = buf1[4];
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]);
+ buf0[7] = buf1[7];
+ butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29],
+ &buf0[18]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28],
+ &buf0[19]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27],
+ &buf0[20]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26],
+ &buf0[21]);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]);
+ butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4);
+ buf1[8] = buf0[8];
+ butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14],
+ &buf1[9]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13],
+ &buf1[10]);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8);
+ butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8);
+
+ // stage 6
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]);
+ butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4);
+ butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4);
+ buf0[16] = buf1[16];
+ butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30],
+ &buf0[17]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29],
+ &buf0[18]);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26],
+ &buf0[21]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25],
+ &buf0[22]);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8],
+ &buf1[15]);
+ butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9],
+ &buf1[14]);
+ butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10],
+ &buf1[13]);
+ butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11],
+ &buf1[12]);
+ butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4);
+ butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4);
+ butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4);
+ butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4);
+
+ // stage 8
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16],
+ &buf0[31]);
+ butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17],
+ &buf0[30]);
+ butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18],
+ &buf0[29]);
+ butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19],
+ &buf0[28]);
+ butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20],
+ &buf0[27]);
+ butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21],
+ &buf0[26]);
+ butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22],
+ &buf0[25]);
+ butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23],
+ &buf0[24]);
+
+ // stage 9
+ output[0] = buf0[0];
+ output[1] = buf0[16];
+ output[2] = buf0[8];
+ output[3] = buf0[24];
+ output[4] = buf0[4];
+ output[5] = buf0[20];
+ output[6] = buf0[12];
+ output[7] = buf0[28];
+ output[8] = buf0[2];
+ output[9] = buf0[18];
+ output[10] = buf0[10];
+ output[11] = buf0[26];
+ output[12] = buf0[6];
+ output[13] = buf0[22];
+ output[14] = buf0[14];
+ output[15] = buf0[30];
+ output[16] = buf0[1];
+ output[17] = buf0[17];
+ output[18] = buf0[9];
+ output[19] = buf0[25];
+ output[20] = buf0[5];
+ output[21] = buf0[21];
+ output[22] = buf0[13];
+ output[23] = buf0[29];
+ output[24] = buf0[3];
+ output[25] = buf0[19];
+ output[26] = buf0[11];
+ output[27] = buf0[27];
+ output[28] = buf0[7];
+ output[29] = buf0[23];
+ output[30] = buf0[15];
+ output[31] = buf0[31];
+}
+
+static void fdct64_neon(const int32x4_t *input, int32x4_t *output,
+ int cos_bit) {
+ const int16_t *cospi = cospi_arr_q13(cos_bit);
+
+ const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+ const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+ const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+ const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+ const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+ const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+ const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+ const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+ const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+ const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+ const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+ const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+ const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+ const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+ const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+ const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+ const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+ const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+ const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+ const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+ const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+ const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+ const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+ const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+ const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+ const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+ const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+ const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+ const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+ const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+ const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+ const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+ const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+ const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+ const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+ const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+ const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+ const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+ const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+ const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+ const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+ const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+ const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+ const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+ const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+ const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+ const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+ const int16x4_t cospi31 = vget_high_s16(cospi29_31);
+
+ // stage 1
+ int32x4_t x1[64];
+ butterfly_dct_pre_s32_x4(input, x1, 64);
+
+ // stage 2
+ int32x4_t x2[64];
+ butterfly_dct_pre_s32_x4(x1, x2, 32);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]);
+
+ // stage 3
+ int32x4_t x3[64];
+ butterfly_dct_pre_s32_x4(x2, x3, 16);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]);
+ butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32);
+
+ // stage 4
+ int32x4_t x4[64];
+ butterfly_dct_pre_s32_x4(x3, x4, 8);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]);
+ butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]);
+
+ // stage 5
+ int32x4_t x5[64];
+ butterfly_dct_pre_s32_x4(x4, x5, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]);
+ butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]);
+ butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16);
+ butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16);
+
+ // stage 6
+ int32x4_t x6[64];
+ butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]);
+ butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]);
+ butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]);
+ butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8);
+ butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8);
+ butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]);
+ butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]);
+
+ // stage 7
+ int32x4_t x7[64];
+ butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]);
+ butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4);
+ butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]);
+ butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]);
+ butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]);
+ butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]);
+ butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8);
+ butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8);
+ butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8);
+ butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8);
+
+ // stage 8
+ int32x4_t x8[64];
+ butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]);
+ butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]);
+ butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]);
+ butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]);
+ butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4);
+ butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4);
+ butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4);
+ butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4);
+ butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]);
+ butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]);
+ butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]);
+ butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]);
+ butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]);
+ butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]);
+ butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]);
+ butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]);
+
+ // stage 9
+ int32x4_t x9[64];
+ butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]);
+ butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]);
+ butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]);
+ butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]);
+ butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]);
+ butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]);
+ butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]);
+ butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]);
+ butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4);
+ butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4);
+ butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4);
+ butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4);
+ butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4);
+ butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4);
+ butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4);
+ butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4);
+
+ // stage 10
+ int32x4_t x10[64];
+ butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]);
+ butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]);
+ butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]);
+ butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]);
+ butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]);
+ butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]);
+ butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]);
+ butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]);
+ butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]);
+ butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]);
+ butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]);
+ butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]);
+ butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]);
+ butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]);
+ butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]);
+ butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]);
+
+ // stage 11, only store into the low 32 output indices.
+ output[0] = x6[0];
+ output[1] = x10[32];
+ output[2] = x9[16];
+ output[3] = x10[48];
+ output[4] = x8[8];
+ output[5] = x10[40];
+ output[6] = x9[24];
+ output[7] = x10[56];
+ output[8] = x7[4];
+ output[9] = x10[36];
+ output[10] = x9[20];
+ output[11] = x10[52];
+ output[12] = x8[12];
+ output[13] = x10[44];
+ output[14] = x9[28];
+ output[15] = x10[60];
+ output[16] = x6[2];
+ output[17] = x10[34];
+ output[18] = x9[18];
+ output[19] = x10[50];
+ output[20] = x8[10];
+ output[21] = x10[42];
+ output[22] = x9[26];
+ output[23] = x10[58];
+ output[24] = x7[6];
+ output[25] = x10[38];
+ output[26] = x9[22];
+ output[27] = x10[54];
+ output[28] = x8[14];
+ output[29] = x10[46];
+ output[30] = x9[30];
+ output[31] = x10[62];
+}
+
+static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ int16x8_t buf0[64], buf1[512];
+ const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+
+ for (int i = 0; i < 8; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+ col_txfm(buf0, buf0, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 64);
+ for (int j = 0; j < 4; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
+ }
+ }
+ for (int i = 0; i < 4; i++) {
+ int32x4_t bufA[64];
+ int32x4_t bufB[64];
+ int16x8_t *buf = buf1 + 64 * i;
+ for (int j = 0; j < 64; ++j) {
+ bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+ bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+ }
+ fdct64_neon(bufA, bufA, 10);
+ fdct64_neon(bufB, bufB, 10);
+ shift_right_2_round_s32_x4(bufA, bufA, 32);
+ shift_right_2_round_s32_x4(bufB, bufB, 32);
+ store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int16x8_t buf0[64], buf1[256];
+ const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+
+ for (int i = 0; i < 8; i++) {
+ col_txfm(input + 8 * i, buf0, stride, 12);
+ shift_right_4_round_s16_x8(buf0, buf0, 32);
+ for (int j = 0; j < 4; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
+ }
+ }
+ assert(tx_type == DCT_DCT);
+ for (int i = 0; i < 4; i++) {
+ int32x4_t bufA[64];
+ int32x4_t bufB[64];
+ int16x8_t *buf = buf1 + 64 * i;
+ for (int j = 0; j < 64; ++j) {
+ bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+ bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+ }
+ fdct64_neon(bufA, bufA, 11);
+ fdct64_neon(bufB, bufB, 11);
+ shift_right_2_round_s32_x4(bufA, bufA, 32);
+ shift_right_2_round_s32_x4(bufB, bufB, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
+ store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ int16x8_t buf0[64], buf1[256];
+ const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
+
+ for (int i = 0; i < 4; i++) {
+ load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
+ col_txfm(buf0, buf0, 13);
+ shift_right_2_round_s16_x8(buf0, buf0, 64);
+ for (int j = 0; j < 4; ++j) {
+ transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < 4; i++) {
+ int32x4_t bufA[32];
+ int32x4_t bufB[32];
+ int16x8_t *buf = buf1 + 32 * i;
+ for (int j = 0; j < 32; ++j) {
+ bufA[j] = vmovl_s16(vget_low_s16(buf[j]));
+ bufB[j] = vmovl_s16(vget_high_s16(buf[j]));
+ }
+ fdct32_neon(bufA, bufA, 11);
+ fdct32_neon(bufB, bufB, 11);
+ shift_right_2_round_s32_x4(bufA, bufA, 32);
+ shift_right_2_round_s32_x4(bufB, bufB, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
+ round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
+ store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = {
+ lowbd_fwd_txfm2d_4x4_neon, // 4x4 transform
+ lowbd_fwd_txfm2d_8x8_neon, // 8x8 transform
+ lowbd_fwd_txfm2d_16x16_neon, // 16x16 transform
+ lowbd_fwd_txfm2d_32x32_neon, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_neon, // 64x64 transform
+ lowbd_fwd_txfm2d_4x8_neon, // 4x8 transform
+ lowbd_fwd_txfm2d_8x4_neon, // 8x4 transform
+ lowbd_fwd_txfm2d_8x16_neon, // 8x16 transform
+ lowbd_fwd_txfm2d_16x8_neon, // 16x8 transform
+ lowbd_fwd_txfm2d_16x32_neon, // 16x32 transform
+ lowbd_fwd_txfm2d_32x16_neon, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_neon, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_neon, // 64x32 transform
+ lowbd_fwd_txfm2d_4x16_neon, // 4x16 transform
+ lowbd_fwd_txfm2d_16x4_neon, // 16x4 transform
+ lowbd_fwd_txfm2d_8x32_neon, // 8x32 transform
+ lowbd_fwd_txfm2d_32x8_neon, // 32x8 transform
+ lowbd_fwd_txfm2d_16x64_neon, // 16x64 transform
+ lowbd_fwd_txfm2d_64x16_neon, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size];
+ if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ } else {
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
new file mode 100644
index 0000000000..11d3def16b
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_highbd_quantize_neon.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE uint16x4_t quantize_4(const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ int32x4_t v_quant_s32,
+ int32x4_t v_dequant_s32,
+ int32x4_t v_round_s32, int log_scale) {
+ const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+ const int32x4_t v_coeff_sign =
+ vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+ const int32x4_t v_log_scale = vdupq_n_s32(log_scale);
+ const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+ // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
+ const int32x4_t v_abs_coeff_scaled =
+ vshlq_s32(v_abs_coeff, vdupq_n_s32(1 + log_scale));
+ const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
+ // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+ vreinterpretq_s32_u32(v_mask));
+ // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+ const int32x4_t v_abs_qcoeff =
+ vqdmulhq_s32(vshlq_s32(v_tmp, v_log_scale), v_quant_s32);
+ // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_qcoeff =
+ vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+ // vshlq_s32 will shift right if shift value is negative.
+ const int32x4_t v_abs_dqcoeff =
+ vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale));
+ // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_dqcoeff =
+ vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+ vst1q_s32(qcoeff_ptr, v_qcoeff);
+ vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+ // Used to find eob.
+ const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0));
+ return vmovn_u32(nz_qcoeff_mask);
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+ int16x8_t v_eobmax,
+ uint16x8_t v_mask) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+ const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+ return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if AOM_ARCH_AARCH64
+ return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+ return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+void av1_highbd_quantize_fp_neon(
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ (void)scan;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+ const int16x4_t v_zero = vdup_n_s16(0);
+ const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+ const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+ const int16x4_t v_round_log_scale =
+ vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+ const int16x4_t v_round =
+ vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+ int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+ int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+ int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+ uint16x4_t v_mask_lo, v_mask_hi;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+ // DC and first 3 AC
+ v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+ v_dequant_s32, v_round_s32, log_scale);
+
+ // overwrite the DC constants with AC constants
+ v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+ v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+ v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+ // 4 more AC
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, log_scale);
+
+ // Find the max lane eob for the first 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+ count -= 8;
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+ v_dequant_s32, v_round_s32, log_scale);
+ v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32, log_scale);
+ // Find the max lane eob for 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+ count -= 8;
+ } while (count);
+
+ *eob_ptr = get_max_eob(v_eobmax);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c
new file mode 100644
index 0000000000..d13cc65ae0
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_k_means_neon.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static int32x4_t k_means_multiply_add_neon(const int16x8_t a) {
+ const int32x4_t l = vmull_s16(vget_low_s16(a), vget_low_s16(a));
+ const int32x4_t h = vmull_s16(vget_high_s16(a), vget_high_s16(a));
+#if AOM_ARCH_AARCH64
+ return vpaddq_s32(l, h);
+#else
+ const int32x2_t dl = vpadd_s32(vget_low_s32(l), vget_high_s32(l));
+ const int32x2_t dh = vpadd_s32(vget_low_s32(h), vget_high_s32(h));
+ return vcombine_s32(dl, dh);
+#endif
+}
+
+void av1_calc_indices_dim1_neon(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ int64x2_t sum = vdupq_n_s64(0);
+ int16x8_t cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ cents[j] = vdupq_n_s16(centroids[j]);
+ }
+
+ for (int i = 0; i < n; i += 8) {
+ const int16x8_t in = vld1q_s16(data);
+ uint16x8_t ind = vdupq_n_u16(0);
+ // Compute the distance to the first centroid.
+ int16x8_t dist_min = vabdq_s16(in, cents[0]);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ const int16x8_t dist = vabdq_s16(in, cents[j]);
+ // Compare to the minimal one.
+ const uint16x8_t cmp = vcgtq_s16(dist_min, dist);
+ dist_min = vminq_s16(dist_min, dist);
+ const uint16x8_t ind1 = vdupq_n_u16(j);
+ ind = vbslq_u16(cmp, ind1, ind);
+ }
+ if (total_dist) {
+ // Square, convert to 32 bit and add together.
+ const int32x4_t l =
+ vmull_s16(vget_low_s16(dist_min), vget_low_s16(dist_min));
+ const int32x4_t sum32_tmp =
+ vmlal_s16(l, vget_high_s16(dist_min), vget_high_s16(dist_min));
+ // Pairwise sum, convert to 64 bit and add to sum.
+ sum = vpadalq_s32(sum, sum32_tmp);
+ }
+ vst1_u8(indices, vmovn_u16(ind));
+ indices += 8;
+ data += 8;
+ }
+ if (total_dist) {
+ *total_dist = horizontal_add_s64x2(sum);
+ }
+}
+
+void av1_calc_indices_dim2_neon(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ int64x2_t sum = vdupq_n_s64(0);
+ uint32x4_t ind[2];
+ int16x8_t cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+ const int16_t cxcy[8] = { cx, cy, cx, cy, cx, cy, cx, cy };
+ cents[j] = vld1q_s16(cxcy);
+ }
+
+ for (int i = 0; i < n; i += 8) {
+ for (int l = 0; l < 2; ++l) {
+ const int16x8_t in = vld1q_s16(data);
+ ind[l] = vdupq_n_u32(0);
+ // Compute the distance to the first centroid.
+ int16x8_t d1 = vsubq_s16(in, cents[0]);
+ int32x4_t dist_min = k_means_multiply_add_neon(d1);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ d1 = vsubq_s16(in, cents[j]);
+ const int32x4_t dist = k_means_multiply_add_neon(d1);
+ // Compare to the minimal one.
+ const uint32x4_t cmp = vcgtq_s32(dist_min, dist);
+ dist_min = vminq_s32(dist_min, dist);
+ const uint32x4_t ind1 = vdupq_n_u32(j);
+ ind[l] = vbslq_u32(cmp, ind1, ind[l]);
+ }
+ if (total_dist) {
+ // Pairwise sum, convert to 64 bit and add to sum.
+ sum = vpadalq_s32(sum, dist_min);
+ }
+ data += 8;
+ }
+ // Cast to 8 bit and store.
+ vst1_u8(indices,
+ vmovn_u16(vcombine_u16(vmovn_u32(ind[0]), vmovn_u32(ind[1]))));
+ indices += 8;
+ }
+ if (total_dist) {
+ *total_dist = horizontal_add_s64x2(sum);
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
new file mode 100644
index 0000000000..18cd0ce4c0
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if AOM_ARCH_AARCH64
+ return vaddlvq_s8(v_sum_diff_total);
+#else
+ const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
+ const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+ const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
+ const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
+ vget_low_s64(fedcba98_76543210));
+ const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
+ return sum_diff;
+#endif
+}
+
+// Denoise a 16x1 vector.
+static INLINE int8x16_t denoiser_16x1_neon(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
+ const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
+ const uint8x16_t v_delta_level_1_and_2,
+ const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+ /* Figure out which level that put us in. */
+ const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
+ const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
+ const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
+
+ /* Calculate absolute adjustments for level 1, 2 and 3. */
+ const uint8x16_t v_level2_adjustment =
+ vandq_u8(v_level2_mask, v_delta_level_1_and_2);
+ const uint8x16_t v_level3_adjustment =
+ vandq_u8(v_level3_mask, v_delta_level_2_and_3);
+ const uint8x16_t v_level1and2_adjustment =
+ vaddq_u8(v_level1_adjustment, v_level2_adjustment);
+ const uint8x16_t v_level1and2and3_adjustment =
+ vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
+
+ /* Figure adjustment absolute value by selecting between the absolute
+ * difference if in level0 or the value for level 1, 2 and 3.
+ */
+ const uint8x16_t v_abs_adjustment =
+ vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
+
+ /* Calculate positive and negative adjustments. Apply them to the signal
+ * and accumulate them. Adjustments are less than eight and the maximum
+ * sum of them (7 * 16) can fit in a signed char.
+ */
+ const uint8x16_t v_pos_adjustment =
+ vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment =
+ vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+ uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ /* Sum all the accumulators to have the sum of all pixel differences
+ * for this macroblock.
+ */
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+ vreinterpretq_s8_u8(v_neg_adjustment));
+ v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+ }
+ return v_sum_diff_total;
+}
+
+static INLINE int8x16_t denoiser_adjust_16x1_neon(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
+ uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+ // Clamp absolute difference to delta to get the adjustment.
+ const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
+
+ const uint8x16_t v_pos_adjustment =
+ vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment =
+ vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+ v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+ vreinterpretq_s8_u8(v_pos_adjustment));
+ v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+ }
+ return v_sum_diff_total;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int av1_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride, uint8_t *running_avg_y,
+ int avg_y_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude,
+ int width) {
+ int sum_diff_thresh, r, sum_diff = 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+
+ const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+ const int b_height = block_size_high[bs] >> 1;
+
+ int8x16_t v_sum_diff_total = vdupq_n_s8(0);
+
+ for (r = 0; r < b_height; ++r) {
+ memcpy(sig_buffer[r], sig, width);
+ memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+ memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+ memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+ width);
+ memcpy(running_buffer[r], running_avg_y, width);
+ memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+ v_sum_diff_total = denoiser_16x1_neon(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+ v_level1_threshold, v_level2_threshold, v_level3_threshold,
+ v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
+ v_sum_diff_total);
+ {
+ const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+ const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
+ const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
+ vst1_u8(running_avg_y, v_running_buffer_low);
+ vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+ }
+ // Update pointers for next iteration.
+ sig += (sig_stride << 1);
+ mc_running_avg_y += (mc_avg_y_stride << 1);
+ running_avg_y += (avg_y_stride << 1);
+ }
+
+ {
+ sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+ sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ // Before returning to copy the block (i.e., apply no denoising),
+ // check if we can still apply some (weaker) temporal filtering to
+ // this block, that would otherwise not be denoised at all. Simplest
+ // is to apply an additional adjustment to running_avg_y to bring it
+ // closer to sig. The adjustment is capped by a maximum delta, and
+ // chosen such that in most cases the resulting sum_diff will be
+ // within the acceptable range given by sum_diff_thresh.
+
+ // The delta is set by the excess of absolute pixel diff over the
+ // threshold.
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vmovq_n_u8(delta);
+ running_avg_y -= avg_y_stride * (b_height << 1);
+ for (r = 0; r < b_height; ++r) {
+ v_sum_diff_total = denoiser_adjust_16x1_neon(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
+ v_sum_diff_total);
+ {
+ const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+ const uint8x8_t v_running_buffer_high =
+ vget_high_u8(v_running_buffer);
+ const uint8x8_t v_running_buffer_low =
+ vget_low_u8(v_running_buffer);
+ vst1_u8(running_avg_y, v_running_buffer_low);
+ vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+ }
+ // Update pointers for next iteration.
+ running_avg_y += (avg_y_stride << 1);
+ }
+ sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+
+ return FILTER_BLOCK;
+}
+
+// Denoise 16x16, to 128x128 blocks.
+static int av1_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride, uint8_t *running_avg_y,
+ int avg_y_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude) {
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+ const int b_width = block_size_wide[bs];
+ const int b_height = block_size_high[bs];
+ const int b_width_shift4 = b_width >> 4;
+
+ int8x16_t v_sum_diff_total[8][8];
+ int r, c, sum_diff = 0;
+
+ for (r = 0; r < 8; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r] = vdupq_n_s8(0);
+ }
+ }
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
+ sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
+ v_level2_threshold, v_level3_threshold, v_level1_adjustment,
+ v_delta_level_1_and_2, v_delta_level_2_and_3,
+ v_sum_diff_total[c][r >> 4]);
+
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+ }
+ }
+
+ // Update pointers for next iteration.
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ {
+ const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vdupq_n_u8(delta);
+ sig -= sig_stride * b_height;
+ mc_running_avg_y -= mc_avg_y_stride * b_height;
+ running_avg_y -= avg_y_stride * b_height;
+ sum_diff = 0;
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r >> 4] =
+ denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
+ k_delta, v_sum_diff_total[c][r >> 4]);
+
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+ }
+ }
+
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+ return FILTER_BLOCK;
+}
+
+int av1_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ // Rank by frequency of the block type to have an early termination.
+ if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+ bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 ||
+ bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+ bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+ return av1_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+ avg_stride, increase_denoising, bs,
+ motion_magnitude);
+ } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+ return av1_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+ avg_stride, increase_denoising, bs,
+ motion_magnitude, 8);
+ }
+ return COPY_BLOCK;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/cnn_neon.c b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c
new file mode 100644
index 0000000000..8e686260d0
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/cnn_neon.c
@@ -0,0 +1,1144 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/partition_cnn_weights.h"
+
+// The CNN weights used in av1_cnn_convolve_no_maxpool_padding_valid are
+// declared (av1_intra_mode_cnn_partition_cnn_layer_[01234]_kernel) in
+// partition_cnn_weights.h. However, to enable linear memory access, rearrange
+// the weight tables here.
+static const float weights_layer_1[] = {
+ 0.228403f, 0.031690f, -0.251710f, -0.046230f, 0.413294f, -0.236732f,
+ -0.038291f, 0.210766f, 0.427196f, -0.384319f, -0.439463f, 0.366015f,
+ 0.112263f, -0.144168f, -0.075017f, 0.119629f, 0.325200f, -0.678246f,
+ -0.370826f, -0.341362f, -0.503392f, 0.400884f, 0.465214f, -0.360847f,
+ 0.187100f, -0.190757f, -0.131906f, 0.121492f, -0.303556f, -0.007658f,
+ 0.380077f, -0.066394f, -0.016043f, -1.490730f, -0.120682f, 0.132062f,
+ 0.086185f, -0.042766f, -0.087069f, 0.029426f, 0.309583f, -0.029985f,
+ -0.297429f, -0.018139f, -0.688828f, 0.756607f, 0.706410f, -0.696826f,
+ -0.087793f, -0.023304f, -0.012332f, -0.018043f, -0.410268f, 0.352143f,
+ 0.391284f, -0.363178f, -0.295034f, 0.160246f, -0.149446f, 0.260145f,
+ -0.252249f, 0.190826f, 0.251206f, -0.270796f, -0.979219f, 0.884880f,
+ 0.962057f, -0.847601f, -0.011053f, 0.118765f, -0.028428f, -0.020138f,
+ 0.400274f, -0.382845f, -0.462766f, 0.390654f, 0.361223f, -0.320068f,
+ -0.372084f, 0.313196f, 0.241933f, -0.416614f, -0.008722f, -0.255078f,
+ 0.078730f, -0.381935f, -0.204577f, 0.159768f, 0.071853f, -0.126294f,
+ -0.036186f, -0.007900f, 0.380071f, -0.298882f, 0.387941f, -0.267350f,
+ -0.586802f, 0.477785f, -0.000013f, 0.197296f, -0.079154f, -0.005811f,
+ -0.044300f, -0.021192f, -0.020879f, -0.005265f, 0.082277f, -0.139132f,
+ -0.239237f, 0.440234f, -0.542342f, 0.378360f, -0.070974f, 0.272702f,
+ -0.278939f, -0.044948f, -0.134197f, -0.007172f, -0.353628f, -0.128091f,
+ 0.357458f, -0.037614f, -0.144983f, 0.220623f, -0.003394f, -0.070166f,
+ 0.200370f, -0.166037f, 0.224448f, -0.012990f, -0.098853f, 0.008613f,
+ -0.017669f, 0.070641f, 0.174530f, -0.119822f, -0.065096f, 0.118487f,
+ -0.024764f, -0.050466f, 0.066631f, -0.075896f, -0.062363f, 0.212604f,
+ -0.377322f, 0.306306f, -0.399733f, 0.238624f, 0.233571f, -0.344080f,
+ 0.462491f, -0.565210f, -0.035074f, -0.010459f, 0.084382f, 0.052294f,
+ 0.065714f, 0.013716f, 0.135036f, 0.000588f, 0.181079f, -0.566344f,
+ 0.395561f, -0.398509f, 0.450017f, -1.462710f, 1.138280f, -0.447774f,
+ 0.247936f, -0.417067f, 0.165997f, -0.458632f, -0.018527f, 0.308461f,
+ 0.541266f, 0.162257f, 0.601786f, -1.275840f, -0.373404f, -0.589747f,
+ 0.026539f, -0.219327f, 0.142972f, -0.018496f, 0.075204f, -0.775190f,
+ 0.237307f, -0.348252f, 0.117792f, -0.094332f, 0.363101f, -0.065025f,
+ 0.816662f, 0.590110f, 0.752202f, -0.308599f, 0.258337f, -0.842085f,
+ 0.695788f, -0.205615f, 0.093930f, -0.392536f, 0.463093f, -0.432456f,
+ 0.041660f, -0.827264f, 0.309128f, -0.354658f, 0.451957f, -1.406640f,
+ 0.773192f, -0.892943f, 0.134856f, -0.467808f, 0.306003f, -0.226560f,
+ 0.086865f, -0.104102f, 0.148098f, -0.082658f, 0.316655f, -1.028310f,
+ 0.741566f, -0.345326f, 0.052379f, -0.275613f, 0.191765f, -0.162391f,
+ 0.000976f, 0.093061f, 0.068649f, 0.033582f, 0.239727f, -0.647769f,
+ 0.218493f, -0.397120f, 0.268229f, -0.303424f, 0.185393f, -0.314189f,
+ 0.101728f, -0.163083f, -0.084989f, 0.136783f, -0.264346f, 0.465914f,
+ 0.220395f, -0.252968f, -0.326661f, 0.271483f, 0.374717f, -0.311570f,
+ -0.082119f, 0.020870f, 0.091975f, -0.030582f, -0.487148f, 0.198912f,
+ 0.024554f, -0.749363f, -0.102267f, 0.097787f, 0.141459f, -0.110706f,
+ 0.079467f, -0.082570f, -0.347567f, 0.341043f, -0.137871f, 0.112319f,
+ 0.064733f, -0.082869f, 0.269999f, -0.408184f, -0.183443f, 0.180608f,
+ 0.223345f, -0.357376f, -0.244593f, 0.355348f, -0.072701f, -0.034311f,
+ 0.096544f, 0.016407f, 0.417550f, -0.367772f, -0.484535f, 0.405977f,
+ 0.314243f, -0.099622f, -0.192218f, -0.012780f, 0.434551f, -0.399047f,
+ -0.531499f, 0.484513f, -0.691352f, 0.872823f, 1.207720f, -1.377490f,
+ 0.006872f, -0.041453f, 0.007845f, 0.007463f, 0.467299f, -0.476372f,
+ -0.452606f, 0.452357f, 0.447332f, -0.365632f, -0.332435f, 0.300284f,
+ -0.290504f, 0.255410f, 0.310921f, -0.293717f, -0.616299f, 0.594207f,
+ 0.461347f, -0.449439f, 0.278455f, 0.285085f, -1.201340f, -0.016463f,
+ 0.549095f, 0.610375f, -4.608530f, -1.727390f, 0.150404f, -0.012846f,
+ -0.481148f, -0.182257f, 0.918796f, 0.213872f, 1.050410f, 0.681526f,
+ -0.458777f, -0.710395f, -2.347200f, -0.277197f, 0.213294f, 0.337551f,
+ -0.177710f, -0.152136f, 0.167666f, 0.308403f, -1.248500f, -0.565367f,
+ 0.122054f, 0.087874f, -0.476556f, -0.083548f, -0.358734f, -0.073131f,
+ -0.146320f, -2.241960f, 0.697639f, 0.545581f, -1.889700f, -0.267725f,
+ 0.433045f, 0.298224f, -0.338508f, 0.250226f, 0.405675f, 0.447201f,
+ -1.184690f, -0.473447f, 0.307403f, 0.711236f, -3.191560f, -1.663980f,
+ 0.165201f, 0.101360f, -0.624451f, -0.173269f, 0.089795f, 0.227478f,
+ -0.136664f, 0.007907f, 0.131079f, 0.605374f, -2.991620f, -1.723790f,
+ 0.082428f, 0.006781f, -0.348732f, -0.019271f, -0.032040f, -0.067078f,
+ -0.437166f, -0.144472f, 0.069844f, 0.194625f, -0.162284f, -0.374656f,
+ 0.056472f, -0.236524f, -0.114241f, -0.029161f, -0.222078f, -0.053435f,
+ -0.313938f, -0.555472f, 1.037550f, 0.689968f, 0.575694f, 0.065826f,
+ -0.659979f, -0.881351f, -0.626417f, -0.953975f, -0.576106f, -0.258708f,
+ 0.263004f, -0.229847f, 0.463835f, 1.390960f, -2.614480f, -1.272910f,
+ 0.065780f, -0.058603f, 0.015612f, 0.104703f, 0.198028f, 0.262792f,
+ 0.253616f, -0.079126f, -0.587381f, -0.739021f, -0.822676f, -0.795512f,
+ 0.193644f, 0.234643f, -0.034407f, 0.421478f, -0.572610f, -0.290714f,
+ -0.257803f, -0.644835f, -0.536938f, -0.375899f, -0.651077f, -0.522576f,
+ 0.562564f, 0.834616f, 0.513893f, 0.649689f, 0.356530f, 0.400716f,
+ 0.300606f, 0.290505f, 0.584608f, 0.671574f, 0.564584f, 0.419870f,
+ 0.062061f, 0.018263f, 0.009831f, 0.084103f, -0.128281f, -0.018818f,
+ -0.187244f, 0.067210f, 0.437147f, 0.442029f, 0.444939f, 0.226661f,
+ 0.541609f, 0.444280f, 0.302795f, 0.633026f, -0.180374f, 0.265197f,
+ 0.210404f, -0.118916f, -0.294013f, -0.692627f, -0.402347f, -0.356287f,
+ 0.387578f, 0.385496f, 0.789542f, 0.690396f, -0.203542f, -0.688546f,
+ 0.045319f, -0.448747f, -0.157148f, 0.152581f, 0.022360f, 0.058358f,
+ 0.593007f, 1.131860f, 0.289006f, 1.015560f, 0.144942f, -0.411577f,
+ 0.264794f, -0.085791f, 0.156996f, 0.200340f, 0.169264f, 0.267615f,
+ -0.361015f, -0.601842f, -0.442217f, -0.781086f, 0.112938f, 0.385305f,
+ 0.482454f, 0.470268f, 1.193390f, 0.589642f, 0.127638f, -0.640946f,
+ 0.540310f, 0.741498f, 0.686937f, 0.435879f, 0.534523f, 0.693119f,
+ 0.817577f, 0.783109f, 0.021681f, -0.004973f, 0.201236f, -0.086311f,
+ 0.028628f, 0.227871f, 0.462751f, 0.126832f, -0.389997f, -0.553965f,
+ -0.343953f, -0.448517f, 0.053129f, -0.115083f, 0.018138f, -0.067131f,
+ -0.293468f, -0.220700f, 0.074348f, -0.273153f, 0.263637f, 0.122049f,
+ 0.153025f, 0.076292f, 0.142320f, 0.286734f, 0.100542f, 0.308660f,
+ -0.759591f, -0.750938f, -0.788799f, -0.853076f, -0.588019f, -0.990063f,
+ -0.692327f, -0.722904f, 0.084736f, 0.151068f, 0.159606f, 0.147715f,
+ 1.610180f, 1.950330f, 1.765670f, 2.265110f, 0.008262f, 0.185584f,
+ 0.039337f, 0.164721f, 0.479446f, 0.314083f, 0.043969f, 0.291320f,
+ 0.003400f, -0.551190f, 0.060158f, -0.147591f, 0.089117f, 0.042994f,
+ 0.042802f, 0.127392f, -0.066172f, 0.078370f, 0.051408f, 0.014004f,
+ 0.086726f, 0.133334f, -0.046733f, 0.155100f, -0.118223f, -0.100778f,
+ -0.225245f, -0.460397f, 0.892644f, 1.003770f, 0.405155f, 0.517477f,
+ 0.184585f, 0.279090f, -0.036477f, 0.198703f, 0.027139f, -0.055728f,
+ -0.022396f, -0.147319f, 2.275540f, 2.014990f, 2.296800f, 2.081730f,
+ -0.088713f, 0.105729f, -0.027871f, -0.095047f, 0.012429f, 0.014244f,
+ -0.014755f, -0.003017f, 1.332700f, 1.300040f, 1.464250f, 1.305030f,
+ 0.032568f, 0.118042f, 0.079632f, -0.089405f, 0.163905f, 0.146608f,
+ 0.026502f, 0.065307f, -0.056909f, -0.065052f, 0.069851f, -0.082958f,
+ 0.023419f, -0.026293f, 0.037616f, -0.048096f, -0.073701f, -0.208295f,
+ -0.782095f, 0.000523f, 0.374131f, 0.420946f, 0.466151f, 0.349651f,
+ -0.679275f, -0.745827f, -0.379918f, -0.900107f, 0.044070f, -0.347536f,
+ -1.224390f, 0.740113f, -0.779966f, 0.510920f, -0.968597f, -0.095630f,
+ 0.120805f, 0.676803f, -0.164827f, 0.172996f, -0.106720f, 0.197527f,
+ 0.337561f, 0.571094f, -0.279090f, -0.396697f, -0.253083f, -0.690170f,
+ -0.363291f, 0.516921f, 0.489391f, -0.920628f, 0.497572f, 0.483864f,
+ -0.125696f, -0.338123f, -0.041517f, -0.534630f, -0.388465f, -0.784554f,
+ 0.215227f, 0.055088f, 0.179638f, 0.086997f, 0.569313f, 0.572926f,
+ 0.137182f, -0.045485f, 0.118087f, 0.210383f, 0.212664f, 0.482443f,
+ 0.151921f, 0.307947f, -0.084656f, -0.386206f, 0.542277f, -0.207005f,
+ 0.073792f, -1.013240f, 0.303581f, 0.270527f, 0.265985f, 0.332702f,
+ 0.848609f, 0.686757f, 0.767212f, 0.316901f, -0.502460f, -0.567092f,
+ -0.484799f, -0.173350f, -0.426863f, 0.222375f, -0.200267f, -0.523758f,
+ 0.265180f, -0.175648f, -0.229754f, 0.148740f, 0.402515f, 0.028243f,
+ -0.366109f, 0.157232f, -0.131564f, 0.055136f, 0.211046f, -0.115542f,
+ 0.322379f, -0.137768f, -0.247832f, 0.070394f, 0.058530f, -0.295023f,
+ -0.196022f, -0.109097f, 0.261285f, -0.273585f, -0.240632f, 0.258326f,
+ -0.077364f, 0.071405f, -0.014766f, -0.008751f, -0.203622f, 0.177818f,
+ 0.116726f, -0.116735f, -0.723616f, -0.700154f, 0.145082f, -0.184949f,
+ -0.287076f, 0.150405f, 0.258075f, -0.157764f, -0.120909f, 0.105459f,
+ 0.113288f, -0.092963f, 0.328183f, -0.300115f, -0.361289f, 0.319792f,
+ -0.048875f, 0.135673f, 0.132539f, -0.162481f, 0.002109f, 0.065048f,
+ -0.135969f, 0.061558f, 1.510670f, -0.884925f, -0.827022f, 0.190311f,
+ -0.060088f, -0.033362f, 0.013354f, 0.002847f, 0.353479f, -0.462538f,
+ -0.319638f, 0.424484f, 0.199540f, -0.073843f, -0.140621f, 0.072133f,
+ -0.098662f, 0.070613f, 0.031150f, -0.021869f, -0.511253f, 0.503412f,
+ 0.565963f, -0.576146f, -1.081700f, 0.047670f, 0.266687f, 0.524804f,
+ -2.361150f, 0.147823f, 0.594717f, 0.956842f, -1.048220f, 0.127083f,
+ 0.079581f, 0.065419f, 0.176783f, 0.653953f, 0.260967f, 0.537892f,
+ -1.207580f, 0.245983f, -0.727067f, 0.071755f, -0.343025f, -0.173435f,
+ 0.215289f, 0.268578f, -1.158560f, 0.039263f, -0.132888f, 0.217132f,
+ -0.622195f, -0.071256f, 0.317333f, 0.157614f, -1.588250f, 0.316432f,
+ -0.736720f, -0.041698f, -1.959280f, 0.083451f, 0.570584f, 0.327620f,
+ -1.262200f, -0.026738f, 0.231198f, 0.326861f, -1.644200f, -0.143833f,
+ -0.079495f, 0.493026f, -2.488090f, -0.034046f, 0.165884f, 1.074260f,
+ -1.076980f, 0.248198f, -0.017987f, 0.421900f, -0.105860f, 0.076710f,
+ 0.002072f, 0.070264f, -1.734750f, 0.227145f, 0.209220f, 0.851459f,
+ -0.142369f, 0.066502f, 0.027816f, 0.044321f, -0.186591f, -0.100340f,
+ 0.115580f, 0.192252f, -0.892114f, 0.209531f, -0.308243f, 0.367968f,
+ -0.721770f, 0.220224f, -0.062744f, 0.133754f, 0.040416f, 0.190428f,
+ -0.035428f, 0.162974f, 0.116427f, 0.669393f, 0.278891f, 0.856676f,
+ 1.060390f, 0.936983f, 0.863355f, 0.990560f, -0.147111f, -0.217883f,
+ 0.355794f, -0.186530f, -0.275614f, -0.095719f, 0.167346f, 0.359078f,
+ -0.079223f, -0.581596f, -0.213134f, -0.431123f, -0.516443f, -0.388628f,
+ -0.643821f, -0.202345f, 0.426230f, 0.516923f, 0.548131f, 0.555973f,
+ 0.022286f, 0.361170f, 0.980065f, 0.648400f, -0.056813f, -0.100310f,
+ -0.439481f, -0.166454f, 0.412449f, 0.509400f, 0.316208f, 0.470293f,
+ -0.827838f, -1.078380f, -1.047040f, -1.074560f, 0.274555f, -0.316736f,
+ 0.128818f, 0.228566f, -0.520967f, -0.731674f, -0.687887f, -0.536388f,
+ -0.031187f, 0.041404f, 0.047821f, 0.064397f, 0.054230f, 0.105059f,
+ -0.178671f, 0.176847f, -0.394797f, -0.260255f, -0.333734f, -0.162345f,
+ -0.444650f, -0.928438f, -0.705840f, -0.833162f, 0.306737f, 0.429699f,
+ 0.417298f, 0.478469f, 0.420903f, 0.676871f, 0.429677f, 0.616921f,
+ -0.805199f, -0.643391f, -0.304100f, 0.797599f, -0.172157f, 0.429085f,
+ -0.750676f, 0.149227f, -0.207898f, -0.022534f, -0.341448f, -0.247976f,
+ 0.095325f, -0.561120f, 0.599694f, -0.025236f, 0.292346f, -0.312001f,
+ 0.517478f, 0.301457f, -0.106415f, 0.226263f, -0.184163f, -0.114419f,
+ -0.322702f, 0.172541f, 0.445573f, 0.157213f, 0.670704f, 0.102174f,
+ -0.234667f, -0.293311f, 0.769852f, 0.038028f, -0.036741f, -0.228060f,
+ -0.253335f, 0.424054f, -0.597980f, 0.221007f, -0.114741f, -0.411557f,
+ -0.592201f, 0.442684f, 0.115491f, -0.106896f, -0.028110f, 0.354751f,
+ -0.248375f, 0.242570f, -0.155856f, 0.280528f, -0.198742f, 0.588725f,
+ 0.371065f, 0.078197f, 0.114706f, -0.448021f, 0.065255f, 0.133741f,
+ -0.227522f, -0.047339f, -0.052849f, 0.309480f, 0.597185f, 0.209182f,
+ 0.226108f, -0.601036f, -0.431672f, -0.172601f, -0.000174f, 0.194292f,
+ -0.133937f, 0.130676f, 0.059372f, 0.091381f, 0.098751f, -0.150996f,
+ 0.170514f, -0.085494f, 0.336576f, 0.484004f, 0.033862f, 0.277473f,
+ -0.231482f, -0.328385f, -0.332739f, -0.626957f, 0.510167f, 0.575861f,
+ 0.421494f, 0.482540f, -0.636377f, -0.864661f, -0.694180f, -0.420014f,
+ -0.132781f, 0.017599f, 0.003538f, 0.486934f, 0.133878f, -0.094622f,
+ 0.016132f, 0.010117f, 0.156680f, -0.022201f, -0.014621f, 0.228445f,
+ 0.190826f, 0.171580f, 0.579923f, 0.245428f, 0.322713f, 0.480101f,
+ 0.406320f, 0.412229f, 0.002334f, -0.022349f, 0.074571f, -0.043828f,
+ 0.290453f, 0.451749f, 0.530376f, 0.271879f, 0.095144f, 0.169450f,
+ 0.049482f, 0.114605f, -0.635634f, -0.700768f, -0.558538f, -0.537625f,
+ 0.190255f, -0.308237f, -0.053703f, 0.212489f, 0.056520f, -0.040019f,
+ 0.089822f, -0.014155f, -0.376004f, -0.448752f, -0.526717f, -0.571440f,
+ 0.116482f, 0.162321f, 0.147895f, 0.280527f, 0.159037f, -0.095958f,
+ 0.007931f, -0.086630f, 0.285625f, 0.514914f, 0.208908f, 0.519251f,
+ 0.309368f, 0.379777f, 0.350565f, 0.487487f, -0.541494f, -0.421836f,
+ -0.390001f, -0.500696f, -0.905736f, -0.150439f, -0.942304f, -0.566771f,
+ 0.484233f, 0.767417f, 0.410477f, 0.670196f, 0.070210f, 0.488836f,
+ 0.372805f, 0.197631f, 0.337892f, 0.524423f, 0.777219f, -0.260955f,
+ -0.112981f, -0.060088f, -0.200250f, -0.195671f, 0.007584f, 0.252096f,
+ 0.235511f, 0.366612f, -0.304979f, -0.211068f, -0.420683f, -0.085370f,
+ 0.085762f, -0.097549f, -0.802509f, -0.468079f, -0.192787f, -0.069670f,
+ -0.235162f, -0.077772f, -0.441671f, -0.348479f, -0.431434f, -0.108256f,
+ -0.133779f, 0.017032f, 0.001964f, -0.120647f, -0.187663f, -0.194985f,
+ -0.231742f, -0.175288f, -0.162639f, 0.245110f, 0.049951f, 0.104229f,
+ -0.159634f, -0.076545f, -0.022496f, -0.036532f, -0.147028f, -0.034215f,
+ 0.028213f, -0.059669f, -0.078259f, 0.062993f, -0.124066f, -0.137362f,
+ -0.129977f, -0.010532f, -0.049090f, -0.189401f, 0.495471f, 0.615778f,
+ 0.451437f, 0.803526f, 0.523532f, 0.841339f, 0.699528f, 0.745129f,
+ 0.246264f, -0.198290f, -0.283620f, 0.189917f, -0.018306f, -0.419097f,
+ 0.280363f, -0.098085f, 0.138972f, -0.140867f, -0.117025f, 0.098585f,
+ 0.130979f, 0.268133f, -0.161731f, -0.176629f, -0.357677f, -0.126379f,
+ 0.553128f, -0.126821f, -0.001511f, -0.010081f, -0.031162f, 0.079203f,
+ -0.157731f, 0.072865f, 0.535830f, -0.529989f, -0.570075f, 0.295795f,
+ 0.595613f, -0.449278f, -0.669756f, 0.941452f, 0.356897f, -0.723720f,
+ -0.115203f, -0.134479f, 0.133048f, 0.109860f, -0.024250f, -0.049732f,
+ 0.020098f, 0.048356f, -0.048293f, 0.108754f, 0.062548f, -0.238315f,
+ 0.182700f, 0.312011f, -0.244377f, -0.118012f, 0.012276f, 0.006089f,
+ 0.098068f, -0.079280f, -0.423987f, -0.411931f, -0.027425f, 0.870280f,
+ 0.022825f, -0.024481f, -0.036320f, -0.111189f, 0.364539f, -0.244896f,
+ -0.373060f, 0.266345f, -0.141778f, 0.277549f, 0.059834f, -0.178242f,
+ -0.686222f, 0.594535f, 0.354546f, -0.272516f, 1.060730f, -1.059810f,
+ -0.948126f, 0.993267f, 0.116597f, -0.227574f, -0.436144f, -0.333309f,
+ -0.575746f, -0.828102f, 0.284561f, 0.351668f, -0.080164f, -0.762518f,
+ -0.511108f, -0.212855f, 0.293892f, -0.548664f, 0.072057f, 0.006748f,
+ 1.485110f, 0.124687f, 0.727211f, 1.557560f, -0.064383f, -0.022242f,
+ 0.002921f, -0.151505f, 0.270926f, 0.173632f, -0.640644f, 0.422410f,
+ -0.240699f, -0.361980f, -0.279864f, -0.055165f, -1.084140f, 0.231705f,
+ 0.366172f, -0.347698f, -0.097565f, -0.747227f, -0.243033f, 0.941545f,
+ -0.207460f, -0.353913f, 0.104303f, -0.403151f, 0.203177f, 0.335893f,
+ -0.229033f, 0.029096f, -0.409634f, -0.179599f, -0.442397f, 0.649114f,
+ 0.460774f, 0.170906f, -0.043857f, 0.402066f, -0.226896f, -0.199624f,
+ 0.016650f, 0.207894f, 0.056954f, 0.220329f, 0.374060f, 0.130361f,
+ -0.303960f, -0.078863f, 0.195410f, 0.729438f, 0.246818f, 0.287730f,
+ 0.484876f, 0.111488f, -0.168647f, -0.087878f, -0.070089f, -0.341329f,
+ -0.330280f, 0.259943f, -0.364205f, 0.256555f, -0.756804f, -0.086915f,
+ 0.777351f, 0.006136f, 0.110348f, 0.248743f, 0.209326f, -0.362741f,
+ -0.184416f, 0.422446f, 0.565193f, 0.310072f, -0.011212f, -0.765226f,
+ 0.039466f, 0.301288f, 0.172907f, -1.539450f, 0.606202f, 0.477469f,
+ 0.045894f, -0.222180f, -0.013192f, -0.064077f, -0.241551f, 0.192914f,
+ 0.028004f, -0.540538f, 0.437440f, 0.179087f, -0.753204f, -0.001374f,
+ 1.185930f, -0.151182f, 1.238580f, -1.389900f, 0.277954f, 0.422208f,
+ 0.041553f, -0.542284f, 0.139019f, -0.148580f, -0.130705f, 0.361830f,
+ 0.322953f, -0.092371f, 0.120180f, -0.355299f, -0.028057f, 0.128114f,
+ 0.250947f, -0.349926f, -0.684633f, 0.246175f, 0.186731f, -0.676313f,
+ 0.060535f, 0.333371f, -0.021172f, -0.421266f, -0.079650f, 0.031359f,
+ -0.303658f, -0.298286f, 0.119016f, 0.655585f, 0.200175f, -0.887182f,
+ -0.197539f, -0.318883f, -0.130250f, 0.522487f, -0.092616f, 0.405930f,
+ -0.281678f, 0.089728f, 0.081814f, -0.781745f, 0.348878f, 0.082274f,
+ -0.914136f, 1.098810f, 0.855321f, -1.078170f, -0.268018f, 0.246440f,
+ 0.238347f, -0.027228f, 0.074111f, -0.061197f, -0.063582f, 0.089462f,
+ -0.040347f, 0.117082f, 0.122772f, -0.162816f, -0.148668f, -0.342856f,
+ -0.495604f, -1.453630f, -0.045273f, -0.030463f, 0.043766f, 0.047978f,
+ 0.016910f, -0.009700f, 0.006288f, -0.042556f, 0.632896f, -0.845744f,
+ -0.516844f, 0.709439f, 0.486166f, -1.203050f, -0.978381f, 0.631876f,
+ 0.000705f, 0.123858f, -0.001187f, -0.172312f, -0.422668f, 0.241838f,
+ 0.437400f, -0.268186f, -0.513259f, 0.450209f, 0.542629f, -0.453810f,
+ -0.207119f, 0.072598f, 0.085066f, -0.018986f, -0.149512f, 0.149521f,
+ 0.182105f, -0.227200f, -0.363240f, 0.172670f, -0.502932f, 0.689256f,
+ 0.093760f, -0.090207f, -0.066803f, 0.056759f, -0.002243f, -0.050662f,
+ -0.059324f, 0.152943f, -0.701150f, 0.712540f, 0.660349f, -0.654970f,
+ 0.351772f, -0.303383f, -0.311177f, 0.247653f, 0.013035f, 0.034648f,
+ -0.137832f, 0.041197f, 0.410265f, 0.345129f, 0.653338f, 0.047050f,
+ 0.140399f, 0.018613f, -0.012431f, -0.113632f, -0.029928f, 0.051564f,
+ -0.031349f, 0.151944f, -0.160340f, 0.326798f, -0.458067f, 0.636235f,
+ 0.243184f, 0.514072f, 2.414450f, 1.421980f, -0.001474f, -0.141389f,
+ -0.104817f, -0.141882f, -0.026395f, 0.053014f, 0.143885f, -0.207774f,
+ -0.563846f, -0.242514f, -0.436574f, -0.456796f, -0.520646f, 0.282550f,
+ -0.684924f, 0.061105f, -0.315884f, -0.392624f, 0.009805f, -0.256597f,
+ -0.146732f, 0.331039f, 0.362342f, 0.270851f, 0.067679f, -0.071331f,
+ -0.222423f, 0.081286f, -0.208192f, -0.193816f, -0.008201f, -0.309340f,
+ 0.167556f, 0.106071f, 0.172254f, -0.163790f, -0.142205f, -0.043182f,
+ 0.096145f, 0.145037f, -0.066015f, -0.073194f, 0.132237f, -0.088522f,
+ -0.044292f, -0.487128f, 0.033389f, -0.573548f, 0.185449f, 0.273593f,
+ 0.147503f, 0.457049f, -0.021539f, 0.090786f, 0.009147f, 0.000899f,
+ 0.018088f, 0.115791f, -0.079165f, 0.139388f,
+};
+
+static const float weights_layer_2[] = {
+ 0.153048f, 0.112901f, 0.136781f, 0.154580f, 0.091610f, 0.045165f,
+ 0.088490f, 0.116991f, -0.463766f, -0.596567f, -0.567008f, -0.630565f,
+ 0.141874f, 0.095726f, 0.175427f, 0.145027f, -0.969824f, -1.018190f,
+ -1.073300f, -1.041130f, -0.070545f, -0.123600f, -0.114967f, -0.169453f,
+ -0.267458f, -0.147730f, -0.161419f, -0.164894f, -0.117508f, -0.204389f,
+ -0.122695f, -0.163107f, -0.003903f, -0.030470f, -0.037433f, -0.059568f,
+ 0.138243f, 0.091019f, 0.160372f, 0.141650f, -0.544565f, -0.620004f,
+ -0.504503f, -0.429979f, -0.099491f, -0.096384f, -0.155265f, -0.188536f,
+ 0.084923f, 0.038345f, 0.066706f, 0.122083f, 0.267087f, 0.184419f,
+ 0.261478f, 0.255746f, -0.245894f, -0.114980f, -0.193880f, -0.227785f,
+ 0.087536f, 0.095712f, 0.106105f, 0.099353f, -0.059473f, -0.173247f,
+ -0.202386f, -0.076010f, 0.125928f, 0.100793f, 0.119638f, 0.129623f,
+ 0.136593f, 0.102984f, 0.156550f, 0.140558f, 0.122524f, 0.051596f,
+ 0.084164f, 0.123630f, 0.072542f, 0.096063f, 0.083236f, 0.087630f,
+ 0.025900f, 0.023738f, 0.036385f, 0.053077f, -0.029501f, 0.010544f,
+ -0.010026f, -0.051268f, 0.086302f, 0.109909f, 0.101385f, 0.127513f,
+ -0.031869f, 0.005340f, -0.056267f, -0.032955f, 0.032748f, 0.023162f,
+ 0.092118f, -0.001780f, -0.123612f, -0.183433f, -0.202377f, -0.317516f,
+ 0.129052f, 0.208112f, 0.145582f, 0.175502f, 0.018476f, 0.036349f,
+ 0.072417f, 0.061194f, 0.086985f, 0.117086f, 0.072465f, 0.129068f,
+ 0.020182f, 0.052114f, 0.017878f, 0.010478f, -0.001381f, -0.034644f,
+ 0.025135f, -0.037748f, 0.004973f, 0.024778f, 0.041816f, 0.032111f,
+ 0.080268f, 0.124998f, 0.105719f, 0.177047f, -0.072114f, -0.011864f,
+ -0.076846f, -0.089840f, 0.069993f, 0.089362f, 0.088035f, 0.120621f,
+ 0.065916f, 0.100946f, -0.006784f, -0.007751f, 0.122039f, 0.126482f,
+ 0.078629f, 0.140299f, 0.074034f, 0.092464f, 0.089798f, 0.108968f,
+ 0.075729f, 0.057128f, 0.013570f, 0.021195f, 0.068901f, 0.054022f,
+ 0.029781f, 0.031404f, -0.209998f, -0.208731f, -0.198310f, -0.212454f,
+ -0.579168f, -0.490190f, -0.607567f, -0.520541f, 0.083863f, 0.056612f,
+ 0.030366f, 0.061790f, -0.004874f, -0.057203f, -0.060429f, -0.049145f,
+ 0.080086f, 0.138602f, 0.223796f, 0.133279f, -0.495954f, -0.612093f,
+ -0.545393f, -0.562310f, 0.070672f, 0.037702f, 0.139013f, 0.080192f,
+ -0.111387f, -0.048165f, 0.074359f, -0.042125f, 0.113633f, 0.106579f,
+ 0.042633f, 0.102734f, -0.068220f, 0.128423f, -0.181821f, -0.013260f,
+ -0.108563f, -0.138667f, -0.109304f, -0.131909f, -0.168667f, -0.126870f,
+ -0.132533f, -0.167096f, -0.184741f, -0.140890f, -0.125361f, -0.150632f,
+ 0.309013f, 0.364376f, 0.361102f, 0.271566f, 0.116552f, 0.091160f,
+ 0.096846f, 0.095954f, 0.046972f, 0.080489f, 0.028766f, -0.012223f,
+ 0.071379f, 0.041535f, -0.000668f, 0.033698f, -0.013493f, -0.027535f,
+ -0.025804f, -0.012267f, -0.097465f, -0.099232f, -0.208863f, -0.225201f,
+ -0.475608f, 0.077358f, -0.002872f, 0.163890f, -0.420298f, 0.072114f,
+ 0.121601f, -0.016727f, 0.573853f, -0.080196f, 0.193053f, 0.053012f,
+ -0.454179f, 0.058563f, 0.067265f, 0.141154f, 0.412541f, 0.086933f,
+ 0.030407f, -0.030413f, 0.478757f, -0.097731f, 0.277072f, -0.086393f,
+ 0.552604f, -0.334201f, 0.091765f, -0.270262f, -1.395060f, 0.271837f,
+ -0.005335f, 0.240499f, 0.175442f, -0.326329f, -0.019353f, -0.270338f,
+ -0.459273f, 0.096183f, 0.153046f, 0.135818f, 0.759028f, -0.177673f,
+ -0.099966f, 0.103363f, 0.697289f, -0.234184f, -0.048706f, -0.116099f,
+ -0.282575f, 0.025655f, -0.184759f, 0.040658f, -0.558267f, 0.214087f,
+ -0.095620f, 0.200522f, 0.278996f, 0.031959f, 0.122936f, -0.209196f,
+ -0.308217f, 0.092917f, 0.113269f, 0.136274f, -0.037046f, 0.017263f,
+ -0.194183f, 0.089133f, -0.161244f, 0.042799f, 0.030557f, 0.153545f,
+ -0.355048f, 0.070928f, -0.152852f, 0.102875f, -0.193649f, 0.007916f,
+ -0.062952f, 0.050602f, 0.073671f, 0.143045f, -5.978970f, -7.013850f,
+ 0.058713f, 0.076116f, 0.026445f, -0.056599f, -0.005966f, 0.032234f,
+ 0.006753f, -0.024528f, 0.120308f, 0.179939f, -6.624630f, -7.638680f,
+ 0.026359f, 0.020758f, 0.194274f, 0.051489f, -0.008491f, -0.028248f,
+ -0.061328f, -0.134423f, -0.103951f, -0.110877f, 0.042263f, 0.127016f,
+ 0.012473f, -0.008595f, 0.031357f, 0.087476f, -0.084022f, -0.015590f,
+ -0.313546f, 0.120072f, 0.123880f, 0.162148f, -6.596560f, -7.358830f,
+ 0.004797f, -0.003415f, 0.048455f, 0.026737f, -0.103702f, 0.034416f,
+ -0.003475f, -0.236827f, 0.005378f, 0.048413f, 0.054612f, -0.079359f,
+ 0.043707f, 0.001085f, 0.023380f, 0.007785f, 0.025938f, -0.052856f,
+ -0.033421f, 0.022643f, 0.034161f, 0.127681f, -5.019490f, -5.233580f,
+ -0.128630f, 0.087741f, -0.239834f, -0.377876f, 0.128082f, 0.142730f,
+ -0.086819f, -0.350927f, 0.089849f, 0.155776f, -6.155120f, -5.721720f,
+ 0.056110f, 0.008761f, 0.045579f, 0.016762f, -0.134076f, -0.101551f,
+ -0.096058f, -0.117146f, 0.003527f, -0.056942f, -0.005578f, 0.071287f,
+ 0.023776f, -0.028003f, -0.075390f, -0.191160f, -0.089672f, -0.104372f,
+ -0.104750f, -0.080813f, -0.249824f, -0.124479f, -0.243593f, -0.244284f,
+ -0.554911f, -0.549095f, -0.564693f, -0.475107f, -0.121771f, -0.143441f,
+ -0.171170f, -0.120920f, 0.109831f, 0.079708f, 0.327295f, 0.308907f,
+ -0.178785f, -0.428316f, -0.418882f, -0.366750f, -0.139296f, -0.129645f,
+ -0.081237f, -0.101533f, -0.006256f, -0.146756f, -0.322110f, -0.338865f,
+ -0.306085f, -0.319592f, -0.454803f, -0.363560f, -0.018557f, 0.006605f,
+ -0.131198f, -0.077708f, 0.138160f, 0.119611f, 0.271098f, 0.232168f,
+ 0.027812f, 0.035390f, -0.202503f, -0.091172f, -0.142020f, -0.159929f,
+ -0.106404f, -0.107433f, -0.381743f, -0.353222f, -0.484159f, -0.469926f,
+ -0.234659f, -0.315674f, -0.178327f, -0.213485f, -0.096207f, -0.190944f,
+ -0.118917f, -0.161288f, 0.015996f, 0.060737f, 0.051390f, 0.060876f,
+ 0.229289f, 0.282418f, 0.250945f, 0.197273f, 0.045131f, -0.008305f,
+ 0.072024f, 0.044547f, -0.050010f, 0.055504f, 0.001343f, -0.014445f,
+ 0.254909f, 0.309091f, 0.228249f, 0.274843f, 0.089778f, -0.046581f,
+ 0.072714f, 0.126814f, -0.048931f, -0.045743f, -0.151333f, -0.004490f,
+ 0.179966f, 0.058150f, -0.178622f, -0.088159f, -0.074416f, -0.005821f,
+ -0.011799f, -0.002225f, -0.069361f, -0.098937f, -0.081575f, -0.034796f,
+ 0.253792f, 0.301039f, 0.219163f, 0.256027f, 0.058007f, -0.041431f,
+ 0.040674f, 0.009019f, -0.099670f, -0.099077f, -0.039437f, 0.017946f,
+ 0.060717f, 0.045796f, 0.109664f, 0.032138f, -0.071094f, 0.023697f,
+ 0.011335f, -0.030465f, 0.068677f, 0.039345f, -0.045078f, 0.084037f,
+ 0.135517f, 0.190417f, 0.175578f, 0.155286f, -0.044505f, 0.010826f,
+ 0.006717f, -0.134715f, 0.068022f, 0.110095f, 0.079966f, 0.034481f,
+ 0.185804f, 0.188273f, 0.227283f, 0.135935f, 0.033447f, 0.031571f,
+ -0.014766f, -0.024565f, 0.021792f, 0.017675f, -0.001333f, -0.040069f,
+ -0.049384f, -0.045256f, -0.014013f, -0.000107f, -0.096928f, -0.111495f,
+ -0.051225f, -0.060449f, 0.071446f, 0.017294f, -0.004822f, 0.006932f,
+ 0.020884f, 0.089425f, 0.061097f, -0.038708f, -0.184029f, -0.089541f,
+ -0.158035f, -0.214607f, -0.377947f, -0.318586f, -0.336977f, -0.323908f,
+ 0.181612f, 0.140018f, 0.233524f, 0.193366f, -0.254507f, -0.271902f,
+ -0.197144f, -0.119539f, 0.042162f, 0.000320f, 0.014708f, -0.014228f,
+ -0.081119f, -0.089326f, 0.001763f, 0.081009f, -0.142618f, -0.160650f,
+ -0.214597f, -0.202143f, -0.053495f, -0.012819f, -0.071468f, -0.010883f,
+ 0.072570f, 0.071507f, 0.091045f, 0.083155f, -0.271237f, -0.289211f,
+ -0.272345f, -0.299411f, 0.031697f, -0.029795f, -0.030045f, -0.013604f,
+ -0.106843f, -0.045212f, -0.122459f, -0.096936f, 0.059793f, 0.006157f,
+ 0.028092f, 0.040589f, -0.014560f, -0.008975f, -0.051404f, -0.014309f,
+ -0.016883f, 0.018332f, 0.040114f, 0.050348f, 0.044921f, -0.002445f,
+ -0.112396f, 0.014395f, 0.115160f, 0.145350f, -0.166814f, -0.121449f,
+ 0.155573f, -0.099446f, -0.161661f, 0.187251f, 0.004711f, 0.024318f,
+ -0.060871f, -0.028311f, -0.098274f, 0.322030f, -0.069242f, -0.153173f,
+ -0.227428f, -0.293965f, 0.228491f, 0.111413f, -1.354720f, -0.344235f,
+ 0.866715f, 0.872344f, 0.078789f, -0.384865f, 0.162388f, 0.109018f,
+ -0.191549f, -0.002638f, 0.305053f, 0.087337f, 0.066506f, -0.055810f,
+ -0.010984f, -0.056160f, -0.114617f, -0.058478f, 0.022059f, -0.124368f,
+ -0.130989f, 0.369432f, -0.248898f, -0.003955f, -0.021578f, 0.115991f,
+ -0.114163f, -0.065232f, 0.339857f, -0.225997f, 0.006282f, -0.125395f,
+ 0.235082f, -0.347785f, 0.662321f, -0.529182f, 0.153297f, -0.001326f,
+ -0.026725f, -0.024677f, -0.088065f, -0.116127f, 0.080896f, 0.212542f,
+ 0.208421f, 0.032047f, -0.211395f, 0.074997f, 0.096659f, 0.096423f,
+ -0.078643f, 0.106556f, -0.123860f, 0.075609f, 0.066008f, -0.097275f,
+ -1.000020f, -0.780154f, -0.856922f, -0.964007f, 0.083135f, -0.018922f,
+ -0.266214f, -0.151480f, 0.051538f, 0.017802f, 0.066774f, -0.021341f,
+ -0.869494f, -0.935252f, -0.895836f, -0.853871f, -0.160490f, 0.085850f,
+ -0.029670f, -0.056675f, 0.159989f, 0.166872f, 0.129970f, 0.194377f,
+ 0.153294f, 0.199593f, 0.037692f, 0.103391f, 0.029335f, -0.085324f,
+ -0.079326f, -0.077216f, 0.501561f, 0.366168f, 0.330196f, 0.296432f,
+ -0.977282f, -0.844295f, -1.014870f, -1.098990f, -0.099858f, -0.129552f,
+ 0.090051f, -0.013378f, 0.081330f, 0.194911f, 0.286501f, 0.177363f,
+ -0.148250f, -0.111700f, -0.243081f, -0.102918f, 0.161069f, -0.012655f,
+ -0.071722f, -0.020329f, -0.077828f, -0.041716f, 0.109247f, 0.062229f,
+ -0.759722f, -0.742756f, -0.563713f, -0.631187f, 0.005911f, 0.268154f,
+ -0.263769f, 0.087149f, -0.163623f, -0.359600f, -0.464577f, -0.369352f,
+ -0.515784f, -0.475822f, -0.523485f, -0.649813f, -0.112419f, -0.029285f,
+ 0.021061f, -0.041515f, 0.149133f, -0.254428f, 0.115776f, -0.061892f,
+ 0.103675f, -0.283363f, 0.005005f, 0.022034f, -0.178454f, 0.035836f,
+ -0.113702f, -0.217823f, 0.209407f, -0.296257f, 0.187976f, -0.157370f,
+ -0.127190f, 0.251780f, 0.055633f, 0.294111f, -0.067773f, 0.467190f,
+ -0.192625f, -0.071084f, -0.445284f, 0.511090f, -0.319728f, 0.267971f,
+ 0.494929f, -0.586727f, 0.454543f, -0.520675f, -0.085900f, 0.325989f,
+ -0.131006f, -0.069501f, 0.199927f, -0.218919f, 0.170055f, -0.106538f,
+ 0.133312f, 0.127629f, -0.561625f, 0.595666f, -0.090927f, 0.363348f,
+ -0.249246f, 0.063068f, -0.016458f, -0.291045f, -0.040509f, 0.017866f,
+ 0.304871f, -0.459214f, 0.214390f, -0.238740f, -0.456541f, 0.545848f,
+ -0.218026f, 0.202475f, 0.128490f, -0.036417f, 0.173885f, -0.049385f,
+ 0.235514f, -0.132587f, -0.015066f, 0.164638f, 0.196873f, -0.125330f,
+ 0.216912f, -0.109398f, 0.121602f, -0.209374f, 0.164400f, -0.123049f,
+ 0.195520f, -0.212932f, -0.015180f, -0.005784f, 0.049726f, -5.822150f,
+ 0.124536f, 0.040689f, -0.018560f, -3.155020f, 0.014690f, 0.076202f,
+ -0.154008f, 1.070630f, -0.071606f, 0.051026f, 0.138285f, -5.836340f,
+ 0.162173f, 0.085890f, -0.186166f, 0.093221f, 0.019240f, -0.017053f,
+ -0.090144f, 0.236254f, -0.125344f, 0.056235f, -0.089813f, -0.252281f,
+ -0.127406f, -0.155088f, 0.009972f, -0.066449f, 0.044222f, 0.025943f,
+ -0.164921f, 0.165463f, -0.001132f, -0.038386f, 0.115194f, -5.757100f,
+ 0.163386f, 0.061226f, 0.024626f, 0.132750f, 0.107279f, -0.001622f,
+ -0.107860f, -0.356009f, -0.138935f, -0.145173f, -0.061198f, -0.646138f,
+ 0.034279f, 0.078187f, 0.108138f, -0.490444f, 0.074719f, 0.034984f,
+ -0.109303f, 0.741785f, -0.066939f, 0.015558f, 0.114229f, -4.001080f,
+ 0.130772f, 0.044675f, -0.165162f, -0.274810f, -0.042987f, -0.048579f,
+ 0.156603f, -1.288370f, 0.076198f, 0.035065f, 0.032043f, -5.002520f,
+ 0.086900f, -0.010886f, 0.030850f, -0.782259f, 0.056211f, -0.097759f,
+ 0.118988f, 0.106638f, 0.091419f, 0.079920f, 0.062325f, 0.097116f,
+ 0.126035f, 0.122530f, -0.278299f, -0.083314f, -0.300563f, -0.197946f,
+ 0.081664f, 0.089925f, 0.074754f, 0.074628f, 0.102338f, 0.088845f,
+ 0.105841f, 0.102381f, 0.003087f, 0.061599f, 0.098326f, 0.040119f,
+ -0.005298f, -0.028834f, 0.059938f, -0.013668f, -0.585882f, -0.631436f,
+ -0.742673f, -0.736666f, 0.025071f, 0.066851f, 0.075046f, 0.091360f,
+ 0.099045f, 0.098261f, 0.106413f, 0.099487f, -0.016742f, -0.097334f,
+ -0.086152f, -0.212444f, -0.028043f, -0.007362f, 0.003914f, -0.055864f,
+ 0.034756f, 0.081361f, 0.080183f, 0.061319f, 0.193396f, 0.173716f,
+ 0.207765f, 0.231701f, -0.074565f, -0.073257f, -0.086470f, -0.083114f,
+ 0.081489f, 0.078477f, 0.033452f, 0.058835f, -0.069665f, -0.031691f,
+ -0.111255f, -0.167754f, 0.184179f, 0.174673f, 0.160288f, 0.190893f,
+ 0.110930f, 0.103495f, 0.098408f, 0.102918f, 0.053764f, 0.089994f,
+ 0.140308f, 0.124867f, 0.074176f, 0.117460f, -0.160775f, -0.144132f,
+ -0.099373f, -0.035913f, 0.081237f, 0.062247f, -0.166421f, 0.062125f,
+ 0.276479f, 0.060955f, 0.066627f, 0.455347f, 0.219953f, 0.109912f,
+ 0.273931f, 0.233153f, 0.102236f, 0.447606f, -0.352243f, 0.499236f,
+ -0.931206f, 0.248595f, 0.254047f, 0.061542f, 0.268804f, 0.309517f,
+ -0.084414f, -0.245828f, -0.144882f, -0.296579f, -0.091628f, -0.142202f,
+ -0.541764f, -0.407470f, 0.053481f, 0.238955f, 0.150188f, -0.060598f,
+ 0.196118f, -0.215617f, -0.086238f, -0.263420f, 0.206877f, 0.241788f,
+ -0.122544f, -0.448790f, 0.286917f, 0.112063f, -0.268408f, -0.041770f,
+ 0.089161f, 0.355811f, -0.078245f, -0.148490f, -0.407301f, -1.296870f,
+ -0.633421f, 0.124253f, 0.275402f, 0.223048f, 0.077016f, 0.160766f,
+ 0.115374f, 0.061053f, -0.231872f, -0.515052f, -0.278331f, -0.235912f,
+ -0.416372f, -0.284106f, -0.055942f, 0.110698f, -0.428288f, -0.298137f,
+ -0.018101f, 0.102677f, -0.019639f, 0.013479f, 0.038549f, 0.048682f,
+ 0.128684f, 0.116416f, 0.044852f, 0.008133f, 0.061597f, 0.083582f,
+ 0.014953f, 0.063716f, -0.155318f, -0.061732f, 0.084855f, 0.129505f,
+ 0.068249f, 0.193775f, -0.088631f, -0.446398f, -0.075710f, -0.061327f,
+ 0.278715f, 0.540366f, 0.618715f, 0.538374f, -0.037843f, 0.062370f,
+ -0.033184f, 0.119901f, -0.008641f, -0.064789f, 0.087498f, 0.043486f,
+ 0.247085f, 0.419992f, 0.299935f, 0.234276f, 0.089283f, 0.070357f,
+ 0.068888f, 0.134311f, 0.109823f, 0.072431f, 0.081676f, 0.091366f,
+ -1.707980f, -2.213110f, -2.149930f, -1.556870f, 0.226598f, 0.191675f,
+ 0.192207f, 0.159566f, -0.070194f, -0.136070f, -0.015172f, -0.204272f,
+ -0.162191f, -0.043313f, -0.158007f, -0.227210f, 0.040398f, 0.043014f,
+ 0.039439f, -0.035439f, 0.245558f, 0.439691f, 0.219659f, 0.138210f,
+ -0.048129f, 0.004954f, -0.102860f, -0.185376f, 0.035548f, 0.006821f,
+ 0.079199f, 0.032901f, 0.039218f, 0.068113f, 0.023075f, -0.037582f,
+ 0.225181f, 0.164562f, 0.106718f, 0.032684f, 0.013402f, 0.018797f,
+ 0.076606f, 0.046512f, -0.070024f, 0.099921f, -0.051231f, 0.074167f,
+ 0.173313f, 0.220212f, 0.142665f, 0.069809f, -0.195130f, -0.007912f,
+ -0.006764f, -0.063687f, 0.306374f, 0.402035f, 0.273759f, 0.449469f,
+ 0.114597f, 0.210745f, 0.355326f, 0.271307f, -0.109943f, -0.171912f,
+ -0.070726f, -0.128932f, 0.138770f, 0.164971f, 0.308516f, 0.332536f,
+ 0.081537f, 0.096939f, 0.054136f, 0.052226f, 0.109489f, 0.010223f,
+ 0.168072f, -0.106279f, 0.525568f, 0.704816f, 0.588942f, 0.473398f,
+ 0.149497f, 0.120835f, 0.080049f, 0.151340f, -0.182038f, -0.191091f,
+ -0.196505f, -0.198309f, -0.801819f, -1.441620f, -1.107780f, -1.025650f,
+ 0.035750f, 0.018049f, -0.029033f, -0.067255f, 0.192049f, 0.009664f,
+ -0.043741f, 0.051557f, 0.082815f, 0.069547f, -0.073379f, 0.010584f,
+ 0.192128f, 0.208586f, 0.141904f, 0.100763f, 0.046183f, 0.044776f,
+ -0.033611f, -0.005812f, 0.012966f, 0.030301f, 0.100665f, 0.103641f,
+ -0.294776f, -0.361573f, -0.420156f, -0.388743f, 0.239287f, 0.191975f,
+ 0.089644f, 0.117591f, 0.069563f, 0.021480f, 0.100287f, 0.174159f,
+ -0.013571f, 0.090960f, 0.010232f, -0.034760f, -0.077205f, 0.060632f,
+ -0.145527f, -0.391110f, -0.143052f, -0.236448f, -0.103902f, -0.188463f,
+ 0.071311f, -0.080171f, 0.021987f, 0.041767f, -0.419487f, -0.515479f,
+ -0.205470f, -0.732132f, 0.150901f, 0.107202f, 0.156307f, 0.143672f,
+ 0.474682f, 0.178137f, 0.150063f, 0.414515f, 0.559891f, 0.697019f,
+ 0.541231f, 0.505310f, -0.478101f, -0.444267f, -0.586539f, -0.445996f,
+ -0.451873f, -0.530085f, -0.447980f, -0.364955f, 0.372435f, 0.318894f,
+ 0.351211f, 0.193961f, 0.212295f, 0.212842f, 0.220003f, 0.243743f,
+ -0.388628f, -0.789620f, -0.536618f, -0.430691f, 0.247004f, 0.266489f,
+ 0.261033f, 0.263692f, 0.050089f, 0.048958f, 0.065207f, 0.120180f,
+ -0.526230f, -0.481969f, -0.422411f, -0.272292f, 0.155593f, 0.229614f,
+ 0.139579f, 0.171805f, -0.251924f, -0.302067f, -0.126157f, -0.346650f,
+ -1.195450f, -1.281100f, -0.987911f, -1.478440f, 0.285667f, 0.284802f,
+ 0.301887f, 0.259556f, -0.194127f, -0.090440f, -0.257959f, -0.259572f,
+ -0.012273f, -0.049993f, -0.099431f, 0.012506f, 0.081526f, 0.166279f,
+ 0.042594f, 0.185121f, 0.148830f, 0.073161f, 0.201728f, 0.125747f,
+ -0.295065f, -0.187585f, -0.333066f, -0.312291f, 0.253458f, 0.321585f,
+ 0.178844f, 0.219944f, -0.763475f, -0.943374f, -0.816825f, -0.709901f,
+ -0.166132f, 0.129186f, 0.015405f, -0.065623f, -0.246006f, -0.340385f,
+ -0.118155f, -0.384905f, -0.233883f, -0.400666f, -0.228597f, -0.228428f,
+ -0.559083f, -0.377784f, -0.541458f, -0.542870f, 0.067400f, 0.122987f,
+ 0.180901f, 0.186004f, -0.482910f, -0.424823f, -0.477831f, -0.394719f,
+ 0.091558f, 0.049248f, 0.049370f, 0.160429f, 0.133641f, 0.096625f,
+ 0.104429f, 0.100782f, -0.238252f, -0.221459f, -0.196974f, -0.250393f,
+ -3.071750f, -2.418450f, -0.861410f, -1.051580f, 0.071263f, 0.118014f,
+ -0.028430f, -0.072073f, -0.074463f, 0.034168f, 0.044089f, -0.091109f,
+ -3.153840f, -2.945850f, -1.977360f, -1.498850f, -0.083429f, 0.131835f,
+ -0.063865f, -0.065785f, -0.069346f, -0.015520f, -0.119551f, 0.044881f,
+ -0.105280f, 0.127516f, 0.005255f, -0.142777f, 0.061055f, -0.117250f,
+ 0.020454f, 0.157879f, -0.213812f, -0.151783f, 0.028583f, 0.137759f,
+ -3.248250f, -3.005940f, -1.510540f, -1.475390f, 0.081874f, -0.171465f,
+ -0.135690f, -0.001989f, -0.227574f, -0.132799f, -0.359742f, -0.137197f,
+ 0.066324f, 0.039194f, -0.050857f, 0.095166f, 0.044475f, 0.011221f,
+ 0.054904f, 0.061414f, -0.039189f, 0.123751f, -0.017171f, -0.008494f,
+ -2.598220f, -2.832670f, -1.622030f, -1.201990f, 0.154313f, -0.021436f,
+ 0.042190f, 0.143947f, -0.090623f, 0.086853f, 0.143137f, 0.099821f,
+ -1.732820f, -1.429730f, -0.775125f, -0.648036f, 0.082176f, 0.079448f,
+ -0.040575f, 0.024511f, -0.064105f, -0.117122f, -0.190323f, -0.182589f,
+ -0.076430f, -0.095615f, -0.112513f, -0.101581f, 0.143037f, 0.148180f,
+ 0.430958f, 0.359225f, 0.001403f, -0.080541f, -0.295001f, -0.156706f,
+ 0.426623f, 0.475597f, 0.455210f, 0.454352f, 0.074365f, 0.099440f,
+ 0.066348f, -0.007078f, 0.008335f, -0.097116f, -0.133687f, -0.110535f,
+ 0.204145f, 0.281478f, 0.078886f, 0.112857f, -0.103620f, -0.068247f,
+ 0.191147f, 0.227593f, -0.011816f, -0.058755f, -0.149477f, -0.101828f,
+ 0.079878f, 0.304949f, 0.557555f, 0.305288f, -0.150955f, -0.118610f,
+ 0.052073f, 0.064707f, -0.121728f, -0.151132f, -0.193987f, -0.175046f,
+ 0.043655f, 0.105270f, -0.120715f, -0.040976f, 0.047776f, -0.004443f,
+ 0.149606f, 0.111240f, -0.047502f, -0.064146f, -0.151858f, -0.151872f,
+ -0.160207f, -0.113846f, -0.081585f, -0.006708f, -0.203760f, -0.068597f,
+ -0.179979f, -0.127779f, -0.062460f, -0.064513f, -0.121479f, -0.111122f,
+ -0.212384f, -0.229157f, -0.283428f, -0.184891f,
+};
+
+static const float weights_layer_3[] = {
+ -0.039388f, 0.033048f, -0.113003f, -0.011642f, 0.170478f, 0.145713f,
+ 0.040189f, -0.280129f, -0.049050f, -0.043788f, -0.157425f, 0.323829f,
+ -0.250725f, -0.166349f, 0.101650f, -0.049690f, 0.205606f, 0.281131f,
+ 0.623204f, 0.993452f, -0.015115f, -0.138995f, 0.009473f, 0.157673f,
+ -0.024687f, -0.067214f, 0.125566f, -0.317619f, 0.057002f, 0.031202f,
+ -0.018167f, 0.068542f, 0.011609f, -0.020233f, -0.000428f, -0.035956f,
+ -0.843274f, -0.800587f, -0.214917f, -0.221250f, 0.031255f, -0.077330f,
+ -0.074902f, -0.063979f, -0.055562f, 0.679495f, 0.146609f, 1.315330f,
+ -0.118399f, -0.034539f, -0.050377f, 0.172867f, -0.204607f, -0.034930f,
+ 0.176014f, 0.089747f, -0.003889f, 0.044980f, 0.002386f, -0.141723f,
+ -0.035828f, -0.204701f, 0.099813f, 0.123580f, 0.209851f, -0.110989f,
+ -0.043655f, -0.461118f, -0.139664f, 0.026855f, -0.081714f, 0.207623f,
+ 0.089942f, 0.253082f, 0.680568f, 0.811360f, -0.090528f, -0.116818f,
+ -0.432361f, -0.075588f, -0.269924f, -0.276810f, -0.289192f, -0.282570f,
+ 0.245566f, 0.267216f, 0.238622f, 0.286528f, -0.157605f, -0.200401f,
+ -0.138924f, -0.185006f, 0.215203f, 0.203316f, 0.209532f, 0.293135f,
+ 0.928046f, 0.733323f, -0.094120f, 0.036918f, -0.126643f, -0.083371f,
+ -0.147530f, -0.153195f, 0.097097f, 0.101852f, 0.109160f, 0.105129f,
+ -0.051869f, -0.064359f, -0.073469f, -0.059591f, 0.102431f, 0.109444f,
+ 0.113614f, 0.105617f, 0.383311f, 0.325783f, 0.393234f, 0.382508f,
+ 0.194720f, 0.189672f, 0.217477f, 0.177786f, 0.326461f, 0.114789f,
+ 0.317061f, 0.048291f, -0.061143f, -0.134641f, -0.067895f, -0.108446f,
+ 0.082592f, 0.029918f, -0.006580f, 0.015533f, -0.053583f, -0.055540f,
+ -0.063395f, -0.023157f, -0.064955f, -0.073981f, -0.115452f, -0.086626f,
+ -0.036616f, 0.008454f, 0.012029f, -0.008039f, -0.207395f, -0.216419f,
+ -0.205363f, -0.249099f, 0.343308f, 0.413215f, -0.009918f, -0.109978f,
+ -0.059711f, -0.045089f, -0.029130f, -0.038483f, -0.070323f, -0.099409f,
+ -0.008849f, -0.063527f, 0.175963f, 0.185335f, 0.149151f, 0.199997f,
+ -0.027516f, -0.039812f, -0.027760f, -0.047910f, -0.007337f, 0.071065f,
+ 0.086225f, 0.125539f, 0.151390f, 0.215488f, 0.203450f, 0.045380f,
+ 0.095761f, 0.107809f, 0.103918f, 0.122383f, 0.116287f, 0.135455f,
+ 0.115446f, 0.155673f, -0.044648f, -0.027455f, -0.015473f, -0.026657f,
+ 0.089852f, 0.077459f, 0.077631f, 0.082507f, -0.102761f, -0.054669f,
+ -0.132223f, -0.024768f, 0.111573f, 0.060467f, 0.107883f, 0.056621f,
+ 0.219357f, -0.161153f, 0.074379f, -0.118743f, -0.169931f, -0.153995f,
+ -0.220003f, -0.200186f, 0.032318f, -0.060687f, -0.087550f, -0.038022f,
+ 0.026633f, -0.005534f, 0.029532f, 0.027081f, 0.011926f, 0.058412f,
+ 0.010631f, 0.003068f, -0.014911f, 0.063070f, 0.065271f, 0.089550f,
+ 0.012885f, 0.005320f, -0.037494f, -0.019849f, -0.009624f, -0.059090f,
+ -0.021222f, -0.088033f, -0.055261f, -0.055113f, -0.047598f, -0.055478f,
+ -0.023648f, -0.046827f, -0.036572f, -0.057655f, 0.104194f, 0.179800f,
+ 0.175751f, 0.192851f, -0.016950f, -0.073650f, -0.028592f, -0.088219f,
+ 0.011130f, 0.061825f, 0.025643f, 0.034183f, 0.095548f, 0.001457f,
+ -0.132869f, 0.032981f, -0.140178f, -0.105343f, -0.161799f, -0.161983f,
+ 0.177746f, 0.132903f, 0.135627f, 0.152489f, -0.012532f, -0.068747f,
+ -0.085849f, -0.095434f, 0.087037f, 0.139497f, 0.111899f, 0.100189f,
+ -0.024649f, -0.092003f, 0.020783f, -0.115807f, 0.092039f, 0.093943f,
+ 0.109466f, 0.049639f, -0.133727f, 0.128430f, -0.050546f, 0.190632f,
+ 0.123733f, 0.082305f, 0.114878f, 0.122572f, 0.201618f, 0.137588f,
+ 0.065582f, 0.125161f, -0.095179f, -0.120719f, -0.127126f, -0.101961f,
+ -0.118120f, -0.104833f, -0.179632f, -0.131764f, -0.138096f, -0.147861f,
+ -0.131512f, -0.153905f, -0.201816f, -0.206641f, -0.196707f, -0.160013f,
+ -0.212605f, -0.093998f, -0.186258f, -0.076137f, -0.065340f, -0.006969f,
+ -0.071383f, -0.075005f,
+};
+
+static const float weights_layer_4[] = {
+ -0.016102f, -0.022836f, 0.624049f, 0.273485f, 0.222800f, -0.290175f,
+ -0.518415f, 0.413484f, -0.264495f, 0.498083f, -0.450145f, -0.106419f,
+ 0.095103f, -0.187451f, 0.145933f, -0.371542f, -0.088871f, 0.184017f,
+ -0.429625f, -0.110882f, 0.292781f, 0.289588f, 0.185127f, 0.326017f,
+ -0.432009f, -0.342663f, -0.312206f, 0.004004f, -1.114290f, 0.028497f,
+ -0.264944f, -0.419611f, 0.046336f, 0.138232f, -0.869528f, 0.425557f,
+ -0.954838f, -0.186830f, -0.464622f, -0.757107f, -0.432686f, -0.125978f,
+ -0.402633f, -0.172266f, -0.041749f, -0.822238f, -0.118486f, 0.238617f,
+ -0.198037f, 0.146347f, 0.405257f, 0.513303f, -0.078876f, -0.300385f,
+ -0.010293f, -0.183962f, 0.155738f, 0.186797f, -0.086814f, 0.000179f,
+ 0.123467f, 0.362523f, 0.068805f, 0.371834f, 0.038122f, -0.117867f,
+ -0.120445f, -0.422322f, -0.131402f, 0.285449f, 0.038957f, 0.008844f,
+ -0.020197f, 0.187723f, 0.190433f, 0.146532f, -0.091068f, -0.270865f,
+ -0.194231f, -0.226777f, 0.013548f, 0.248351f, 0.537685f, 0.056316f,
+ -0.171540f, -0.003865f, 0.406439f, 0.126507f, 0.192780f, 0.149335f,
+ -0.149602f, 0.255202f, -0.015426f, 0.032335f, -1.791330f, -0.894602f,
+ -0.196641f, -0.282846f, -0.391100f, -0.040969f, 0.049934f, 0.056348f,
+ -0.041426f, -0.075159f, -0.658335f, -0.827270f, -0.175029f, -0.427235f,
+ 0.311201f, 0.560413f, 0.363408f, 0.374580f, -0.433531f, -0.180580f,
+ 0.142142f, 0.194768f, -0.054118f, -0.376541f, -0.366185f, -0.308782f,
+ -0.273143f, -0.074097f, 0.009000f, -0.182198f, -0.015616f, -0.003882f,
+ -0.174340f, -0.354866f, 0.527972f, 0.348355f, 0.091381f, -0.419828f,
+ -0.530529f, 0.159899f, -0.511867f, -0.104237f, -0.286079f, -0.659039f,
+ -0.266596f, -0.256557f, -0.600437f, -0.446333f, -0.229629f, 0.024931f,
+ -0.143716f, -0.415754f, -0.003760f, -0.107195f, -0.666165f, -0.697312f,
+ -0.650255f, -0.703877f, 0.243402f, 0.426710f, 0.217210f, 0.260255f,
+ 0.027416f, 0.163147f, 0.132188f, 0.142374f, 0.558627f, 0.065717f,
+ 0.382781f, -1.192240f, 0.195492f, 0.028439f, 0.278252f, -0.491806f,
+ 0.497701f, -0.448835f, -0.245079f, -0.014336f, -0.174907f, -0.409633f,
+ 0.207548f, 0.433813f, 0.459889f, 0.431728f, 0.605050f, 0.485520f,
+ 0.218548f, 0.437307f, 0.027023f, -0.204251f, 0.012100f, 0.150677f,
+ -1.097980f, 0.086866f, -1.293130f, -0.372575f, -0.876264f, -0.021818f,
+ 0.322864f, -0.231043f, -0.271608f, 0.132782f, -0.314895f, 0.396800f,
+ 0.262788f, -0.317212f, -0.666308f, 0.830742f, 0.319409f, -0.564373f,
+ -0.178656f, 0.306993f, 0.265634f, -0.332480f, -0.491514f, -0.186745f,
+ -0.063044f, -0.009321f, 0.074944f, -0.372082f, -0.029479f, 0.081548f,
+ 0.028172f, -0.233148f, -0.337938f, -0.087695f, 0.596556f, 0.559530f,
+ 0.139332f, 0.107223f, -0.190915f, 0.137401f, -0.150625f, -0.225484f,
+ -0.191344f, -0.232535f, 0.126510f, 0.296323f, -0.547901f, -0.653080f,
+ 0.358514f, 0.726289f, -0.421725f, -0.243620f, 0.236206f, 0.390823f,
+ -0.076560f, -0.282329f, -0.012460f, -0.428484f, 0.349469f, 0.394629f,
+ 0.421537f, 0.219632f, -0.117550f, -0.087894f, 0.077155f, 0.016000f,
+ -0.289137f, -0.092937f, -0.014518f, -0.027111f, 0.210329f, -0.159678f,
+ 0.013288f, -0.039268f, 0.008112f, 0.003152f, 0.030084f, -0.039859f,
+ 0.322028f, -0.407797f, 0.447087f, -0.381562f, 0.529297f, -0.520298f,
+ 0.562865f, -0.616878f, 0.689389f, 0.754262f, 0.138475f, 0.750697f,
+ -0.760157f, -0.383740f, 0.074219f, 0.556257f, 0.087827f, -0.511826f,
+ -0.305507f, -0.638214f, 0.114833f, -0.444022f, 0.526612f, -0.604984f,
+ -0.100415f, 0.037824f, -0.106264f, 0.337615f, 0.070743f, 0.031129f,
+ 0.281954f, 0.176144f, -0.032833f, -0.073902f, -0.285492f, -0.803803f,
+ -0.015589f, 0.186077f, -0.033351f, 0.517269f, -1.878800f, -1.685210f,
+ -0.416581f, 0.158476f, -0.071929f, -0.624353f, -0.122069f, -0.075065f,
+ 0.311816f, 0.506305f, 0.383896f, 0.259450f, -0.308232f, -0.094221f,
+ -0.421885f, -0.293573f,
+};
+
+static const float weights_layer_5[] = {
+ 0.131894f, 0.078431f, 0.323121f, -0.230680f, -0.684740f, 0.020895f,
+ 0.364983f, 0.121656f, 0.132448f, -0.731198f, 0.071148f, 0.739642f,
+ 0.318437f, -0.033021f, -1.037080f, 0.135335f, 0.383582f, 0.287332f,
+ 0.054042f, -0.825482f, 0.418533f, 0.305606f, 0.041549f, 0.432422f,
+ -0.826878f, -0.593536f, 0.105657f, 0.125357f, 0.408567f, -0.293338f,
+ 0.233905f, -0.039609f, 0.547727f, -0.435806f, 0.036160f, 0.220275f,
+ -0.020337f, -0.619403f, -0.455858f, 0.681455f, 0.543846f, -0.495084f,
+ 0.251496f, -0.085686f, 0.091395f, -0.476696f, 0.453628f, -0.109663f,
+ 0.383493f, -0.456563f, -0.212935f, 0.020567f, -0.719564f, -0.377813f,
+ -0.737511f, 0.765965f, 0.624309f, -0.063679f, -0.055681f, -0.475969f,
+ -0.069902f, 0.725690f, 0.641094f, 0.439922f, -0.111544f, -0.309061f,
+ 0.280091f, 0.381416f, 0.481168f, 0.483543f, -0.901267f, -0.499230f,
+ 0.043449f, -0.372395f, 0.021216f, -0.002200f, -0.524089f, -0.071485f,
+ -0.273974f, -0.462654f, 0.042369f, -0.138679f, -0.330060f, 0.021886f,
+ -0.306075f, -0.011130f, -0.260224f, -0.288435f, -0.104039f, -0.183563f,
+ 0.118990f, -0.531160f, 0.339632f, -0.028374f, 0.159084f, -0.008824f,
+ -0.791388f, 0.245242f, 0.356510f, 0.469867f, -0.396949f, -0.476146f,
+ -0.168472f, 1.068400f, 0.474629f, -0.117554f, -0.142453f, -0.306604f,
+ 0.348525f, -0.111929f, -0.435384f, 0.019952f, -0.260185f, 0.373376f,
+ 0.109729f, -0.639168f, 0.033392f, -0.082573f, -0.196018f, 0.301637f,
+ -0.124210f, -0.202515f, -1.221920f, -0.253690f, -0.144864f, 0.287753f,
+ -0.161206f, -0.213246f, 0.373968f, 0.141397f, -0.248237f, 0.283090f,
+ -0.008977f, -0.172960f, -0.234146f, -0.720014f, -0.322451f, 0.181083f,
+ 0.310659f, -0.422646f, -0.719994f, -0.354339f, 0.352739f, 0.230923f,
+ 0.427013f, -0.660316f, 0.232140f, 0.685896f, 0.660208f, 0.225748f,
+ -0.918750f, -0.650790f, -0.674525f, -0.450305f, -0.152529f, 0.498480f,
+ 0.895092f, 0.688242f, 0.669057f, 0.612669f, 0.593484f, 0.318204f,
+ -0.169294f, 0.388789f, -0.529777f, -0.219706f, -0.044916f, 0.161697f,
+ -0.145288f, 0.196153f, -0.022212f, -0.434209f, -0.208115f, -0.117745f,
+ -0.279029f, -0.009506f, 0.137474f, 0.330148f, 0.439258f, 0.345879f,
+ -0.845131f, -0.215713f, 0.094463f, 0.638604f, 0.882254f, -0.964082f,
+ -0.383920f, 0.292645f, 0.266341f, 0.747473f, -0.645631f, -0.538896f,
+ -0.319764f, 0.521880f, 0.460091f, -0.470898f, -0.778283f, -0.061622f,
+ -0.142433f, 0.210520f, 0.804197f, 0.285840f, -0.138414f, -0.381846f,
+ -0.499991f, 0.223648f, 0.439025f, 0.321508f, -0.099560f, -0.622893f,
+ 0.750925f, 0.740994f, 0.140405f, 0.074631f, -0.270223f, -0.829049f,
+ -0.753355f, -0.258015f, 0.006285f, -0.730573f, -1.107390f, -0.538015f,
+ -1.005520f, -0.724115f, -0.440183f, -0.395239f, 0.508768f, 0.204620f,
+ -0.267331f, 0.001740f, -0.838709f, 0.659333f, 0.043739f, -0.024099f,
+ 0.262431f, 0.252433f, -0.265215f, 0.057289f, -0.428192f, -0.114350f,
+ -0.011475f, 0.463995f, 0.668833f, -0.604556f, -0.122780f, -0.441645f,
+ 0.145769f, 0.310450f, -1.003500f, 0.936069f, 0.516604f, -0.643386f,
+ -0.518571f, 0.306130f, 0.337387f, 0.583400f, -0.366025f, -0.560035f,
+ -0.262332f, 0.465242f, 0.964332f, -0.545410f, -0.637428f, -0.202695f,
+ 0.378931f, 0.834604f, 0.000970f, -0.553303f, -0.562879f, 0.221665f,
+ 0.395160f, 0.446281f, -0.184394f, -0.591780f, 0.170595f, 1.164390f,
+ 0.227068f, -0.150910f, -0.393690f, -0.131151f, 0.309956f, -0.413518f,
+ -0.768334f, -0.548975f, 0.245384f, -0.256904f, -0.514790f, -0.102616f,
+ -0.347625f, 0.420456f, 0.037804f, -0.283200f, -0.578815f, 0.319282f,
+ 0.674622f, -0.011791f, -0.339329f, 0.466705f, 0.563444f, 0.409660f,
+ 0.445784f, -0.899507f, -0.605116f, 0.622438f, 0.427385f, -0.062509f,
+ 0.666570f, 0.057105f, 0.357894f, -0.811016f, -0.421715f, -0.458397f,
+ 0.288955f, 0.005857f, 0.236331f, 0.107957f, 0.587276f, -0.375800f,
+ 0.323799f, -0.623363f, 0.254122f, -0.198478f, -0.098436f, -0.282531f,
+ 0.452453f, -0.163349f, -0.413382f, -0.448732f, -0.528770f, -0.457449f,
+ -0.619619f, -0.265919f, -0.042760f, 0.438730f, 0.501798f, -0.403851f,
+ 0.519564f, 0.817314f, 0.366203f, 0.492610f, 0.546929f, 0.853094f,
+ 0.289000f, 0.453941f, -0.076152f, 0.007226f, -0.183717f, -0.506252f,
+ -0.599989f, -0.576006f, 0.746488f, 0.631466f, -0.475599f, -0.334991f,
+ -0.879614f, 0.918957f, 0.473471f, -0.043781f, -0.688234f, -0.925875f,
+ -0.188081f, 0.050918f, 0.116855f, 0.221413f, -0.066680f, -0.674395f,
+ -0.481985f, 0.247368f, 0.271129f, 0.637979f, -1.006970f, -0.855441f,
+ 0.144874f, 0.507424f, 1.506960f, -0.338910f, 0.398203f, 0.738000f,
+ 0.263193f, -0.425908f, 0.358271f, -1.072900f, -0.816209f, -0.425519f,
+ 0.264373f, 0.694014f, 0.036333f, 0.635532f, 0.518856f, 0.047585f,
+ -0.854817f, -0.138202f, 0.006811f, -0.052020f, -0.468498f, 0.489080f,
+ -0.105778f, 0.357038f, -0.782875f, 0.649049f, -0.562652f, -0.544392f,
+ -0.328526f, -0.402121f, -0.263172f, -0.668459f, -0.526702f, -0.395829f,
+ 0.190986f, 0.307766f, -1.001830f, -0.293051f, 0.283334f, 0.572450f,
+ 0.906095f, -1.144300f, 0.180989f, 0.421092f, 0.684571f, 0.527276f,
+ -0.122287f, 0.575067f, 0.675221f, 0.755029f, 0.094957f, 0.481403f,
+ 0.825155f, 0.755035f, 0.641420f, 0.034497f, 0.518783f, 0.283800f,
+ 0.293733f, -0.074778f, -0.268720f, 0.798921f, 0.317714f, -0.236391f,
+ -0.375071f, -0.414600f, 0.223413f, -0.349044f, -0.191033f, -0.391779f,
+ -0.596894f, -0.378608f, -0.185920f, -0.822171f, -0.754962f, -0.167706f,
+ 0.755378f, 0.671847f, 0.969414f, 0.793048f, 1.078610f, -0.418963f,
+ 0.367648f, 0.217645f, 0.294232f, 0.113027f, 0.060312f, -0.327488f,
+ -0.305035f, -0.243600f, -0.020588f, -0.326324f, -0.417534f, -0.425868f,
+ -0.404614f, -0.346750f, -0.339145f, -0.348094f, -0.527290f, -0.617825f,
+ -0.258342f, -0.200753f, -0.249779f, -0.321039f, -0.023117f, -0.004167f,
+ -0.206788f, -0.612420f, -0.646428f, -0.548969f, -0.158875f, 0.213814f,
+ -0.084040f, -0.217365f, -0.511895f, -0.653285f, 0.440971f, 0.455591f,
+ -0.123900f, 0.134097f, -0.251241f, 0.682463f, 0.740614f, 0.991212f,
+ 0.565984f, 0.592690f,
+};
+
+static INLINE float32x4_t add_f32x4_x4(const float32x4_t a[4]) {
+ float32x4_t sum01 = vaddq_f32(a[0], a[1]);
+ float32x4_t sum23 = vaddq_f32(a[2], a[3]);
+ return vaddq_f32(sum01, sum23);
+}
+
+static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon(
+ const float **input, int in_width, int in_height, int in_stride,
+ const float *bias, const int skip_width, const int skip_height,
+ const int filter_width, const int filter_height, const int in_channels,
+ const int out_channels, float **output, int out_stride, int start_idx,
+ const float *weights) {
+ assert(filter_height == 2 && filter_width == 2);
+ assert(skip_width == 2 && skip_height == 2);
+ assert(in_width >= 16);
+ const int in_size = in_height * in_width;
+
+ do {
+ const float32x4_t bias_v = vdupq_n_f32(bias[0]);
+ const float *weight_ptr0 = weights;
+ const float *in_ptr0 = *input;
+ float *out_ptr0 = *output;
+ int h = 0;
+
+ do {
+ const float *in_ptr1 = in_ptr0;
+ float *out_ptr1 = out_ptr0;
+ int w = 0;
+
+ do {
+ const float *weight_ptr1 = weight_ptr0;
+ const float *in_ptr2 = in_ptr1;
+ int k = 0;
+ float32x4_t sum0[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0),
+ vdupq_n_f32(0) };
+ float32x4_t sum1[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0),
+ vdupq_n_f32(0) };
+
+ do {
+ const float32x4_t weights0 = vld1q_f32(weight_ptr1);
+ const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4);
+ const float32x2_t weights0_lo = vget_low_f32(weights0);
+ const float32x2_t weights0_hi = vget_high_f32(weights0);
+ const float32x2_t weights1_lo = vget_low_f32(weights1);
+ const float32x2_t weights1_hi = vget_high_f32(weights1);
+
+ const float32x4x2_t in0_lo_0 = vld2q_f32(in_ptr2);
+ const float32x4x2_t in0_hi_0 = vld2q_f32(in_ptr2 + in_stride);
+ const float32x4x2_t in1_lo_0 = vld2q_f32(in_ptr2 + in_size);
+ const float32x4x2_t in1_hi_0 =
+ vld2q_f32(in_ptr2 + in_size + in_stride);
+
+ sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[0], weights0_lo, 0);
+ sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[1], weights0_lo, 1);
+
+ sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[0], weights0_hi, 0);
+ sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[1], weights0_hi, 1);
+
+ sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[0], weights1_lo, 0);
+ sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[1], weights1_lo, 1);
+
+ sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[0], weights1_hi, 0);
+ sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[1], weights1_hi, 1);
+
+ const float32x4x2_t in0_lo_1 = vld2q_f32(in_ptr2 + 8);
+ const float32x4x2_t in0_hi_1 = vld2q_f32(in_ptr2 + in_stride + 8);
+ const float32x4x2_t in1_lo_1 = vld2q_f32(in_ptr2 + in_size + 8);
+ const float32x4x2_t in1_hi_1 =
+ vld2q_f32(in_ptr2 + in_size + in_stride + 8);
+
+ sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[0], weights0_lo, 0);
+ sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[1], weights0_lo, 1);
+
+ sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[0], weights0_hi, 0);
+ sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[1], weights0_hi, 1);
+
+ sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[0], weights1_lo, 0);
+ sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[1], weights1_lo, 1);
+
+ sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[0], weights1_hi, 0);
+ sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[1], weights1_hi, 1);
+
+ weight_ptr1 += 8;
+ in_ptr2 += 2 * in_size;
+ k += 2;
+ } while (k < in_channels);
+
+ vst1q_f32(out_ptr1, add_f32x4_x4(sum0));
+ vst1q_f32(out_ptr1 + 4, add_f32x4_x4(sum1));
+
+ out_ptr1 += 8;
+ in_ptr1 += 8 * skip_width;
+ w += 8 * skip_width;
+ } while (w < in_width - filter_width + 1);
+
+ out_ptr0 += out_stride;
+ in_ptr0 += skip_height * in_stride;
+ h += skip_height;
+ } while (h < in_height - filter_height + 1);
+
+ ++bias;
+ ++output;
+ weights += in_channels * filter_height * filter_width;
+ } while (++start_idx < out_channels);
+}
+
+static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon(
+ const float **input, int in_width, int in_height, int in_stride,
+ const float *bias, const int skip_width, const int skip_height,
+ const int filter_width, const int filter_height, const int in_channels,
+ const int out_channels, float **output, int out_stride, int start_idx,
+ const float *weights) {
+ assert(filter_height == 2 && filter_width == 2);
+ assert(skip_width == 2 && skip_height == 2);
+ assert(in_width == 8);
+ const int in_size = in_height * in_width;
+ do {
+ const float32x4_t bias_v = vdupq_n_f32(*bias);
+ const float *weight_ptr0 = weights;
+ const float *in_ptr0 = *input;
+ float *out_ptr0 = *output;
+ int h = 0;
+
+ do {
+ const float *in_ptr1 = in_ptr0;
+ float *out_ptr1 = out_ptr0;
+ int w = 0;
+
+ do {
+ const float *weight_ptr1 = weight_ptr0;
+ const float *in_ptr2 = in_ptr1;
+ int k = 0;
+ float32x4_t sum[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0),
+ vdupq_n_f32(0) };
+
+ do {
+ const float32x4_t weights0 = vld1q_f32(weight_ptr1);
+ const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4);
+ const float32x2_t weights0_lo = vget_low_f32(weights0);
+ const float32x2_t weights0_hi = vget_high_f32(weights0);
+ const float32x2_t weights1_lo = vget_low_f32(weights1);
+ const float32x2_t weights1_hi = vget_high_f32(weights1);
+
+ const float32x4x2_t in0_lo = vld2q_f32(in_ptr2);
+ const float32x4x2_t in0_hi = vld2q_f32(in_ptr2 + in_stride);
+ const float32x4x2_t in1_lo = vld2q_f32(in_ptr2 + in_size);
+ const float32x4x2_t in1_hi = vld2q_f32(in_ptr2 + in_size + in_stride);
+
+ sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[0], weights0_lo, 0);
+ sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[1], weights0_lo, 1);
+
+ sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[0], weights0_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[1], weights0_hi, 1);
+
+ sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[0], weights1_lo, 0);
+ sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[1], weights1_lo, 1);
+
+ sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[0], weights1_hi, 0);
+ sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[1], weights1_hi, 1);
+
+ weight_ptr1 += 8;
+ in_ptr2 += 2 * in_size;
+ k += 2;
+ } while (k < in_channels);
+
+ vst1q_f32(out_ptr1, add_f32x4_x4(sum));
+
+ out_ptr1 += 4;
+ in_ptr1 += 4 * skip_width;
+ w += 4 * skip_width;
+ } while (w < in_width - filter_width + 1);
+
+ out_ptr0 += out_stride;
+ in_ptr0 += skip_height * in_stride;
+ h += skip_height;
+ } while (h < in_height - filter_height + 1);
+
+ ++bias;
+ ++output;
+ weights += in_channels * filter_height * filter_width;
+ } while (++start_idx < out_channels);
+}
+
+static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon(
+ const float **input, int in_width, int in_height, int in_stride,
+ const float *bias, const int skip_width, const int skip_height,
+ const int filter_width, const int filter_height, const int in_channels,
+ const int out_channels, float **output, int out_stride, int start_idx,
+ const float *weights) {
+ assert(filter_height == 5 && filter_width == 5);
+ assert(skip_width == 4 && skip_height == 4);
+ assert(in_width >= 16);
+ assert(in_channels == 1);
+ (void)in_channels;
+
+ do {
+ const float32x4_t bias_v = vdupq_n_f32(*bias);
+ const float *in_ptr0 = *input;
+ const float *weights_ptr0 = weights;
+ float *out_ptr0 = *output;
+ int h = 0;
+
+ do {
+ const float *in_ptr1 = in_ptr0;
+ float *out_ptr1 = out_ptr0;
+ int w = 0;
+
+ do {
+ float32x4_t sum[2] = { bias_v, vdupq_n_f32(0) };
+
+ const float32x4_t weight_0_3 = vld1q_f32(weights_ptr0);
+ const float32x4_t weight_4_7 = vld1q_f32(weights_ptr0 + 4);
+ const float32x4_t weight_8_11 = vld1q_f32(weights_ptr0 + 8);
+ const float32x4_t weight_12_15 = vld1q_f32(weights_ptr0 + 12);
+ const float32x4_t weight_16_19 = vld1q_f32(weights_ptr0 + 16);
+ const float32x4_t weight_20_23 = vld1q_f32(weights_ptr0 + 20);
+
+ const float32x2_t weight_0_3_lo = vget_low_f32(weight_0_3);
+ const float32x2_t weight_0_3_hi = vget_high_f32(weight_0_3);
+ const float32x2_t weight_4_7_lo = vget_low_f32(weight_4_7);
+ const float32x2_t weight_4_7_hi = vget_high_f32(weight_4_7);
+ const float32x2_t weight_8_11_lo = vget_low_f32(weight_8_11);
+ const float32x2_t weight_8_11_hi = vget_high_f32(weight_8_11);
+ const float32x2_t weight_12_15_lo = vget_low_f32(weight_12_15);
+ const float32x2_t weight_12_15_hi = vget_high_f32(weight_12_15);
+ const float32x2_t weight_16_19_lo = vget_low_f32(weight_16_19);
+ const float32x2_t weight_16_19_hi = vget_high_f32(weight_16_19);
+ const float32x2_t weight_20_23_lo = vget_low_f32(weight_20_23);
+ const float32x2_t weight_20_23_hi = vget_high_f32(weight_20_23);
+
+ const float32x4x4_t in0 = vld4q_f32(in_ptr1 + 0 * in_stride);
+ const float32x4x4_t in1 = vld4q_f32(in_ptr1 + 1 * in_stride);
+ const float32x4x4_t in2 = vld4q_f32(in_ptr1 + 2 * in_stride);
+ const float32x4x4_t in3 = vld4q_f32(in_ptr1 + 3 * in_stride);
+ const float32x4x4_t in4 = vld4q_f32(in_ptr1 + 4 * in_stride);
+
+ const float32x4_t in0_4 = vextq_f32(
+ in0.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 0 * in_stride)), 1);
+ const float32x4_t in1_4 = vextq_f32(
+ in1.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 1 * in_stride)), 1);
+ const float32x4_t in2_4 = vextq_f32(
+ in2.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 2 * in_stride)), 1);
+ const float32x4_t in3_4 = vextq_f32(
+ in3.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 3 * in_stride)), 1);
+ const float32x4_t in4_4 = vextq_f32(
+ in4.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 4 * in_stride)), 1);
+
+ // Kernel row 0.
+ sum[0] = vmlaq_lane_f32(sum[0], in0.val[0], weight_0_3_lo, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in0.val[1], weight_0_3_lo, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in0.val[2], weight_0_3_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in0.val[3], weight_0_3_hi, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in0_4, weight_4_7_lo, 0);
+
+ // Kernel row 1.
+ sum[1] = vmlaq_lane_f32(sum[1], in1.val[0], weight_4_7_lo, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in1.val[1], weight_4_7_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in1.val[2], weight_4_7_hi, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in1.val[3], weight_8_11_lo, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in1_4, weight_8_11_lo, 1);
+
+ // Kernel row 2.
+ sum[0] = vmlaq_lane_f32(sum[0], in2.val[0], weight_8_11_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in2.val[1], weight_8_11_hi, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in2.val[2], weight_12_15_lo, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in2.val[3], weight_12_15_lo, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in2_4, weight_12_15_hi, 0);
+
+ // Kernel row 3.
+ sum[1] = vmlaq_lane_f32(sum[1], in3.val[0], weight_12_15_hi, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in3.val[1], weight_16_19_lo, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in3.val[2], weight_16_19_lo, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in3.val[3], weight_16_19_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in3_4, weight_16_19_hi, 1);
+
+ // Kernel row 4.
+ sum[0] = vmlaq_lane_f32(sum[0], in4.val[0], weight_20_23_lo, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in4.val[1], weight_20_23_lo, 1);
+ sum[0] = vmlaq_lane_f32(sum[0], in4.val[2], weight_20_23_hi, 0);
+ sum[1] = vmlaq_lane_f32(sum[1], in4.val[3], weight_20_23_hi, 1);
+ sum[0] = vmlaq_f32(sum[0], vdupq_n_f32(*(weights_ptr0 + 24)), in4_4);
+
+ vst1q_f32(out_ptr1, vaddq_f32(sum[0], sum[1]));
+
+ out_ptr1 += 4;
+ in_ptr1 += 4 * skip_width;
+ w += 4 * skip_width;
+ } while (w < in_width - filter_width + 1);
+
+ out_ptr0 += out_stride;
+ in_ptr0 += skip_height * in_stride;
+ h += skip_height;
+ } while (h < in_height - filter_height + 1);
+
+ ++output;
+ ++bias;
+ weights += 25;
+ } while (++start_idx < out_channels);
+}
+
+// Neon variant of av1_cnn_convolve_no_maxpool_padding_valid_c().
+// As per the current encoder, av1_cnn_convolve function gets called for
+// block size equal to 64x64. av1_cnn_convolve() uses layer config values
+// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few
+// details related to each layer's config parameters.
+// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht
+// 0 64x64 16x16 5 5 4 4
+// 1 16x16 8x8 2 2 2 2
+// 2 8x8 4x4 2 2 2 2
+// 3 4x4 2x2 2 2 2 2
+// 4 2x2 1x1 2 2 2 2
+// Here,
+// filter_wd = filter_width and filter_ht = filter_height,
+// skip_wd = skip_width and skip_ht = skip_height.
+void av1_cnn_convolve_no_maxpool_padding_valid_neon(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+ int start_idx, int cstep, int channel_step) {
+ assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
+ !layer_config->maxpool);
+ assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
+ assert(layer_config->pad == PADDING_VALID);
+ assert(channel_step == 1);
+ assert(cstep == layer_config->in_channels * layer_config->out_channels);
+
+ if (layer_config->filter_width == 5 && layer_config->filter_height == 5 &&
+ layer_config->skip_width == 4 && layer_config->skip_height == 4) {
+ av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon(
+ input, in_width, in_height, in_stride, layer_config->bias,
+ layer_config->skip_width, layer_config->skip_height,
+ layer_config->filter_width, layer_config->filter_height,
+ layer_config->in_channels, layer_config->out_channels, output,
+ out_stride, start_idx, weights_layer_5);
+ } else if (layer_config->filter_width == 2 &&
+ layer_config->filter_height == 2 &&
+ layer_config->skip_width == 2 && layer_config->skip_height == 2) {
+ const float *weights = weights_layer_1;
+ if (layer_config->output_num ==
+ av1_intra_mode_cnn_partition_cnn_config.layer_config[2].output_num) {
+ weights = weights_layer_2;
+ } else if ((layer_config->output_num ==
+ av1_intra_mode_cnn_partition_cnn_config.layer_config[3]
+ .output_num)) {
+ weights = weights_layer_3;
+ } else if ((layer_config->output_num ==
+ av1_intra_mode_cnn_partition_cnn_config.layer_config[4]
+ .output_num)) {
+ weights = weights_layer_4;
+ }
+ if (in_width >= 16) {
+ av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon(
+ input, in_width, in_height, in_stride, layer_config->bias,
+ layer_config->skip_width, layer_config->skip_height,
+ layer_config->filter_width, layer_config->filter_height,
+ layer_config->in_channels, layer_config->out_channels, output,
+ out_stride, start_idx, weights);
+ } else if (in_width == 8) {
+ av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon(
+ input, in_width, in_height, in_stride, layer_config->bias,
+ layer_config->skip_width, layer_config->skip_height,
+ layer_config->filter_width, layer_config->filter_height,
+ layer_config->in_channels, layer_config->out_channels, output,
+ out_stride, start_idx, weights);
+ } else {
+ av1_cnn_convolve_no_maxpool_padding_valid_c(
+ input, in_width, in_height, in_stride, layer_config, output,
+ out_stride, start_idx, cstep, channel_step);
+ }
+ } else {
+ av1_cnn_convolve_no_maxpool_padding_valid_c(
+ input, in_width, in_height, in_stride, layer_config, output, out_stride,
+ start_idx, cstep, channel_step);
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c
new file mode 100644
index 0000000000..582863a27c
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/encodetxb_neon.c
@@ -0,0 +1,646 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/encodetxb.h"
+
+void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = height + TX_PAD_HOR;
+ memset(levels - TX_PAD_TOP * stride, 0,
+ sizeof(*levels) * TX_PAD_TOP * stride);
+ memset(levels + stride * width, 0,
+ sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+ const int32x4_t zeros = vdupq_n_s32(0);
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (height == 4) {
+ do {
+ const int32x4_t coeffA = vld1q_s32(cf);
+ const int32x4_t coeffB = vld1q_s32(cf + height);
+ const int16x8_t coeffAB =
+ vcombine_s16(vqmovn_s32(coeffA), vqmovn_s32(coeffB));
+ const int16x8_t absAB = vqabsq_s16(coeffAB);
+ const int8x8_t absABs = vqmovn_s16(absAB);
+#if AOM_ARCH_AARCH64
+ const int8x16_t absAB8 =
+ vcombine_s8(absABs, vreinterpret_s8_s32(vget_low_s32(zeros)));
+ const uint8x16_t lsAB =
+ vreinterpretq_u8_s32(vzip1q_s32(vreinterpretq_s32_s8(absAB8), zeros));
+#else
+ const int32x2x2_t absAB8 =
+ vzip_s32(vreinterpret_s32_s8(absABs), vget_low_s32(zeros));
+ const uint8x16_t lsAB =
+ vreinterpretq_u8_s32(vcombine_s32(absAB8.val[0], absAB8.val[1]));
+#endif
+ vst1q_u8(ls, lsAB);
+ ls += (stride << 1);
+ cf += (height << 1);
+ i += 2;
+ } while (i < width);
+ } else if (height == 8) {
+ do {
+ const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
+ const int16x8_t absAB = vqabsq_s16(coeffAB);
+ const uint8x16_t absAB8 = vreinterpretq_u8_s8(vcombine_s8(
+ vqmovn_s16(absAB), vreinterpret_s8_s32(vget_low_s32(zeros))));
+ vst1q_u8(ls, absAB8);
+ ls += stride;
+ cf += height;
+ i += 1;
+ } while (i < width);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const int16x8_t coeffAB = load_tran_low_to_s16q(cf);
+ const int16x8_t coeffCD = load_tran_low_to_s16q(cf + 8);
+ const int16x8_t absAB = vqabsq_s16(coeffAB);
+ const int16x8_t absCD = vqabsq_s16(coeffCD);
+ const uint8x16_t absABCD = vreinterpretq_u8_s8(
+ vcombine_s8(vqmovn_s16(absAB), vqmovn_s16(absCD)));
+ vst1q_u8((ls + j), absABCD);
+ j += 16;
+ cf += 16;
+ } while (j < height);
+ *(int32_t *)(ls + height) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < width);
+ }
+}
+
+// get_4_nz_map_contexts_2d coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_2d[2][16]) = {
+ { 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21 },
+ { 0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_hor coefficients:
+/* clang-format off */
+#define SIG_COEF_CONTEXTS_2D_X4_051010 \
+ (SIG_COEF_CONTEXTS_2D + ((SIG_COEF_CONTEXTS_2D + 5) << 8) + \
+ ((SIG_COEF_CONTEXTS_2D + 10) << 16) + ((SIG_COEF_CONTEXTS_2D + 10) << 24))
+/* clang-format on */
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_4_po_hor[16]) = {
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_8_coeff_contexts_2d coefficients:
+// if (width == 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_8[2][16]) = {
+ { 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21 },
+ { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+// if (width < 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_l[2][16]) = {
+ { 0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21, 21, 21, 21, 21 },
+ { 11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21, 21, 21, 21, 21, 21 }
+};
+
+// if (width > 8)
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_2d_g[2][16]) = {
+ { 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_4_nz_map_contexts_ver coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_8_po_ver[16]) = {
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// get_16n_coeff_contexts_2d coefficients:
+// real_width == real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_e[4][16]) = {
+ { 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width < real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_g[3][16]) = {
+ { 0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// real_width > real_height
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_2d_l[3][16]) = {
+ { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 },
+ { 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 },
+ { 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21 }
+};
+
+// get_16n_coeff_contexts_hor coefficients:
+static const DECLARE_ALIGNED(16, uint8_t, c_16_po_ver[16]) = {
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10
+};
+
+// end of coefficients declaration area
+
+static INLINE uint8x16_t load_8bit_4x4_to_1_reg(const uint8_t *const src,
+ const int byte_stride) {
+#if AOM_ARCH_AARCH64
+ uint32x4_t v_data = vld1q_u32((uint32_t *)src);
+ v_data = vld1q_lane_u32((uint32_t *)(src + 1 * byte_stride), v_data, 1);
+ v_data = vld1q_lane_u32((uint32_t *)(src + 2 * byte_stride), v_data, 2);
+ v_data = vld1q_lane_u32((uint32_t *)(src + 3 * byte_stride), v_data, 3);
+
+ return vreinterpretq_u8_u32(v_data);
+#else
+ return load_unaligned_u8q(src, byte_stride);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_8x2_to_1_reg(const uint8_t *const src,
+ const int byte_stride) {
+#if AOM_ARCH_AARCH64
+ uint64x2_t v_data = vld1q_u64((uint64_t *)src);
+ v_data = vld1q_lane_u64((uint64_t *)(src + 1 * byte_stride), v_data, 1);
+
+ return vreinterpretq_u8_u64(v_data);
+#else
+ uint8x8_t v_data_low = vld1_u8(src);
+ uint8x8_t v_data_high = vld1_u8(src + byte_stride);
+
+ return vcombine_u8(v_data_low, v_data_high);
+#endif
+}
+
+static INLINE uint8x16_t load_8bit_16x1_to_1_reg(const uint8_t *const src,
+ const int byte_stride) {
+ (void)byte_stride;
+ return vld1q_u8(src);
+}
+
+static INLINE void load_levels_4x4x5(const uint8_t *const src, const int stride,
+ const ptrdiff_t *const offsets,
+ uint8x16_t *const level) {
+ level[0] = load_8bit_4x4_to_1_reg(&src[1], stride);
+ level[1] = load_8bit_4x4_to_1_reg(&src[stride], stride);
+ level[2] = load_8bit_4x4_to_1_reg(&src[offsets[0]], stride);
+ level[3] = load_8bit_4x4_to_1_reg(&src[offsets[1]], stride);
+ level[4] = load_8bit_4x4_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_8x2x5(const uint8_t *const src, const int stride,
+ const ptrdiff_t *const offsets,
+ uint8x16_t *const level) {
+ level[0] = load_8bit_8x2_to_1_reg(&src[1], stride);
+ level[1] = load_8bit_8x2_to_1_reg(&src[stride], stride);
+ level[2] = load_8bit_8x2_to_1_reg(&src[offsets[0]], stride);
+ level[3] = load_8bit_8x2_to_1_reg(&src[offsets[1]], stride);
+ level[4] = load_8bit_8x2_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE void load_levels_16x1x5(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ uint8x16_t *const level) {
+ level[0] = load_8bit_16x1_to_1_reg(&src[1], stride);
+ level[1] = load_8bit_16x1_to_1_reg(&src[stride], stride);
+ level[2] = load_8bit_16x1_to_1_reg(&src[offsets[0]], stride);
+ level[3] = load_8bit_16x1_to_1_reg(&src[offsets[1]], stride);
+ level[4] = load_8bit_16x1_to_1_reg(&src[offsets[2]], stride);
+}
+
+static INLINE uint8x16_t get_coeff_contexts_kernel(uint8x16_t *const level) {
+ const uint8x16_t const_3 = vdupq_n_u8(3);
+ const uint8x16_t const_4 = vdupq_n_u8(4);
+ uint8x16_t count;
+
+ count = vminq_u8(level[0], const_3);
+ level[1] = vminq_u8(level[1], const_3);
+ level[2] = vminq_u8(level[2], const_3);
+ level[3] = vminq_u8(level[3], const_3);
+ level[4] = vminq_u8(level[4], const_3);
+ count = vaddq_u8(count, level[1]);
+ count = vaddq_u8(count, level[2]);
+ count = vaddq_u8(count, level[3]);
+ count = vaddq_u8(count, level[4]);
+
+ count = vrshrq_n_u8(count, 1);
+ count = vminq_u8(count, const_4);
+ return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *const coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const uint8x16_t pos_to_offset_large = vdupq_n_u8(21);
+
+ uint8x16_t pos_to_offset =
+ (width == 4) ? vld1q_u8(c_4_po_2d[0]) : vld1q_u8(c_4_po_2d[1]);
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+ uint8_t *cc = coeff_contexts;
+
+ assert(!(width % 4));
+
+ int col = width;
+ do {
+ load_levels_4x4x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(cc, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ cc += 16;
+ col -= 4;
+ } while (col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+
+ const uint8x16_t pos_to_offset =
+ vreinterpretq_u8_u32(vdupq_n_u32(SIG_COEF_CONTEXTS_2D_X4_051010));
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(width % 4));
+
+ int col = width;
+ do {
+ load_levels_4x4x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(coeff_contexts, count);
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ col -= 4;
+ } while (col);
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+ uint8x16_t pos_to_offset = vld1q_u8(c_4_po_hor);
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(width % 4));
+
+ int col = width;
+ do {
+ load_levels_4x4x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ col -= 4;
+ } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ uint8_t *cc = coeff_contexts;
+ uint8x16_t count;
+ uint8x16_t level[5];
+ uint8x16_t pos_to_offset[3];
+
+ assert(!(width % 2));
+
+ if (width == 8) {
+ pos_to_offset[0] = vld1q_u8(c_8_po_2d_8[0]);
+ pos_to_offset[1] = vld1q_u8(c_8_po_2d_8[1]);
+ } else if (width < 8) {
+ pos_to_offset[0] = vld1q_u8(c_8_po_2d_l[0]);
+ pos_to_offset[1] = vld1q_u8(c_8_po_2d_l[1]);
+ } else {
+ pos_to_offset[0] = vld1q_u8(c_8_po_2d_g[0]);
+ pos_to_offset[1] = vld1q_u8(c_8_po_2d_g[1]);
+ }
+ pos_to_offset[2] = vdupq_n_u8(21);
+
+ int col = width;
+ do {
+ load_levels_8x2x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset[0]);
+ vst1q_u8(cc, count);
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += 2 * stride;
+ cc += 16;
+ col -= 2;
+ } while (col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+
+ const uint8x16_t pos_to_offset = vld1q_u8(c_8_po_ver);
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(width % 2));
+
+ int col = width;
+ do {
+ load_levels_8x2x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(coeff_contexts, count);
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ col -= 2;
+ } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+ uint8x16_t pos_to_offset = vcombine_u8(vdup_n_u8(SIG_COEF_CONTEXTS_2D + 0),
+ vdup_n_u8(SIG_COEF_CONTEXTS_2D + 5));
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(width % 2));
+
+ int col = width;
+ do {
+ load_levels_8x2x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ col -= 2;
+ } while (col);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+ const int real_width,
+ const int real_height,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+ uint8_t *cc = coeff_contexts;
+ int col = width;
+ uint8x16_t pos_to_offset[5];
+ uint8x16_t pos_to_offset_large[3];
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(height % 16));
+
+ pos_to_offset_large[2] = vdupq_n_u8(21);
+ if (real_width == real_height) {
+ pos_to_offset[0] = vld1q_u8(c_16_po_2d_e[0]);
+ pos_to_offset[1] = vld1q_u8(c_16_po_2d_e[1]);
+ pos_to_offset[2] = vld1q_u8(c_16_po_2d_e[2]);
+ pos_to_offset[3] = vld1q_u8(c_16_po_2d_e[3]);
+ pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+ pos_to_offset_large[2];
+ } else if (real_width < real_height) {
+ pos_to_offset[0] = vld1q_u8(c_16_po_2d_g[0]);
+ pos_to_offset[1] = vld1q_u8(c_16_po_2d_g[1]);
+ pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] =
+ vld1q_u8(c_16_po_2d_g[2]);
+ pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+ } else { // real_width > real_height
+ pos_to_offset[0] = pos_to_offset[1] = vld1q_u8(c_16_po_2d_l[0]);
+ pos_to_offset[2] = vld1q_u8(c_16_po_2d_l[1]);
+ pos_to_offset[3] = vld1q_u8(c_16_po_2d_l[2]);
+ pos_to_offset[4] = pos_to_offset_large[2];
+ pos_to_offset_large[0] = pos_to_offset_large[1] = vdupq_n_u8(16);
+ }
+
+ do {
+ int h = height;
+
+ do {
+ load_levels_16x1x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset[0]);
+ vst1q_u8(cc, count);
+ levels += 16;
+ cc += 16;
+ h -= 16;
+ pos_to_offset[0] = pos_to_offset_large[0];
+ } while (h);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ pos_to_offset[2] = pos_to_offset[3];
+ pos_to_offset[3] = pos_to_offset[4];
+ pos_to_offset_large[0] = pos_to_offset_large[1];
+ pos_to_offset_large[1] = pos_to_offset_large[2];
+ levels += TX_PAD_HOR;
+ } while (--col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+
+ const uint8x16_t pos_to_offset_large = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(height % 16));
+
+ int col = width;
+ do {
+ uint8x16_t pos_to_offset = vld1q_u8(c_16_po_ver);
+
+ int h = height;
+ do {
+ load_levels_16x1x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset);
+ vst1q_u8(coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 16;
+ coeff_contexts += 16;
+ h -= 16;
+ } while (h);
+
+ levels += TX_PAD_HOR;
+ } while (--col);
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ uint8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+
+ uint8x16_t pos_to_offset[3];
+ uint8x16_t count;
+ uint8x16_t level[5];
+
+ assert(!(height % 16));
+
+ pos_to_offset[0] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 0);
+ pos_to_offset[1] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 5);
+ pos_to_offset[2] = vdupq_n_u8(SIG_COEF_CONTEXTS_2D + 10);
+
+ int col = width;
+ do {
+ int h = height;
+ do {
+ load_levels_16x1x5(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel(level);
+ count = vaddq_u8(count, pos_to_offset[0]);
+ vst1q_u8(coeff_contexts, count);
+ levels += 16;
+ coeff_contexts += 16;
+ h -= 16;
+ } while (h);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += TX_PAD_HOR;
+ } while (--col);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_neon(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size,
+ const TX_CLASS tx_class,
+ int8_t *const coeff_contexts) {
+ const int last_idx = eob - 1;
+ if (!last_idx) {
+ coeff_contexts[0] = 0;
+ return;
+ }
+
+ uint8_t *const coefficients = (uint8_t *const)coeff_contexts;
+
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const int stride = height + TX_PAD_HOR;
+ ptrdiff_t offsets[3];
+
+ /* coeff_contexts must be 16 byte aligned. */
+ assert(!((intptr_t)coeff_contexts & 0xf));
+
+ if (tx_class == TX_CLASS_2D) {
+ offsets[0] = 0 * stride + 2;
+ offsets[1] = 1 * stride + 1;
+ offsets[2] = 2 * stride + 0;
+
+ if (height == 4) {
+ get_4_nz_map_contexts_2d(levels, width, offsets, coefficients);
+ } else if (height == 8) {
+ get_8_coeff_contexts_2d(levels, width, offsets, coefficients);
+ } else {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coefficients);
+ }
+ } else if (tx_class == TX_CLASS_HORIZ) {
+ offsets[0] = 2 * stride;
+ offsets[1] = 3 * stride;
+ offsets[2] = 4 * stride;
+ if (height == 4) {
+ get_4_nz_map_contexts_hor(levels, width, offsets, coefficients);
+ } else if (height == 8) {
+ get_8_coeff_contexts_hor(levels, width, offsets, coefficients);
+ } else {
+ get_16n_coeff_contexts_hor(levels, width, height, offsets, coefficients);
+ }
+ } else { // TX_CLASS_VERT
+ offsets[0] = 2;
+ offsets[1] = 3;
+ offsets[2] = 4;
+ if (height == 4) {
+ get_4_nz_map_contexts_ver(levels, width, offsets, coefficients);
+ } else if (height == 8) {
+ get_8_coeff_contexts_ver(levels, width, offsets, coefficients);
+ } else {
+ get_16n_coeff_contexts_ver(levels, width, height, offsets, coefficients);
+ }
+ }
+
+ const int bhl = get_txb_bhl(tx_size);
+ const int pos = scan[last_idx];
+ if (last_idx <= (width << bhl) / 8)
+ coeff_contexts[pos] = 1;
+ else if (last_idx <= (width << bhl) / 4)
+ coeff_contexts[pos] = 2;
+ else
+ coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
new file mode 100644
index 0000000000..aa64a38902
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c
@@ -0,0 +1,2619 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "shift_neon.h"
+#include "txfm_neon.h"
+
+static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in,
+ int32x4_t *out) {
+ // This is not quite the same as the other transposes defined in
+ // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is
+ // unused by the following row transform.
+ for (int j = 0; j < 8; ++j) {
+ for (int i = 0; i < 16; ++i) {
+ transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i);
+ }
+ }
+}
+
+// A note on butterfly helper naming:
+//
+// butterfly_[weight_indices]_neon
+// e.g. butterfly_0312_neon
+// ^ Weights are applied as indices 0, 3, 2, 1
+// (see more detail below)
+//
+// Weight indices are treated as an index into the 4-tuple of the weight
+// itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1).
+// This is then represented in the helper naming by referring to the lane index
+// in the loaded tuple that each multiply is performed with:
+//
+// in0 in1
+// /------------
+// out0 | w[0] w[1] ==> out0 = in0 * w[0] + in1 * w[1]
+// out1 | w[2] w[3] ==> out1 = in0 * w[2] + in1 * w[3]
+//
+// So for indices 0321 from the earlier example, we end up with:
+//
+// in0 in1
+// /------------------
+// out0 | (lane 0) (lane 3) ==> out0 = in0 * w0 + in1 * (w0-1)
+// out1 | (lane 2) (lane 1) ==> out1 = in0 * -w0 + in1 * (1-w0)
+
+#define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit) \
+ do { \
+ int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } }; \
+ int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \
+ x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2); \
+ *out = vrshlq_s32(x, v_bit); \
+ } while (false)
+
+static AOM_FORCE_INLINE void butterfly_0112_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_2312_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_0332_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_0130_neon(
+ const int32_t *cospi, const int widx0, const int32x4_t n0,
+ const int32x4_t n1, int32x4_t *out0, int32x4_t *out1,
+ const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * widx0);
+ butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon(
+ const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
+ int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * 32);
+ butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon(
+ const int32_t *cospi, const int32x4_t n0, const int32x4_t n1,
+ int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) {
+ int32x2_t w01 = vld1_s32(cospi + 2 * 32);
+ butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit);
+ butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit);
+}
+
+static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input,
+ int32x4_t *output,
+ const int size) {
+ const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
+ int i = 0;
+ do {
+ const int32x4_t r1 = vmulq_s32(input[i], sqrt2);
+ output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+ } while (++i < size);
+}
+
+static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon(
+ const int32x4_t *input, int32x4_t *output, const int size) {
+ const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2);
+ int i = 0;
+ do {
+ const int32x4_t r0 = vrshrq_n_s32(input[i], 2);
+ const int32x4_t r1 = vmulq_s32(r0, sqrt2);
+ output[i] = vrshrq_n_s32(r1, NewSqrt2Bits);
+ } while (++i < size);
+}
+
+#define LOAD_BUFFER_4XH(h) \
+ static AOM_FORCE_INLINE void load_buffer_4x##h( \
+ const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
+ if (fliplr) { \
+ for (int i = 0; i < (h); ++i) { \
+ int16x4_t a = vld1_s16(input + i * stride); \
+ a = vrev64_s16(a); \
+ in[i] = vshll_n_s16(a, 2); \
+ } \
+ } else { \
+ for (int i = 0; i < (h); ++i) { \
+ int16x4_t a = vld1_s16(input + i * stride); \
+ in[i] = vshll_n_s16(a, 2); \
+ } \
+ } \
+ }
+
+// AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to
+// avoid the expression even though the compiler can prove that the code path
+// is never taken if `shift == 0`.
+#define shift_left_long_s16(a, shift) \
+ ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift)))
+
+#define LOAD_BUFFER_WXH(w, h, shift) \
+ static AOM_FORCE_INLINE void load_buffer_##w##x##h( \
+ const int16_t *input, int32x4_t *in, int stride, int fliplr) { \
+ assert(w >= 8); \
+ if (fliplr) { \
+ for (int i = 0; i < (h); ++i) { \
+ for (int j = 0; j < (w) / 8; ++j) { \
+ int16x8_t a = vld1q_s16(input + i * stride + j * 8); \
+ a = vrev64q_s16(a); \
+ int j2 = (w) / 8 - j - 1; \
+ in[i + (h) * (2 * j2 + 0)] = \
+ shift_left_long_s16(vget_high_s16(a), (shift)); \
+ in[i + (h) * (2 * j2 + 1)] = \
+ shift_left_long_s16(vget_low_s16(a), (shift)); \
+ } \
+ } \
+ } else { \
+ for (int i = 0; i < (h); ++i) { \
+ for (int j = 0; j < (w) / 8; ++j) { \
+ int16x8_t a = vld1q_s16(input + i * stride + j * 8); \
+ in[i + (h) * (2 * j + 0)] = \
+ shift_left_long_s16(vget_low_s16(a), (shift)); \
+ in[i + (h) * (2 * j + 1)] = \
+ shift_left_long_s16(vget_high_s16(a), (shift)); \
+ } \
+ } \
+ } \
+ }
+
+LOAD_BUFFER_4XH(4)
+LOAD_BUFFER_4XH(8)
+LOAD_BUFFER_4XH(16)
+LOAD_BUFFER_4XH(32)
+LOAD_BUFFER_WXH(8, 8, 2)
+LOAD_BUFFER_WXH(16, 16, 2)
+LOAD_BUFFER_WXH(32, 64, 0)
+LOAD_BUFFER_WXH(64, 32, 2)
+LOAD_BUFFER_WXH(64, 64, 0)
+
+#if !CONFIG_REALTIME_ONLY
+LOAD_BUFFER_WXH(16, 64, 0)
+LOAD_BUFFER_WXH(64, 16, 2)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define STORE_BUFFER_WXH(w, h) \
+ static AOM_FORCE_INLINE void store_buffer_##w##x##h( \
+ const int32x4_t *in, int32_t *out, int stride) { \
+ for (int i = 0; i < (w); ++i) { \
+ for (int j = 0; j < (h) / 4; ++j) { \
+ vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \
+ } \
+ } \
+ }
+
+STORE_BUFFER_WXH(4, 4)
+STORE_BUFFER_WXH(8, 4)
+STORE_BUFFER_WXH(8, 8)
+STORE_BUFFER_WXH(16, 4)
+STORE_BUFFER_WXH(16, 16)
+STORE_BUFFER_WXH(32, 4)
+STORE_BUFFER_WXH(32, 32)
+STORE_BUFFER_WXH(64, 32)
+
+#if !CONFIG_REALTIME_ONLY
+STORE_BUFFER_WXH(16, 32)
+STORE_BUFFER_WXH(64, 16)
+#endif // !CONFIG_REALTIME_ONLY
+
+static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]);
+ const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]);
+
+ const int32x4_t a0 = vaddq_s32(in[0], in[3]);
+ const int32x4_t a1 = vsubq_s32(in[0], in[3]);
+ const int32x4_t a2 = vaddq_s32(in[1], in[2]);
+ const int32x4_t a3 = vsubq_s32(in[1], in[2]);
+
+ const int32x4_t b0 = vmulq_s32(a0, cospi32);
+ const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1);
+ const int32x4_t b2 = vmulq_s32(a2, cospi32);
+ const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1);
+
+ const int32x4_t c0 = vaddq_s32(b0, b2);
+ const int32x4_t c1 = vsubq_s32(b0, b2);
+ const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0);
+ const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0);
+
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ const int32x4_t d0 = vrshlq_s32(c0, v_bit);
+ const int32x4_t d1 = vrshlq_s32(c1, v_bit);
+ const int32x4_t d2 = vrshlq_s32(c2, v_bit);
+ const int32x4_t d3 = vrshlq_s32(c3, v_bit);
+
+ out[0] = d0;
+ out[1] = d2;
+ out[2] = d1;
+ out[3] = d3;
+}
+
+static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1);
+
+ const int32x4_t a0 = vaddq_s32(in[0], in[1]);
+ const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0);
+ const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1);
+ const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0);
+
+ const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1);
+ const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0);
+ const int32x4_t b2 = vsubq_s32(a0, in[3]);
+
+ const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1);
+ const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1);
+ const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0);
+
+ const int32x4_t d0 = vaddq_s32(c0, a3);
+ const int32x4_t d1 = vsubq_s32(c1, a3);
+ const int32x4_t d2 = vsubq_s32(c1, c0);
+
+ const int32x4_t e0 = vaddq_s32(d2, a3);
+
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+ out[0] = vrshlq_s32(d0, v_bit);
+ out[1] = vrshlq_s32(c2, v_bit);
+ out[2] = vrshlq_s32(d1, v_bit);
+ out[3] = vrshlq_s32(e0, v_bit);
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in,
+ int32x4_t *out,
+ int bit) {
+ (void)bit;
+ int32x4_t fact = vdupq_n_s32(NewSqrt2);
+
+ for (int i = 0; i < 4; i++) {
+ const int32x4_t a_low = vmulq_s32(in[i], fact);
+ out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits);
+ }
+}
+
+void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff,
+ int input_stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4);
+
+ // Workspace for column/row-wise transforms.
+ int32x4_t buf[4];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case ADST_DCT:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case ADST_ADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case IDTX:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case V_DCT:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case H_DCT:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case V_ADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case H_ADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x4(input, buf, input_stride, 0);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ case H_FLIPADST:
+ load_buffer_4x4(input, buf, input_stride, 1);
+ highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ transpose_arrays_s32_4x4(buf, buf);
+ highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]);
+ store_buffer_4x4(buf, coeff, /*stride=*/4);
+ break;
+ default: assert(0);
+ }
+}
+
+// Butterfly pre-processing:
+// e.g. n=4:
+// out[0] = in[0] + in[3]
+// out[1] = in[1] + in[2]
+// out[2] = in[1] - in[2]
+// out[3] = in[0] - in[3]
+
+static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input,
+ int32x4_t *output, int n) {
+ for (int i = 0; i < n / 2; ++i) {
+ output[i] = vaddq_s32(input[i], input[n - i - 1]);
+ }
+ for (int i = 0; i < n / 2; ++i) {
+ output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]);
+ }
+}
+
+// Butterfly post-processing:
+// e.g. n=8:
+// out[0] = in0[0] + in1[3];
+// out[1] = in0[1] + in1[2];
+// out[2] = in0[1] - in1[2];
+// out[3] = in0[0] - in1[3];
+// out[4] = in0[7] - in1[4];
+// out[5] = in0[6] - in1[5];
+// out[6] = in0[6] + in1[5];
+// out[7] = in0[7] + in1[4];
+
+static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0,
+ const int32x4_t *in1,
+ int32x4_t *output, int n) {
+ for (int i = 0; i < n / 4; ++i) {
+ output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]);
+ }
+ for (int i = 0; i < n / 4; ++i) {
+ output[(3 * n) / 4 + i] =
+ vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+ // stage 1
+ int32x4_t a[8];
+ butterfly_dct_pre(in, a, 8);
+
+ // stage 2
+ int32x4_t b[8];
+ butterfly_dct_pre(a, b, 4);
+ butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit);
+
+ // stage 3
+ int32x4_t c[8];
+ butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit);
+ butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit);
+ butterfly_dct_post(a + 4, b + 4, c + 4, 4);
+
+ // stage 4-5
+ butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit);
+ butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit);
+
+ out[0] = c[0];
+ out[2] = c[2];
+ out[4] = c[1];
+ out[6] = c[3];
+}
+
+static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in,
+ int32x4_t *out, int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+ int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
+ int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
+
+ // stage 0-1
+ u0 = in[0];
+ u1 = in[7];
+ u2 = in[3];
+ u3 = in[4];
+ u4 = in[1];
+ u5 = in[6];
+ u6 = in[2];
+ u7 = in[5];
+
+ // stage 2
+ v0 = u0;
+ v1 = u1;
+ butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit);
+ v4 = u4;
+ v5 = u5;
+ butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit);
+
+ // stage 3
+ u0 = vaddq_s32(v0, v2);
+ u1 = vsubq_s32(v3, v1);
+ u2 = vsubq_s32(v0, v2);
+ u3 = vaddq_s32(v1, v3);
+ u4 = vsubq_s32(v6, v4);
+ u5 = vaddq_s32(v5, v7);
+ u6 = vaddq_s32(v4, v6);
+ u7 = vsubq_s32(v5, v7);
+
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
+
+ butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit);
+ butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit);
+
+ // stage 5
+ u0 = vaddq_s32(v0, v4);
+ u1 = vaddq_s32(v1, v5);
+ u2 = vaddq_s32(v2, v6);
+ u3 = vsubq_s32(v7, v3);
+ u4 = vsubq_s32(v0, v4);
+ u5 = vsubq_s32(v1, v5);
+ u6 = vsubq_s32(v2, v6);
+ u7 = vaddq_s32(v3, v7);
+
+ // stage 6
+ butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit);
+ butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit);
+ butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit);
+ butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit);
+
+ // stage 7
+ out[0] = v1;
+ out[1] = v6;
+ out[2] = v3;
+ out[3] = v4;
+ out[4] = v5;
+ out[5] = v2;
+ out[6] = v7;
+ out[7] = v0;
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in,
+ int32x4_t *out,
+ int bit) {
+ (void)bit;
+ out[0] = vshlq_n_s32(in[0], 1);
+ out[1] = vshlq_n_s32(in[1], 1);
+ out[2] = vshlq_n_s32(in[2], 1);
+ out[3] = vshlq_n_s32(in[3], 1);
+ out[4] = vshlq_n_s32(in[4], 1);
+ out[5] = vshlq_n_s32(in[5], 1);
+ out[6] = vshlq_n_s32(in[6], 1);
+ out[7] = vshlq_n_s32(in[7], 1);
+}
+
+static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in,
+ int32x4_t *out, int bit,
+ int howmany) {
+ const int stride = 8;
+ int i = 0;
+ do {
+ highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in,
+ int32x4_t *out, int bit,
+ int howmany) {
+ const int stride = 8;
+ int i = 0;
+ do {
+ highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in,
+ int32x4_t *out, int bit,
+ int howmany) {
+ (void)bit;
+ const int stride = 8;
+ int i = 0;
+ do {
+ highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+ // Workspaces for column/row-wise transforms.
+ int32x4_t buf0[16], buf1[16];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case IDTX:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case V_DCT:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case H_DCT:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case V_ADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case H_ADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case V_FLIPADST:
+ load_buffer_8x8(input, buf0, stride, 0);
+ highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ case H_FLIPADST:
+ load_buffer_8x8(input, buf0, stride, 1);
+ highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2);
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_8x8(buf0, buf1);
+ highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2);
+ store_buffer_8x8(buf1, coeff, /*stride=*/8);
+ break;
+ default: assert(0);
+ }
+}
+
+static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out,
+ int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+ int32x4_t u[16], v[16];
+
+ // stage 1
+ butterfly_dct_pre(in, u, 16);
+
+ // stage 2
+ butterfly_dct_pre(u, v, 8);
+ v[8] = u[8];
+ v[9] = u[9];
+ butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit);
+ butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit);
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 3
+ butterfly_dct_pre(v, u, 4);
+ u[4] = v[4];
+ butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit);
+ u[7] = v[7];
+ butterfly_dct_post(v + 8, v + 8, u + 8, 8);
+
+ // stage 4
+ butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit);
+ butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit);
+ butterfly_dct_post(u + 4, u + 4, v + 4, 4);
+ v[8] = u[8];
+ butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit);
+ butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[15] = u[15];
+
+ // stage 5
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit);
+ butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit);
+ butterfly_dct_post(v + 8, v + 8, u + 8, 4);
+ butterfly_dct_post(v + 12, v + 12, u + 12, 4);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit);
+ butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit);
+ butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit);
+ butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit);
+
+ out[0] = v[0];
+ out[1] = v[8];
+ out[2] = v[4];
+ out[3] = v[12];
+ out[4] = v[2];
+ out[5] = v[10];
+ out[6] = v[6];
+ out[7] = v[14];
+ out[8] = v[1];
+ out[9] = v[9];
+ out[10] = v[5];
+ out[11] = v[13];
+ out[12] = v[3];
+ out[13] = v[11];
+ out[14] = v[7];
+ out[15] = v[15];
+}
+
+static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out,
+ int bit) {
+ const int32_t *const cospi = cospi_arr_s32(bit);
+ const int32x4_t v_bit = vdupq_n_s32(-bit);
+
+ int32x4_t u[16], v[16];
+
+ // stage 0-1
+ u[0] = in[0];
+ u[1] = in[15];
+ u[2] = in[7];
+ u[3] = in[8];
+ u[4] = in[3];
+ u[5] = in[12];
+ u[6] = in[4];
+ u[7] = in[11];
+ u[8] = in[1];
+ u[9] = in[14];
+ u[10] = in[6];
+ u[11] = in[9];
+ u[12] = in[2];
+ u[13] = in[13];
+ u[14] = in[5];
+ u[15] = in[10];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit);
+ v[4] = u[4];
+ v[5] = u[5];
+ butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit);
+ v[12] = u[12];
+ v[13] = u[13];
+ butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit);
+
+ // stage 3
+ u[0] = vaddq_s32(v[0], v[2]);
+ u[1] = vsubq_s32(v[3], v[1]);
+ u[2] = vsubq_s32(v[0], v[2]);
+ u[3] = vaddq_s32(v[1], v[3]);
+ u[4] = vsubq_s32(v[6], v[4]);
+ u[5] = vaddq_s32(v[5], v[7]);
+ u[6] = vaddq_s32(v[4], v[6]);
+ u[7] = vsubq_s32(v[5], v[7]);
+ u[8] = vsubq_s32(v[10], v[8]);
+ u[9] = vaddq_s32(v[9], v[11]);
+ u[10] = vaddq_s32(v[8], v[10]);
+ u[11] = vsubq_s32(v[9], v[11]);
+ u[12] = vaddq_s32(v[12], v[14]);
+ u[13] = vsubq_s32(v[15], v[13]);
+ u[14] = vsubq_s32(v[12], v[14]);
+ u[15] = vaddq_s32(v[13], v[15]);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit);
+ butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit);
+ butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit);
+
+ // stage 5
+ u[0] = vaddq_s32(v[0], v[4]);
+ u[1] = vaddq_s32(v[1], v[5]);
+ u[2] = vaddq_s32(v[2], v[6]);
+ u[3] = vsubq_s32(v[7], v[3]);
+ u[4] = vsubq_s32(v[0], v[4]);
+ u[5] = vsubq_s32(v[1], v[5]);
+ u[6] = vsubq_s32(v[2], v[6]);
+ u[7] = vaddq_s32(v[3], v[7]);
+ u[8] = vaddq_s32(v[8], v[12]);
+ u[9] = vaddq_s32(v[9], v[13]);
+ u[10] = vsubq_s32(v[14], v[10]);
+ u[11] = vaddq_s32(v[11], v[15]);
+ u[12] = vsubq_s32(v[8], v[12]);
+ u[13] = vsubq_s32(v[9], v[13]);
+ u[14] = vaddq_s32(v[10], v[14]);
+ u[15] = vsubq_s32(v[11], v[15]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit);
+ butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit);
+ butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit);
+ butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit);
+
+ // stage 7
+ u[0] = vaddq_s32(v[0], v[8]);
+ u[1] = vaddq_s32(v[1], v[9]);
+ u[2] = vaddq_s32(v[2], v[10]);
+ u[3] = vaddq_s32(v[3], v[11]);
+ u[4] = vaddq_s32(v[4], v[12]);
+ u[5] = vaddq_s32(v[5], v[13]);
+ u[6] = vaddq_s32(v[6], v[14]);
+ u[7] = vsubq_s32(v[15], v[7]);
+ u[8] = vsubq_s32(v[0], v[8]);
+ u[9] = vsubq_s32(v[1], v[9]);
+ u[10] = vsubq_s32(v[2], v[10]);
+ u[11] = vsubq_s32(v[3], v[11]);
+ u[12] = vsubq_s32(v[4], v[12]);
+ u[13] = vsubq_s32(v[5], v[13]);
+ u[14] = vsubq_s32(v[6], v[14]);
+ u[15] = vaddq_s32(v[7], v[15]);
+
+ // stage 8
+ butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit);
+ butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit);
+ butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit);
+ butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit);
+ butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit);
+ butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit);
+ butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit);
+ butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit);
+
+ // stage 9
+ out[0] = v[1];
+ out[1] = v[14];
+ out[2] = v[3];
+ out[3] = v[12];
+ out[4] = v[5];
+ out[5] = v[10];
+ out[6] = v[7];
+ out[7] = v[8];
+ out[8] = v[9];
+ out[9] = v[6];
+ out[10] = v[11];
+ out[11] = v[4];
+ out[12] = v[13];
+ out[13] = v[2];
+ out[14] = v[15];
+ out[15] = v[0];
+}
+
+static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out,
+ int bit) {
+ (void)bit;
+ const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2);
+ const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1));
+
+ for (int i = 0; i < 16; i++) {
+ int32x4_t a = vmulq_s32(in[i], fact);
+ a = vaddq_s32(a, offset);
+ out[i] = vshrq_n_s32(a, NewSqrt2Bits);
+ }
+}
+
+static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
+ const int howmany) {
+ const int stride = 16;
+ int i = 0;
+ do {
+ highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit,
+ int howmany) {
+ const int stride = 16;
+ int i = 0;
+ do {
+ highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out,
+ int bit, int howmany) {
+ const int stride = 16;
+ int i = 0;
+ do {
+ highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit);
+ } while (++i < howmany);
+}
+
+void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Workspaces for column/row-wise transforms.
+ int32x4_t buf0[64], buf1[64];
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case ADST_DCT:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case DCT_ADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case ADST_ADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case IDTX:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case V_DCT:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case H_DCT:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case V_ADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case H_ADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case V_FLIPADST:
+ load_buffer_16x16(input, buf0, stride, 0);
+ highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ case H_FLIPADST:
+ load_buffer_16x16(input, buf0, stride, 1);
+ highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+ transpose_arrays_s32_16x16(buf0, buf1);
+ highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4);
+ store_buffer_16x16(buf1, coeff, /*stride=*/16);
+ break;
+ default: assert(0);
+ }
+}
+
+typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out,
+ int stride, int bit, int lr_flip);
+typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in,
+ int32x4_t *out, int stride,
+ int bit, int lr_flip,
+ int howmany, int hm_stride);
+
+typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out,
+ int bit, int stride);
+typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in,
+ int32_t *out, int bit,
+ int howmany, int hm_stride,
+ int stride);
+
+// Construct component kernels that include the load_buffer and store_buffer
+// stages to avoid the need to spill loaded data to the stack between these and
+// the txfm kernel calls.
+// The TRANSFORM_*_ONE cases are only ever called in situations where the
+// howmany parameter would be one, so no need for the loop at all in these
+// cases.
+
+#define TRANSFORM_COL_ONE(name, n) \
+ static void highbd_##name##_col_neon(const int16_t *input, \
+ int32x4_t *output, int stride, \
+ int cos_bit, int lr_flip) { \
+ int32x4_t buf0[n]; \
+ load_buffer_4x##n(input, buf0, stride, lr_flip); \
+ highbd_##name##_x4_neon(buf0, output, cos_bit); \
+ }
+
+#define TRANSFORM_COL_MANY(name, n) \
+ static void highbd_##name##_col_many_neon( \
+ const int16_t *input, int32x4_t *output, int stride, int cos_bit, \
+ int lr_flip, int howmany, int hm_stride) { \
+ int i = 0; \
+ do { \
+ int32x4_t buf0[n]; \
+ load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip); \
+ highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit); \
+ } while (++i < howmany); \
+ }
+
+#define TRANSFORM_ROW_ONE(name, n) \
+ static void highbd_##name##_row_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input, buf0, cos_bit); \
+ store_buffer_##n##x4(buf0, output, stride); \
+ }
+
+#define TRANSFORM_ROW_RECT_ONE(name, n) \
+ static void highbd_##name##_row_rect_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input, buf0, cos_bit); \
+ round_rect_array_s32_neon(buf0, buf0, (n)); \
+ store_buffer_##n##x4(buf0, output, stride); \
+ }
+
+#define TRANSFORM_ROW_MANY(name, n) \
+ static void highbd_##name##_row_many_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
+ int hm_stride, int stride) { \
+ int i = 0; \
+ do { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \
+ store_buffer_##n##x4(buf0, output + 4 * i, stride); \
+ } while (++i < howmany); \
+ }
+
+#define TRANSFORM_ROW_RECT_MANY(name, n) \
+ static void highbd_##name##_row_rect_many_neon( \
+ const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \
+ int hm_stride, int stride) { \
+ int i = 0; \
+ do { \
+ int32x4_t buf0[n]; \
+ highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \
+ round_rect_array_s32_neon(buf0, buf0, (n)); \
+ store_buffer_##n##x4(buf0, output + 4 * i, stride); \
+ } while (++i < howmany); \
+ }
+
+TRANSFORM_COL_ONE(fdct8, 8)
+TRANSFORM_COL_ONE(fadst8, 8)
+TRANSFORM_COL_ONE(fidentity8, 8)
+
+TRANSFORM_COL_MANY(fdct4, 4)
+TRANSFORM_COL_MANY(fdct8, 8)
+TRANSFORM_COL_MANY(fdct16, 16)
+TRANSFORM_COL_MANY(fadst4, 4)
+TRANSFORM_COL_MANY(fadst8, 8)
+TRANSFORM_COL_MANY(fadst16, 16)
+TRANSFORM_COL_MANY(fidentity4, 4)
+TRANSFORM_COL_MANY(fidentity8, 8)
+TRANSFORM_COL_MANY(fidentity16, 16)
+
+TRANSFORM_ROW_ONE(fdct16, 16)
+TRANSFORM_ROW_ONE(fadst16, 16)
+TRANSFORM_ROW_ONE(fidentity16, 16)
+
+TRANSFORM_ROW_RECT_ONE(fdct8, 8)
+TRANSFORM_ROW_RECT_ONE(fadst8, 8)
+TRANSFORM_ROW_RECT_ONE(fidentity8, 8)
+
+#if !CONFIG_REALTIME_ONLY
+TRANSFORM_ROW_MANY(fdct4, 4)
+TRANSFORM_ROW_MANY(fdct8, 8)
+TRANSFORM_ROW_MANY(fadst4, 4)
+TRANSFORM_ROW_MANY(fadst8, 8)
+TRANSFORM_ROW_MANY(fidentity4, 4)
+TRANSFORM_ROW_MANY(fidentity8, 8)
+#endif
+
+TRANSFORM_ROW_RECT_MANY(fdct4, 4)
+TRANSFORM_ROW_RECT_MANY(fdct8, 8)
+TRANSFORM_ROW_RECT_MANY(fdct16, 16)
+TRANSFORM_ROW_RECT_MANY(fadst4, 4)
+TRANSFORM_ROW_RECT_MANY(fadst8, 8)
+TRANSFORM_ROW_RECT_MANY(fadst16, 16)
+TRANSFORM_ROW_RECT_MANY(fidentity4, 4)
+TRANSFORM_ROW_RECT_MANY(fidentity8, 8)
+TRANSFORM_ROW_RECT_MANY(fidentity16, 16)
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm8_xn_arr[TX_TYPES] = {
+ highbd_fdct8_col_many_neon, // DCT_DCT
+ highbd_fadst8_col_many_neon, // ADST_DCT
+ highbd_fdct8_col_many_neon, // DCT_ADST
+ highbd_fadst8_col_many_neon, // ADST_ADST
+ highbd_fadst8_col_many_neon, // FLIPADST_DCT
+ highbd_fdct8_col_many_neon, // DCT_FLIPADST
+ highbd_fadst8_col_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_col_many_neon, // ADST_FLIPADST
+ highbd_fadst8_col_many_neon, // FLIPADST_ADST
+ highbd_fidentity8_col_many_neon, // IDTX
+ highbd_fdct8_col_many_neon, // V_DCT
+ highbd_fidentity8_col_many_neon, // H_DCT
+ highbd_fadst8_col_many_neon, // V_ADST
+ highbd_fidentity8_col_many_neon, // H_ADST
+ highbd_fadst8_col_many_neon, // V_FLIPADST
+ highbd_fidentity8_col_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = {
+ highbd_fdct8_col_neon, // DCT_DCT
+ highbd_fadst8_col_neon, // ADST_DCT
+ highbd_fdct8_col_neon, // DCT_ADST
+ highbd_fadst8_col_neon, // ADST_ADST
+ highbd_fadst8_col_neon, // FLIPADST_DCT
+ highbd_fdct8_col_neon, // DCT_FLIPADST
+ highbd_fadst8_col_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_col_neon, // ADST_FLIPADST
+ highbd_fadst8_col_neon, // FLIPADST_ADST
+ highbd_fidentity8_col_neon, // IDTX
+ highbd_fdct8_col_neon, // V_DCT
+ highbd_fidentity8_col_neon, // H_DCT
+ highbd_fadst8_col_neon, // V_ADST
+ highbd_fidentity8_col_neon, // H_ADST
+ highbd_fadst8_col_neon, // V_FLIPADST
+ highbd_fidentity8_col_neon // H_FLIPADST
+};
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm16_xn_arr[TX_TYPES] = {
+ highbd_fdct16_col_many_neon, // DCT_DCT
+ highbd_fadst16_col_many_neon, // ADST_DCT
+ highbd_fdct16_col_many_neon, // DCT_ADST
+ highbd_fadst16_col_many_neon, // ADST_ADST
+ highbd_fadst16_col_many_neon, // FLIPADST_DCT
+ highbd_fdct16_col_many_neon, // DCT_FLIPADST
+ highbd_fadst16_col_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst16_col_many_neon, // ADST_FLIPADST
+ highbd_fadst16_col_many_neon, // FLIPADST_ADST
+ highbd_fidentity16_col_many_neon, // IDTX
+ highbd_fdct16_col_many_neon, // V_DCT
+ highbd_fidentity16_col_many_neon, // H_DCT
+ highbd_fadst16_col_many_neon, // V_ADST
+ highbd_fidentity16_col_many_neon, // H_ADST
+ highbd_fadst16_col_many_neon, // V_FLIPADST
+ highbd_fidentity16_col_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm4_xn_arr[TX_TYPES] = {
+ highbd_fdct4_col_many_neon, // DCT_DCT
+ highbd_fadst4_col_many_neon, // ADST_DCT
+ highbd_fdct4_col_many_neon, // DCT_ADST
+ highbd_fadst4_col_many_neon, // ADST_ADST
+ highbd_fadst4_col_many_neon, // FLIPADST_DCT
+ highbd_fdct4_col_many_neon, // DCT_FLIPADST
+ highbd_fadst4_col_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst4_col_many_neon, // ADST_FLIPADST
+ highbd_fadst4_col_many_neon, // FLIPADST_ADST
+ highbd_fidentity4_col_many_neon, // IDTX
+ highbd_fdct4_col_many_neon, // V_DCT
+ highbd_fidentity4_col_many_neon, // H_DCT
+ highbd_fadst4_col_many_neon, // V_ADST
+ highbd_fidentity4_col_many_neon, // H_ADST
+ highbd_fadst4_col_many_neon, // V_FLIPADST
+ highbd_fidentity4_col_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = {
+ highbd_fdct16_row_neon, // DCT_DCT
+ highbd_fdct16_row_neon, // ADST_DCT
+ highbd_fadst16_row_neon, // DCT_ADST
+ highbd_fadst16_row_neon, // ADST_ADST
+ highbd_fdct16_row_neon, // FLIPADST_DCT
+ highbd_fadst16_row_neon, // DCT_FLIPADST
+ highbd_fadst16_row_neon, // FLIPADST_FLIPADST
+ highbd_fadst16_row_neon, // ADST_FLIPADST
+ highbd_fadst16_row_neon, // FLIPADST_ADST
+ highbd_fidentity16_row_neon, // IDTX
+ highbd_fidentity16_row_neon, // V_DCT
+ highbd_fdct16_row_neon, // H_DCT
+ highbd_fidentity16_row_neon, // V_ADST
+ highbd_fadst16_row_neon, // H_ADST
+ highbd_fidentity16_row_neon, // V_FLIPADST
+ highbd_fadst16_row_neon // H_FLIPADST
+};
+
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm16_xn_arr[TX_TYPES] = {
+ highbd_fdct16_row_rect_many_neon, // DCT_DCT
+ highbd_fdct16_row_rect_many_neon, // ADST_DCT
+ highbd_fadst16_row_rect_many_neon, // DCT_ADST
+ highbd_fadst16_row_rect_many_neon, // ADST_ADST
+ highbd_fdct16_row_rect_many_neon, // FLIPADST_DCT
+ highbd_fadst16_row_rect_many_neon, // DCT_FLIPADST
+ highbd_fadst16_row_rect_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst16_row_rect_many_neon, // ADST_FLIPADST
+ highbd_fadst16_row_rect_many_neon, // FLIPADST_ADST
+ highbd_fidentity16_row_rect_many_neon, // IDTX
+ highbd_fidentity16_row_rect_many_neon, // V_DCT
+ highbd_fdct16_row_rect_many_neon, // H_DCT
+ highbd_fidentity16_row_rect_many_neon, // V_ADST
+ highbd_fadst16_row_rect_many_neon, // H_ADST
+ highbd_fidentity16_row_rect_many_neon, // V_FLIPADST
+ highbd_fadst16_row_rect_many_neon // H_FLIPADST
+ };
+
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_row_many_neon
+ row_highbd_txfm8_xn_arr[TX_TYPES] = {
+ highbd_fdct8_row_many_neon, // DCT_DCT
+ highbd_fdct8_row_many_neon, // ADST_DCT
+ highbd_fadst8_row_many_neon, // DCT_ADST
+ highbd_fadst8_row_many_neon, // ADST_ADST
+ highbd_fdct8_row_many_neon, // FLIPADST_DCT
+ highbd_fadst8_row_many_neon, // DCT_FLIPADST
+ highbd_fadst8_row_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_row_many_neon, // ADST_FLIPADST
+ highbd_fadst8_row_many_neon, // FLIPADST_ADST
+ highbd_fidentity8_row_many_neon, // IDTX
+ highbd_fidentity8_row_many_neon, // V_DCT
+ highbd_fdct8_row_many_neon, // H_DCT
+ highbd_fidentity8_row_many_neon, // V_ADST
+ highbd_fadst8_row_many_neon, // H_ADST
+ highbd_fidentity8_row_many_neon, // V_FLIPADST
+ highbd_fadst8_row_many_neon // H_FLIPADST
+ };
+#endif
+
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm8_xn_arr[TX_TYPES] = {
+ highbd_fdct8_row_rect_many_neon, // DCT_DCT
+ highbd_fdct8_row_rect_many_neon, // ADST_DCT
+ highbd_fadst8_row_rect_many_neon, // DCT_ADST
+ highbd_fadst8_row_rect_many_neon, // ADST_ADST
+ highbd_fdct8_row_rect_many_neon, // FLIPADST_DCT
+ highbd_fadst8_row_rect_many_neon, // DCT_FLIPADST
+ highbd_fadst8_row_rect_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_row_rect_many_neon, // ADST_FLIPADST
+ highbd_fadst8_row_rect_many_neon, // FLIPADST_ADST
+ highbd_fidentity8_row_rect_many_neon, // IDTX
+ highbd_fidentity8_row_rect_many_neon, // V_DCT
+ highbd_fdct8_row_rect_many_neon, // H_DCT
+ highbd_fidentity8_row_rect_many_neon, // V_ADST
+ highbd_fadst8_row_rect_many_neon, // H_ADST
+ highbd_fidentity8_row_rect_many_neon, // V_FLIPADST
+ highbd_fadst8_row_rect_many_neon // H_FLIPADST
+ };
+
+static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = {
+ highbd_fdct8_row_rect_neon, // DCT_DCT
+ highbd_fdct8_row_rect_neon, // ADST_DCT
+ highbd_fadst8_row_rect_neon, // DCT_ADST
+ highbd_fadst8_row_rect_neon, // ADST_ADST
+ highbd_fdct8_row_rect_neon, // FLIPADST_DCT
+ highbd_fadst8_row_rect_neon, // DCT_FLIPADST
+ highbd_fadst8_row_rect_neon, // FLIPADST_FLIPADST
+ highbd_fadst8_row_rect_neon, // ADST_FLIPADST
+ highbd_fadst8_row_rect_neon, // FLIPADST_ADST
+ highbd_fidentity8_row_rect_neon, // IDTX
+ highbd_fidentity8_row_rect_neon, // V_DCT
+ highbd_fdct8_row_rect_neon, // H_DCT
+ highbd_fidentity8_row_rect_neon, // V_ADST
+ highbd_fadst8_row_rect_neon, // H_ADST
+ highbd_fidentity8_row_rect_neon, // V_FLIPADST
+ highbd_fadst8_row_rect_neon // H_FLIPADST
+};
+
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_row_many_neon
+ row_highbd_txfm4_xn_arr[TX_TYPES] = {
+ highbd_fdct4_row_many_neon, // DCT_DCT
+ highbd_fdct4_row_many_neon, // ADST_DCT
+ highbd_fadst4_row_many_neon, // DCT_ADST
+ highbd_fadst4_row_many_neon, // ADST_ADST
+ highbd_fdct4_row_many_neon, // FLIPADST_DCT
+ highbd_fadst4_row_many_neon, // DCT_FLIPADST
+ highbd_fadst4_row_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst4_row_many_neon, // ADST_FLIPADST
+ highbd_fadst4_row_many_neon, // FLIPADST_ADST
+ highbd_fidentity4_row_many_neon, // IDTX
+ highbd_fidentity4_row_many_neon, // V_DCT
+ highbd_fdct4_row_many_neon, // H_DCT
+ highbd_fidentity4_row_many_neon, // V_ADST
+ highbd_fadst4_row_many_neon, // H_ADST
+ highbd_fidentity4_row_many_neon, // V_FLIPADST
+ highbd_fadst4_row_many_neon // H_FLIPADST
+ };
+#endif
+
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm4_xn_arr[TX_TYPES] = {
+ highbd_fdct4_row_rect_many_neon, // DCT_DCT
+ highbd_fdct4_row_rect_many_neon, // ADST_DCT
+ highbd_fadst4_row_rect_many_neon, // DCT_ADST
+ highbd_fadst4_row_rect_many_neon, // ADST_ADST
+ highbd_fdct4_row_rect_many_neon, // FLIPADST_DCT
+ highbd_fadst4_row_rect_many_neon, // DCT_FLIPADST
+ highbd_fadst4_row_rect_many_neon, // FLIPADST_FLIPADST
+ highbd_fadst4_row_rect_many_neon, // ADST_FLIPADST
+ highbd_fadst4_row_rect_many_neon, // FLIPADST_ADST
+ highbd_fidentity4_row_rect_many_neon, // IDTX
+ highbd_fidentity4_row_rect_many_neon, // V_DCT
+ highbd_fdct4_row_rect_many_neon, // H_DCT
+ highbd_fidentity4_row_rect_many_neon, // V_ADST
+ highbd_fadst4_row_rect_many_neon, // H_ADST
+ highbd_fidentity4_row_rect_many_neon, // V_FLIPADST
+ highbd_fadst4_row_rect_many_neon // H_FLIPADST
+ };
+
+static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output,
+ int cos_bit) {
+ const int32_t *const cospi = cospi_arr_s32(cos_bit);
+ const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+ // Workspaces for intermediate transform steps.
+ int32x4_t buf0[32];
+ int32x4_t buf1[32];
+
+ // stage 1
+ butterfly_dct_pre(input, buf1, 32);
+
+ // stage 2
+ butterfly_dct_pre(buf1, buf0, 16);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23],
+ v_cos_bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ butterfly_dct_pre(buf0, buf1, 8);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11],
+ v_cos_bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16);
+
+ // stage 4
+ butterfly_dct_pre(buf1, buf0, 4);
+ buf0[4] = buf1[4];
+ butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5],
+ v_cos_bit);
+ buf0[7] = buf1[7];
+ butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26],
+ v_cos_bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3],
+ v_cos_bit);
+ butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4);
+ buf1[8] = buf0[8];
+ butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13],
+ v_cos_bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8);
+ butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8);
+
+ // stage 6
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+
+ butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17],
+ v_cos_bit);
+ butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29],
+ v_cos_bit);
+ butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4);
+ butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4);
+ buf0[16] = buf1[16];
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+
+ butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21],
+ v_cos_bit);
+ butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22],
+ v_cos_bit);
+
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12],
+ v_cos_bit);
+ butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4);
+ butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4);
+ butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4);
+ butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4);
+
+ // stage 8
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26],
+ v_cos_bit);
+ butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25],
+ v_cos_bit);
+ butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24],
+ v_cos_bit);
+
+ // stage 9
+ output[0] = buf0[0];
+ output[1] = buf0[16];
+ output[2] = buf0[8];
+ output[3] = buf0[24];
+ output[4] = buf0[4];
+ output[5] = buf0[20];
+ output[6] = buf0[12];
+ output[7] = buf0[28];
+ output[8] = buf0[2];
+ output[9] = buf0[18];
+ output[10] = buf0[10];
+ output[11] = buf0[26];
+ output[12] = buf0[6];
+ output[13] = buf0[22];
+ output[14] = buf0[14];
+ output[15] = buf0[30];
+ output[16] = buf0[1];
+ output[17] = buf0[17];
+ output[18] = buf0[9];
+ output[19] = buf0[25];
+ output[20] = buf0[5];
+ output[21] = buf0[21];
+ output[22] = buf0[13];
+ output[23] = buf0[29];
+ output[24] = buf0[3];
+ output[25] = buf0[19];
+ output[26] = buf0[11];
+ output[27] = buf0[27];
+ output[28] = buf0[7];
+ output[29] = buf0[23];
+ output[30] = buf0[15];
+ output[31] = buf0[31];
+}
+
+static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output,
+ int8_t cos_bit) {
+ const int32_t *const cospi = cospi_arr_s32(cos_bit);
+ const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit);
+
+ // stage 1
+ int32x4_t x1[64];
+ butterfly_dct_pre(input, x1, 64);
+
+ // stage 2
+ int32x4_t x2[64];
+ butterfly_dct_pre(x1, x2, 32);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+
+ // stage 3
+ int32x4_t x3[64];
+ butterfly_dct_pre(x2, x3, 16);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32);
+
+ // stage 4
+ int32x4_t x4[64];
+ butterfly_dct_pre(x3, x4, 8);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit);
+ butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+
+ // stage 5
+ int32x4_t x5[64];
+ butterfly_dct_pre(x4, x5, 4);
+ x5[4] = x4[4];
+ butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit);
+ x5[7] = x4[7];
+ butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16);
+ butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16);
+
+ // stage 6
+ int32x4_t x6[64];
+ butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit);
+ butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit);
+ butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4);
+ x6[8] = x5[8];
+ butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit);
+ butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8);
+ butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit);
+ butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit);
+ butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit);
+ butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit);
+ butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit);
+ butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit);
+ butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+
+ // stage 7
+ int32x4_t x7[64];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit);
+ butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit);
+ butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4);
+ butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4);
+ x7[16] = x6[16];
+ butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit);
+ butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit);
+ butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8);
+ butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8);
+ butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8);
+ butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8);
+
+ // stage 8
+ int32x4_t x8[64];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+
+ butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit);
+ butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit);
+ butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit);
+ butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit);
+ butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4);
+ butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4);
+ butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4);
+ butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4);
+ x8[32] = x7[32];
+ butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit);
+ butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit);
+ butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit);
+ butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit);
+ butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+
+ // stage 9
+ int32x4_t x9[64];
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit);
+ butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit);
+ butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit);
+ butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit);
+ butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit);
+ butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit);
+ butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit);
+ butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit);
+ butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4);
+ butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4);
+ butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4);
+ butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4);
+ butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4);
+ butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4);
+ butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4);
+ butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4);
+
+ // stage 10
+ int32x4_t x10[64];
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit);
+ butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit);
+ butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit);
+ butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit);
+ butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit);
+ butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit);
+ butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit);
+ butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit);
+ butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit);
+ butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit);
+ butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit);
+ butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit);
+ butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit);
+ butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit);
+ butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit);
+ butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit);
+
+ // stage 11
+ output[0] = x10[0];
+ output[1] = x10[32];
+ output[2] = x10[16];
+ output[3] = x10[48];
+ output[4] = x10[8];
+ output[5] = x10[40];
+ output[6] = x10[24];
+ output[7] = x10[56];
+ output[8] = x10[4];
+ output[9] = x10[36];
+ output[10] = x10[20];
+ output[11] = x10[52];
+ output[12] = x10[12];
+ output[13] = x10[44];
+ output[14] = x10[28];
+ output[15] = x10[60];
+ output[16] = x10[2];
+ output[17] = x10[34];
+ output[18] = x10[18];
+ output[19] = x10[50];
+ output[20] = x10[10];
+ output[21] = x10[42];
+ output[22] = x10[26];
+ output[23] = x10[58];
+ output[24] = x10[6];
+ output[25] = x10[38];
+ output[26] = x10[22];
+ output[27] = x10[54];
+ output[28] = x10[14];
+ output[29] = x10[46];
+ output[30] = x10[30];
+ output[31] = x10[62];
+ output[32] = x10[1];
+ output[33] = x10[33];
+ output[34] = x10[17];
+ output[35] = x10[49];
+ output[36] = x10[9];
+ output[37] = x10[41];
+ output[38] = x10[25];
+ output[39] = x10[57];
+ output[40] = x10[5];
+ output[41] = x10[37];
+ output[42] = x10[21];
+ output[43] = x10[53];
+ output[44] = x10[13];
+ output[45] = x10[45];
+ output[46] = x10[29];
+ output[47] = x10[61];
+ output[48] = x10[3];
+ output[49] = x10[35];
+ output[50] = x10[19];
+ output[51] = x10[51];
+ output[52] = x10[11];
+ output[53] = x10[43];
+ output[54] = x10[27];
+ output[55] = x10[59];
+ output[56] = x10[7];
+ output[57] = x10[39];
+ output[58] = x10[23];
+ output[59] = x10[55];
+ output[60] = x10[15];
+ output[61] = x10[47];
+ output[62] = x10[31];
+ output[63] = x10[63];
+}
+
+static void highbd_fidentity32_x4_neon(const int32x4_t *input,
+ int32x4_t *output, int cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; i++) {
+ output[i] = vshlq_n_s32(input[i], 2);
+ }
+}
+
+TRANSFORM_COL_MANY(fdct32, 32)
+TRANSFORM_COL_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_col_many_neon
+ col_highbd_txfm32_x4_arr[TX_TYPES] = {
+ highbd_fdct32_col_many_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ highbd_fidentity32_col_many_neon, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+ };
+
+TRANSFORM_ROW_MANY(fdct32, 32)
+TRANSFORM_ROW_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_row_many_neon
+ row_highbd_txfm32_x4_arr[TX_TYPES] = {
+ highbd_fdct32_row_many_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ highbd_fidentity32_row_many_neon, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+ };
+
+TRANSFORM_ROW_RECT_MANY(fdct32, 32)
+TRANSFORM_ROW_RECT_MANY(fidentity32, 32)
+
+static const fwd_transform_1d_row_many_neon
+ row_rect_highbd_txfm32_x4_arr[TX_TYPES] = {
+ highbd_fdct32_row_rect_many_neon, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ highbd_fidentity32_row_rect_many_neon, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+ };
+
+void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm8_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm16_xn_arr[tx_type];
+ int bit = av1_fwd_cos_bit_col[2][1];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+ // Column-wise transform.
+ int32x4_t buf0[32];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4,
+ /*hm_stride=*/-8);
+ } else {
+ col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4,
+ /*hm_stride=*/8);
+ }
+ shift_right_2_round_s32_x4(buf0, buf0, 32);
+
+ int32x4_t buf1[32];
+ transpose_arrays_s32_16x8(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8);
+}
+
+void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm16_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm8_xn_arr[tx_type];
+ int bit = av1_fwd_cos_bit_col[1][2];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Column-wise transform.
+ int32x4_t buf0[32];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2,
+ /*hm_stride=*/-16);
+ } else {
+ col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2,
+ /*hm_stride=*/16);
+ }
+ shift_right_2_round_s32_x4(buf0, buf0, 32);
+
+ int32x4_t buf1[32];
+ transpose_arrays_s32_8x16(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16);
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int bitcol = av1_fwd_cos_bit_col[0][2];
+ int bitrow = av1_fwd_cos_bit_row[0][2];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm16_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm4_xn_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Column-wise transform.
+ int32x4_t buf0[16];
+ if (lr_flip) {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1,
+ /*hm_stride=*/0);
+ } else {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1,
+ /*hm_stride=*/0);
+ }
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+
+ int32x4_t buf1[16];
+ transpose_arrays_s32_4x16(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16);
+}
+#endif
+
+void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int bitcol = av1_fwd_cos_bit_col[2][0];
+ int bitrow = av1_fwd_cos_bit_row[2][0];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm4_xn_arr[tx_type];
+ const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+
+ // Column-wise transform.
+ int32x4_t buf0[16];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4,
+ /*hm_stride=*/-4);
+ } else {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
+ /*hm_stride=*/4);
+ }
+
+ shift_right_1_round_s32_x4(buf0, buf0, 16);
+ transpose_arrays_s32_4x16(buf0, buf0);
+
+ // Row-wise transform.
+ row_txfm(buf0, coeff, bitrow, /*stride=*/4);
+}
+
+void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm32_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm16_xn_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[2][3];
+ int bitrow = av1_fwd_cos_bit_row[2][3];
+
+ // Column-wise transform.
+ int32x4_t buf0[128];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4,
+ /*hm_stride=*/32);
+ shift_right_4_round_s32_x4(buf0, buf0, 128);
+
+ int32x4_t buf1[128];
+ transpose_arrays_s32_16x32(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ int bitcol = av1_fwd_cos_bit_col[3][4];
+ int bitrow = av1_fwd_cos_bit_row[3][4];
+
+ // Column-wise transform.
+ int32x4_t buf0[512];
+ load_buffer_32x64(input, buf0, stride, 0);
+ for (int i = 0; i < 8; i++) {
+ highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
+ }
+ shift_right_2_round_s32_x4(buf0, buf0, 512);
+
+ int32x4_t buf1[512];
+ transpose_arrays_s32_32x64(buf0, buf1);
+
+ // Row-wise transform.
+ for (int i = 0; i < 16; i++) {
+ highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow);
+ }
+ round_shift2_rect_array_s32_neon(buf1, buf1, 512);
+ store_buffer_32x32(buf1, coeff, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ int bitcol = av1_fwd_cos_bit_col[4][3];
+ int bitrow = av1_fwd_cos_bit_row[4][3];
+
+ // Column-wise transform.
+ int32x4_t buf0[512];
+ load_buffer_64x32(input, buf0, stride, 0);
+ for (int i = 0; i < 16; i++) {
+ highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol);
+ }
+ shift_right_4_round_s32_x4(buf0, buf0, 512);
+
+ int32x4_t buf1[512];
+ transpose_arrays_s32_64x32(buf0, buf1);
+
+ // Row-wise transform.
+ for (int i = 0; i < 8; i++) {
+ highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
+ }
+ round_shift2_rect_array_s32_neon(buf1, buf1, 512);
+ store_buffer_64x32(buf1, coeff, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm16_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm32_x4_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[3][2];
+ int bitrow = av1_fwd_cos_bit_row[3][2];
+
+ // Column-wise transform.
+ int32x4_t buf0[128];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
+ /*hm_stride=*/16);
+ shift_right_4_round_s32_x4(buf0, buf0, 128);
+
+ int32x4_t buf1[128];
+ transpose_arrays_s32_32x16(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16);
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm32_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm8_xn_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[1][3];
+ int bitrow = av1_fwd_cos_bit_row[1][3];
+
+ // Column-wise transform.
+ int32x4_t buf0[64];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
+ /*hm_stride=*/32);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+
+ int32x4_t buf1[64];
+ transpose_arrays_s32_8x32(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm8_xn_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm32_x4_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[3][1];
+ int bitrow = av1_fwd_cos_bit_row[3][1];
+
+ // Column-wise transform.
+ int32x4_t buf0[64];
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8,
+ /*hm_stride=*/8);
+ shift_right_2_round_s32_x4(buf0, buf0, 64);
+
+ int32x4_t buf1[64];
+ transpose_arrays_s32_32x8(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8);
+}
+#endif
+
+void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ int bitcol = av1_fwd_cos_bit_col[0][1];
+ int bitrow = av1_fwd_cos_bit_row[0][1];
+ const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_rect_highbd_txfm4_xn_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
+
+ // Column-wise transform.
+ int32x4_t buf0[8];
+ col_txfm(input, buf0, stride, bitcol, lr_flip);
+ shift_right_1_round_s32_x4(buf0, buf0, 8);
+
+ int32x4_t buf1[8];
+ transpose_arrays_s32_4x8(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8);
+}
+
+void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const int bitcol = av1_fwd_cos_bit_col[1][0];
+ const int bitrow = av1_fwd_cos_bit_row[1][0];
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm4_xn_arr[tx_type];
+ const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
+
+ // Column-wise transform.
+ int32x4_t buf0[8];
+ if (lr_flip) {
+ col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2,
+ /*hm_stride=*/-4);
+ } else {
+ col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2,
+ /*hm_stride=*/4);
+ }
+
+ shift_right_1_round_s32_x4(buf0, buf0, 8);
+
+ int32x4_t buf1[8];
+ transpose_arrays_s32_8x4(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, coeff, bitrow, /*stride=*/4);
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const int bitcol = av1_fwd_cos_bit_col[2][4];
+ const int bitrow = av1_fwd_cos_bit_row[2][4];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 64);
+
+ // Column-wise transform.
+ int32x4_t buf0[256];
+ load_buffer_16x64(input, buf0, stride, lr_flip);
+ for (int i = 0; i < 4; i++) {
+ highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol);
+ }
+ shift_right_2_round_s32_x4(buf0, buf0, 256);
+
+ int32x4_t buf1[256];
+ transpose_arrays_s32_16x64(buf0, buf1);
+
+ // Row-wise transform.
+ highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8);
+ store_buffer_16x32(buf1, coeff, /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const int bitcol = av1_fwd_cos_bit_col[4][2];
+ const int bitrow = av1_fwd_cos_bit_row[4][2];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
+
+ // Column-wise transform.
+ int32x4_t buf0[256];
+ load_buffer_64x16(input, buf0, stride, lr_flip);
+ highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16);
+ shift_right_4_round_s32_x4(buf0, buf0, 256);
+
+ int32x4_t buf1[256];
+ transpose_arrays_s32_64x16(buf0, buf1);
+
+ // Row-wise transform.
+ for (int i = 0; i < 4; i++) {
+ highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow);
+ }
+ store_buffer_64x16(buf1, coeff, /*stride=*/16);
+ memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff));
+}
+#endif
+
+void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const fwd_transform_1d_col_many_neon col_txfm =
+ col_highbd_txfm32_x4_arr[tx_type];
+ const fwd_transform_1d_row_many_neon row_txfm =
+ row_highbd_txfm32_x4_arr[tx_type];
+
+ // Column-wise transform.
+ int32x4_t buf0[256];
+ col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8,
+ /*hm_stride=*/32);
+ shift_right_4_round_s32_x4(buf0, buf0, 256);
+
+ int32x4_t buf1[256];
+ transpose_arrays_s32_32x32(buf0, buf1);
+
+ // Row-wise transform.
+ row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32,
+ /*stride=*/32);
+}
+
+void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+
+ // Column-wise transform.
+ int32x4_t buf0[1024];
+ load_buffer_64x64(input, buf0, stride, 0);
+ for (int col = 0; col < 16; col++) {
+ highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13);
+ }
+ shift_right_2_round_s32_x4(buf0, buf0, 1024);
+
+ int32x4_t buf1[1024];
+ transpose_arrays_s32_64x64(buf0, buf1);
+
+ // Row-wise transform.
+ for (int col = 0; col < 8; col++) {
+ highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10);
+ }
+ shift_right_2_round_s32_x4(buf1, buf1, 512);
+ store_buffer_64x32(buf1, output, /*stride=*/32);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
new file mode 100644
index 0000000000..47b5f5cfb7
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_pickrst_neon.c
@@ -0,0 +1,1207 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdint.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void highbd_calc_proj_params_r0_r1_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t h01_lo = vdupq_n_s64(0);
+ int64x2_t h01_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint16_t *src_ptr = src;
+ const uint16_t *dat_ptr = dat;
+ int32_t *flt0_ptr = flt0;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr);
+ uint16x8_t d = vld1q_u16(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int32x4_t u_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t u_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t s_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+ int32x4_t s_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+ s_lo = vsubq_s32(s_lo, u_lo);
+ s_hi = vsubq_s32(s_hi, u_hi);
+
+ f0_lo = vsubq_s32(f0_lo, u_lo);
+ f0_hi = vsubq_s32(f0_hi, u_hi);
+ f1_lo = vsubq_s32(f1_lo, u_lo);
+ f1_hi = vsubq_s32(f1_hi, u_hi);
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
+ h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
+ h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
+ h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src += src_stride;
+ dat += dat_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ H[1][0] = H[0][1];
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+static INLINE void highbd_calc_proj_params_r0_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+
+ do {
+ const uint16_t *src_ptr = src;
+ const uint16_t *dat_ptr = dat;
+ int32_t *flt0_ptr = flt0;
+ int w = width;
+
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr);
+ uint16x8_t d = vld1q_u16(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+
+ int32x4_t u_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t u_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t s_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+ int32x4_t s_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+ s_lo = vsubq_s32(s_lo, u_lo);
+ s_hi = vsubq_s32(s_hi, u_hi);
+
+ f0_lo = vsubq_s32(f0_lo, u_lo);
+ f0_hi = vsubq_s32(f0_hi, u_hi);
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src += src_stride;
+ dat += dat_stride;
+ flt0 += flt0_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+}
+
+static INLINE void highbd_calc_proj_params_r1_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint16_t *src_ptr = src;
+ const uint16_t *dat_ptr = dat;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint16x8_t s = vld1q_u16(src_ptr);
+ uint16x8_t d = vld1q_u16(dat_ptr);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int32x4_t u_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t u_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(d), SGRPROJ_RST_BITS));
+ int32x4_t s_lo =
+ vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(s), SGRPROJ_RST_BITS));
+ int32x4_t s_hi = vreinterpretq_s32_u32(
+ vshll_n_u16(vget_high_u16(s), SGRPROJ_RST_BITS));
+ s_lo = vsubq_s32(s_lo, u_lo);
+ s_hi = vsubq_s32(s_hi, u_hi);
+
+ f1_lo = vsubq_s32(f1_lo, u_lo);
+ f1_hi = vsubq_s32(f1_hi, u_hi);
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src += src_stride;
+ dat += dat_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_high_bd_neon(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ highbd_calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ highbd_calc_proj_params_r0_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ highbd_calc_proj_params_r1_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+static INLINE int16x8_t tbl2q(int16x8_t a, int16x8_t b, uint8x16_t idx) {
+#if AOM_ARCH_AARCH64
+ uint8x16x2_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b) } };
+ return vreinterpretq_s16_u8(vqtbl2q_u8(table, idx));
+#else
+ uint8x8x4_t table = { { vreinterpret_u8_s16(vget_low_s16(a)),
+ vreinterpret_u8_s16(vget_high_s16(a)),
+ vreinterpret_u8_s16(vget_low_s16(b)),
+ vreinterpret_u8_s16(vget_high_s16(b)) } };
+ return vreinterpretq_s16_u8(vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)),
+ vtbl4_u8(table, vget_high_u8(idx))));
+#endif
+}
+
+static INLINE int16x8_t tbl3q(int16x8_t a, int16x8_t b, int16x8_t c,
+ uint8x16_t idx) {
+#if AOM_ARCH_AARCH64
+ uint8x16x3_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b),
+ vreinterpretq_u8_s16(c) } };
+ return vreinterpretq_s16_u8(vqtbl3q_u8(table, idx));
+#else
+ // This is a specific implementation working only for compute stats with
+ // wiener_win == 5.
+ uint8x8x3_t table_lo = { { vreinterpret_u8_s16(vget_low_s16(a)),
+ vreinterpret_u8_s16(vget_high_s16(a)),
+ vreinterpret_u8_s16(vget_low_s16(b)) } };
+ uint8x8x3_t table_hi = { { vreinterpret_u8_s16(vget_low_s16(b)),
+ vreinterpret_u8_s16(vget_high_s16(b)),
+ vreinterpret_u8_s16(vget_low_s16(c)) } };
+ return vreinterpretq_s16_u8(vcombine_u8(
+ vtbl3_u8(table_lo, vget_low_u8(idx)),
+ vtbl3_u8(table_hi, vsub_u8(vget_high_u8(idx), vdup_n_u8(16)))));
+#endif
+}
+
+static INLINE int64_t div_shift_s64(int64_t x, int power) {
+ return (x < 0 ? x + (1ll << power) - 1 : x) >> power;
+}
+
+// The M matrix is accumulated in a bitdepth-dependent number of steps to
+// speed up the computation. This function computes the final M from the
+// accumulated (src_s64) and the residual parts (src_s32). It also transposes
+// the result as the output needs to be column-major.
+static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
+ const int32_t *src_s32, const int wiener_win,
+ int shift) {
+ for (int i = 0; i < wiener_win; ++i) {
+ for (int j = 0; j < wiener_win; ++j) {
+ int tr_idx = j * wiener_win + i;
+ *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift);
+ }
+ }
+}
+
+// The resulting H is a column-major matrix accumulated from the transposed
+// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single
+// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This
+// function transforms back to the originally expected format (double
+// transpose). The H matrix is accumulated in a bitdepth-dependent number of
+// steps to speed up the computation. This function computes the final H from
+// the accumulated (src_s64) and the residual parts (src_s32). The computed H is
+// only an upper triangle matrix, this function also fills the lower triangle of
+// the resulting matrix.
+static INLINE void update_H(int64_t *dst, const int64_t *src_s64,
+ const int32_t *src_s32, const int wiener_win,
+ int stride, int shift) {
+ // For a simplified theoretical 3x3 case where `wiener_win` is 3 and
+ // `wiener_win2` is 9, the M matrix is 3x3:
+ // 0, 3, 6
+ // 1, 4, 7
+ // 2, 5, 8
+ //
+ // This is viewed as a vector to compute H (9x9) by vector outer product:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8
+ //
+ // Double transpose and upper triangle remapping for 3x3 -> 9x9 case:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8,
+ // 3, 30, 33, 12, 31, 34, 21, 32, 35,
+ // 6, 33, 60, 15, 42, 61, 24, 51, 62,
+ // 1, 12, 15, 10, 13, 16, 11, 14, 17,
+ // 4, 31, 42, 13, 40, 43, 22, 41, 44,
+ // 7, 34, 61, 16, 43, 70, 25, 52, 71,
+ // 2, 21, 24, 11, 22, 25, 20, 23, 26,
+ // 5, 32, 51, 14, 41, 52, 23, 50, 53,
+ // 8, 35, 62, 17, 44, 71, 26, 53, 80,
+ const int wiener_win2 = wiener_win * wiener_win;
+
+ // Loop through the indices according to the remapping above, along the
+ // columns:
+ // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ...,
+ // wiener_win - 1, wiener_win - 1 + wiener_win, ...
+ // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8.
+ for (int i = 0; i < wiener_win; ++i) {
+ for (int j = i; j < wiener_win2; j += wiener_win) {
+ // These two inner loops are the same as the two outer loops, but running
+ // along rows instead of columns. For the 3x3 case `l` will be:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8.
+ for (int k = 0; k < wiener_win; ++k) {
+ for (int l = k; l < wiener_win2; l += wiener_win) {
+ // The nominal double transpose indexing would be:
+ // int idx = stride * j + l;
+ // However we need the upper-right triangle, it is easy with some
+ // min/max operations.
+ int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l);
+
+ // Resulting matrix is filled by combining the 64-bit and the residual
+ // 32-bit matrices together with scaling.
+ *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift);
+ }
+ }
+ }
+ }
+}
+
+// Load 7x7 matrix into 7 128-bit vectors from consecutive rows, the last load
+// address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_s16_8x7(int16x8_t dst[7], const int16_t *src,
+ ptrdiff_t stride) {
+ dst[0] = vld1q_s16(src);
+ src += stride;
+ dst[1] = vld1q_s16(src);
+ src += stride;
+ dst[2] = vld1q_s16(src);
+ src += stride;
+ dst[3] = vld1q_s16(src);
+ src += stride;
+ dst[4] = vld1q_s16(src);
+ src += stride;
+ dst[5] = vld1q_s16(src);
+ src += stride;
+ dst[6] = vld1q_s16(src - 1);
+}
+
+static INLINE void highbd_compute_stats_win7_neon(
+ const uint16_t *dgd, const uint16_t *src, int avg, int width, int height,
+ int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ // Matrix names are capitalized to help readability.
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+ DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+
+ memset(M_s32, 0, sizeof(M_s32));
+ memset(M_s64, 0, sizeof(M_s64));
+ memset(H_s32, 0, sizeof(H_s32));
+ memset(H_s64, 0, sizeof(H_s64));
+
+ // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7
+ // matrices.
+ // clang-format off
+ DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7_highbd[192]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19,
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21,
+ 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23,
+ 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 10, 11, 12, 13, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19,
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21,
+ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23,
+ 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25,
+ 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+ 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ };
+ // clang-format on
+
+ const uint8x16_t lut0 = vld1q_u8(shuffle_stats7_highbd + 0);
+ const uint8x16_t lut1 = vld1q_u8(shuffle_stats7_highbd + 16);
+ const uint8x16_t lut2 = vld1q_u8(shuffle_stats7_highbd + 32);
+ const uint8x16_t lut3 = vld1q_u8(shuffle_stats7_highbd + 48);
+ const uint8x16_t lut4 = vld1q_u8(shuffle_stats7_highbd + 64);
+ const uint8x16_t lut5 = vld1q_u8(shuffle_stats7_highbd + 80);
+ const uint8x16_t lut6 = vld1q_u8(shuffle_stats7_highbd + 96);
+ const uint8x16_t lut7 = vld1q_u8(shuffle_stats7_highbd + 112);
+ const uint8x16_t lut8 = vld1q_u8(shuffle_stats7_highbd + 128);
+ const uint8x16_t lut9 = vld1q_u8(shuffle_stats7_highbd + 144);
+ const uint8x16_t lut10 = vld1q_u8(shuffle_stats7_highbd + 160);
+ const uint8x16_t lut11 = vld1q_u8(shuffle_stats7_highbd + 176);
+
+ // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
+ // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
+ // be as high as 32768/2048/128 for the compute stats.
+ const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+ int acc_cnt = acc_cnt_max;
+ const int src_next = src_stride - width;
+ const int dgd_next = dgd_stride - width;
+ const int16x8_t avg_s16 = vdupq_n_s16(avg);
+
+ do {
+ int j = width;
+ while (j >= 2) {
+ // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+ // middle 6x7 elements being shared.
+ int16x8_t dgd_rows[7];
+ load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+ const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6;
+ dgd += 2;
+
+ dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16);
+ dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16);
+ dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16);
+ dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16);
+ dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16);
+ dgd_rows[5] = vsubq_s16(dgd_rows[5], avg_s16);
+ dgd_rows[6] = vsubq_s16(dgd_rows[6], avg_s16);
+
+ // Re-arrange the combined 8x7 matrix to have the 2 whole 7x7 matrices (1
+ // for each of the 2 pixels) separated into distinct int16x8_t[6] arrays.
+ // These arrays contain 48 elements of the 49 (7x7). Compute `dgd - avg`
+ // for both buffers. Each DGD_AVG buffer contains 49 consecutive elements.
+ int16x8_t dgd_avg0[6];
+ int16x8_t dgd_avg1[6];
+
+ dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut6);
+ dgd_avg0[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
+ dgd_avg1[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut7);
+ dgd_avg0[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
+ dgd_avg1[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut8);
+ dgd_avg0[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut3);
+ dgd_avg1[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut9);
+ dgd_avg0[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut4);
+ dgd_avg1[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut10);
+ dgd_avg0[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut5);
+ dgd_avg1[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut11);
+
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG1, dgd_avg1[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+ vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+ vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]);
+ vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+ vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]);
+ vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+ vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]);
+
+ // The remaining last (49th) elements of `dgd - avg`.
+ DGD_AVG0[48] = dgd_ptr[6] - avg;
+ DGD_AVG1[48] = dgd_ptr[7] - avg;
+
+ // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+ // output pixels at a time. M is of size 7 * 7. It needs to be filled such
+ // that multiplying one element from src with each element of a row of the
+ // wiener window will fill one column of M. However this is not very
+ // convenient in terms of memory access, as it means we do contiguous
+ // loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int src_avg1 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+ update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+ dgd_avg1[0]);
+ update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+ dgd_avg1[1]);
+ update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+ dgd_avg1[2]);
+ update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3],
+ dgd_avg1[3]);
+ update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4],
+ dgd_avg1[4]);
+ update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5],
+ dgd_avg1[5]);
+
+ // Last (49th) element of M_s32 can be computed as scalar more efficiently
+ // for 2 output pixels.
+ M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1;
+
+ // Start accumulating into row-major version of matrix H
+ // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+ // row-major. H is of size 49 * 49. It is filled by multiplying every pair
+ // of elements of the wiener window together (vector outer product). Since
+ // it is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work with column-major matrices,
+ // so we accumulate into a row-major matrix H_s32. At the end of the
+ // algorithm a double transpose transformation will convert H_s32 back to
+ // the expected output layout.
+ update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[48 * WIENER_WIN2_ALIGN2 + 48] +=
+ DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48];
+
+ // Accumulate into 64-bit after a bit depth dependent number of iterations
+ // to prevent overflow.
+ if (--acc_cnt == 0) {
+ acc_cnt = acc_cnt_max;
+
+ accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2);
+
+ // The widening accumulation is only needed for the upper triangle part
+ // of the matrix.
+ int64_t *lh = H_s64;
+ int32_t *lh32 = H_s32;
+ for (int k = 0; k < WIENER_WIN2; ++k) {
+ // The widening accumulation is only run for the relevant parts
+ // (upper-right triangle) in a row 4-element aligned.
+ int k4 = k / 4 * 4;
+ accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4);
+
+ // Last element of the row is computed separately.
+ lh[48] += lh32[48];
+ lh32[48] = 0;
+
+ lh += WIENER_WIN2_ALIGN2;
+ lh32 += WIENER_WIN2_ALIGN2;
+ }
+ }
+
+ j -= 2;
+ }
+
+ // Computations for odd pixel in the row.
+ if (width & 1) {
+ // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+ // middle 6x7 elements being shared.
+ int16x8_t dgd_rows[7];
+ load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+ const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6;
+ ++dgd;
+
+ // Re-arrange the combined 8x7 matrix to have a whole 7x7 matrix tightly
+ // packed into a int16x8_t[6] array. This array contains 48 elements of
+ // the 49 (7x7). Compute `dgd - avg` for the whole buffer. The DGD_AVG
+ // buffer contains 49 consecutive elements.
+ int16x8_t dgd_avg0[6];
+
+ dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16);
+ dgd_avg0[1] = vsubq_s16(tbl2q(dgd_rows[1], dgd_rows[2], lut1), avg_s16);
+ dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[2], dgd_rows[3], lut2), avg_s16);
+ dgd_avg0[3] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut3), avg_s16);
+ dgd_avg0[4] = vsubq_s16(tbl2q(dgd_rows[4], dgd_rows[5], lut4), avg_s16);
+ dgd_avg0[5] = vsubq_s16(tbl2q(dgd_rows[5], dgd_rows[6], lut5), avg_s16);
+
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+ vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+ vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+
+ // The remaining last (49th) element of `dgd - avg`.
+ DGD_AVG0[48] = dgd_ptr[6] - avg;
+
+ // Accumulate into row-major order variant of matrix M (cross-correlation)
+ // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled
+ // such that multiplying one element from src with each element of a row
+ // of the wiener window will fill one column of M. However this is not
+ // very convenient in terms of memory access, as it means we do
+ // contiguous loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+ update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+ update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+ update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]);
+ update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]);
+ update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]);
+
+ // Last (49th) element of M_s32 can be computed as scalar more efficiently
+ // for 1 output pixel.
+ M_s32[48] += DGD_AVG0[48] * src_avg0;
+
+ // Start accumulating into row-major order version of matrix H
+ // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+ // H is of size 49 * 49. It is filled by multiplying every pair of
+ // elements of the wiener window together (vector outer product). Since it
+ // is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work column-major matrices, so we
+ // accumulate into a row-major matrix H_s32. At the end of the algorithm a
+ // double transpose transformation will convert H_s32 back to the expected
+ // output layout.
+ update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48);
+
+ // The last element of the triangle of H_s32 matrix can be computed as
+ // scalar more efficiently.
+ H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48];
+ }
+
+ src += src_next;
+ dgd += dgd_next;
+ } while (--height != 0);
+
+ int bit_depth_shift = bit_depth - AOM_BITS_8;
+
+ acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, bit_depth_shift);
+
+ update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, bit_depth_shift);
+}
+
+// Load 5x5 matrix into 5 128-bit vectors from consecutive rows, the last load
+// address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_s16_6x5(int16x8_t dst[5], const int16_t *src,
+ ptrdiff_t stride) {
+ dst[0] = vld1q_s16(src);
+ src += stride;
+ dst[1] = vld1q_s16(src);
+ src += stride;
+ dst[2] = vld1q_s16(src);
+ src += stride;
+ dst[3] = vld1q_s16(src);
+ src += stride;
+ dst[4] = vld1q_s16(src - 3);
+}
+
+static void highbd_compute_stats_win5_neon(const uint16_t *dgd,
+ const uint16_t *src, int avg,
+ int width, int height,
+ int dgd_stride, int src_stride,
+ int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ // Matrix names are capitalized to help readability.
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t,
+ H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+ DECLARE_ALIGNED(64, int64_t,
+ H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+
+ memset(M_s32, 0, sizeof(M_s32));
+ memset(M_s64, 0, sizeof(M_s64));
+ memset(H_s32, 0, sizeof(H_s32));
+ memset(H_s64, 0, sizeof(H_s64));
+
+ // Look-up tables to create 8x3 matrix with consecutive elements from 5x5
+ // matrix.
+ DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5_highbd[96]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21,
+ 6, 7, 8, 9, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 32, 33,
+ 2, 3, 4, 5, 6, 7, 8, 9, 22, 23, 24, 25, 26, 27, 28, 29,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23,
+ 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 34, 35,
+ 4, 5, 6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31,
+ };
+
+ const uint8x16_t lut0 = vld1q_u8(shuffle_stats5_highbd + 0);
+ const uint8x16_t lut1 = vld1q_u8(shuffle_stats5_highbd + 16);
+ const uint8x16_t lut2 = vld1q_u8(shuffle_stats5_highbd + 32);
+ const uint8x16_t lut3 = vld1q_u8(shuffle_stats5_highbd + 48);
+ const uint8x16_t lut4 = vld1q_u8(shuffle_stats5_highbd + 64);
+ const uint8x16_t lut5 = vld1q_u8(shuffle_stats5_highbd + 80);
+
+ // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results
+ // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can
+ // be as high as 32768/2048/128 for the compute stats.
+ const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1;
+ int acc_cnt = acc_cnt_max;
+ const int src_next = src_stride - width;
+ const int dgd_next = dgd_stride - width;
+ const int16x8_t avg_s16 = vdupq_n_s16(avg);
+
+ do {
+ int j = width;
+ while (j >= 2) {
+ // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+ // middle 4x5 elements being shared.
+ int16x8_t dgd_rows[5];
+ load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+ const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4;
+ dgd += 2;
+
+ dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16);
+ dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16);
+ dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16);
+ dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16);
+ dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16);
+
+ // Re-arrange the combined 6x5 matrix to have the 2 whole 5x5 matrices (1
+ // for each of the 2 pixels) separated into distinct int16x8_t[3] arrays.
+ // These arrays contain 24 elements of the 25 (5x5). Compute `dgd - avg`
+ // for both buffers. Each DGD_AVG buffer contains 25 consecutive elements.
+ int16x8_t dgd_avg0[3];
+ int16x8_t dgd_avg1[3];
+
+ dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut3);
+ dgd_avg0[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1);
+ dgd_avg1[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut4);
+ dgd_avg0[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut2);
+ dgd_avg1[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut5);
+
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG1, dgd_avg1[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+
+ // The remaining last (25th) elements of `dgd - avg`.
+ DGD_AVG0[24] = dgd_ptr[4] - avg;
+ DGD_AVG1[24] = dgd_ptr[5] - avg;
+
+ // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+ // output pixels at a time. M is of size 5 * 5. It needs to be filled such
+ // that multiplying one element from src with each element of a row of the
+ // wiener window will fill one column of M. However this is not very
+ // convenient in terms of memory access, as it means we do contiguous
+ // loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int src_avg1 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+ update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+ dgd_avg1[0]);
+ update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+ dgd_avg1[1]);
+ update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+ dgd_avg1[2]);
+
+ // Last (25th) element of M_s32 can be computed as scalar more efficiently
+ // for 2 output pixels.
+ M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1;
+
+ // Start accumulating into row-major version of matrix H
+ // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+ // row-major. H is of size 25 * 25. It is filled by multiplying every pair
+ // of elements of the wiener window together (vector outer product). Since
+ // it is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work with column-major matrices,
+ // so we accumulate into a row-major matrix H_s32. At the end of the
+ // algorithm a double transpose transformation will convert H_s32 back to
+ // the expected output layout.
+ update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+ DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24];
+
+ // Accumulate into 64-bit after a bit depth dependent number of iterations
+ // to prevent overflow.
+ if (--acc_cnt == 0) {
+ acc_cnt = acc_cnt_max;
+
+ accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2);
+
+ // The widening accumulation is only needed for the upper triangle part
+ // of the matrix.
+ int64_t *lh = H_s64;
+ int32_t *lh32 = H_s32;
+ for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) {
+ // The widening accumulation is only run for the relevant parts
+ // (upper-right triangle) in a row 4-element aligned.
+ int k4 = k / 4 * 4;
+ accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4);
+
+ // Last element of the row is computed separately.
+ lh[24] += lh32[24];
+ lh32[24] = 0;
+
+ lh += WIENER_WIN2_REDUCED_ALIGN2;
+ lh32 += WIENER_WIN2_REDUCED_ALIGN2;
+ }
+ }
+
+ j -= 2;
+ }
+
+ // Computations for odd pixel in the row.
+ if (width & 1) {
+ // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+ // middle 4x5 elements being shared.
+ int16x8_t dgd_rows[5];
+ load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride);
+
+ const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4;
+ ++dgd;
+
+ // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5
+ // matrix tightly packed into a int16x8_t[3] array. This array contains
+ // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer.
+ // The DGD_AVG buffer contains 25 consecutive elements.
+ int16x8_t dgd_avg0[3];
+
+ dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16);
+ dgd_avg0[1] = vsubq_s16(
+ tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1), avg_s16);
+ dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut2), avg_s16);
+
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+
+ // The remaining last (25th) element of `dgd - avg`.
+ DGD_AVG0[24] = dgd_ptr[4] - avg;
+ DGD_AVG1[24] = dgd_ptr[5] - avg;
+
+ // Accumulate into row-major order variant of matrix M (cross-correlation)
+ // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled
+ // such that multiplying one element from src with each element of a row
+ // of the wiener window will fill one column of M. However this is not
+ // very convenient in terms of memory access, as it means we do
+ // contiguous loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+ update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+ update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+
+ // Last (25th) element of M_s32 can be computed as scalar more efficiently
+ // for 1 output pixel.
+ M_s32[24] += DGD_AVG0[24] * src_avg0;
+
+ // Start accumulating into row-major order version of matrix H
+ // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+ // H is of size 25 * 25. It is filled by multiplying every pair of
+ // elements of the wiener window together (vector outer product). Since it
+ // is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work with column-major matrices,
+ // so we accumulate into a row-major matrix H_s32. At the end of the
+ // algorithm a double transpose transformation will convert H_s32 back to
+ // the expected output layout.
+ update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+ DGD_AVG0[24] * DGD_AVG0[24];
+ }
+
+ src += src_next;
+ dgd += dgd_next;
+ } while (--height != 0);
+
+ int bit_depth_shift = bit_depth - AOM_BITS_8;
+
+ acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, bit_depth_shift);
+
+ update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2,
+ bit_depth_shift);
+}
+
+static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride,
+ int width, int height) {
+ assert(width > 0);
+ assert(height > 0);
+
+ uint64x2_t sum_u64 = vdupq_n_u64(0);
+ uint64_t sum = 0;
+
+ int h = height;
+ do {
+ uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ int w = width;
+ const uint16_t *row = src;
+ while (w >= 32) {
+ uint16x8_t s0 = vld1q_u16(row + 0);
+ uint16x8_t s1 = vld1q_u16(row + 8);
+ uint16x8_t s2 = vld1q_u16(row + 16);
+ uint16x8_t s3 = vld1q_u16(row + 24);
+
+ s0 = vaddq_u16(s0, s1);
+ s2 = vaddq_u16(s2, s3);
+ sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+ sum_u32[1] = vpadalq_u16(sum_u32[1], s2);
+
+ row += 32;
+ w -= 32;
+ }
+
+ if (w >= 16) {
+ uint16x8_t s0 = vld1q_u16(row + 0);
+ uint16x8_t s1 = vld1q_u16(row + 8);
+
+ s0 = vaddq_u16(s0, s1);
+ sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+
+ row += 16;
+ w -= 16;
+ }
+
+ if (w >= 8) {
+ uint16x8_t s0 = vld1q_u16(row);
+ sum_u32[1] = vpadalq_u16(sum_u32[1], s0);
+
+ row += 8;
+ w -= 8;
+ }
+
+ if (w >= 4) {
+ uint16x8_t s0 = vcombine_u16(vld1_u16(row), vdup_n_u16(0));
+ sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+
+ row += 4;
+ w -= 4;
+ }
+
+ while (w-- > 0) {
+ sum += *row++;
+ }
+
+ sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0], sum_u32[1]));
+
+ src += src_stride;
+ } while (--h != 0);
+
+ return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width));
+}
+
+void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8,
+ const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED);
+
+ const int wiener_halfwin = wiener_win >> 1;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const int height = v_end - v_start;
+ const int width = h_end - h_start;
+
+ const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride;
+ const uint16_t *src_start = src + h_start + v_start * src_stride;
+
+ // The wiener window will slide along the dgd frame, centered on each pixel.
+ // For the top left pixel and all the pixels on the side of the frame this
+ // means half of the window will be outside of the frame. As such the actual
+ // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+ // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+ const int vert_offset = v_start - wiener_halfwin;
+ const int horiz_offset = h_start - wiener_halfwin;
+ const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+
+ uint16_t avg = highbd_find_average_neon(dgd_start, dgd_stride, width, height);
+
+ if (wiener_win == WIENER_WIN) {
+ highbd_compute_stats_win7_neon(dgd_win, src_start, avg, width, height,
+ dgd_stride, src_stride, M, H, bit_depth);
+ } else {
+ highbd_compute_stats_win5_neon(dgd_win, src_start, avg, width, height,
+ dgd_stride, src_stride, M, H, bit_depth);
+ }
+}
+
+int64_t av1_highbd_pixel_proj_error_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ int64_t sse = 0;
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ int32x2_t xq_v = vld1_s32(xq);
+ int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4);
+
+ do {
+ int j = 0;
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+
+ do {
+ const uint16x8_t d = vld1q_u16(&dat[j]);
+ const uint16x8_t s = vld1q_u16(&src[j]);
+ int32x4_t flt0_0 = vld1q_s32(&flt0[j]);
+ int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]);
+ int32x4_t flt1_0 = vld1q_s32(&flt1[j]);
+ int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]);
+
+ int32x4_t d_s32_lo = vreinterpretq_s32_u32(
+ vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0));
+ int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16(
+ vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0));
+
+ int32x4_t v0 = vsubq_s32(
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)),
+ d_s32_lo);
+ int32x4_t v1 = vsubq_s32(
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)),
+ d_s32_hi);
+
+ v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0);
+ v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0);
+ v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1);
+ v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1);
+
+ int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+ int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+ int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1),
+ vreinterpretq_s16_u16(vsubq_u16(d, s)));
+ int16x4_t e_lo = vget_low_s16(e);
+ int16x4_t e_hi = vget_high_s16(e);
+
+ sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+ sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+ j += 8;
+ } while (j <= width - 8);
+
+ for (int k = j; k < width; ++k) {
+ int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+ v += xq[0] * (flt0[k]) + xq[1] * (flt1[k]);
+ v -= (xq[1] + xq[0]) * (int32_t)(dat[k] << 4);
+ int32_t e =
+ (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
+ sse += ((int64_t)e * e);
+ }
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+ int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ int32x4_t xq_v = vdupq_n_s32(xq_active);
+
+ do {
+ int j = 0;
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ do {
+ const uint16x8_t d0 = vld1q_u16(&dat[j]);
+ const uint16x8_t s0 = vld1q_u16(&src[j]);
+ int32x4_t flt0_0 = vld1q_s32(&flt[j]);
+ int32x4_t flt0_1 = vld1q_s32(&flt[j + 4]);
+
+ uint16x8_t d_u16 = vshlq_n_u16(d0, 4);
+ int32x4_t sub0 = vreinterpretq_s32_u32(
+ vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16)));
+ int32x4_t sub1 = vreinterpretq_s32_u32(
+ vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16)));
+
+ int32x4_t v0 = vmlaq_s32(
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub0,
+ xq_v);
+ int32x4_t v1 = vmlaq_s32(
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub1,
+ xq_v);
+
+ int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+ int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+ int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1),
+ vreinterpretq_s16_u16(vsubq_u16(d0, s0)));
+ int16x4_t e_lo = vget_low_s16(e);
+ int16x4_t e_hi = vget_high_s16(e);
+
+ sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+ sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+ j += 8;
+ } while (j <= width - 8);
+
+ for (int k = j; k < width; ++k) {
+ int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+ v += xq_active * (int32_t)((uint32_t)flt[j] - (uint16_t)(dat[k] << 4));
+ const int32_t e =
+ (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
+ sse += ((int64_t)e * e);
+ }
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+ dat += dat_stride;
+ flt += flt_stride;
+ src += src_stride;
+ } while (--height != 0);
+ } else {
+ do {
+ int j = 0;
+
+ do {
+ const uint16x8_t d = vld1q_u16(&dat[j]);
+ const uint16x8_t s = vld1q_u16(&src[j]);
+
+ uint16x8_t diff = vabdq_u16(d, s);
+ uint16x4_t diff_lo = vget_low_u16(diff);
+ uint16x4_t diff_hi = vget_high_u16(diff);
+
+ uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo);
+ uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi);
+
+ sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo));
+ sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi));
+
+ j += 8;
+ } while (j <= width - 8);
+
+ for (int k = j; k < width; ++k) {
+ int32_t e = dat[k] - src[k];
+ sse += e * e;
+ }
+
+ dat += dat_stride;
+ src += src_stride;
+ } while (--height != 0);
+ }
+
+ sse += horizontal_add_s64x2(sse_s64);
+ return sse;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c
new file mode 100644
index 0000000000..4bf7ae6ce4
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_rdopt_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+int64_t av1_highbd_block_error_neon(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz, int bd) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+ int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+ const int shift = 2 * (bd - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int32x4_t c = vld1q_s32(coeff);
+ const int32x4_t d = vld1q_s32(dqcoeff);
+
+ const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+ err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+ err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+ ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+ ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+ coeff += 4;
+ dqcoeff += 4;
+ block_size -= 4;
+ } while (block_size != 0);
+
+ *ssz = (horizontal_add_s64x2(ssz_s64) + rounding) >> shift;
+ return ((int64_t)horizontal_add_u64x2(err_u64) + rounding) >> shift;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
new file mode 100644
index 0000000000..88e176f56c
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/highbd_temporal_filter_neon.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_squared_error(
+ const uint16_t *frame1, const uint32_t stride1, const uint16_t *frame2,
+ const uint32_t stride2, const uint32_t block_width,
+ const uint32_t block_height, uint32_t *frame_sse,
+ const unsigned int dst_stride) {
+ uint32_t *dst = frame_sse;
+
+ uint32_t i = 0;
+ do {
+ uint32_t j = 0;
+ do {
+ uint16x8_t s = vld1q_u16(frame1 + i * stride1 + j);
+ uint16x8_t r = vld1q_u16(frame2 + i * stride2 + j);
+
+ uint16x8_t abs_diff = vabdq_u16(s, r);
+ uint32x4_t sse_lo =
+ vmull_u16(vget_low_u16(abs_diff), vget_low_u16(abs_diff));
+ uint32x4_t sse_hi =
+ vmull_u16(vget_high_u16(abs_diff), vget_high_u16(abs_diff));
+
+ vst1q_u32(dst + j, sse_lo);
+ vst1q_u32(dst + j + 4, sse_hi);
+
+ j += 8;
+ } while (j < block_width);
+
+ dst += dst_stride;
+ i++;
+ } while (i < block_height);
+}
+
+static uint32_t sum_kernel5x5_mask_single(const uint32x4_t vsrc[5][2],
+ const uint32x4_t mask_single) {
+ uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[1][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[2][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[3][0], mask_single);
+ vsums = vmlaq_u32(vsums, vsrc[4][0], mask_single);
+ return horizontal_add_u32x4(vsums);
+}
+
+static uint32x4_t sum_kernel5x5_mask_double(const uint32x4_t vsrc[5][2],
+ const uint32x4_t mask1,
+ const uint32x4_t mask2) {
+ uint32x4_t vsums = vmulq_u32(vsrc[0][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[1][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[2][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[3][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[4][0], mask1);
+ vsums = vmlaq_u32(vsums, vsrc[0][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[1][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[2][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[3][1], mask2);
+ vsums = vmlaq_u32(vsums, vsrc[4][1], mask2);
+ return vsums;
+}
+
+static void highbd_apply_temporal_filter(
+ const uint16_t *frame, const unsigned int stride,
+ const uint32_t block_width, const uint32_t block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ const uint32_t *frame_sse, const uint32_t frame_sse_stride,
+ const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+ const double decay_factor, const double inv_factor,
+ const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl,
+ int bd) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_neon[BH][BW] = { 0 };
+ const int half_window = TF_WINDOW_LENGTH >> 1;
+
+ uint32x4_t vsrc[5][2] = { 0 };
+ const uint32x4_t k0000 = vdupq_n_u32(0);
+ const uint32x4_t k1111 = vdupq_n_u32(1);
+ const uint32_t k3110_u32[4] = { 0, 1, 1, 3 };
+ const uint32_t k2111_u32[4] = { 1, 1, 1, 2 };
+ const uint32_t k1112_u32[4] = { 2, 1, 1, 1 };
+ const uint32_t k0113_u32[4] = { 3, 1, 1, 0 };
+ const uint32x4_t k3110 = vld1q_u32(k3110_u32);
+ const uint32x4_t k2111 = vld1q_u32(k2111_u32);
+ const uint32x4_t k1112 = vld1q_u32(k1112_u32);
+ const uint32x4_t k0113 = vld1q_u32(k0113_u32);
+
+ uint32x4_t vmask1[4], vmask2[4];
+ vmask1[0] = k1111;
+ vmask2[0] = vextq_u32(k1111, k0000, 3);
+ vmask1[1] = vextq_u32(k0000, k1111, 3);
+ vmask2[1] = vextq_u32(k1111, k0000, 2);
+ vmask1[2] = vextq_u32(k0000, k1111, 2);
+ vmask2[2] = vextq_u32(k1111, k0000, 1);
+ vmask1[3] = vextq_u32(k0000, k1111, 1);
+ vmask2[3] = k1111;
+
+ uint32_t row = 0;
+ do {
+ uint32_t col = 0;
+ const uint32_t *src = frame_sse + row * frame_sse_stride;
+ if (row == 0) {
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First 2 rows of the 5x5 matrix are padded from the 1st.
+ vsrc[0][0] = vsrc[2][0];
+ vsrc[1][0] = vsrc[2][0];
+ } else if (row == 1) {
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First row of the 5x5 matrix are padded from the 1st.
+ vsrc[0][0] = vsrc[1][0];
+ } else if (row == block_height - 2) {
+ vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+
+ // Last row of the 5x5 matrix are padded from the one before.
+ vsrc[4][0] = vsrc[3][0];
+ } else if (row == block_height - 1) {
+ vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+
+ // Last 2 rows of the 5x5 matrix are padded from the 3rd.
+ vsrc[3][0] = vsrc[2][0];
+ vsrc[4][0] = vsrc[2][0];
+ } else {
+ vsrc[0][0] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][0] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][0] = vld1q_u32(src);
+ vsrc[3][0] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][0] = vld1q_u32(src + 2 * frame_sse_stride);
+ }
+
+ acc_5x5_neon[row][0] = sum_kernel5x5_mask_single(vsrc, k0113);
+ acc_5x5_neon[row][1] = sum_kernel5x5_mask_single(vsrc, k1112);
+
+ col += 4;
+ src += 4;
+ // Traverse 4 columns at a time
+ do {
+ if (row == 0) {
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First 2 rows of the 5x5 matrix are padded from the 1st.
+ vsrc[0][1] = vsrc[2][1];
+ vsrc[1][1] = vsrc[2][1];
+ } else if (row == 1) {
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+
+ // First row of the 5x5 matrix are padded from the 1st.
+ vsrc[0][1] = vsrc[1][1];
+ } else if (row == block_height - 2) {
+ vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+
+ // Last row of the 5x5 matrix are padded from the one before.
+ vsrc[4][1] = vsrc[3][1];
+ } else if (row == block_height - 1) {
+ vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+
+ // Last 2 rows of the 5x5 matrix are padded from the 3rd.
+ vsrc[3][1] = vsrc[2][1];
+ vsrc[4][1] = vsrc[2][1];
+ } else {
+ vsrc[0][1] = vld1q_u32(src - 2 * frame_sse_stride);
+ vsrc[1][1] = vld1q_u32(src - frame_sse_stride);
+ vsrc[2][1] = vld1q_u32(src);
+ vsrc[3][1] = vld1q_u32(src + frame_sse_stride);
+ vsrc[4][1] = vld1q_u32(src + 2 * frame_sse_stride);
+ }
+
+ uint32x4_t sums[4];
+ sums[0] = sum_kernel5x5_mask_double(vsrc, vmask1[0], vmask2[0]);
+ sums[1] = sum_kernel5x5_mask_double(vsrc, vmask1[1], vmask2[1]);
+ sums[2] = sum_kernel5x5_mask_double(vsrc, vmask1[2], vmask2[2]);
+ sums[3] = sum_kernel5x5_mask_double(vsrc, vmask1[3], vmask2[3]);
+ vst1q_u32(&acc_5x5_neon[row][col - half_window],
+ horizontal_add_4d_u32x4(sums));
+
+ vsrc[0][0] = vsrc[0][1];
+ vsrc[1][0] = vsrc[1][1];
+ vsrc[2][0] = vsrc[2][1];
+ vsrc[3][0] = vsrc[3][1];
+ vsrc[4][0] = vsrc[4][1];
+
+ src += 4;
+ col += 4;
+ } while (col <= block_width - 4);
+
+ acc_5x5_neon[row][col - half_window] =
+ sum_kernel5x5_mask_single(vsrc, k2111);
+ acc_5x5_neon[row][col - half_window + 1] =
+ sum_kernel5x5_mask_single(vsrc, k3110);
+
+ row++;
+ } while (row < block_height);
+
+ // Perform filtering.
+ if (tf_wgt_calc_lvl == 0) {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ // Scale down the difference for high bit depth input.
+ const uint32_t diff_sse =
+ (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ } else {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ // Scale down the difference for high bit depth input.
+ const uint32_t diff_sse =
+ (acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j]) >> ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ }
+}
+
+void av1_highbd_apply_temporal_filter_neon(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred8, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+ assert(is_high_bitdepth);
+
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ // Frame information.
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint32_t frame_sse[BW * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride =
+ frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const uint32_t frame_sse_stride = plane_w;
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint16_t *ref =
+ CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ const int ww = frame_sse_stride
+ << ss_x_shift; // Width of Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * ww + xx];
+ }
+ }
+ }
+ }
+ }
+ get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+ plane_h, frame_sse, frame_sse_stride);
+
+ highbd_apply_temporal_filter(
+ pred + plane_offset, plane_w, plane_w, plane_h, subblock_mses,
+ accum + plane_offset, count + plane_offset, frame_sse, frame_sse_stride,
+ luma_sse_sum, inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl, mbd->bd);
+
+ plane_offset += plane_h * plane_w;
+ }
+}
+
+double av1_highbd_estimate_noise_from_single_plane_neon(const uint16_t *src,
+ int height, int width,
+ int stride,
+ int bitdepth,
+ int edge_thresh) {
+ uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+ uint64x2_t acc = vdupq_n_u64(0);
+ // Count is in theory positive as it counts the number of times we're under
+ // the threshold, but it will be counted negatively in order to make best use
+ // of the vclt instruction, which sets every bit of a lane to 1 when the
+ // condition is true.
+ int32x4_t count = vdupq_n_s32(0);
+ int final_count = 0;
+ uint64_t final_acc = 0;
+ const uint16_t *src_start = src + stride + 1;
+ int h = 1;
+
+ do {
+ int w = 1;
+ const uint16_t *src_ptr = src_start;
+
+ while (w <= (width - 1) - 8) {
+ uint16x8_t mat[3][3];
+ mat[0][0] = vld1q_u16(src_ptr - stride - 1);
+ mat[0][1] = vld1q_u16(src_ptr - stride);
+ mat[0][2] = vld1q_u16(src_ptr - stride + 1);
+ mat[1][0] = vld1q_u16(src_ptr - 1);
+ mat[1][1] = vld1q_u16(src_ptr);
+ mat[1][2] = vld1q_u16(src_ptr + 1);
+ mat[2][0] = vld1q_u16(src_ptr + stride - 1);
+ mat[2][1] = vld1q_u16(src_ptr + stride);
+ mat[2][2] = vld1q_u16(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa = vaddq_u16(mat[0][0], mat[2][0]);
+ uint16x8_t gxb = vaddq_u16(mat[0][2], mat[2][2]);
+ gxa = vaddq_u16(gxa, vaddq_u16(mat[1][0], mat[1][0]));
+ gxb = vaddq_u16(gxb, vaddq_u16(mat[1][2], mat[1][2]));
+
+ uint16x8_t gya = vaddq_u16(mat[0][0], mat[0][2]);
+ uint16x8_t gyb = vaddq_u16(mat[2][0], mat[2][2]);
+ gya = vaddq_u16(gya, vaddq_u16(mat[0][1], mat[0][1]));
+ gyb = vaddq_u16(gyb, vaddq_u16(mat[2][1], mat[2][1]));
+
+ uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+ ga = vrshlq_u16(ga, vdupq_n_s16(8 - bitdepth));
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionnally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+ uint16x8_t center = vshlq_n_u16(mat[1][1], 2);
+
+ uint16x8_t adj0 = vaddq_u16(mat[0][1], mat[2][1]);
+ uint16x8_t adj1 = vaddq_u16(mat[1][0], mat[1][2]);
+ uint16x8_t adj = vaddq_u16(adj0, adj1);
+ adj = vaddq_u16(adj, adj);
+
+ uint16x8_t diag0 = vaddq_u16(mat[0][0], mat[0][2]);
+ uint16x8_t diag1 = vaddq_u16(mat[2][0], mat[2][2]);
+ uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+ uint16x8_t v = vabdq_u16(vaddq_u16(center, diag), adj);
+ v = vandq_u16(vrshlq_u16(v, vdupq_n_s16(8 - bitdepth)), thresh_u16);
+ uint32x4_t v_u32 = vpaddlq_u16(v);
+
+ acc = vpadalq_u32(acc, v_u32);
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+ w += 8;
+ src_ptr += 8;
+ }
+
+ if (w <= (width - 1) - 4) {
+ uint16x4_t mat[3][3];
+ mat[0][0] = vld1_u16(src_ptr - stride - 1);
+ mat[0][1] = vld1_u16(src_ptr - stride);
+ mat[0][2] = vld1_u16(src_ptr - stride + 1);
+ mat[1][0] = vld1_u16(src_ptr - 1);
+ mat[1][1] = vld1_u16(src_ptr);
+ mat[1][2] = vld1_u16(src_ptr + 1);
+ mat[2][0] = vld1_u16(src_ptr + stride - 1);
+ mat[2][1] = vld1_u16(src_ptr + stride);
+ mat[2][2] = vld1_u16(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x4_t gxa = vadd_u16(mat[0][0], mat[2][0]);
+ uint16x4_t gxb = vadd_u16(mat[0][2], mat[2][2]);
+ gxa = vadd_u16(gxa, vadd_u16(mat[1][0], mat[1][0]));
+ gxb = vadd_u16(gxb, vadd_u16(mat[1][2], mat[1][2]));
+
+ uint16x4_t gya = vadd_u16(mat[0][0], mat[0][2]);
+ uint16x4_t gyb = vadd_u16(mat[2][0], mat[2][2]);
+ gya = vadd_u16(gya, vadd_u16(mat[0][1], mat[0][1]));
+ gyb = vadd_u16(gyb, vadd_u16(mat[2][1], mat[2][1]));
+
+ uint16x4_t ga = vaba_u16(vabd_u16(gxa, gxb), gya, gyb);
+ ga = vrshl_u16(ga, vdup_n_s16(8 - bitdepth));
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionnally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x4_t thresh_u16 = vclt_u16(ga, vget_low_u16(thresh));
+
+ uint16x4_t center = vshl_n_u16(mat[1][1], 2);
+
+ uint16x4_t adj0 = vadd_u16(mat[0][1], mat[2][1]);
+ uint16x4_t adj1 = vadd_u16(mat[1][0], mat[1][2]);
+ uint16x4_t adj = vadd_u16(adj0, adj1);
+ adj = vadd_u16(adj, adj);
+
+ uint16x4_t diag0 = vadd_u16(mat[0][0], mat[0][2]);
+ uint16x4_t diag1 = vadd_u16(mat[2][0], mat[2][2]);
+ uint16x4_t diag = vadd_u16(diag0, diag1);
+
+ uint16x4_t v = vabd_u16(vadd_u16(center, diag), adj);
+ v = vand_u16(v, thresh_u16);
+ uint32x4_t v_u32 = vmovl_u16(vrshl_u16(v, vdup_n_s16(8 - bitdepth)));
+
+ acc = vpadalq_u32(acc, v_u32);
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vaddw_s16(count, vreinterpret_s16_u16(thresh_u16));
+
+ w += 4;
+ src_ptr += 4;
+ }
+
+ while (w < width - 1) {
+ int mat[3][3];
+ mat[0][0] = *(src_ptr - stride - 1);
+ mat[0][1] = *(src_ptr - stride);
+ mat[0][2] = *(src_ptr - stride + 1);
+ mat[1][0] = *(src_ptr - 1);
+ mat[1][1] = *(src_ptr);
+ mat[1][2] = *(src_ptr + 1);
+ mat[2][0] = *(src_ptr + stride - 1);
+ mat[2][1] = *(src_ptr + stride);
+ mat[2][2] = *(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int ga = ROUND_POWER_OF_TWO(abs(gx) + abs(gy), bitdepth - 8);
+
+ // Accumulate Laplacian.
+ const int is_under = ga < edge_thresh;
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ final_acc += ROUND_POWER_OF_TWO(abs(v), bitdepth - 8) * is_under;
+ final_count += is_under;
+
+ src_ptr++;
+ w++;
+ }
+ src_start += stride;
+ } while (++h < height - 1);
+
+ // We counted negatively, so subtract to get the final value.
+ final_count -= horizontal_add_s32x4(count);
+ final_acc += horizontal_add_u64x2(acc);
+ return (final_count < 16)
+ ? -1.0
+ : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
new file mode 100644
index 0000000000..6cf835a243
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/txfm_common.h"
+
+static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) {
+ int32x4x2_t b0 =
+ vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+ int16x4x2_t c0 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+ vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+ int16x4x2_t c1 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+ vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+ out[0] = c0.val[0];
+ out[1] = c0.val[1];
+ out[2] = c1.val[0];
+ out[3] = c1.val[1];
+}
+
+void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride) {
+ // Load the 4x4 source in transposed form.
+ int16x4_t a1, b1, c1, d1, e;
+ a1 = vld1_s16(&input[0]);
+ b1 = vld1_s16(&input[1 * stride]);
+ c1 = vld1_s16(&input[2 * stride]);
+ d1 = vld1_s16(&input[3 * stride]);
+
+ // WHT.
+
+ // Row transforms.
+ a1 = vadd_s16(a1, b1);
+ d1 = vsub_s16(d1, c1);
+ e = vhsub_s16(a1, d1);
+ b1 = vsub_s16(e, b1);
+ c1 = vsub_s16(e, c1);
+ a1 = vsub_s16(a1, c1);
+ d1 = vadd_s16(d1, b1);
+
+ int16x8_t x[2];
+ x[0] = vcombine_s16(a1, c1);
+ x[1] = vcombine_s16(d1, b1);
+
+ int16x4_t s[4];
+ transpose4x4(x, s);
+
+ a1 = s[0];
+ b1 = s[1];
+ c1 = s[2];
+ d1 = s[3];
+
+ // Row transforms.
+ a1 = vadd_s16(a1, b1);
+ d1 = vsub_s16(d1, c1);
+ e = vhsub_s16(a1, d1);
+ b1 = vsub_s16(e, b1);
+ c1 = vsub_s16(e, c1);
+ a1 = vsub_s16(a1, c1);
+ d1 = vadd_s16(d1, b1);
+
+ vst1q_s32(&output[0], vshll_n_s16(a1, UNIT_QUANT_SHIFT));
+ vst1q_s32(&output[4], vshll_n_s16(c1, UNIT_QUANT_SHIFT));
+ vst1q_s32(&output[8], vshll_n_s16(d1, UNIT_QUANT_SHIFT));
+ vst1q_s32(&output[12], vshll_n_s16(b1, UNIT_QUANT_SHIFT));
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/ml_neon.c b/third_party/aom/av1/encoder/arm/neon/ml_neon.c
new file mode 100644
index 0000000000..be6ddfd763
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/ml_neon.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+
+static void nn_activate8(float32x4_t *out_h, float32x4_t *out_l,
+ const float32x4_t *zero) {
+ *out_h = vmaxq_f32(*out_h, *zero);
+ *out_l = vmaxq_f32(*out_l, *zero);
+}
+
+static void nn_activate4(float32x4_t *x, const float32x4_t *zero) {
+ *x = vmaxq_f32(*x, *zero);
+}
+
+#define CLAMP_0(x) (x = x > 0 ? x : 0)
+
+static void nn_propagate_8to1(int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes, bool output_layer) {
+ const float32x4_t zero = vdupq_n_f32(0);
+ float32x4_t vadd = zero;
+ float total = *layer_bias;
+
+ for (int in = 0; in < num_inputs; in += 8) {
+ const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+ const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+
+ const float32x4_t weights_h = vld1q_f32(&weights[in + 4]);
+ const float32x4_t weights_l = vld1q_f32(&weights[in]);
+
+ vadd = vmlaq_f32(vadd, inputs_h, weights_h);
+ vadd = vmlaq_f32(vadd, inputs_l, weights_l);
+ }
+#if AOM_ARCH_AARCH64
+ total += vaddvq_f32(vadd);
+#else
+ float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+ vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+ total += vget_lane_f32(vadd_lo, 0);
+#endif
+
+ if (!output_layer) CLAMP_0(total);
+ *output_nodes = total;
+}
+
+static void nn_propagate_xto1(int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes) {
+ float32x4_t vadd = vdupq_n_f32(0);
+
+ float total = *layer_bias;
+ int j = num_inputs;
+ int in = 0;
+ while (j > 7) {
+ const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+ const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+
+ const float32x4_t weights_h = vld1q_f32(&weights[in + 4]);
+ const float32x4_t weights_l = vld1q_f32(&weights[in]);
+
+ vadd = vmlaq_f32(vadd, inputs_h, weights_h);
+ vadd = vmlaq_f32(vadd, inputs_l, weights_l);
+ in += 8;
+ j -= 8;
+ }
+
+#if AOM_ARCH_AARCH64
+ total += vaddvq_f32(vadd);
+
+#else
+ float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+ vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+ total += vget_lane_f32(vadd_lo, 0);
+#endif
+ for (; in < num_inputs; in++) total += weights[in] * inputs[in];
+
+ *output_nodes = CLAMP_0(total);
+}
+
+static void nn_propagate_xsto1(int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes) {
+ float total = *layer_bias;
+#if AOM_ARCH_AARCH64
+ const float32x4_t v_inputs = vld1q_f32(inputs);
+ const float32x4_t v_weights = vld1q_f32(weights);
+ const float32x4_t vadd = vmulq_f32(v_inputs, v_weights);
+ total += vaddvq_f32(vadd);
+ int in = 4;
+#else
+ int in = 0;
+#endif
+ for (; in < num_inputs; in++) total += weights[in] * inputs[in];
+
+ *output_nodes = CLAMP_0(total);
+}
+
+static void nn_propagate_4to1(int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes, bool output_layer) {
+ const float32x4_t zero = vdupq_n_f32(0);
+ float32x4_t vadd = zero;
+ float total = *layer_bias;
+
+ for (int in = 0; in < num_inputs; in += 4) {
+ const float32x4_t v_inputs = vld1q_f32(&inputs[in]);
+ const float32x4_t v_weights = vld1q_f32(&weights[in]);
+ vadd = vmlaq_f32(vadd, v_inputs, v_weights);
+ }
+
+#if AOM_ARCH_AARCH64
+ total += vaddvq_f32(vadd);
+#else
+ float32x2_t vadd_lo = vadd_f32(vget_low_f32(vadd), vget_high_f32(vadd));
+ vadd_lo = vpadd_f32(vadd_lo, vadd_lo);
+ total += vget_lane_f32(vadd_lo, 0);
+#endif
+
+ if (!output_layer) CLAMP_0(total);
+ *output_nodes = total;
+}
+
+static void nn_propagate_4to4(int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes, bool output_layer) {
+ float32x4_t outputs = vld1q_f32(layer_bias);
+ const float32x4_t zero = vdupq_n_f32(0);
+
+ float32x4_t mul0[2] = { zero, zero };
+ float32x4_t mul1[2] = { zero, zero };
+ for (int in = 0; in < num_inputs; in += 4) {
+ const float32x4_t v_input = vld1q_f32(&inputs[in]);
+
+ for (int i = 0; i < 2; i++) {
+ const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]);
+ mul0[i] = vmlaq_f32(mul0[i], weight0, v_input);
+ const float32x4_t weight1 =
+ vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]);
+ mul1[i] = vmlaq_f32(mul1[i], weight1, v_input);
+ }
+ }
+ for (int i = 0; i < 2; i++)
+#if AOM_ARCH_AARCH64
+ mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
+ const float32x4_t hh = vpaddq_f32(mul0[0], mul0[1]);
+#else
+ mul0[i] =
+ vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])),
+ vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i])));
+ const float32x4_t hh =
+ vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])),
+ vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1])));
+#endif
+
+ outputs = vaddq_f32(outputs, hh);
+ if (!output_layer) nn_activate4(&outputs, &zero);
+ vst1q_f32(output_nodes, outputs);
+}
+
+static void nn_propagate_4to8(const int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes, bool output_layer) {
+ float32x4_t out_h = vld1q_f32(&layer_bias[4]);
+ float32x4_t out_l = vld1q_f32(layer_bias);
+ const float32x4_t zero = vdupq_n_f32(0);
+ float32x4_t mul0[4] = { zero, zero, zero, zero };
+ float32x4_t mul1[4] = { zero, zero, zero, zero };
+
+ for (int in = 0; in < num_inputs; in += 4) {
+ const float32x4_t v_input = vld1q_f32(&inputs[in]);
+ for (int i = 0; i < 4; i++) {
+ const float32x4_t weight0 = vld1q_f32(&weights[in + 2 * i * num_inputs]);
+ const float32x4_t weight1 =
+ vld1q_f32(&weights[in + (2 * i + 1) * num_inputs]);
+ mul0[i] = vmlaq_f32(mul0[i], v_input, weight0);
+ mul1[i] = vmlaq_f32(mul1[i], v_input, weight1);
+ }
+ }
+ for (int i = 0; i < 4; i++)
+#if AOM_ARCH_AARCH64
+ mul0[i] = vpaddq_f32(mul0[i], mul1[i]);
+ const float32x4_t hh0 = vpaddq_f32(mul0[0], mul0[1]);
+ const float32x4_t hh1 = vpaddq_f32(mul0[2], mul0[3]);
+#else
+ mul0[i] =
+ vcombine_f32(vpadd_f32(vget_low_f32(mul0[i]), vget_high_f32(mul0[i])),
+ vpadd_f32(vget_low_f32(mul1[i]), vget_high_f32(mul1[i])));
+ const float32x4_t hh0 =
+ vcombine_f32(vpadd_f32(vget_low_f32(mul0[0]), vget_high_f32(mul0[0])),
+ vpadd_f32(vget_low_f32(mul0[1]), vget_high_f32(mul0[1])));
+ const float32x4_t hh1 =
+ vcombine_f32(vpadd_f32(vget_low_f32(mul0[2]), vget_high_f32(mul0[2])),
+ vpadd_f32(vget_low_f32(mul0[3]), vget_high_f32(mul0[3])));
+#endif
+
+ out_h = vaddq_f32(out_h, hh1);
+ out_l = vaddq_f32(out_l, hh0);
+
+ if (!output_layer) nn_activate8(&out_h, &out_l, &zero);
+ vst1q_f32(&output_nodes[4], out_h);
+ vst1q_f32(output_nodes, out_l);
+}
+
+static void nn_propagate_8to4(const int num_inputs, const float *const inputs,
+ const float *const weights,
+ const float *layer_bias,
+ float *const output_nodes, bool output_layer) {
+ float32x4_t outputs = vld1q_f32(layer_bias);
+ const float32x4_t zero = vdupq_n_f32(0);
+ float32x4_t add[4] = { zero, zero, zero, zero };
+ for (int in = 0; in < num_inputs; in += 8) {
+ const float32x4_t inputs_l = vld1q_f32(&inputs[in]);
+ const float32x4_t inputs_h = vld1q_f32(&inputs[in + 4]);
+
+ for (int i = 0; i < 4; i++) {
+ const float32x4_t weight_l = vld1q_f32(&weights[in + i * num_inputs]);
+ const float32x4_t weight_h = vld1q_f32(&weights[in + i * num_inputs + 4]);
+ add[i] = vmlaq_f32(add[i], inputs_l, weight_l);
+ add[i] = vmlaq_f32(add[i], inputs_h, weight_h);
+ }
+ }
+#if AOM_ARCH_AARCH64
+ const float32x4_t hadd_h = vpaddq_f32(add[2], add[3]);
+ const float32x4_t hadd_l = vpaddq_f32(add[0], add[1]);
+ const float32x4_t haddhadd = vpaddq_f32(hadd_l, hadd_h);
+#else
+ const float32x4_t hadd_h =
+ vcombine_f32(vpadd_f32(vget_low_f32(add[2]), vget_high_f32(add[2])),
+ vpadd_f32(vget_low_f32(add[3]), vget_high_f32(add[3])));
+ const float32x4_t hadd_l =
+ vcombine_f32(vpadd_f32(vget_low_f32(add[0]), vget_high_f32(add[0])),
+ vpadd_f32(vget_low_f32(add[1]), vget_high_f32(add[1])));
+ const float32x4_t haddhadd =
+ vcombine_f32(vpadd_f32(vget_low_f32(hadd_l), vget_high_f32(hadd_l)),
+ vpadd_f32(vget_low_f32(hadd_h), vget_high_f32(hadd_h)));
+#endif
+
+ outputs = vaddq_f32(outputs, haddhadd);
+ if (!output_layer) nn_activate4(&outputs, &zero);
+ vst1q_f32(output_nodes, outputs);
+}
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_neon(const float *input_nodes,
+ const NN_CONFIG *const nn_config, int reduce_prec,
+ float *const output) {
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+ int buf_index = 0;
+ int num_inputs = nn_config->num_inputs;
+ // Hidden layers, except the final iteration is the output layer.
+ for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+ const float *layer_weights = nn_config->weights[layer];
+ const float *layer_bias = nn_config->bias[layer];
+ bool output_layer = (layer == nn_config->num_hidden_layers);
+ float *const output_nodes = output_layer ? output : buf[buf_index];
+ const int num_outputs = output_layer ? nn_config->num_outputs
+ : nn_config->num_hidden_nodes[layer];
+
+ if (num_inputs % 4 == 0 && num_outputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out += 8) {
+ nn_propagate_4to8(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out], output_layer);
+ }
+ } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ nn_propagate_8to4(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out], output_layer);
+ }
+ } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ nn_propagate_4to4(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out], output_layer);
+ }
+ } else if (num_inputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ nn_propagate_8to1(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out], output_layer);
+ }
+ } else if (num_inputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ nn_propagate_4to1(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out], output_layer);
+ }
+ } else if (num_inputs > 8) {
+ for (int out = 0; out < num_outputs; out++) {
+ nn_propagate_xto1(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out]);
+ }
+ } else if (num_inputs >= 4) {
+ for (int out = 0; out < num_outputs; out++) {
+ nn_propagate_xsto1(num_inputs, input_nodes,
+ &layer_weights[out * num_inputs], &layer_bias[out],
+ &output_nodes[out]);
+ }
+ } else {
+ for (int node = 0; node < num_outputs; ++node) {
+ float val = layer_bias[node];
+ for (int i = 0; i < num_inputs; ++i)
+ val += layer_weights[node * num_inputs + i] * input_nodes[i];
+ // ReLU as activation function.
+ val = val > 0.0f ? val : 0.0f; // Could use AOMMAX().
+ output_nodes[node] = val;
+ }
+ }
+ input_nodes = output_nodes;
+ num_inputs = num_outputs;
+ buf_index = 1 - buf_index;
+ }
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c
new file mode 100644
index 0000000000..2e4761f9a4
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.c
@@ -0,0 +1,1217 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/arm/neon/pickrst_neon.h"
+#include "av1/encoder/pickrst.h"
+
+int64_t av1_lowbd_pixel_proj_error_neon(
+ const uint8_t *src, int width, int height, int src_stride,
+ const uint8_t *dat, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int64_t sse = 0;
+ int64x2_t sse_s64 = vdupq_n_s64(0);
+
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ int32x2_t xq_v = vld1_s32(xq);
+ int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), SGRPROJ_RST_BITS);
+
+ do {
+ int j = 0;
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+
+ do {
+ const uint8x8_t d = vld1_u8(&dat[j]);
+ const uint8x8_t s = vld1_u8(&src[j]);
+ int32x4_t flt0_0 = vld1q_s32(&flt0[j]);
+ int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]);
+ int32x4_t flt1_0 = vld1q_s32(&flt1[j]);
+ int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]);
+
+ int32x4_t offset =
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1));
+ int32x4_t v0 = vmlaq_lane_s32(offset, flt0_0, xq_v, 0);
+ int32x4_t v1 = vmlaq_lane_s32(offset, flt0_1, xq_v, 0);
+
+ v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1);
+ v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1);
+
+ int16x8_t d_s16 = vreinterpretq_s16_u16(vmovl_u8(d));
+ v0 = vmlsl_lane_s16(v0, vget_low_s16(d_s16),
+ vreinterpret_s16_s32(xq_sum_v), 0);
+ v1 = vmlsl_lane_s16(v1, vget_high_s16(d_s16),
+ vreinterpret_s16_s32(xq_sum_v), 0);
+
+ int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+ int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s));
+ int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff);
+ int16x4_t e_lo = vget_low_s16(e);
+ int16x4_t e_hi = vget_high_s16(e);
+
+ sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+ sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+ j += 8;
+ } while (j <= width - 8);
+
+ for (int k = j; k < width; ++k) {
+ int32_t u = (dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)) +
+ xq[0] * flt0[k] + xq[1] * flt1[k] - u * (xq[0] + xq[1]);
+ int32_t e =
+ (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
+ sse += e * e;
+ }
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+ int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ int32x2_t xq_v = vdup_n_s32(xq_active);
+
+ do {
+ int32x4_t sse_s32 = vdupq_n_s32(0);
+ int j = 0;
+
+ do {
+ const uint8x8_t d = vld1_u8(&dat[j]);
+ const uint8x8_t s = vld1_u8(&src[j]);
+ int32x4_t flt_0 = vld1q_s32(&flt[j]);
+ int32x4_t flt_1 = vld1q_s32(&flt[j + 4]);
+ int16x8_t d_s16 =
+ vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+
+ int32x4_t sub_0 = vsubw_s16(flt_0, vget_low_s16(d_s16));
+ int32x4_t sub_1 = vsubw_s16(flt_1, vget_high_s16(d_s16));
+
+ int32x4_t offset =
+ vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1));
+ int32x4_t v0 = vmlaq_lane_s32(offset, sub_0, xq_v, 0);
+ int32x4_t v1 = vmlaq_lane_s32(offset, sub_1, xq_v, 0);
+
+ int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+ int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
+
+ int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s));
+ int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff);
+ int16x4_t e_lo = vget_low_s16(e);
+ int16x4_t e_hi = vget_high_s16(e);
+
+ sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
+ sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
+
+ j += 8;
+ } while (j <= width - 8);
+
+ for (int k = j; k < width; ++k) {
+ int32_t u = dat[k] << SGRPROJ_RST_BITS;
+ int32_t v = xq_active * (flt[k] - u);
+ int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) +
+ dat[k] - src[k];
+ sse += e * e;
+ }
+
+ sse_s64 = vpadalq_s32(sse_s64, sse_s32);
+
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ } while (--height != 0);
+ } else {
+ uint32x4_t sse_s32 = vdupq_n_u32(0);
+
+ do {
+ int j = 0;
+
+ do {
+ const uint8x16_t d = vld1q_u8(&dat[j]);
+ const uint8x16_t s = vld1q_u8(&src[j]);
+
+ uint8x16_t diff = vabdq_u8(d, s);
+ uint8x8_t diff_lo = vget_low_u8(diff);
+ uint8x8_t diff_hi = vget_high_u8(diff);
+
+ sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_lo, diff_lo));
+ sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_hi, diff_hi));
+
+ j += 16;
+ } while (j <= width - 16);
+
+ for (int k = j; k < width; ++k) {
+ int32_t e = dat[k] - src[k];
+ sse += e * e;
+ }
+
+ dat += dat_stride;
+ src += src_stride;
+ } while (--height != 0);
+
+ sse_s64 = vreinterpretq_s64_u64(vpaddlq_u32(sse_s32));
+ }
+
+ sse += horizontal_add_s64x2(sse_s64);
+ return sse;
+}
+
+// We can accumulate up to 65536 8-bit multiplication results in 32-bit. We are
+// processing 2 pixels at a time, so the accumulator max can be as high as 32768
+// for the compute stats.
+#define STAT_ACCUMULATOR_MAX 32768
+
+static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
+#if AOM_ARCH_AARCH64
+ uint8x16x2_t table = { { a, b } };
+ return vqtbl2_u8(table, idx);
+#else
+ uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
+ vget_high_u8(b) } };
+ return vtbl4_u8(table, idx);
+#endif
+}
+
+static INLINE uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) {
+#if AOM_ARCH_AARCH64
+ uint8x16x2_t table = { { a, b } };
+ return vqtbl2q_u8(table, idx);
+#else
+ uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
+ vget_high_u8(b) } };
+ return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)),
+ vtbl4_u8(table, vget_high_u8(idx)));
+#endif
+}
+
+// The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the
+// computation. This function computes the final M from the accumulated
+// (src_s64) and the residual parts (src_s32). It also transposes the result as
+// the output needs to be column-major.
+static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
+ const int32_t *src_s32, const int wiener_win,
+ int scale) {
+ for (int i = 0; i < wiener_win; ++i) {
+ for (int j = 0; j < wiener_win; ++j) {
+ int tr_idx = j * wiener_win + i;
+ *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
+ }
+ }
+}
+
+// The resulting H is a column-major matrix accumulated from the transposed
+// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single
+// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This
+// function transforms back to the originally expected format (double
+// transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to
+// speed-up the computation. This function computes the final H from the
+// accumulated (src_s64) and the residual parts (src_s32). The computed H is
+// only an upper triangle matrix, this function also fills the lower triangle of
+// the resulting matrix.
+static void update_H(int64_t *dst, const int64_t *src_s64,
+ const int32_t *src_s32, const int wiener_win, int stride,
+ int scale) {
+ // For a simplified theoretical 3x3 case where `wiener_win` is 3 and
+ // `wiener_win2` is 9, the M matrix is 3x3:
+ // 0, 3, 6
+ // 1, 4, 7
+ // 2, 5, 8
+ //
+ // This is viewed as a vector to compute H (9x9) by vector outer product:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8
+ //
+ // Double transpose and upper triangle remapping for 3x3 -> 9x9 case:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8,
+ // 3, 30, 33, 12, 31, 34, 21, 32, 35,
+ // 6, 33, 60, 15, 42, 61, 24, 51, 62,
+ // 1, 12, 15, 10, 13, 16, 11, 14, 17,
+ // 4, 31, 42, 13, 40, 43, 22, 41, 44,
+ // 7, 34, 61, 16, 43, 70, 25, 52, 71,
+ // 2, 21, 24, 11, 22, 25, 20, 23, 26,
+ // 5, 32, 51, 14, 41, 52, 23, 50, 53,
+ // 8, 35, 62, 17, 44, 71, 26, 53, 80,
+ const int wiener_win2 = wiener_win * wiener_win;
+
+ // Loop through the indices according to the remapping above, along the
+ // columns:
+ // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ...,
+ // wiener_win - 1, wiener_win - 1 + wiener_win, ...
+ // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8.
+ for (int i = 0; i < wiener_win; ++i) {
+ for (int j = i; j < wiener_win2; j += wiener_win) {
+ // These two inner loops are the same as the two outer loops, but running
+ // along rows instead of columns. For the 3x3 case `l` will be:
+ // 0, 3, 6, 1, 4, 7, 2, 5, 8.
+ for (int k = 0; k < wiener_win; ++k) {
+ for (int l = k; l < wiener_win2; l += wiener_win) {
+ // The nominal double transpose indexing would be:
+ // int idx = stride * j + l;
+ // However we need the upper-triangle indices, it is easy with some
+ // min/max operations.
+ int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l);
+
+ // Resulting matrix is filled by combining the 64-bit and the residual
+ // 32-bit matrices together with scaling.
+ *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
+ }
+ }
+ }
+ }
+}
+
+// Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the
+// last load address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src,
+ ptrdiff_t stride) {
+ dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+ src += 2 * stride;
+ dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+ src += 2 * stride;
+ dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+ src += 2 * stride;
+ dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0));
+}
+
+static INLINE void compute_stats_win7_neon(const uint8_t *dgd,
+ const uint8_t *src, int width,
+ int height, int dgd_stride,
+ int src_stride, int avg, int64_t *M,
+ int64_t *H, int downsample_factor) {
+ // Matrix names are capitalized to help readability.
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+ DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
+
+ memset(M_s32, 0, sizeof(M_s32));
+ memset(M_s64, 0, sizeof(M_s64));
+ memset(H_s32, 0, sizeof(H_s32));
+ memset(H_s64, 0, sizeof(H_s64));
+
+ // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7
+ // matrices.
+ // clang-format off
+ DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7[96]) = {
+ 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17,
+ 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19,
+ 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22,
+ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18,
+ 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20,
+ 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23,
+ };
+ // clang-format on
+
+ const uint8x16_t lut0 = vld1q_u8(shuffle_stats7 + 0);
+ const uint8x16_t lut1 = vld1q_u8(shuffle_stats7 + 16);
+ const uint8x16_t lut2 = vld1q_u8(shuffle_stats7 + 32);
+ const uint8x16_t lut3 = vld1q_u8(shuffle_stats7 + 48);
+ const uint8x16_t lut4 = vld1q_u8(shuffle_stats7 + 64);
+ const uint8x16_t lut5 = vld1q_u8(shuffle_stats7 + 80);
+
+ int acc_cnt = STAT_ACCUMULATOR_MAX;
+ const int src_next = downsample_factor * src_stride - width;
+ const int dgd_next = downsample_factor * dgd_stride - width;
+ const uint8x8_t avg_u8 = vdup_n_u8(avg);
+
+ do {
+ int j = width;
+ while (j >= 2) {
+ // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+ // middle 6x7 elements being shared.
+ uint8x16_t dgd_rows[4];
+ load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
+
+ const uint8_t *dgd_ptr = dgd + dgd_stride * 6;
+ dgd += 2;
+
+ // Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7
+ // matrices (1 for each of the 2 pixels) separated into distinct
+ // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7).
+ // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49
+ // consecutive elements.
+ int16x8_t dgd_avg0[6];
+ int16x8_t dgd_avg1[6];
+ uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3);
+
+ dgd_avg0[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+ dgd_avg1[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf3), avg_u8));
+ dgd_avg1[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf3), avg_u8));
+
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG1, dgd_avg1[0]);
+ vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+
+ uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
+ uint8x16_t dgd_shuf4 = tbl2q(dgd_rows[1], dgd_rows[2], lut4);
+
+ dgd_avg0[2] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
+ dgd_avg0[3] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
+ dgd_avg1[2] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf4), avg_u8));
+ dgd_avg1[3] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf4), avg_u8));
+
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+ vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+ vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]);
+
+ uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
+ uint8x16_t dgd_shuf5 = tbl2q(dgd_rows[2], dgd_rows[3], lut5);
+
+ dgd_avg0[4] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
+ dgd_avg0[5] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
+ dgd_avg1[4] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf5), avg_u8));
+ dgd_avg1[5] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf5), avg_u8));
+
+ vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+ vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+ vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]);
+ vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]);
+
+ // The remaining last (49th) elements of `dgd - avg`.
+ DGD_AVG0[48] = dgd_ptr[6] - avg;
+ DGD_AVG1[48] = dgd_ptr[7] - avg;
+
+ // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+ // output pixels at a time. M is of size 7 * 7. It needs to be filled such
+ // that multiplying one element from src with each element of a row of the
+ // wiener window will fill one column of M. However this is not very
+ // convenient in terms of memory access, as it means we do contiguous
+ // loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int src_avg1 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+ update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+ dgd_avg1[0]);
+ update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+ dgd_avg1[1]);
+ update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+ dgd_avg1[2]);
+ update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3],
+ dgd_avg1[3]);
+ update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4],
+ dgd_avg1[4]);
+ update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5],
+ dgd_avg1[5]);
+
+ // Last (49th) element of M_s32 can be computed as scalar more efficiently
+ // for 2 output pixels.
+ M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1;
+
+ // Start accumulating into row-major version of matrix H
+ // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+ // row-major. H is of size 49 * 49. It is filled by multiplying every pair
+ // of elements of the wiener window together (vector outer product). Since
+ // it is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work with column-major matrices,
+ // so we accumulate into a row-major matrix H_s32. At the end of the
+ // algorithm a double transpose transformation will convert H_s32 back to
+ // the expected output layout.
+ update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[48 * WIENER_WIN2_ALIGN2 + 48] +=
+ DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48];
+
+ // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent
+ // overflow.
+ if (--acc_cnt == 0) {
+ acc_cnt = STAT_ACCUMULATOR_MAX;
+
+ accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2);
+
+ // The widening accumulation is only needed for the upper triangle part
+ // of the matrix.
+ int64_t *lh = H_s64;
+ int32_t *lh32 = H_s32;
+ for (int k = 0; k < WIENER_WIN2; ++k) {
+ // The widening accumulation is only run for the relevant parts
+ // (upper-right triangle) in a row 4-element aligned.
+ int k4 = k / 4 * 4;
+ accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4);
+
+ // Last element of the row is computed separately.
+ lh[48] += lh32[48];
+ lh32[48] = 0;
+
+ lh += WIENER_WIN2_ALIGN2;
+ lh32 += WIENER_WIN2_ALIGN2;
+ }
+ }
+
+ j -= 2;
+ }
+
+ // Computations for odd pixel in the row.
+ if (width & 1) {
+ // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
+ // middle 6x7 elements being shared.
+ uint8x16_t dgd_rows[4];
+ load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
+
+ const uint8_t *dgd_ptr = dgd + dgd_stride * 6;
+ ++dgd;
+
+ // Re-arrange (and widen) the combined 8x7 matrix to have a whole 7x7
+ // matrix tightly packed into a int16x8_t[6] array. This array contains
+ // 48 elements of the 49 (7x7). Compute `dgd - avg` for the whole buffer.
+ // The DGD_AVG buffer contains 49 consecutive elements.
+ int16x8_t dgd_avg0[6];
+ uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ dgd_avg0[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+ vst1q_s16(DGD_AVG0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+
+ uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
+ dgd_avg0[2] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
+ dgd_avg0[3] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
+
+ uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
+ dgd_avg0[4] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
+ dgd_avg0[5] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
+ vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
+ vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
+
+ // The remaining last (49th) element of `dgd - avg`.
+ DGD_AVG0[48] = dgd_ptr[6] - avg;
+
+ // Accumulate into row-major order variant of matrix M (cross-correlation)
+ // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled
+ // such that multiplying one element from src with each element of a row
+ // of the wiener window will fill one column of M. However this is not
+ // very convenient in terms of memory access, as it means we do
+ // contiguous loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+ update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+ update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+ update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]);
+ update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]);
+ update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]);
+
+ // Last (49th) element of M_s32 can be computed as scalar more efficiently
+ // for 1 output pixel.
+ M_s32[48] += DGD_AVG0[48] * src_avg0;
+
+ // Start accumulating into row-major order version of matrix H
+ // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+ // H is of size 49 * 49. It is filled by multiplying every pair of
+ // elements of the wiener window together (vector outer product). Since it
+ // is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work column-major matrices, so we
+ // accumulate into a row-major matrix H_s32. At the end of the algorithm a
+ // double transpose transformation will convert H_s32 back to the expected
+ // output layout.
+ update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48);
+
+ // The last element of the triangle of H_s32 matrix can be computed as
+ // scalar more efficiently.
+ H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48];
+ }
+
+ src += src_next;
+ dgd += dgd_next;
+ } while (--height != 0);
+
+ acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, downsample_factor);
+
+ update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, downsample_factor);
+}
+
+// Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the
+// last load address is offset to prevent out-of-bounds access.
+static INLINE void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src,
+ ptrdiff_t stride) {
+ dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+ src += 2 * stride;
+ dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
+ src += 2 * stride;
+ dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0));
+}
+
+static INLINE void compute_stats_win5_neon(const uint8_t *dgd,
+ const uint8_t *src, int width,
+ int height, int dgd_stride,
+ int src_stride, int avg, int64_t *M,
+ int64_t *H, int downsample_factor) {
+ // Matrix names are capitalized to help readability.
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]);
+ DECLARE_ALIGNED(64, int32_t,
+ H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+ DECLARE_ALIGNED(64, int64_t,
+ H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
+
+ memset(M_s32, 0, sizeof(M_s32));
+ memset(M_s64, 0, sizeof(M_s64));
+ memset(H_s32, 0, sizeof(H_s32));
+ memset(H_s64, 0, sizeof(H_s64));
+
+ // Look-up tables to create 8x3 matrix with consecutive elements from two 5x5
+ // matrices.
+ // clang-format off
+ DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5[48]) = {
+ 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24,
+ 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 25,
+ 9, 10, 11, 12, 19, 20, 21, 22, 10, 11, 12, 13, 20, 21, 22, 23,
+ };
+ // clang-format on
+
+ const uint8x16_t lut0 = vld1q_u8(shuffle_stats5 + 0);
+ const uint8x16_t lut1 = vld1q_u8(shuffle_stats5 + 16);
+ const uint8x16_t lut2 = vld1q_u8(shuffle_stats5 + 32);
+
+ int acc_cnt = STAT_ACCUMULATOR_MAX;
+ const int src_next = downsample_factor * src_stride - width;
+ const int dgd_next = downsample_factor * dgd_stride - width;
+ const uint8x8_t avg_u8 = vdup_n_u8(avg);
+
+ do {
+ int j = width;
+ while (j >= 2) {
+ // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+ // middle 4x5 elements being shared.
+ uint8x16_t dgd_rows[3];
+ load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
+
+ const uint8_t *dgd_ptr = dgd + dgd_stride * 4;
+ dgd += 2;
+
+ // Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5
+ // matrices (1 for each of the 2 pixels) separated into distinct
+ // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5).
+ // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25
+ // consecutive elements.
+ int16x8_t dgd_avg0[3];
+ int16x8_t dgd_avg1[3];
+ uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1);
+ uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2);
+
+ dgd_avg0[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[2] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
+ dgd_avg1[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
+ dgd_avg1[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
+ dgd_avg1[2] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
+
+ vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+ vst1q_s16(DGD_AVG1 + 0, dgd_avg1[0]);
+ vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
+ vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
+
+ // The remaining last (25th) elements of `dgd - avg`.
+ DGD_AVG0[24] = dgd_ptr[4] - avg;
+ DGD_AVG1[24] = dgd_ptr[5] - avg;
+
+ // Accumulate into row-major variant of matrix M (cross-correlation) for 2
+ // output pixels at a time. M is of size 5 * 5. It needs to be filled such
+ // that multiplying one element from src with each element of a row of the
+ // wiener window will fill one column of M. However this is not very
+ // convenient in terms of memory access, as it means we do contiguous
+ // loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int src_avg1 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
+ update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
+ dgd_avg1[0]);
+ update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
+ dgd_avg1[1]);
+ update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
+ dgd_avg1[2]);
+
+ // Last (25th) element of M_s32 can be computed as scalar more efficiently
+ // for 2 output pixels.
+ M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1;
+
+ // Start accumulating into row-major version of matrix H
+ // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
+ // row-major. H is of size 25 * 25. It is filled by multiplying every pair
+ // of elements of the wiener window together (vector outer product). Since
+ // it is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work with column-major matrices,
+ // so we accumulate into a row-major matrix H_s32. At the end of the
+ // algorithm a double transpose transformation will convert H_s32 back to
+ // the expected output layout.
+ update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+ DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24];
+
+ // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent
+ // overflow.
+ if (--acc_cnt == 0) {
+ acc_cnt = STAT_ACCUMULATOR_MAX;
+
+ accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2);
+
+ // The widening accumulation is only needed for the upper triangle part
+ // of the matrix.
+ int64_t *lh = H_s64;
+ int32_t *lh32 = H_s32;
+ for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) {
+ // The widening accumulation is only run for the relevant parts
+ // (upper-right triangle) in a row 4-element aligned.
+ int k4 = k / 4 * 4;
+ accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4);
+
+ // Last element of the row is computed separately.
+ lh[24] += lh32[24];
+ lh32[24] = 0;
+
+ lh += WIENER_WIN2_REDUCED_ALIGN2;
+ lh32 += WIENER_WIN2_REDUCED_ALIGN2;
+ }
+ }
+
+ j -= 2;
+ }
+
+ // Computations for odd pixel in the row.
+ if (width & 1) {
+ // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
+ // middle 4x5 elements being shared.
+ uint8x16_t dgd_rows[3];
+ load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
+
+ const uint8_t *dgd_ptr = dgd + dgd_stride * 4;
+ ++dgd;
+
+ // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5
+ // matrix tightly packed into a int16x8_t[3] array. This array contains
+ // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer.
+ // The DGD_AVG buffer contains 25 consecutive elements.
+ int16x8_t dgd_avg0[3];
+ uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
+ uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2));
+
+ dgd_avg0[0] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[1] =
+ vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
+ dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(dgd_shuf1, avg_u8));
+
+ vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]);
+ vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
+ vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
+
+ // The remaining last (25th) element of `dgd - avg`.
+ DGD_AVG0[24] = dgd_ptr[4] - avg;
+
+ // Accumulate into row-major order variant of matrix M (cross-correlation)
+ // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled
+ // such that multiplying one element from src with each element of a row
+ // of the wiener window will fill one column of M. However this is not
+ // very convenient in terms of memory access, as it means we do
+ // contiguous loads of dgd but strided stores to M. As a result, we use an
+ // intermediate matrix M_s32 which is instead filled such that one row of
+ // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
+ // then transposed to return M.
+ int src_avg0 = *src++ - avg;
+ int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
+ update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
+ update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
+ update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
+
+ // Last (25th) element of M_s32 can be computed as scalar more efficiently
+ // for 1 output pixel.
+ M_s32[24] += DGD_AVG0[24] * src_avg0;
+
+ // Start accumulating into row-major order version of matrix H
+ // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
+ // H is of size 25 * 25. It is filled by multiplying every pair of
+ // elements of the wiener window together (vector outer product). Since it
+ // is a symmetric matrix, we only compute the upper-right triangle, and
+ // then copy it down to the lower-left later. The upper triangle is
+ // covered by 4x4 tiles. The original algorithm assumes the M matrix is
+ // column-major and the resulting H matrix is also expected to be
+ // column-major. It is not efficient to work column-major matrices, so we
+ // accumulate into a row-major matrix H_s32. At the end of the algorithm a
+ // double transpose transformation will convert H_s32 back to the expected
+ // output layout.
+ update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24);
+
+ // The last element of the triangle of H_s32 matrix can be computed as a
+ // scalar more efficiently.
+ H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
+ DGD_AVG0[24] * DGD_AVG0[24];
+ }
+
+ src += src_next;
+ dgd += dgd_next;
+ } while (--height != 0);
+
+ acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, downsample_factor);
+
+ update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2,
+ downsample_factor);
+}
+
+static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride,
+ int width, int height) {
+ uint64_t sum = 0;
+
+ if (width >= 16) {
+ int h = 0;
+ // We can accumulate up to 257 8-bit values in a 16-bit value, given
+ // that each 16-bit vector has 8 elements, that means we can process up to
+ // int(257*8/width) rows before we need to widen to 32-bit vector
+ // elements.
+ int h_overflow = 257 * 8 / width;
+ int h_limit = height > h_overflow ? h_overflow : height;
+ uint32x4_t avg_u32 = vdupq_n_u32(0);
+ do {
+ uint16x8_t avg_u16 = vdupq_n_u16(0);
+ do {
+ int j = width;
+ const uint8_t *src_ptr = src;
+ do {
+ uint8x16_t s = vld1q_u8(src_ptr);
+ avg_u16 = vpadalq_u8(avg_u16, s);
+ j -= 16;
+ src_ptr += 16;
+ } while (j >= 16);
+ if (j >= 8) {
+ uint8x8_t s = vld1_u8(src_ptr);
+ avg_u16 = vaddw_u8(avg_u16, s);
+ j -= 8;
+ src_ptr += 8;
+ }
+ // Scalar tail case.
+ while (j > 0) {
+ sum += src[width - j];
+ j--;
+ }
+ src += src_stride;
+ } while (++h < h_limit);
+ avg_u32 = vpadalq_u16(avg_u32, avg_u16);
+
+ h_limit += h_overflow;
+ h_limit = height > h_overflow ? h_overflow : height;
+ } while (h < height);
+ return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) /
+ (width * height));
+ }
+ if (width >= 8) {
+ int h = 0;
+ // We can accumulate up to 257 8-bit values in a 16-bit value, given
+ // that each 16-bit vector has 4 elements, that means we can process up to
+ // int(257*4/width) rows before we need to widen to 32-bit vector
+ // elements.
+ int h_overflow = 257 * 4 / width;
+ int h_limit = height > h_overflow ? h_overflow : height;
+ uint32x2_t avg_u32 = vdup_n_u32(0);
+ do {
+ uint16x4_t avg_u16 = vdup_n_u16(0);
+ do {
+ int j = width;
+ const uint8_t *src_ptr = src;
+ uint8x8_t s = vld1_u8(src_ptr);
+ avg_u16 = vpadal_u8(avg_u16, s);
+ j -= 8;
+ src_ptr += 8;
+ // Scalar tail case.
+ while (j > 0) {
+ sum += src[width - j];
+ j--;
+ }
+ src += src_stride;
+ } while (++h < h_limit);
+ avg_u32 = vpadal_u16(avg_u32, avg_u16);
+
+ h_limit += h_overflow;
+ h_limit = height > h_overflow ? h_overflow : height;
+ } while (h < height);
+ return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) /
+ (width * height));
+ }
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ sum += src[j];
+ } while (++j < width);
+ src += src_stride;
+ } while (--i != 0);
+ return (uint8_t)(sum / (width * height));
+}
+
+void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+ assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4);
+ (void)dgd_avg;
+ (void)src_avg;
+
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = wiener_win >> 1;
+ const int width = h_end - h_start;
+ const int height = v_end - v_start;
+
+ const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride;
+ const uint8_t *src_start = src + h_start + v_start * src_stride;
+
+ // The wiener window will slide along the dgd frame, centered on each pixel.
+ // For the top left pixel and all the pixels on the side of the frame this
+ // means half of the window will be outside of the frame. As such the actual
+ // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
+ // wider and 2 * wiener_halfwin higher than the original dgd buffer.
+ const int vert_offset = v_start - wiener_halfwin;
+ const int horiz_offset = h_start - wiener_halfwin;
+ const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
+
+ uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height);
+
+ // Since the height is not necessarily a multiple of the downsample factor,
+ // the last line of src will be scaled according to how many rows remain.
+ int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+ int downsampled_height = height / downsample_factor;
+ int downsample_remainder = height % downsample_factor;
+
+ memset(M, 0, wiener_win2 * sizeof(*M));
+ memset(H, 0, wiener_win2 * wiener_win2 * sizeof(*H));
+
+ // Calculate the M and H matrices for the normal and downsampled cases.
+ if (downsampled_height > 0) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_neon(dgd_win, src_start, width, downsampled_height,
+ dgd_stride, src_stride, avg, M, H,
+ downsample_factor);
+ } else {
+ compute_stats_win5_neon(dgd_win, src_start, width, downsampled_height,
+ dgd_stride, src_stride, avg, M, H,
+ downsample_factor);
+ }
+ }
+
+ // Accumulate the remaining last rows in the downsampled case.
+ if (downsample_remainder > 0) {
+ int remainder_offset = height - downsample_remainder;
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_neon(dgd_win + remainder_offset * dgd_stride,
+ src_start + remainder_offset * src_stride, width,
+ 1, dgd_stride, src_stride, avg, M, H,
+ downsample_remainder);
+ } else {
+ compute_stats_win5_neon(dgd_win + remainder_offset * dgd_stride,
+ src_start + remainder_offset * src_stride, width,
+ 1, dgd_stride, src_stride, avg, M, H,
+ downsample_remainder);
+ }
+ }
+}
+
+static INLINE void calc_proj_params_r0_r1_neon(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t h01_lo = vdupq_n_s64(0);
+ int64x2_t h01_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint8_t *src_ptr = src8;
+ const uint8_t *dat_ptr = dat8;
+ int32_t *flt0_ptr = flt0;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t d = vld1_u8(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+ int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+ int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+ f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
+ f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
+ f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
+ f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
+ h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
+ h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
+ h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src8 += src_stride;
+ dat8 += dat_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ H[1][0] = H[0][1];
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+static INLINE void calc_proj_params_r0_neon(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+
+ int64x2_t h00_lo = vdupq_n_s64(0);
+ int64x2_t h00_hi = vdupq_n_s64(0);
+ int64x2_t c0_lo = vdupq_n_s64(0);
+ int64x2_t c0_hi = vdupq_n_s64(0);
+
+ do {
+ const uint8_t *src_ptr = src8;
+ const uint8_t *dat_ptr = dat8;
+ int32_t *flt0_ptr = flt0;
+ int w = width;
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t d = vld1_u8(dat_ptr);
+ int32x4_t f0_lo = vld1q_s32(flt0_ptr);
+ int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
+
+ int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+ int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+ int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+ f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
+ f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
+
+ h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
+ h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
+ h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
+ h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
+
+ c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
+ c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
+ c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
+ c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt0_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src8 += src_stride;
+ dat8 += dat_stride;
+ flt0 += flt0_stride;
+ } while (--height != 0);
+
+ H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
+ C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
+}
+
+static INLINE void calc_proj_params_r1_neon(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ assert(width % 8 == 0);
+ const int size = width * height;
+
+ int64x2_t h11_lo = vdupq_n_s64(0);
+ int64x2_t h11_hi = vdupq_n_s64(0);
+ int64x2_t c1_lo = vdupq_n_s64(0);
+ int64x2_t c1_hi = vdupq_n_s64(0);
+
+ do {
+ const uint8_t *src_ptr = src8;
+ const uint8_t *dat_ptr = dat8;
+ int32_t *flt1_ptr = flt1;
+ int w = width;
+
+ do {
+ uint8x8_t s = vld1_u8(src_ptr);
+ uint8x8_t d = vld1_u8(dat_ptr);
+ int32x4_t f1_lo = vld1q_s32(flt1_ptr);
+ int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
+
+ int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
+ int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
+
+ int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
+ int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
+ f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
+ f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
+
+ h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
+ h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
+ h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
+ h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
+
+ c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
+ c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
+ c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
+ c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
+
+ src_ptr += 8;
+ dat_ptr += 8;
+ flt1_ptr += 8;
+ w -= 8;
+ } while (w != 0);
+
+ src8 += src_stride;
+ dat8 += dat_stride;
+ flt1 += flt1_stride;
+ } while (--height != 0);
+
+ H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
+ C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2],
+ int64_t C[2], const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride,
+ flt1, flt1_stride, H, C);
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h
new file mode 100644
index 0000000000..7b72dca34d
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/pickrst_neon.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
+
+#include <arm_neon.h>
+
+#include "av1/common/restoration.h"
+
+// Aligned sizes for Wiener filters.
+#define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2)
+#define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3)
+#define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED))
+#define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2)
+#define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3)
+
+// Compute 8 values of M (cross correlation) for a single source pixel and
+// accumulate.
+static INLINE void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg,
+ int16x8_t dgd_avg) {
+ int32x4_t lo = vld1q_s32(M_s32 + 0);
+ int32x4_t hi = vld1q_s32(M_s32 + 4);
+
+ lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg);
+ hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg);
+
+ vst1q_s32(M_s32 + 0, lo);
+ vst1q_s32(M_s32 + 4, hi);
+}
+
+// Compute 8 values of M (cross correlation) for two source pixels and
+// accumulate.
+static INLINE void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0,
+ int16x4_t src_avg1, int16x8_t dgd_avg0,
+ int16x8_t dgd_avg1) {
+ int32x4_t lo = vld1q_s32(M_s32 + 0);
+ int32x4_t hi = vld1q_s32(M_s32 + 4);
+
+ lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0);
+ hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0);
+ lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1);
+ hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1);
+
+ vst1q_s32(M_s32 + 0, lo);
+ vst1q_s32(M_s32 + 4, hi);
+}
+
+static INLINE void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg,
+ int width, int height) {
+ for (int i = 0; i < height; i += 4) {
+ int16x4_t di = vld1_s16(dgd_avg + i);
+
+ for (int j = i; j < width; j += 4) {
+ int16x4_t dj = vld1_s16(dgd_avg + j);
+ int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j);
+ int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j);
+ int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j);
+ int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j);
+
+ h0 = vmlal_lane_s16(h0, dj, di, 0);
+ h1 = vmlal_lane_s16(h1, dj, di, 1);
+ h2 = vmlal_lane_s16(h2, dj, di, 2);
+ h3 = vmlal_lane_s16(h3, dj, di, 3);
+
+ vst1q_s32(H_s32 + 0 * width + j, h0);
+ vst1q_s32(H_s32 + 1 * width + j, h1);
+ vst1q_s32(H_s32 + 2 * width + j, h2);
+ vst1q_s32(H_s32 + 3 * width + j, h3);
+ }
+ H_s32 += 4 * width;
+ }
+}
+
+static INLINE void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
+ const int16_t *dgd_avg1) {
+ for (int i = 0; i < 24; i += 4) {
+ int16x4_t di0 = vld1_s16(dgd_avg0 + i);
+ int16x4_t di1 = vld1_s16(dgd_avg1 + i);
+
+ for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) {
+ int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
+ int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
+ int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+ int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+ int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+ int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j);
+
+ h0 = vmlal_lane_s16(h0, dj0, di0, 0);
+ h0 = vmlal_lane_s16(h0, dj1, di1, 0);
+ h1 = vmlal_lane_s16(h1, dj0, di0, 1);
+ h1 = vmlal_lane_s16(h1, dj1, di1, 1);
+ h2 = vmlal_lane_s16(h2, dj0, di0, 2);
+ h2 = vmlal_lane_s16(h2, dj1, di1, 2);
+ h3 = vmlal_lane_s16(h3, dj0, di0, 3);
+ h3 = vmlal_lane_s16(h3, dj1, di1, 3);
+
+ vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0);
+ vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1);
+ vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2);
+ vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3);
+ }
+ H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2;
+ }
+}
+
+static INLINE void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0,
+ const int16_t *dgd_avg1) {
+ for (int i = 0; i < 48; i += 4) {
+ int16x4_t di0 = vld1_s16(dgd_avg0 + i);
+ int16x4_t di1 = vld1_s16(dgd_avg1 + i);
+
+ int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i);
+ int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i);
+ int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i);
+ int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i);
+
+ h0 = vmlal_lane_s16(h0, di0, di0, 0);
+ h0 = vmlal_lane_s16(h0, di1, di1, 0);
+ h1 = vmlal_lane_s16(h1, di0, di0, 1);
+ h1 = vmlal_lane_s16(h1, di1, di1, 1);
+ h2 = vmlal_lane_s16(h2, di0, di0, 2);
+ h2 = vmlal_lane_s16(h2, di1, di1, 2);
+ h3 = vmlal_lane_s16(h3, di0, di0, 3);
+ h3 = vmlal_lane_s16(h3, di1, di1, 3);
+
+ vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0);
+ vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1);
+ vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2);
+ vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3);
+
+ for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) {
+ int16x4_t dj0 = vld1_s16(dgd_avg0 + j);
+ int16x4_t dj1 = vld1_s16(dgd_avg1 + j);
+ h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j);
+ h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j);
+ h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j);
+ h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j);
+
+ h0 = vmlal_lane_s16(h0, dj0, di0, 0);
+ h0 = vmlal_lane_s16(h0, dj1, di1, 0);
+ h1 = vmlal_lane_s16(h1, dj0, di0, 1);
+ h1 = vmlal_lane_s16(h1, dj1, di1, 1);
+ h2 = vmlal_lane_s16(h2, dj0, di0, 2);
+ h2 = vmlal_lane_s16(h2, dj1, di1, 2);
+ h3 = vmlal_lane_s16(h3, dj0, di0, 3);
+ h3 = vmlal_lane_s16(h3, dj1, di1, 3);
+
+ vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0);
+ vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1);
+ vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2);
+ vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3);
+ }
+ H_s32 += 4 * WIENER_WIN2_ALIGN2;
+ }
+}
+
+// Widen 32-bit src data and accumulate into 64-bit dst. Clear src data.
+static INLINE void accumulate_and_clear(int64_t *dst, int32_t *src,
+ int length) {
+ do {
+ int32x4_t s32 = vld1q_s32(src);
+ vst1q_s32(src, vdupq_n_s32(0));
+ src += 4;
+
+ int64x2_t d_lo = vld1q_s64(dst + 0);
+ int64x2_t d_hi = vld1q_s64(dst + 2);
+
+ d_lo = vaddw_s32(d_lo, vget_low_s32(s32));
+ d_hi = vaddw_s32(d_hi, vget_high_s32(s32));
+
+ vst1q_s64(dst + 0, d_lo);
+ vst1q_s64(dst + 2, d_hi);
+
+ dst += 4;
+ length -= 4;
+ } while (length > 0);
+}
+
+#endif // AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
new file mode 100644
index 0000000000..c3b57ce206
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
@@ -0,0 +1,928 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if AOM_ARCH_AARCH64
+ return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+ return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+ int16x8_t v_eobmax,
+ uint16x8_t v_mask) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+ const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+ return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ int16x8_t v_quant, int16x8_t v_dequant,
+ int16x8_t v_round, int16x8_t v_zero) {
+ const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs = vabsq_s16(v_coeff);
+ const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+ const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
+ const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
+ store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
+ return v_nz_mask;
+}
+
+void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ int16x8_t v_quant = vld1q_s16(quant_ptr);
+ int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+ int16x8_t v_round = vld1q_s16(round_ptr);
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ uint16x8_t v_nz_mask;
+ // process dc and the first seven ac coeffs
+ v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero);
+ v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask);
+ // overwrite the dc constants with ac constants
+ v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+ v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+ v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+ count -= 8;
+ // now process the rest of the ac coeffs
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero);
+ v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+ count -= 8;
+ } while (count > 0);
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+static INLINE uint16x8_t quantize_lp_8(const int16_t *coeff_ptr,
+ int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, int16x8_t v_quant,
+ int16x8_t v_dequant, int16x8_t v_round,
+ int16x8_t v_zero) {
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs = vabsq_s16(v_coeff);
+ const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+ const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
+ const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ vst1q_s16(qcoeff_ptr, v_qcoeff);
+ vst1q_s16(dqcoeff_ptr, v_dqcoeff);
+ return v_nz_mask;
+}
+
+void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ int16x8_t v_quant = vld1q_s16(quant_ptr);
+ int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+ int16x8_t v_round = vld1q_s16(round_ptr);
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ uint16x8_t v_nz_mask;
+ intptr_t count = n_coeffs;
+
+ // process dc and the first seven ac coeffs
+ v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero);
+ v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+ // overwrite the dc constants with ac constants
+ v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+ v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+ v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+ count -= 8;
+ // now process the rest of the ac coeffs
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero);
+ v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+ count -= 8;
+ } while (count != 0);
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
+ int16x8_t v_round, int16x8_t v_zero, int log_scale) {
+ const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1);
+ const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale));
+ const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+ const uint16x8_t v_mask =
+ vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1));
+ // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
+ vreinterpretq_s16_u16(v_mask));
+ const int16x8_t v_tmp2 =
+ vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant);
+ const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+ const int16x8_t v_qcoeff =
+ vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
+ // Multiplying by dequant here will use all 16 bits. Cast to unsigned before
+ // shifting right. (vshlq_s16 will shift right if shift value is negative)
+ const uint16x8_t v_abs_dqcoeff =
+ vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)),
+ vdupq_n_s16(-log_scale));
+ const int16x8_t v_dqcoeff =
+ vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign),
+ v_coeff_sign);
+ store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+ store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
+ return v_nz_mask;
+}
+
+static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
+ int16x8_t v_round, int16x8_t v_zero) {
+ const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+ const uint16x8_t v_mask =
+ vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1),
+ vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2));
+ // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
+ vreinterpretq_s16_u16(v_mask));
+ // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+ const int16x8_t v_tmp2 =
+ vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1),
+ vreinterpretq_s16_u16(vshrq_n_u16(
+ vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14)));
+ const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
+ const int16x8_t v_qcoeff =
+ vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
+ // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+ const int16x8_t v_abs_dqcoeff =
+ vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13),
+ vreinterpretq_s16_u16(vshrq_n_u16(
+ vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2)));
+ const int16x8_t v_dqcoeff =
+ vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+ store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+ store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
+ return v_nz_mask;
+}
+
+static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan,
+ int log_scale) {
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ int16x8_t v_quant = vld1q_s16(quant_ptr);
+ int16x8_t v_dequant = vld1q_s16(dequant_ptr);
+ const int16x8_t v_round_no_scale = vld1q_s16(round_ptr);
+ int16x8_t v_round =
+ vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ intptr_t non_zero_count = n_coeffs;
+
+ assert(n_coeffs > 16);
+ // Pre-scan pass
+ const int16x8_t v_dequant_scaled =
+ vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale)));
+ const int16x8_t v_zbin_s16 =
+ vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1);
+ intptr_t i = n_coeffs;
+ do {
+ const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8);
+ const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16);
+ const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a);
+ const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b);
+ const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16);
+ const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16);
+ // If the coefficient is in the base ZBIN range, then discard.
+ if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) {
+ non_zero_count -= 16;
+ } else {
+ break;
+ }
+ i -= 16;
+ } while (i > 0);
+
+ const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
+ memset(qcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr + non_zero_count, 0,
+ remaining_zcoeffs * sizeof(*dqcoeff_ptr));
+
+ // process dc and the first seven ac coeffs
+ uint16x8_t v_nz_mask;
+ if (log_scale == 2) {
+ v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant, v_dequant, v_round, v_zero);
+ } else {
+ v_nz_mask =
+ quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero, log_scale);
+ }
+ v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+ // overwrite the dc constants with ac constants
+ v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
+ v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
+ v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
+
+ for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ if (log_scale == 2) {
+ v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant, v_dequant, v_round, v_zero);
+ } else {
+ v_nz_mask =
+ quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
+ v_dequant, v_round, v_zero, log_scale);
+ }
+ v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+ quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
+ iscan, 1);
+}
+
+void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+ quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
+ qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
+ iscan, 2);
+}
+
+void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ const int16x8_t zero = vdupq_n_s16(0);
+ int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+ int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
+ int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+ int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+ int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+ int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ int16x8_t v_abs = vabsq_s16(v_coeff);
+
+ vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+
+ uint16x8_t vcond = vcgeq_s16(v_abs, vzbins);
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ vround = vsetq_lane_s16(round_ptr[0], vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+ int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+ store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+ int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+ vround = vsetq_lane_s16(round_ptr[1], vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+ for (int i = 8; i < n_coeffs; i += 8) {
+ v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+ v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ v_abs = vabsq_s16(v_coeff);
+ vcond = vcgeq_s16(v_abs, vzbins);
+
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+ int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+
+ vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+ store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+ int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+#define QM_MULL_SHIFT(x0, x1) \
+ vreinterpretq_s16_u16(vorrq_u16( \
+ vreinterpretq_u16_s16(vshlq_n_s16( \
+ vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \
+ vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS)))
+
+static void aom_quantize_b_helper_16x16_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ (void)scan;
+
+ uint16x8_t vwt, viwt;
+ const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ const int16x8_t zero = vdupq_n_s16(0);
+ int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+
+ int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
+ int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+ int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+ int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+ int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ int16x8_t v_abs = vabsq_s16(v_coeff);
+ vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+ uint16x8_t vcond;
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ vround = vsetq_lane_s16(round_ptr[0], vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+
+ vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+ store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+ vround = vsetq_lane_s16(round_ptr[1], vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+ for (int i = 8; i < n_coeffs; i += 8) {
+ v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+ v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ v_abs = vabsq_s16(v_coeff);
+
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+
+ vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+ store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+static void aom_quantize_b_helper_32x32_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ (void)scan;
+
+ uint16x8_t vwt, viwt;
+ const int log_scale = 1;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ const int16x8_t zero = vdupq_n_s16(0);
+ int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+ const int16x8_t v_log_scale = v_eobmax_76543210;
+
+ int16x8_t vzbins = vdupq_n_s16(zbins[1]),
+ vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
+ int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+ int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+ int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+ int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ int16x8_t v_abs = vabsq_s16(v_coeff);
+ vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+ uint16x8_t vcond;
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ vround =
+ vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+
+ vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+ store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+ vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+ vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+ vround =
+ vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+ for (int i = 8; i < n_coeffs; i += 8) {
+ v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+ v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ v_abs = vabsq_s16(v_coeff);
+
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+ vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
+
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+ store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+ vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+static void aom_quantize_b_helper_64x64_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr) {
+ (void)scan;
+
+ uint16x8_t vwt, viwt;
+ const int log_scale = 2;
+ const int16x8_t v_log_scale =
+ vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
+
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ const int16x8_t zero = vdupq_n_s16(0);
+ int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
+ int16x8_t v_ones = vnegq_s16(v_eobmax_76543210);
+
+ int16x8_t vzbins = vdupq_n_s16(zbins[1]),
+ vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
+ int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
+ int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+ int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+ int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ int16x8_t v_abs = vabsq_s16(v_coeff);
+ vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
+ uint16x8_t vcond;
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ vround =
+ vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+
+ int16x8_t ones =
+ vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
+ vtmp2 =
+ vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
+ store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+ vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+ v_deq_abs =
+ vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
+
+ vround =
+ vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
+ vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
+ vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
+ vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
+
+ for (int i = 8; i < n_coeffs; i += 8) {
+ v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+ v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ v_abs = vabsq_s16(v_coeff);
+
+ if (qm_ptr == NULL) {
+ vcond = vcgeq_s16(v_abs, vzbins);
+ } else {
+ vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
+ vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
+ }
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ if (nz_check) {
+ int16x8_t vtmp = vqaddq_s16(v_abs, vround);
+
+ int16x8_t vtmp2;
+ if (qm_ptr == NULL) {
+ vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ } else {
+ vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
+ vtmp2 = vaddq_s16(vtmp2, vtmp);
+ }
+
+ int16x8_t ones =
+ vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
+ vtmp2 =
+ vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
+ int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
+ int16x8_t coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
+ store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
+
+ if (iqm_ptr != NULL) {
+ viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
+ vdequant = QM_MULL_SHIFT(vdequant, viwt);
+ }
+ int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
+ vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
+ v_deq_abs =
+ vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
+ vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
+ coeff_nz_mask =
+ vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
+ store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
+
+ uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
+ const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
+ int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
+ v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ }
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+}
+
+void aom_quantize_b_helper_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale) {
+ switch (log_scale) { // log_scale for AV1 encoder can be only 0, 1, 2
+ case 0:
+ aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, qm_ptr, iqm_ptr);
+ break;
+ case 1:
+ aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, qm_ptr, iqm_ptr);
+ break;
+ case 2:
+ aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, qm_ptr, iqm_ptr);
+ break;
+ }
+}
+
+void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+ NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+ NULL, NULL, 2);
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c b/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c
new file mode 100644
index 0000000000..7d3bd4c606
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/rdopt_neon.c
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include "av1/encoder/rdopt.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+ int32x4_t *xy_sum_32,
+ int32x4_t *xz_sum_32,
+ int32x4_t *x_sum_32,
+ int32x4_t *x2_sum_32) {
+ // Pixels in this 4x4 [ a b c d ]
+ // are referred to as: [ e f g h ]
+ // [ i j k l ]
+ // [ m n o p ]
+
+ const int16x4_t pixelsa_2_lo = vld1_s16(diff + (0 * stride));
+ const int16x4_t pixelsa_2_sli =
+ vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_2_lo), 16));
+ const int16x4_t pixelsb_2_lo = vld1_s16(diff + (1 * stride));
+ const int16x4_t pixelsb_2_sli =
+ vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_2_lo), 16));
+ const int16x4_t pixelsa_1_lo = vld1_s16(diff + (2 * stride));
+ const int16x4_t pixelsa_1_sli =
+ vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_1_lo), 16));
+ const int16x4_t pixelsb_1_lo = vld1_s16(diff + (3 * stride));
+ const int16x4_t pixelsb_1_sli =
+ vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_1_lo), 16));
+
+ const int16x8_t slli_a = vcombine_s16(pixelsa_1_sli, pixelsa_2_sli);
+
+ *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_1_lo, pixelsa_1_sli);
+ *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_2_lo, pixelsa_2_sli);
+ *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsb_2_lo, pixelsb_2_sli);
+
+ *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_1_sli);
+ *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_2_sli, pixelsb_2_sli);
+ *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_2_sli);
+
+ // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
+ // (sum up every element in slli_a and swap_b)
+ *x_sum_32 = vpadalq_s16(*x_sum_32, slli_a);
+ *x_sum_32 = vaddw_s16(*x_sum_32, pixelsb_2_sli);
+
+ // Also sum their squares
+ *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_1_sli, pixelsa_1_sli);
+ *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_2_sli, pixelsa_2_sli);
+ *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsb_2_sli, pixelsb_2_sli);
+}
+
+void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride,
+ int width, int height, float *hcorr,
+ float *vcorr) {
+ // The following notation is used:
+ // x - current pixel
+ // y - right neighbour pixel
+ // z - below neighbour pixel
+ // w - down-right neighbour pixel
+ int64_t xy_sum = 0, xz_sum = 0;
+ int64_t x_sum = 0, x2_sum = 0;
+ int32x4_t zero = vdupq_n_s32(0);
+ int64x2_t v_x_sum = vreinterpretq_s64_s32(zero);
+ int64x2_t v_xy_sum = vreinterpretq_s64_s32(zero);
+ int64x2_t v_xz_sum = vreinterpretq_s64_s32(zero);
+ int64x2_t v_x2_sum = vreinterpretq_s64_s32(zero);
+ // Process horizontal and vertical correlations through the body in 4x4
+ // blocks. This excludes the final row and column and possibly one extra
+ // column depending how 3 divides into width and height
+
+ for (int i = 0; i <= height - 4; i += 3) {
+ int32x4_t xy_sum_32 = zero;
+ int32x4_t xz_sum_32 = zero;
+ int32x4_t x_sum_32 = zero;
+ int32x4_t x2_sum_32 = zero;
+ for (int j = 0; j <= width - 4; j += 3) {
+ horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+ &xz_sum_32, &x_sum_32, &x2_sum_32);
+ }
+ v_xy_sum = vpadalq_s32(v_xy_sum, xy_sum_32);
+ v_xz_sum = vpadalq_s32(v_xz_sum, xz_sum_32);
+ v_x_sum = vpadalq_s32(v_x_sum, x_sum_32);
+ v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32);
+ }
+#if AOM_ARCH_AARCH64
+ xy_sum = vaddvq_s64(v_xy_sum);
+ xz_sum = vaddvq_s64(v_xz_sum);
+ x2_sum = vaddvq_s64(v_x2_sum);
+ x_sum = vaddvq_s64(v_x_sum);
+#else
+ xy_sum = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_xy_sum), vget_high_s64(v_xy_sum)), 0);
+ xz_sum = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_xz_sum), vget_high_s64(v_xz_sum)), 0);
+ x2_sum = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_x2_sum), vget_high_s64(v_x2_sum)), 0);
+ x_sum =
+ vget_lane_s64(vadd_s64(vget_low_s64(v_x_sum), vget_high_s64(v_x_sum)), 0);
+#endif
+ // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+ int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+ // Do we have 2 rows remaining or just the one? Note that width and height
+ // are powers of 2, so each modulo 3 must be 1 or 2.
+ if (height % 3 == 1) { // Just horiz corrs on the final row
+ const int16_t x0 = diff[(height - 1) * stride];
+ x_sum += x0;
+ x_finalrow += x0;
+ x2_sum += x0 * x0;
+ x2_finalrow += x0 * x0;
+ if (width >= 8) {
+ int32x4_t v_y_sum = zero;
+ int32x4_t v_y2_sum = zero;
+ int32x4_t v_xy_sum_a = zero;
+ int k = width - 1;
+ int j = 0;
+ while ((k - 8) > 0) {
+ const int16x8_t v_x = vld1q_s16(&diff[(height - 1) * stride + j]);
+ const int16x8_t v_y = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
+ const int16x4_t v_x_lo = vget_low_s16(v_x);
+ const int16x4_t v_x_hi = vget_high_s16(v_x);
+ const int16x4_t v_y_lo = vget_low_s16(v_y);
+ const int16x4_t v_y_hi = vget_high_s16(v_y);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+ v_y_sum = vpadalq_s16(v_y_sum, v_y);
+ k -= 8;
+ j += 8;
+ }
+
+ const int16x8_t v_l = vld1q_s16(&diff[(height - 1) * stride] + j);
+ const int16x8_t v_x =
+ vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
+ vreinterpretq_s16_s32(zero), 1);
+ const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
+ const int16x4_t v_x_lo = vget_low_s16(v_x);
+ const int16x4_t v_x_hi = vget_high_s16(v_x);
+ const int16x4_t v_y_lo = vget_low_s16(v_y);
+ const int16x4_t v_y_hi = vget_high_s16(v_y);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+ const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y);
+ const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
+#if AOM_ARCH_AARCH64
+ const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
+ xy_sum += vaddvq_s64(v_xy_sum2);
+ const int32_t y = vaddvq_s32(v_y_sum_a);
+ const int64_t y2 = vaddvq_s64(v_y2_sum_a);
+#else
+ xy_sum += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
+ const int64x2_t v_y_a = vpaddlq_s32(v_y_sum_a);
+ const int64_t y =
+ vget_lane_s64(vadd_s64(vget_low_s64(v_y_a), vget_high_s64(v_y_a)), 0);
+ const int64x2_t v_y2_sum_b = vpaddlq_s32(v_y2_sum);
+ int64_t y2 = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_y2_sum_b), vget_high_s64(v_y2_sum_b)), 0);
+#endif
+ x_sum += y;
+ x2_sum += y2;
+ x_finalrow += y;
+ x2_finalrow += y2;
+ } else {
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 1) * stride + j];
+ const int16_t y = diff[(height - 1) * stride + j + 1];
+ xy_sum += x * y;
+ x_sum += y;
+ x2_sum += y * y;
+ x_finalrow += y;
+ x2_finalrow += y * y;
+ }
+ }
+ } else { // Two rows remaining to do
+ const int16_t x0 = diff[(height - 2) * stride];
+ const int16_t z0 = diff[(height - 1) * stride];
+ x_sum += x0 + z0;
+ x2_sum += x0 * x0 + z0 * z0;
+ x_finalrow += z0;
+ x2_finalrow += z0 * z0;
+ if (width >= 8) {
+ int32x4_t v_y2_sum = zero;
+ int32x4_t v_w2_sum = zero;
+ int32x4_t v_xy_sum_a = zero;
+ int32x4_t v_xz_sum_a = zero;
+ int32x4_t v_x_sum_a = zero;
+ int32x4_t v_w_sum = zero;
+ int k = width - 1;
+ int j = 0;
+ while ((k - 8) > 0) {
+ const int16x8_t v_x = vld1q_s16(&diff[(height - 2) * stride + j]);
+ const int16x8_t v_y = vld1q_s16(&diff[(height - 2) * stride + j + 1]);
+ const int16x8_t v_z = vld1q_s16(&diff[(height - 1) * stride + j]);
+ const int16x8_t v_w = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
+
+ const int16x4_t v_x_lo = vget_low_s16(v_x);
+ const int16x4_t v_y_lo = vget_low_s16(v_y);
+ const int16x4_t v_z_lo = vget_low_s16(v_z);
+ const int16x4_t v_w_lo = vget_low_s16(v_w);
+ const int16x4_t v_x_hi = vget_high_s16(v_x);
+ const int16x4_t v_y_hi = vget_high_s16(v_y);
+ const int16x4_t v_z_hi = vget_high_s16(v_z);
+ const int16x4_t v_w_hi = vget_high_s16(v_w);
+
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
+
+ v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
+ v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
+
+ v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
+ v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+
+ v_w_sum = vpadalq_s16(v_w_sum, v_w);
+ v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
+ v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
+
+ k -= 8;
+ j += 8;
+ }
+ const int16x8_t v_l = vld1q_s16(&diff[(height - 2) * stride] + j);
+ const int16x8_t v_x =
+ vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
+ vreinterpretq_s16_s32(zero), 1);
+ const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
+ const int16x8_t v_l_2 = vld1q_s16(&diff[(height - 1) * stride] + j);
+ const int16x8_t v_z =
+ vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l_2, 7),
+ vreinterpretq_s16_s32(zero), 1);
+ const int16x8_t v_w = vextq_s16(v_l_2, vreinterpretq_s16_s32(zero), 1);
+
+ const int16x4_t v_x_lo = vget_low_s16(v_x);
+ const int16x4_t v_y_lo = vget_low_s16(v_y);
+ const int16x4_t v_z_lo = vget_low_s16(v_z);
+ const int16x4_t v_w_lo = vget_low_s16(v_w);
+ const int16x4_t v_x_hi = vget_high_s16(v_x);
+ const int16x4_t v_y_hi = vget_high_s16(v_y);
+ const int16x4_t v_z_hi = vget_high_s16(v_z);
+ const int16x4_t v_w_hi = vget_high_s16(v_w);
+
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
+ v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
+
+ v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
+ v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
+
+ v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
+ v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
+ v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
+
+ v_w_sum = vpadalq_s16(v_w_sum, v_w);
+ v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
+ v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
+
+#if AOM_ARCH_AARCH64
+ xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a));
+ xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a));
+ x_sum += vaddvq_s32(v_x_sum_a);
+ x_finalrow += vaddvq_s32(v_w_sum);
+ int64_t y2 = vaddvq_s64(vpaddlq_s32(v_y2_sum));
+ int64_t w2 = vaddvq_s64(vpaddlq_s32(v_w2_sum));
+#else
+ const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
+ xy_sum += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
+ const int64x2_t v_xz_sum2 = vpaddlq_s32(v_xz_sum_a);
+ xz_sum += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_xz_sum2), vget_high_s64(v_xz_sum2)), 0);
+ const int64x2_t v_x_sum2 = vpaddlq_s32(v_x_sum_a);
+ x_sum += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_x_sum2), vget_high_s64(v_x_sum2)), 0);
+ const int64x2_t v_w_sum_a = vpaddlq_s32(v_w_sum);
+ x_finalrow += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_w_sum_a), vget_high_s64(v_w_sum_a)), 0);
+ const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
+ int64_t y2 = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_y2_sum_a), vget_high_s64(v_y2_sum_a)), 0);
+ const int64x2_t v_w2_sum_a = vpaddlq_s32(v_w2_sum);
+ int64_t w2 = vget_lane_s64(
+ vadd_s64(vget_low_s64(v_w2_sum_a), vget_high_s64(v_w2_sum_a)), 0);
+#endif
+ x2_sum += y2 + w2;
+ x2_finalrow += w2;
+ } else {
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 2) * stride + j];
+ const int16_t y = diff[(height - 2) * stride + j + 1];
+ const int16_t z = diff[(height - 1) * stride + j];
+ const int16_t w = diff[(height - 1) * stride + j + 1];
+
+ // Horizontal and vertical correlations for the penultimate row:
+ xy_sum += x * y;
+ xz_sum += x * z;
+
+ // Now just horizontal correlations for the final row:
+ xy_sum += z * w;
+
+ x_sum += y + w;
+ x2_sum += y * y + w * w;
+ x_finalrow += w;
+ x2_finalrow += w * w;
+ }
+ }
+ }
+
+ // Do we have 2 columns remaining or just the one?
+ if (width % 3 == 1) { // Just vert corrs on the final col
+ const int16_t x0 = diff[width - 1];
+ x_sum += x0;
+ x_finalcol += x0;
+ x2_sum += x0 * x0;
+ x2_finalcol += x0 * x0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 1];
+ xz_sum += x * z;
+ x_finalcol += z;
+ x2_finalcol += z * z;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z;
+ x2_sum += z * z;
+ }
+ }
+ } else { // Two cols remaining
+ const int16_t x0 = diff[width - 2];
+ const int16_t y0 = diff[width - 1];
+ x_sum += x0 + y0;
+ x2_sum += x0 * x0 + y0 * y0;
+ x_finalcol += y0;
+ x2_finalcol += y0 * y0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 2];
+ const int16_t y = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 2];
+ const int16_t w = diff[(i + 1) * stride + width - 1];
+
+ // Horizontal and vertical correlations for the penultimate col:
+ // Skip these on the last iteration of this loop if we also had two
+ // rows remaining, otherwise the final horizontal and vertical correlation
+ // get erroneously processed twice
+ if (i < height - 2 || height % 3 == 1) {
+ xy_sum += x * y;
+ xz_sum += x * z;
+ }
+
+ x_finalcol += w;
+ x2_finalcol += w * w;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z + w;
+ x2_sum += z * z + w * w;
+ }
+
+ // Now just vertical correlations for the final column:
+ xz_sum += y * w;
+ }
+ }
+
+ // Calculate the simple sums and squared-sums
+ int64_t x_firstrow = 0, x_firstcol = 0;
+ int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+ if (width >= 8) {
+ int32x4_t v_x_firstrow = zero;
+ int32x4_t v_x2_firstrow = zero;
+ for (int j = 0; j < width; j += 8) {
+ const int16x8_t v_diff = vld1q_s16(diff + j);
+ const int16x4_t v_diff_lo = vget_low_s16(v_diff);
+ const int16x4_t v_diff_hi = vget_high_s16(v_diff);
+ v_x_firstrow = vpadalq_s16(v_x_firstrow, v_diff);
+ v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo);
+ v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi);
+ }
+#if AOM_ARCH_AARCH64
+ x_firstrow += vaddvq_s32(v_x_firstrow);
+ x2_firstrow += vaddvq_s32(v_x2_firstrow);
+#else
+ const int64x2_t v_x_firstrow_64 = vpaddlq_s32(v_x_firstrow);
+ x_firstrow += vget_lane_s64(
+ vadd_s64(vget_low_s64(v_x_firstrow_64), vget_high_s64(v_x_firstrow_64)),
+ 0);
+ const int64x2_t v_x2_firstrow_64 = vpaddlq_s32(v_x2_firstrow);
+ x2_firstrow += vget_lane_s64(vadd_s64(vget_low_s64(v_x2_firstrow_64),
+ vget_high_s64(v_x2_firstrow_64)),
+ 0);
+#endif
+ } else {
+ for (int j = 0; j < width; ++j) {
+ x_firstrow += diff[j];
+ x2_firstrow += diff[j] * diff[j];
+ }
+ }
+ for (int i = 0; i < height; ++i) {
+ x_firstcol += diff[i * stride];
+ x2_firstcol += diff[i * stride] * diff[i * stride];
+ }
+
+ int64_t xhor_sum = x_sum - x_finalcol;
+ int64_t xver_sum = x_sum - x_finalrow;
+ int64_t y_sum = x_sum - x_firstcol;
+ int64_t z_sum = x_sum - x_firstrow;
+ int64_t x2hor_sum = x2_sum - x2_finalcol;
+ int64_t x2ver_sum = x2_sum - x2_finalrow;
+ int64_t y2_sum = x2_sum - x2_firstcol;
+ int64_t z2_sum = x2_sum - x2_firstrow;
+
+ const float num_hor = (float)(height * (width - 1));
+ const float num_ver = (float)((height - 1) * width);
+
+ const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+ const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+ const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+ const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+ const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+ const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+ if (xhor_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ } else {
+ *hcorr = 1.0;
+ }
+ if (xver_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ } else {
+ *vcorr = 1.0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c b/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c
new file mode 100644
index 0000000000..3d17723224
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/reconinter_enc_neon.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+
+#include "av1/encoder/reconinter_enc.h"
+
+void aom_upsampled_pred_neon(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref, int ref_stride,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter_params = av1_get_filter(subpel_search);
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ if (width > 8) {
+ assert(width % 16 == 0);
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ uint8x16_t r = vld1q_u8(ref + j);
+ vst1q_u8(comp_pred + j, r);
+ j += 16;
+ } while (j < width);
+ ref += ref_stride;
+ comp_pred += width;
+ } while (--i != 0);
+ } else if (width == 8) {
+ int i = height;
+ do {
+ uint8x8_t r = vld1_u8(ref);
+ vst1_u8(comp_pred, r);
+ ref += ref_stride;
+ comp_pred += width;
+ } while (--i != 0);
+ } else {
+ assert(width == 4);
+ int i = height / 2;
+ do {
+ uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+ vst1_u8(comp_pred, r);
+ ref += 2 * ref_stride;
+ comp_pred += 2 * width;
+ } while (--i != 0);
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const filter_x =
+ av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1);
+ aom_convolve8_horiz(ref, ref_stride, comp_pred, width, filter_x, 16, NULL,
+ -1, width, height);
+ } else if (!subpel_x_q3) {
+ const int16_t *const filter_y =
+ av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1);
+ aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, filter_y,
+ 16, width, height);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t,
+ im_block[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+
+ const int16_t *const filter_x =
+ av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q3 << 1);
+ const int16_t *const filter_y =
+ av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q3 << 1);
+
+ const int im_stride = MAX_SB_SIZE;
+ const int im_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + SUBPEL_TAPS;
+
+ const int ref_vert_offset = ref_stride * ((SUBPEL_TAPS >> 1) - 1);
+ const int im_vert_offset = im_stride * ((filter_params->taps >> 1) - 1);
+
+ assert(im_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_convolve8_horiz(ref - ref_vert_offset, ref_stride, im_block,
+ MAX_SB_SIZE, filter_x, 16, NULL, -1, width, im_height);
+ aom_convolve8_vert(im_block + im_vert_offset, MAX_SB_SIZE, comp_pred, width,
+ NULL, -1, filter_y, 16, width, height);
+ }
+}
+
+void aom_comp_avg_upsampled_pred_neon(MACROBLOCKD *xd,
+ const AV1_COMMON *const cm, int mi_row,
+ int mi_col, const MV *const mv,
+ uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, int subpel_search) {
+ aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+
+ aom_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred, width);
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_neon(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+ aom_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+
+ aom_dist_wtd_comp_avg_pred_neon(comp_pred, pred, width, height, comp_pred,
+ width, jcp_param);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_upsampled_pred_neon(MACROBLOCKD *xd,
+ const struct AV1Common *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred8, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref8, int ref_stride, int bd,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ if (width > 4) {
+ assert(width % 8 == 0);
+ int i = height;
+ do {
+ int j = 0;
+ do {
+ uint16x8_t r = vld1q_u16(ref + j);
+ vst1q_u16(comp_pred + j, r);
+ j += 8;
+ } while (j < width);
+ ref += ref_stride;
+ comp_pred += width;
+ } while (--i != 0);
+ } else if (width == 4) {
+ int i = height;
+ do {
+ uint16x4_t r = vld1_u16(ref);
+ vst1_u16(comp_pred, r);
+ ref += ref_stride;
+ comp_pred += width;
+ } while (--i != 0);
+ } else {
+ assert(width == 2);
+ int i = height / 2;
+ do {
+ uint16x4_t r = load_u16_2x2(ref, ref_stride);
+ store_u16x2_strided_x2(comp_pred, width, r);
+ ref += 2 * ref_stride;
+ comp_pred += 2 * width;
+ } while (--i != 0);
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_highbd_convolve8_horiz_neon(ref8, ref_stride, comp_pred8, width, kernel,
+ 16, NULL, -1, width, height, bd);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_highbd_convolve8_vert_neon(ref8, ref_stride, comp_pred8, width, NULL,
+ -1, kernel, 16, width, height, bd);
+ } else {
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_highbd_convolve8_horiz_neon(
+ ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride,
+ CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+ intermediate_height, bd);
+ aom_highbd_convolve8_vert_neon(
+ CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+ MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+ bd);
+ }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_neon(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, int subpel_search) {
+ aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8,
+ ref_stride, bd, subpel_search);
+
+ aom_highbd_comp_avg_pred_neon(comp_pred8, pred8, width, height, comp_pred8,
+ width);
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_neon(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+ int subpel_search) {
+ aom_highbd_upsampled_pred_neon(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8,
+ ref_stride, bd, subpel_search);
+
+ aom_highbd_dist_wtd_comp_avg_pred_neon(comp_pred8, pred8, width, height,
+ comp_pred8, width, jcp_param);
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/arm/neon/shift_neon.h b/third_party/aom/av1/encoder/arm/neon/shift_neon.h
new file mode 100644
index 0000000000..d73aef2f25
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/shift_neon.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h" // For AOM_INLINE.
+
+#define SHIFT_LOOP_HELPER(name, type, intrinsic, arg) \
+ static AOM_INLINE void name(const type *in, type *out, int size) { \
+ int i = 0; \
+ do { \
+ out[i] = intrinsic(in[i], arg); \
+ } while (++i < size); \
+ }
+
+SHIFT_LOOP_HELPER(shift_left_2_s16_x4, int16x4_t, vshl_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_left_2_s16_x8, int16x8_t, vshlq_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_left_2_s32_x4, int32x4_t, vshlq_n_s32, 2)
+SHIFT_LOOP_HELPER(shift_right_2_round_s16_x8, int16x8_t, vrshrq_n_s16, 2)
+SHIFT_LOOP_HELPER(shift_right_2_round_s32_x4, int32x4_t, vrshrq_n_s32, 2)
+SHIFT_LOOP_HELPER(shift_right_4_round_s16_x8, int16x8_t, vrshrq_n_s16, 4)
+SHIFT_LOOP_HELPER(shift_right_4_round_s32_x4, int32x4_t, vrshrq_n_s32, 4)
+
+// Addition instructions have slightly better performance compared to shift
+// instructions on some micro-architectures, so use these for shifts by one.
+
+SHIFT_LOOP_HELPER(shift_left_1_s16_x4, int16x4_t, vadd_s16, in[i])
+SHIFT_LOOP_HELPER(shift_left_1_s16_x8, int16x8_t, vaddq_s16, in[i])
+SHIFT_LOOP_HELPER(shift_right_1_round_s16_x4, int16x4_t, vrhadd_s16,
+ vdup_n_s16(0))
+SHIFT_LOOP_HELPER(shift_right_1_round_s16_x8, int16x8_t, vrhaddq_s16,
+ vdupq_n_s16(0))
+SHIFT_LOOP_HELPER(shift_right_1_round_s32_x4, int32x4_t, vrhaddq_s32,
+ vdupq_n_s32(0))
+
+#undef SHIFT_LOOP_HELPER
+
+#endif // AOM_AV1_ENCODER_ARM_NEON_SHIFT_NEON_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c
new file mode 100644
index 0000000000..986f143864
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+// For the squared error buffer, add padding for 4 samples.
+#define SSE_STRIDE (BW + 4)
+
+// When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits.
+DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = {
+ 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
+};
+
+static INLINE void get_squared_error(
+ const uint8_t *frame1, const uint32_t stride1, const uint8_t *frame2,
+ const uint32_t stride2, const uint32_t block_width,
+ const uint32_t block_height, uint16_t *frame_sse,
+ const unsigned int dst_stride) {
+ uint16_t *dst = frame_sse;
+
+ uint32_t i = 0;
+ do {
+ uint32_t j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
+ uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
+
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ uint16x8_t sse_lo =
+ vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
+ uint16x8_t sse_hi =
+ vmull_u8(vget_high_u8(abs_diff), vget_high_u8(abs_diff));
+
+ vst1q_u16(dst + j + 2, sse_lo);
+ vst1q_u16(dst + j + 10, sse_hi);
+
+ j += 16;
+ } while (j < block_width);
+
+ dst += dst_stride;
+ } while (++i < block_height);
+}
+
+static INLINE uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col,
+ const uint32_t block_width) {
+ uint16x8_t s = vld1q_u16(src);
+
+ if (col == 0) {
+ const uint16_t lane2 = vgetq_lane_u16(s, 2);
+ s = vsetq_lane_u16(lane2, s, 0);
+ s = vsetq_lane_u16(lane2, s, 1);
+ } else if (col >= block_width - 4) {
+ const uint16_t lane5 = vgetq_lane_u16(s, 5);
+ s = vsetq_lane_u16(lane5, s, 6);
+ s = vsetq_lane_u16(lane5, s, 7);
+ }
+ return s;
+}
+
+static void apply_temporal_filter(
+ const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
+ const uint32_t block_height, const int *subblock_mses,
+ unsigned int *accumulator, uint16_t *count, const uint16_t *frame_sse,
+ const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+ const double decay_factor, const double inv_factor,
+ const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_neon[BH][BW];
+ const uint16x8x4_t vmask = vld1q_u16_x4(kSlidingWindowMask);
+
+ // Traverse 4 columns at a time - first and last two columns need padding.
+ for (uint32_t col = 0; col < block_width; col += 4) {
+ uint16x8_t vsrc[5];
+ const uint16_t *src = frame_sse + col;
+
+ // Load and pad (for first and last two columns) 3 rows from the top.
+ for (int i = 2; i < 5; i++) {
+ vsrc[i] = load_and_pad(src, col, block_width);
+ src += SSE_STRIDE;
+ }
+
+ // Pad the top 2 rows.
+ vsrc[0] = vsrc[2];
+ vsrc[1] = vsrc[2];
+
+ for (unsigned int row = 0; row < block_height; row++) {
+ for (int i = 0; i < 4; i++) {
+ uint32x4_t vsum = vdupq_n_u32(0);
+ for (int j = 0; j < 5; j++) {
+ vsum = vpadalq_u16(vsum, vandq_u16(vsrc[j], vmask.val[i]));
+ }
+ acc_5x5_neon[row][col + i] = horizontal_add_u32x4(vsum);
+ }
+
+ // Push all rows in the sliding window up one.
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ if (row <= block_height - 4) {
+ // Load next row into the bottom of the sliding window.
+ vsrc[4] = load_and_pad(src, col, block_width);
+ src += SSE_STRIDE;
+ } else {
+ // Pad the bottom 2 rows.
+ vsrc[4] = vsrc[3];
+ }
+ }
+ }
+
+ // Perform filtering.
+ if (tf_wgt_calc_lvl == 0) {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ } else {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ }
+}
+
+void av1_apply_temporal_filter_neon(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+ assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ // Frame information.
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride =
+ frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+ }
+ }
+ }
+ }
+ }
+
+ get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+ plane_h, frame_sse, SSE_STRIDE);
+
+ apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
+ subblock_mses, accum + plane_offset,
+ count + plane_offset, frame_sse, luma_sse_sum,
+ inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+
+ plane_offset += plane_h * plane_w;
+ }
+}
+
+double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
+ int width, int stride,
+ int edge_thresh) {
+ uint16x8_t thresh = vdupq_n_u16(edge_thresh);
+ uint32x4_t acc = vdupq_n_u32(0);
+ // Count is in theory positive as it counts the number of times we're under
+ // the threshold, but it will be counted negatively in order to make best use
+ // of the vclt instruction, which sets every bit of a lane to 1 when the
+ // condition is true.
+ int32x4_t count = vdupq_n_s32(0);
+ int final_count = 0;
+ int64_t final_acc = 0;
+ const uint8_t *src_start = src + stride + 1;
+ int h = 1;
+
+ do {
+ int w = 1;
+ const uint8_t *src_ptr = src_start;
+
+ while (w <= (width - 1) - 16) {
+ uint8x16_t mat[3][3];
+ mat[0][0] = vld1q_u8(src_ptr - stride - 1);
+ mat[0][1] = vld1q_u8(src_ptr - stride);
+ mat[0][2] = vld1q_u8(src_ptr - stride + 1);
+ mat[1][0] = vld1q_u8(src_ptr - 1);
+ mat[1][1] = vld1q_u8(src_ptr);
+ mat[1][2] = vld1q_u8(src_ptr + 1);
+ mat[2][0] = vld1q_u8(src_ptr + stride - 1);
+ mat[2][1] = vld1q_u8(src_ptr + stride);
+ mat[2][2] = vld1q_u8(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa_lo =
+ vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[2][0]));
+ uint16x8_t gxa_hi =
+ vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[2][0]));
+ uint16x8_t gxb_lo =
+ vaddl_u8(vget_low_u8(mat[0][2]), vget_low_u8(mat[2][2]));
+ uint16x8_t gxb_hi =
+ vaddl_u8(vget_high_u8(mat[0][2]), vget_high_u8(mat[2][2]));
+ gxa_lo = vaddq_u16(
+ gxa_lo, vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][0])));
+ gxa_hi = vaddq_u16(
+ gxa_hi, vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][0])));
+ gxb_lo = vaddq_u16(
+ gxb_lo, vaddl_u8(vget_low_u8(mat[1][2]), vget_low_u8(mat[1][2])));
+ gxb_hi = vaddq_u16(
+ gxb_hi, vaddl_u8(vget_high_u8(mat[1][2]), vget_high_u8(mat[1][2])));
+
+ uint16x8_t gya_lo =
+ vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
+ uint16x8_t gya_hi =
+ vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
+ uint16x8_t gyb_lo =
+ vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
+ uint16x8_t gyb_hi =
+ vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
+ gya_lo = vaddq_u16(
+ gya_lo, vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[0][1])));
+ gya_hi = vaddq_u16(
+ gya_hi, vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[0][1])));
+ gyb_lo = vaddq_u16(
+ gyb_lo, vaddl_u8(vget_low_u8(mat[2][1]), vget_low_u8(mat[2][1])));
+ gyb_hi = vaddq_u16(
+ gyb_hi, vaddl_u8(vget_high_u8(mat[2][1]), vget_high_u8(mat[2][1])));
+
+ uint16x8_t ga_lo = vabaq_u16(vabdq_u16(gxa_lo, gxb_lo), gya_lo, gyb_lo);
+ uint16x8_t ga_hi = vabaq_u16(vabdq_u16(gxa_hi, gxb_hi), gya_hi, gyb_hi);
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16_lo = vcltq_u16(ga_lo, thresh);
+ uint16x8_t thresh_u16_hi = vcltq_u16(ga_hi, thresh);
+
+ uint16x8_t center_lo = vshll_n_u8(vget_low_u8(mat[1][1]), 2);
+ uint16x8_t center_hi = vshll_n_u8(vget_high_u8(mat[1][1]), 2);
+
+ uint16x8_t adj0_lo =
+ vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[2][1]));
+ uint16x8_t adj0_hi =
+ vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[2][1]));
+ uint16x8_t adj1_lo =
+ vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][2]));
+ uint16x8_t adj1_hi =
+ vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][2]));
+ uint16x8_t adj_lo = vaddq_u16(adj0_lo, adj1_lo);
+ adj_lo = vaddq_u16(adj_lo, adj_lo);
+ uint16x8_t adj_hi = vaddq_u16(adj0_hi, adj1_hi);
+ adj_hi = vaddq_u16(adj_hi, adj_hi);
+
+ uint16x8_t diag0_lo =
+ vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
+ uint16x8_t diag0_hi =
+ vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
+ uint16x8_t diag1_lo =
+ vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
+ uint16x8_t diag1_hi =
+ vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
+ uint16x8_t diag_lo = vaddq_u16(diag0_lo, diag1_lo);
+ uint16x8_t diag_hi = vaddq_u16(diag0_hi, diag1_hi);
+
+ uint16x8_t v_lo = vaddq_u16(center_lo, diag_lo);
+ v_lo = vabdq_u16(v_lo, adj_lo);
+ uint16x8_t v_hi = vaddq_u16(center_hi, diag_hi);
+ v_hi = vabdq_u16(v_hi, adj_hi);
+
+ acc = vpadalq_u16(acc, vandq_u16(v_lo, thresh_u16_lo));
+ acc = vpadalq_u16(acc, vandq_u16(v_hi, thresh_u16_hi));
+
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_lo));
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_hi));
+
+ w += 16;
+ src_ptr += 16;
+ }
+
+ if (w <= (width - 1) - 8) {
+ uint8x8_t mat[3][3];
+ mat[0][0] = vld1_u8(src_ptr - stride - 1);
+ mat[0][1] = vld1_u8(src_ptr - stride);
+ mat[0][2] = vld1_u8(src_ptr - stride + 1);
+ mat[1][0] = vld1_u8(src_ptr - 1);
+ mat[1][1] = vld1_u8(src_ptr);
+ mat[1][2] = vld1_u8(src_ptr + 1);
+ mat[2][0] = vld1_u8(src_ptr + stride - 1);
+ mat[2][1] = vld1_u8(src_ptr + stride);
+ mat[2][2] = vld1_u8(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
+ uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
+ gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
+ gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
+
+ uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
+ gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
+ gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
+
+ uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
+
+ uint16x8_t center = vshll_n_u8(mat[1][1], 2);
+
+ uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
+ uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
+ uint16x8_t adj = vaddq_u16(adj0, adj1);
+ adj = vaddq_u16(adj, adj);
+
+ uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
+ uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+ uint16x8_t v = vaddq_u16(center, diag);
+ v = vabdq_u16(v, adj);
+
+ acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+ w += 8;
+ src_ptr += 8;
+ }
+
+ if (w <= (width - 1) - 4) {
+ uint16x8_t mask = vcombine_u16(vdup_n_u16(65535), vdup_n_u16(0));
+ uint8x8_t mat[3][3];
+ mat[0][0] = load_u8_4x1(src_ptr - stride - 1);
+ mat[0][1] = load_u8_4x1(src_ptr - stride);
+ mat[0][2] = load_u8_4x1(src_ptr - stride + 1);
+ mat[1][0] = load_u8_4x1(src_ptr - 1);
+ mat[1][1] = load_u8_4x1(src_ptr);
+ mat[1][2] = load_u8_4x1(src_ptr + 1);
+ mat[2][0] = load_u8_4x1(src_ptr + stride - 1);
+ mat[2][1] = load_u8_4x1(src_ptr + stride);
+ mat[2][2] = load_u8_4x1(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
+ uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
+ gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
+ gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
+
+ uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
+ gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
+ gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
+
+ uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
+
+ // Check which vector elements are under the threshold. The Laplacian is
+ // then unconditionally computed and we accumulate zeros if we're not
+ // under the threshold. This is much faster than using an if statement.
+ uint16x8_t thresh_u16 = vandq_u16(vcltq_u16(ga, thresh), mask);
+
+ uint16x8_t center = vshll_n_u8(mat[1][1], 2);
+
+ uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
+ uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
+ uint16x8_t adj = vaddq_u16(adj0, adj1);
+ adj = vaddq_u16(adj, adj);
+
+ uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
+ uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
+ uint16x8_t diag = vaddq_u16(diag0, diag1);
+
+ uint16x8_t v = vaddq_u16(center, diag);
+ v = vabdq_u16(v, adj);
+
+ acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
+ // Add -1 for each lane where the gradient is under the threshold.
+ count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
+
+ w += 4;
+ src_ptr += 4;
+ }
+
+ while (w < width - 1) {
+ int mat[3][3];
+ mat[0][0] = *(src_ptr - stride - 1);
+ mat[0][1] = *(src_ptr - stride);
+ mat[0][2] = *(src_ptr - stride + 1);
+ mat[1][0] = *(src_ptr - 1);
+ mat[1][1] = *(src_ptr);
+ mat[1][2] = *(src_ptr + 1);
+ mat[2][0] = *(src_ptr + stride - 1);
+ mat[2][1] = *(src_ptr + stride);
+ mat[2][2] = *(src_ptr + stride + 1);
+
+ // Compute Sobel gradients.
+ const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int ga = abs(gx) + abs(gy);
+
+ // Accumulate Laplacian.
+ const int is_under = ga < edge_thresh;
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ final_acc += abs(v) * is_under;
+ final_count += is_under;
+
+ src_ptr++;
+ w++;
+ }
+ src_start += stride;
+ } while (++h < height - 1);
+
+ // We counted negatively, so subtract to get the final value.
+ final_count -= horizontal_add_s32x4(count);
+ final_acc += horizontal_long_add_u32x4(acc);
+ return (final_count < 16)
+ ? -1.0
+ : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
new file mode 100644
index 0000000000..5a52e701a2
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+// For the squared error buffer, add padding for 4 samples.
+#define SSE_STRIDE (BW + 4)
+
+// clang-format off
+
+DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
+ 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+ 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
+ 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+// clang-format on
+
+static INLINE void get_abs_diff(const uint8_t *frame1, const uint32_t stride1,
+ const uint8_t *frame2, const uint32_t stride2,
+ const uint32_t block_width,
+ const uint32_t block_height,
+ uint8_t *frame_abs_diff,
+ const unsigned int dst_stride) {
+ uint8_t *dst = frame_abs_diff;
+
+ uint32_t i = 0;
+ do {
+ uint32_t j = 0;
+ do {
+ uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
+ uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
+ uint8x16_t abs_diff = vabdq_u8(s, r);
+ vst1q_u8(dst + j + 2, abs_diff);
+ j += 16;
+ } while (j < block_width);
+
+ dst += dst_stride;
+ } while (++i < block_height);
+}
+
+static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col,
+ const uint32_t block_width) {
+ uint8x8_t s = vld1_u8(src);
+
+ if (col == 0) {
+ const uint8_t lane2 = vget_lane_u8(s, 2);
+ s = vset_lane_u8(lane2, s, 0);
+ s = vset_lane_u8(lane2, s, 1);
+ } else if (col >= block_width - 4) {
+ const uint8_t lane5 = vget_lane_u8(s, 5);
+ s = vset_lane_u8(lane5, s, 6);
+ s = vset_lane_u8(lane5, s, 7);
+ }
+ return vcombine_u8(s, s);
+}
+
+static void apply_temporal_filter(
+ const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
+ const uint32_t block_height, const int *subblock_mses,
+ unsigned int *accumulator, uint16_t *count, const uint8_t *frame_abs_diff,
+ const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
+ const double decay_factor, const double inv_factor,
+ const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_neon[BH][BW];
+ const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask);
+
+ // Traverse 4 columns at a time - first and last two columns need padding.
+ for (uint32_t col = 0; col < block_width; col += 4) {
+ uint8x16_t vsrc[5][2];
+ const uint8_t *src = frame_abs_diff + col;
+
+ // Load, pad (for first and last two columns) and mask 3 rows from the top.
+ for (int i = 2; i < 5; i++) {
+ const uint8x16_t s = load_and_pad(src, col, block_width);
+ vsrc[i][0] = vandq_u8(s, vmask.val[0]);
+ vsrc[i][1] = vandq_u8(s, vmask.val[1]);
+ src += SSE_STRIDE;
+ }
+
+ // Pad the top 2 rows.
+ vsrc[0][0] = vsrc[2][0];
+ vsrc[0][1] = vsrc[2][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+
+ for (unsigned int row = 0; row < block_height; row++) {
+ uint32x4_t sum_01 = vdupq_n_u32(0);
+ uint32x4_t sum_23 = vdupq_n_u32(0);
+
+ sum_01 = vdotq_u32(sum_01, vsrc[0][0], vsrc[0][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[1][0], vsrc[1][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[2][0], vsrc[2][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[3][0], vsrc[3][0]);
+ sum_01 = vdotq_u32(sum_01, vsrc[4][0], vsrc[4][0]);
+
+ sum_23 = vdotq_u32(sum_23, vsrc[0][1], vsrc[0][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[1][1], vsrc[1][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[2][1], vsrc[2][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[3][1], vsrc[3][1]);
+ sum_23 = vdotq_u32(sum_23, vsrc[4][1], vsrc[4][1]);
+
+ vst1q_u32(&acc_5x5_neon[row][col], vpaddq_u32(sum_01, sum_23));
+
+ // Push all rows in the sliding window up one.
+ for (int i = 0; i < 4; i++) {
+ vsrc[i][0] = vsrc[i + 1][0];
+ vsrc[i][1] = vsrc[i + 1][1];
+ }
+
+ if (row <= block_height - 4) {
+ // Load next row into the bottom of the sliding window.
+ uint8x16_t s = load_and_pad(src, col, block_width);
+ vsrc[4][0] = vandq_u8(s, vmask.val[0]);
+ vsrc[4][1] = vandq_u8(s, vmask.val[1]);
+ src += SSE_STRIDE;
+ } else {
+ // Pad the bottom 2 rows.
+ vsrc[4][0] = vsrc[3][0];
+ vsrc[4][1] = vsrc[3][1];
+ }
+ }
+ }
+
+ // Perform filtering.
+ if (tf_wgt_calc_lvl == 0) {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ } else {
+ for (unsigned int i = 0, k = 0; i < block_height; i++) {
+ for (unsigned int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame[i * stride + j];
+ const uint32_t diff_sse = acc_5x5_neon[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx =
+ (i >= block_height / 2) * 2 + (j >= block_width / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor;
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+ accumulator[k] += weight * pixel_value;
+ count[k] += weight;
+ }
+ }
+ }
+}
+
+void av1_apply_temporal_filter_neon_dotprod(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
+ assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ // Frame information.
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint8_t frame_abs_diff[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride =
+ frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] +=
+ (frame_abs_diff[yy * SSE_STRIDE + xx + 2] *
+ frame_abs_diff[yy * SSE_STRIDE + xx + 2]);
+ }
+ }
+ }
+ }
+ }
+
+ get_abs_diff(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
+ plane_h, frame_abs_diff, SSE_STRIDE);
+
+ apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
+ subblock_mses, accum + plane_offset,
+ count + plane_offset, frame_abs_diff, luma_sse_sum,
+ inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/third_party/aom/av1/encoder/arm/neon/txfm_neon.h b/third_party/aom/av1/encoder/arm/neon/txfm_neon.h
new file mode 100644
index 0000000000..635364f46a
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/txfm_neon.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+#define AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
+
+#include "aom/aom_integer.h" // For AOM_INLINE.
+
+static AOM_INLINE void ud_adjust_input_and_stride(int ud_flip,
+ const int16_t **input,
+ int *stride, int out_size) {
+ if (ud_flip) {
+ *input = *input + (out_size - 1) * *stride;
+ *stride = -*stride;
+ }
+}
+
+#endif // AOM_AV1_ENCODER_ARM_NEON_TXFM_NEON_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c b/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c
new file mode 100644
index 0000000000..1b35269b33
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/wedge_utils_neon.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c for details of the parameters and
+ * computation.
+ */
+uint64_t av1_wedge_sse_from_residuals_neon(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ assert(N % 64 == 0);
+
+ uint64x2_t v_csse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ int i = 0;
+ do {
+ int32x4_t sum[4];
+ int32x4_t sse[2];
+ int16x4_t sum_s16[4];
+
+ const int16x8_t r1_l = vld1q_s16(r1 + i);
+ const int16x8_t r1_h = vld1q_s16(r1 + i + 8);
+ const int16x8_t d_l = vld1q_s16(d + i);
+ const int16x8_t d_h = vld1q_s16(d + i + 8);
+ // The following three lines are a bit inelegant compared to using a pair
+ // of vmovl_u8()... but it forces the compiler to emit a ZIP1, ZIP2 pair -
+ // which can be executed in parallel with the subsequent SSHL instructions.
+ // (SSHL can only be executed on half of the Neon pipes in modern Arm
+ // cores, whereas ZIP1/2 can be executed on all of them.)
+ const uint8x16x2_t m_u16 = vzipq_u8(vld1q_u8(m + i), vdupq_n_u8(0));
+ const int16x8_t m_l = vreinterpretq_s16_u8(m_u16.val[0]);
+ const int16x8_t m_h = vreinterpretq_s16_u8(m_u16.val[1]);
+
+ sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS);
+ sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS);
+ sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS);
+ sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS);
+
+ sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l));
+ sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l));
+ sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h));
+ sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h));
+
+ sum_s16[0] = vqmovn_s32(sum[0]);
+ sum_s16[1] = vqmovn_s32(sum[1]);
+ sum_s16[2] = vqmovn_s32(sum[2]);
+ sum_s16[3] = vqmovn_s32(sum[3]);
+
+ sse[0] = vmull_s16(sum_s16[0], sum_s16[0]);
+ sse[1] = vmull_s16(sum_s16[2], sum_s16[2]);
+ sse[0] = vmlal_s16(sse[0], sum_s16[1], sum_s16[1]);
+ sse[1] = vmlal_s16(sse[1], sum_s16[3], sum_s16[3]);
+
+ v_csse[0] = vpadalq_u32(v_csse[0], vreinterpretq_u32_s32(sse[0]));
+ v_csse[1] = vpadalq_u32(v_csse[1], vreinterpretq_u32_s32(sse[1]));
+
+ i += 16;
+ } while (i < N);
+
+ uint64_t csse = horizontal_add_u64x2(vaddq_u64(v_csse[0], v_csse[1]));
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+int8_t av1_wedge_sign_from_residuals_neon(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int32x4_t acc[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+ vdupq_n_s32(0) };
+
+ do {
+ int16x8_t ds_l = vld1q_s16(ds);
+ int16x8_t ds_h = vld1q_s16(ds + 8);
+
+ int8x16_t m_s8 = vreinterpretq_s8_u8(vld1q_u8(m));
+ int16x8_t m_l = vmovl_s8(vget_low_s8(m_s8));
+ int16x8_t m_h = vmovl_s8(vget_high_s8(m_s8));
+
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(ds_l), vget_low_s16(m_l));
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(ds_l), vget_high_s16(m_l));
+ acc[2] = vmlal_s16(acc[2], vget_low_s16(ds_h), vget_low_s16(m_h));
+ acc[3] = vmlal_s16(acc[3], vget_high_s16(ds_h), vget_high_s16(m_h));
+
+ ds += 16;
+ m += 16;
+ N -= 16;
+ } while (N != 0);
+
+ int64x2_t sum = vpaddlq_s32(acc[0]);
+ sum = vpadalq_s32(sum, acc[1]);
+ sum = vpadalq_s32(sum, acc[2]);
+ sum = vpadalq_s32(sum, acc[3]);
+
+ return (horizontal_add_s64x2(sum) > limit);
+}
+
+void av1_wedge_compute_delta_squares_neon(int16_t *d_ptr, const int16_t *a_ptr,
+ const int16_t *b_ptr, int N) {
+ do {
+ int16x8_t a = vld1q_s16(a_ptr);
+ int16x8_t b = vld1q_s16(b_ptr);
+
+ int32x4_t sq_lo = vmull_s16(vget_low_s16(a), vget_low_s16(a));
+ int32x4_t sq_hi = vmull_s16(vget_high_s16(a), vget_high_s16(a));
+
+ sq_lo = vmlsl_s16(sq_lo, vget_low_s16(b), vget_low_s16(b));
+ sq_hi = vmlsl_s16(sq_hi, vget_high_s16(b), vget_high_s16(b));
+
+ int16x8_t res = vcombine_s16(vqmovn_s32(sq_lo), vqmovn_s32(sq_hi));
+
+ vst1q_s16(d_ptr, res);
+
+ d_ptr += 8;
+ a_ptr += 8;
+ b_ptr += 8;
+ N -= 8;
+ } while (N != 0);
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
new file mode 100644
index 0000000000..6601c19ab3
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
@@ -0,0 +1,1885 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/common/av1_txfm.h"
+
+void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 4;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[4];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[3];
+ bf1[1] = input[1] + input[2];
+ bf1[2] = -input[2] + input[1];
+ bf1[3] = -input[3] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[2];
+ bf1[2] = bf0[1];
+ bf1[3] = bf0[3];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 8;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[8];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[7];
+ bf1[1] = input[1] + input[6];
+ bf1[2] = input[2] + input[5];
+ bf1[3] = input[3] + input[4];
+ bf1[4] = -input[4] + input[3];
+ bf1[5] = -input[5] + input[2];
+ bf1[6] = -input[6] + input[1];
+ bf1[7] = -input[7] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[4];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[6];
+ bf1[4] = bf0[1];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[3];
+ bf1[7] = bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 16;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[16];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[15];
+ bf1[1] = input[1] + input[14];
+ bf1[2] = input[2] + input[13];
+ bf1[3] = input[3] + input[12];
+ bf1[4] = input[4] + input[11];
+ bf1[5] = input[5] + input[10];
+ bf1[6] = input[6] + input[9];
+ bf1[7] = input[7] + input[8];
+ bf1[8] = -input[8] + input[7];
+ bf1[9] = -input[9] + input[6];
+ bf1[10] = -input[10] + input[5];
+ bf1[11] = -input[11] + input[4];
+ bf1[12] = -input[12] + input[3];
+ bf1[13] = -input[13] + input[2];
+ bf1[14] = -input[14] + input[1];
+ bf1[15] = -input[15] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[7];
+ bf1[1] = bf0[1] + bf0[6];
+ bf1[2] = bf0[2] + bf0[5];
+ bf1[3] = bf0[3] + bf0[4];
+ bf1[4] = -bf0[4] + bf0[3];
+ bf1[5] = -bf0[5] + bf0[2];
+ bf1[6] = -bf0[6] + bf0[1];
+ bf1[7] = -bf0[7] + bf0[0];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8] + bf0[11];
+ bf1[9] = bf0[9] + bf0[10];
+ bf1[10] = -bf0[10] + bf0[9];
+ bf1[11] = -bf0[11] + bf0[8];
+ bf1[12] = -bf0[12] + bf0[15];
+ bf1[13] = -bf0[13] + bf0[14];
+ bf1[14] = bf0[14] + bf0[13];
+ bf1[15] = bf0[15] + bf0[12];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ bf1[8] = bf0[8] + bf0[9];
+ bf1[9] = -bf0[9] + bf0[8];
+ bf1[10] = -bf0[10] + bf0[11];
+ bf1[11] = bf0[11] + bf0[10];
+ bf1[12] = bf0[12] + bf0[13];
+ bf1[13] = -bf0[13] + bf0[12];
+ bf1[14] = -bf0[14] + bf0[15];
+ bf1[15] = bf0[15] + bf0[14];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+ bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[8];
+ bf1[2] = bf0[4];
+ bf1[3] = bf0[12];
+ bf1[4] = bf0[2];
+ bf1[5] = bf0[10];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[14];
+ bf1[8] = bf0[1];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[5];
+ bf1[11] = bf0[13];
+ bf1[12] = bf0[3];
+ bf1[13] = bf0[11];
+ bf1[14] = bf0[7];
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 32;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[32];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[31];
+ bf1[1] = input[1] + input[30];
+ bf1[2] = input[2] + input[29];
+ bf1[3] = input[3] + input[28];
+ bf1[4] = input[4] + input[27];
+ bf1[5] = input[5] + input[26];
+ bf1[6] = input[6] + input[25];
+ bf1[7] = input[7] + input[24];
+ bf1[8] = input[8] + input[23];
+ bf1[9] = input[9] + input[22];
+ bf1[10] = input[10] + input[21];
+ bf1[11] = input[11] + input[20];
+ bf1[12] = input[12] + input[19];
+ bf1[13] = input[13] + input[18];
+ bf1[14] = input[14] + input[17];
+ bf1[15] = input[15] + input[16];
+ bf1[16] = -input[16] + input[15];
+ bf1[17] = -input[17] + input[14];
+ bf1[18] = -input[18] + input[13];
+ bf1[19] = -input[19] + input[12];
+ bf1[20] = -input[20] + input[11];
+ bf1[21] = -input[21] + input[10];
+ bf1[22] = -input[22] + input[9];
+ bf1[23] = -input[23] + input[8];
+ bf1[24] = -input[24] + input[7];
+ bf1[25] = -input[25] + input[6];
+ bf1[26] = -input[26] + input[5];
+ bf1[27] = -input[27] + input[4];
+ bf1[28] = -input[28] + input[3];
+ bf1[29] = -input[29] + input[2];
+ bf1[30] = -input[30] + input[1];
+ bf1[31] = -input[31] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[15];
+ bf1[1] = bf0[1] + bf0[14];
+ bf1[2] = bf0[2] + bf0[13];
+ bf1[3] = bf0[3] + bf0[12];
+ bf1[4] = bf0[4] + bf0[11];
+ bf1[5] = bf0[5] + bf0[10];
+ bf1[6] = bf0[6] + bf0[9];
+ bf1[7] = bf0[7] + bf0[8];
+ bf1[8] = -bf0[8] + bf0[7];
+ bf1[9] = -bf0[9] + bf0[6];
+ bf1[10] = -bf0[10] + bf0[5];
+ bf1[11] = -bf0[11] + bf0[4];
+ bf1[12] = -bf0[12] + bf0[3];
+ bf1[13] = -bf0[13] + bf0[2];
+ bf1[14] = -bf0[14] + bf0[1];
+ bf1[15] = -bf0[15] + bf0[0];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[7];
+ bf1[1] = bf0[1] + bf0[6];
+ bf1[2] = bf0[2] + bf0[5];
+ bf1[3] = bf0[3] + bf0[4];
+ bf1[4] = -bf0[4] + bf0[3];
+ bf1[5] = -bf0[5] + bf0[2];
+ bf1[6] = -bf0[6] + bf0[1];
+ bf1[7] = -bf0[7] + bf0[0];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[23];
+ bf1[17] = bf0[17] + bf0[22];
+ bf1[18] = bf0[18] + bf0[21];
+ bf1[19] = bf0[19] + bf0[20];
+ bf1[20] = -bf0[20] + bf0[19];
+ bf1[21] = -bf0[21] + bf0[18];
+ bf1[22] = -bf0[22] + bf0[17];
+ bf1[23] = -bf0[23] + bf0[16];
+ bf1[24] = -bf0[24] + bf0[31];
+ bf1[25] = -bf0[25] + bf0[30];
+ bf1[26] = -bf0[26] + bf0[29];
+ bf1[27] = -bf0[27] + bf0[28];
+ bf1[28] = bf0[28] + bf0[27];
+ bf1[29] = bf0[29] + bf0[26];
+ bf1[30] = bf0[30] + bf0[25];
+ bf1[31] = bf0[31] + bf0[24];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8] + bf0[11];
+ bf1[9] = bf0[9] + bf0[10];
+ bf1[10] = -bf0[10] + bf0[9];
+ bf1[11] = -bf0[11] + bf0[8];
+ bf1[12] = -bf0[12] + bf0[15];
+ bf1[13] = -bf0[13] + bf0[14];
+ bf1[14] = bf0[14] + bf0[13];
+ bf1[15] = bf0[15] + bf0[12];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+ bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+ bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[19];
+ bf1[17] = bf0[17] + bf0[18];
+ bf1[18] = -bf0[18] + bf0[17];
+ bf1[19] = -bf0[19] + bf0[16];
+ bf1[20] = -bf0[20] + bf0[23];
+ bf1[21] = -bf0[21] + bf0[22];
+ bf1[22] = bf0[22] + bf0[21];
+ bf1[23] = bf0[23] + bf0[20];
+ bf1[24] = bf0[24] + bf0[27];
+ bf1[25] = bf0[25] + bf0[26];
+ bf1[26] = -bf0[26] + bf0[25];
+ bf1[27] = -bf0[27] + bf0[24];
+ bf1[28] = -bf0[28] + bf0[31];
+ bf1[29] = -bf0[29] + bf0[30];
+ bf1[30] = bf0[30] + bf0[29];
+ bf1[31] = bf0[31] + bf0[28];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ bf1[8] = bf0[8] + bf0[9];
+ bf1[9] = -bf0[9] + bf0[8];
+ bf1[10] = -bf0[10] + bf0[11];
+ bf1[11] = bf0[11] + bf0[10];
+ bf1[12] = bf0[12] + bf0[13];
+ bf1[13] = -bf0[13] + bf0[12];
+ bf1[14] = -bf0[14] + bf0[15];
+ bf1[15] = bf0[15] + bf0[14];
+ bf1[16] = bf0[16];
+ bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+ bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+ bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+ bf1[16] = bf0[16] + bf0[17];
+ bf1[17] = -bf0[17] + bf0[16];
+ bf1[18] = -bf0[18] + bf0[19];
+ bf1[19] = bf0[19] + bf0[18];
+ bf1[20] = bf0[20] + bf0[21];
+ bf1[21] = -bf0[21] + bf0[20];
+ bf1[22] = -bf0[22] + bf0[23];
+ bf1[23] = bf0[23] + bf0[22];
+ bf1[24] = bf0[24] + bf0[25];
+ bf1[25] = -bf0[25] + bf0[24];
+ bf1[26] = -bf0[26] + bf0[27];
+ bf1[27] = bf0[27] + bf0[26];
+ bf1[28] = bf0[28] + bf0[29];
+ bf1[29] = -bf0[29] + bf0[28];
+ bf1[30] = -bf0[30] + bf0[31];
+ bf1[31] = bf0[31] + bf0[30];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+ bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+ bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+ bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+ bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+ bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+ bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+ bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+ bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[16];
+ bf1[2] = bf0[8];
+ bf1[3] = bf0[24];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[20];
+ bf1[6] = bf0[12];
+ bf1[7] = bf0[28];
+ bf1[8] = bf0[2];
+ bf1[9] = bf0[18];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[26];
+ bf1[12] = bf0[6];
+ bf1[13] = bf0[22];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[30];
+ bf1[16] = bf0[1];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[9];
+ bf1[19] = bf0[25];
+ bf1[20] = bf0[5];
+ bf1[21] = bf0[21];
+ bf1[22] = bf0[13];
+ bf1[23] = bf0[29];
+ bf1[24] = bf0[3];
+ bf1[25] = bf0[19];
+ bf1[26] = bf0[11];
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[7];
+ bf1[29] = bf0[23];
+ bf1[30] = bf0[15];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ int bit = cos_bit;
+ const int32_t *sinpi = sinpi_arr(bit);
+ int32_t x0, x1, x2, x3;
+ int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // stage 0
+ av1_range_check_buf(0, input, input, 4, stage_range[0]);
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
+ s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
+ s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
+ s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
+ s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
+ s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
+ s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
+ s7 = range_check_value(x0 + x1, stage_range[1]);
+
+ // stage 2
+ s7 = range_check_value(s7 - x3, stage_range[2]);
+
+ // stage 3
+ x0 = range_check_value(s0 + s2, bit + stage_range[3]);
+ x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
+ x2 = range_check_value(s1 - s3, bit + stage_range[3]);
+ x3 = range_check_value(s4, bit + stage_range[3]);
+
+ // stage 4
+ x0 = range_check_value(x0 + s5, bit + stage_range[4]);
+ x2 = range_check_value(x2 + s6, bit + stage_range[4]);
+
+ // stage 5
+ s0 = range_check_value(x0 + x3, bit + stage_range[5]);
+ s1 = range_check_value(x1, bit + stage_range[5]);
+ s2 = range_check_value(x2 - x3, bit + stage_range[5]);
+ s3 = range_check_value(x2 - x0, bit + stage_range[5]);
+
+ // stage 6
+ s3 = range_check_value(s3 + x3, bit + stage_range[6]);
+
+ // 1-D transform scaling factor is sqrt(2).
+ output[0] = round_shift(s0, bit);
+ output[1] = round_shift(s1, bit);
+ output[2] = round_shift(s2, bit);
+ output[3] = round_shift(s3, bit);
+ av1_range_check_buf(6, input, output, 4, stage_range[6]);
+}
+
+void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 8;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[8];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ assert(output != input);
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = -input[7];
+ bf1[2] = -input[3];
+ bf1[3] = input[4];
+ bf1[4] = -input[1];
+ bf1[5] = input[6];
+ bf1[6] = input[2];
+ bf1[7] = -input[5];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[2];
+ bf1[1] = bf0[1] + bf0[3];
+ bf1[2] = bf0[0] - bf0[2];
+ bf1[3] = bf0[1] - bf0[3];
+ bf1[4] = bf0[4] + bf0[6];
+ bf1[5] = bf0[5] + bf0[7];
+ bf1[6] = bf0[4] - bf0[6];
+ bf1[7] = bf0[5] - bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+ bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[4];
+ bf1[1] = bf0[1] + bf0[5];
+ bf1[2] = bf0[2] + bf0[6];
+ bf1[3] = bf0[3] + bf0[7];
+ bf1[4] = bf0[0] - bf0[4];
+ bf1[5] = bf0[1] - bf0[5];
+ bf1[6] = bf0[2] - bf0[6];
+ bf1[7] = bf0[3] - bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+ bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+ bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[1];
+ bf1[1] = bf0[6];
+ bf1[2] = bf0[3];
+ bf1[3] = bf0[4];
+ bf1[4] = bf0[5];
+ bf1[5] = bf0[2];
+ bf1[6] = bf0[7];
+ bf1[7] = bf0[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 16;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[16];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ assert(output != input);
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = -input[15];
+ bf1[2] = -input[7];
+ bf1[3] = input[8];
+ bf1[4] = -input[3];
+ bf1[5] = input[12];
+ bf1[6] = input[4];
+ bf1[7] = -input[11];
+ bf1[8] = -input[1];
+ bf1[9] = input[14];
+ bf1[10] = input[6];
+ bf1[11] = -input[9];
+ bf1[12] = input[2];
+ bf1[13] = -input[13];
+ bf1[14] = -input[5];
+ bf1[15] = input[10];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[2];
+ bf1[1] = bf0[1] + bf0[3];
+ bf1[2] = bf0[0] - bf0[2];
+ bf1[3] = bf0[1] - bf0[3];
+ bf1[4] = bf0[4] + bf0[6];
+ bf1[5] = bf0[5] + bf0[7];
+ bf1[6] = bf0[4] - bf0[6];
+ bf1[7] = bf0[5] - bf0[7];
+ bf1[8] = bf0[8] + bf0[10];
+ bf1[9] = bf0[9] + bf0[11];
+ bf1[10] = bf0[8] - bf0[10];
+ bf1[11] = bf0[9] - bf0[11];
+ bf1[12] = bf0[12] + bf0[14];
+ bf1[13] = bf0[13] + bf0[15];
+ bf1[14] = bf0[12] - bf0[14];
+ bf1[15] = bf0[13] - bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+ bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+ bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[4];
+ bf1[1] = bf0[1] + bf0[5];
+ bf1[2] = bf0[2] + bf0[6];
+ bf1[3] = bf0[3] + bf0[7];
+ bf1[4] = bf0[0] - bf0[4];
+ bf1[5] = bf0[1] - bf0[5];
+ bf1[6] = bf0[2] - bf0[6];
+ bf1[7] = bf0[3] - bf0[7];
+ bf1[8] = bf0[8] + bf0[12];
+ bf1[9] = bf0[9] + bf0[13];
+ bf1[10] = bf0[10] + bf0[14];
+ bf1[11] = bf0[11] + bf0[15];
+ bf1[12] = bf0[8] - bf0[12];
+ bf1[13] = bf0[9] - bf0[13];
+ bf1[14] = bf0[10] - bf0[14];
+ bf1[15] = bf0[11] - bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+ bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+ bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+ bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+ bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[8];
+ bf1[1] = bf0[1] + bf0[9];
+ bf1[2] = bf0[2] + bf0[10];
+ bf1[3] = bf0[3] + bf0[11];
+ bf1[4] = bf0[4] + bf0[12];
+ bf1[5] = bf0[5] + bf0[13];
+ bf1[6] = bf0[6] + bf0[14];
+ bf1[7] = bf0[7] + bf0[15];
+ bf1[8] = bf0[0] - bf0[8];
+ bf1[9] = bf0[1] - bf0[9];
+ bf1[10] = bf0[2] - bf0[10];
+ bf1[11] = bf0[3] - bf0[11];
+ bf1[12] = bf0[4] - bf0[12];
+ bf1[13] = bf0[5] - bf0[13];
+ bf1[14] = bf0[6] - bf0[14];
+ bf1[15] = bf0[7] - bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+ bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+ bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+ bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+ bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+ bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+ bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[1];
+ bf1[1] = bf0[14];
+ bf1[2] = bf0[3];
+ bf1[3] = bf0[12];
+ bf1[4] = bf0[5];
+ bf1[5] = bf0[10];
+ bf1[6] = bf0[7];
+ bf1[7] = bf0[8];
+ bf1[8] = bf0[9];
+ bf1[9] = bf0[6];
+ bf1[10] = bf0[11];
+ bf1[11] = bf0[4];
+ bf1[12] = bf0[13];
+ bf1[13] = bf0[2];
+ bf1[14] = bf0[15];
+ bf1[15] = bf0[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 4; ++i)
+ output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
+ assert(stage_range[0] + NewSqrt2Bits <= 32);
+ av1_range_check_buf(0, input, output, 4, stage_range[0]);
+}
+
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
+ av1_range_check_buf(0, input, output, 8, stage_range[0]);
+}
+
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 16; ++i)
+ output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
+ assert(stage_range[0] + NewSqrt2Bits <= 32);
+ av1_range_check_buf(0, input, output, 16, stage_range[0]);
+}
+
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
+ av1_range_check_buf(0, input, output, 32, stage_range[0]);
+}
+
+void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 64;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[64];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[63];
+ bf1[1] = input[1] + input[62];
+ bf1[2] = input[2] + input[61];
+ bf1[3] = input[3] + input[60];
+ bf1[4] = input[4] + input[59];
+ bf1[5] = input[5] + input[58];
+ bf1[6] = input[6] + input[57];
+ bf1[7] = input[7] + input[56];
+ bf1[8] = input[8] + input[55];
+ bf1[9] = input[9] + input[54];
+ bf1[10] = input[10] + input[53];
+ bf1[11] = input[11] + input[52];
+ bf1[12] = input[12] + input[51];
+ bf1[13] = input[13] + input[50];
+ bf1[14] = input[14] + input[49];
+ bf1[15] = input[15] + input[48];
+ bf1[16] = input[16] + input[47];
+ bf1[17] = input[17] + input[46];
+ bf1[18] = input[18] + input[45];
+ bf1[19] = input[19] + input[44];
+ bf1[20] = input[20] + input[43];
+ bf1[21] = input[21] + input[42];
+ bf1[22] = input[22] + input[41];
+ bf1[23] = input[23] + input[40];
+ bf1[24] = input[24] + input[39];
+ bf1[25] = input[25] + input[38];
+ bf1[26] = input[26] + input[37];
+ bf1[27] = input[27] + input[36];
+ bf1[28] = input[28] + input[35];
+ bf1[29] = input[29] + input[34];
+ bf1[30] = input[30] + input[33];
+ bf1[31] = input[31] + input[32];
+ bf1[32] = -input[32] + input[31];
+ bf1[33] = -input[33] + input[30];
+ bf1[34] = -input[34] + input[29];
+ bf1[35] = -input[35] + input[28];
+ bf1[36] = -input[36] + input[27];
+ bf1[37] = -input[37] + input[26];
+ bf1[38] = -input[38] + input[25];
+ bf1[39] = -input[39] + input[24];
+ bf1[40] = -input[40] + input[23];
+ bf1[41] = -input[41] + input[22];
+ bf1[42] = -input[42] + input[21];
+ bf1[43] = -input[43] + input[20];
+ bf1[44] = -input[44] + input[19];
+ bf1[45] = -input[45] + input[18];
+ bf1[46] = -input[46] + input[17];
+ bf1[47] = -input[47] + input[16];
+ bf1[48] = -input[48] + input[15];
+ bf1[49] = -input[49] + input[14];
+ bf1[50] = -input[50] + input[13];
+ bf1[51] = -input[51] + input[12];
+ bf1[52] = -input[52] + input[11];
+ bf1[53] = -input[53] + input[10];
+ bf1[54] = -input[54] + input[9];
+ bf1[55] = -input[55] + input[8];
+ bf1[56] = -input[56] + input[7];
+ bf1[57] = -input[57] + input[6];
+ bf1[58] = -input[58] + input[5];
+ bf1[59] = -input[59] + input[4];
+ bf1[60] = -input[60] + input[3];
+ bf1[61] = -input[61] + input[2];
+ bf1[62] = -input[62] + input[1];
+ bf1[63] = -input[63] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[31];
+ bf1[1] = bf0[1] + bf0[30];
+ bf1[2] = bf0[2] + bf0[29];
+ bf1[3] = bf0[3] + bf0[28];
+ bf1[4] = bf0[4] + bf0[27];
+ bf1[5] = bf0[5] + bf0[26];
+ bf1[6] = bf0[6] + bf0[25];
+ bf1[7] = bf0[7] + bf0[24];
+ bf1[8] = bf0[8] + bf0[23];
+ bf1[9] = bf0[9] + bf0[22];
+ bf1[10] = bf0[10] + bf0[21];
+ bf1[11] = bf0[11] + bf0[20];
+ bf1[12] = bf0[12] + bf0[19];
+ bf1[13] = bf0[13] + bf0[18];
+ bf1[14] = bf0[14] + bf0[17];
+ bf1[15] = bf0[15] + bf0[16];
+ bf1[16] = -bf0[16] + bf0[15];
+ bf1[17] = -bf0[17] + bf0[14];
+ bf1[18] = -bf0[18] + bf0[13];
+ bf1[19] = -bf0[19] + bf0[12];
+ bf1[20] = -bf0[20] + bf0[11];
+ bf1[21] = -bf0[21] + bf0[10];
+ bf1[22] = -bf0[22] + bf0[9];
+ bf1[23] = -bf0[23] + bf0[8];
+ bf1[24] = -bf0[24] + bf0[7];
+ bf1[25] = -bf0[25] + bf0[6];
+ bf1[26] = -bf0[26] + bf0[5];
+ bf1[27] = -bf0[27] + bf0[4];
+ bf1[28] = -bf0[28] + bf0[3];
+ bf1[29] = -bf0[29] + bf0[2];
+ bf1[30] = -bf0[30] + bf0[1];
+ bf1[31] = -bf0[31] + bf0[0];
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[34];
+ bf1[35] = bf0[35];
+ bf1[36] = bf0[36];
+ bf1[37] = bf0[37];
+ bf1[38] = bf0[38];
+ bf1[39] = bf0[39];
+ bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+ bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+ bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+ bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+ bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+ bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+ bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
+ bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
+ bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
+ bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
+ bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
+ bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
+ bf1[56] = bf0[56];
+ bf1[57] = bf0[57];
+ bf1[58] = bf0[58];
+ bf1[59] = bf0[59];
+ bf1[60] = bf0[60];
+ bf1[61] = bf0[61];
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[15];
+ bf1[1] = bf0[1] + bf0[14];
+ bf1[2] = bf0[2] + bf0[13];
+ bf1[3] = bf0[3] + bf0[12];
+ bf1[4] = bf0[4] + bf0[11];
+ bf1[5] = bf0[5] + bf0[10];
+ bf1[6] = bf0[6] + bf0[9];
+ bf1[7] = bf0[7] + bf0[8];
+ bf1[8] = -bf0[8] + bf0[7];
+ bf1[9] = -bf0[9] + bf0[6];
+ bf1[10] = -bf0[10] + bf0[5];
+ bf1[11] = -bf0[11] + bf0[4];
+ bf1[12] = -bf0[12] + bf0[3];
+ bf1[13] = -bf0[13] + bf0[2];
+ bf1[14] = -bf0[14] + bf0[1];
+ bf1[15] = -bf0[15] + bf0[0];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = bf0[32] + bf0[47];
+ bf1[33] = bf0[33] + bf0[46];
+ bf1[34] = bf0[34] + bf0[45];
+ bf1[35] = bf0[35] + bf0[44];
+ bf1[36] = bf0[36] + bf0[43];
+ bf1[37] = bf0[37] + bf0[42];
+ bf1[38] = bf0[38] + bf0[41];
+ bf1[39] = bf0[39] + bf0[40];
+ bf1[40] = -bf0[40] + bf0[39];
+ bf1[41] = -bf0[41] + bf0[38];
+ bf1[42] = -bf0[42] + bf0[37];
+ bf1[43] = -bf0[43] + bf0[36];
+ bf1[44] = -bf0[44] + bf0[35];
+ bf1[45] = -bf0[45] + bf0[34];
+ bf1[46] = -bf0[46] + bf0[33];
+ bf1[47] = -bf0[47] + bf0[32];
+ bf1[48] = -bf0[48] + bf0[63];
+ bf1[49] = -bf0[49] + bf0[62];
+ bf1[50] = -bf0[50] + bf0[61];
+ bf1[51] = -bf0[51] + bf0[60];
+ bf1[52] = -bf0[52] + bf0[59];
+ bf1[53] = -bf0[53] + bf0[58];
+ bf1[54] = -bf0[54] + bf0[57];
+ bf1[55] = -bf0[55] + bf0[56];
+ bf1[56] = bf0[56] + bf0[55];
+ bf1[57] = bf0[57] + bf0[54];
+ bf1[58] = bf0[58] + bf0[53];
+ bf1[59] = bf0[59] + bf0[52];
+ bf1[60] = bf0[60] + bf0[51];
+ bf1[61] = bf0[61] + bf0[50];
+ bf1[62] = bf0[62] + bf0[49];
+ bf1[63] = bf0[63] + bf0[48];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[7];
+ bf1[1] = bf0[1] + bf0[6];
+ bf1[2] = bf0[2] + bf0[5];
+ bf1[3] = bf0[3] + bf0[4];
+ bf1[4] = -bf0[4] + bf0[3];
+ bf1[5] = -bf0[5] + bf0[2];
+ bf1[6] = -bf0[6] + bf0[1];
+ bf1[7] = -bf0[7] + bf0[0];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[23];
+ bf1[17] = bf0[17] + bf0[22];
+ bf1[18] = bf0[18] + bf0[21];
+ bf1[19] = bf0[19] + bf0[20];
+ bf1[20] = -bf0[20] + bf0[19];
+ bf1[21] = -bf0[21] + bf0[18];
+ bf1[22] = -bf0[22] + bf0[17];
+ bf1[23] = -bf0[23] + bf0[16];
+ bf1[24] = -bf0[24] + bf0[31];
+ bf1[25] = -bf0[25] + bf0[30];
+ bf1[26] = -bf0[26] + bf0[29];
+ bf1[27] = -bf0[27] + bf0[28];
+ bf1[28] = bf0[28] + bf0[27];
+ bf1[29] = bf0[29] + bf0[26];
+ bf1[30] = bf0[30] + bf0[25];
+ bf1[31] = bf0[31] + bf0[24];
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[34];
+ bf1[35] = bf0[35];
+ bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+ bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+ bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+ bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+ bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+ bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+ bf1[44] = bf0[44];
+ bf1[45] = bf0[45];
+ bf1[46] = bf0[46];
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = bf0[49];
+ bf1[50] = bf0[50];
+ bf1[51] = bf0[51];
+ bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
+ bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
+ bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
+ bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
+ bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
+ bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
+ bf1[60] = bf0[60];
+ bf1[61] = bf0[61];
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8] + bf0[11];
+ bf1[9] = bf0[9] + bf0[10];
+ bf1[10] = -bf0[10] + bf0[9];
+ bf1[11] = -bf0[11] + bf0[8];
+ bf1[12] = -bf0[12] + bf0[15];
+ bf1[13] = -bf0[13] + bf0[14];
+ bf1[14] = bf0[14] + bf0[13];
+ bf1[15] = bf0[15] + bf0[12];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+ bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+ bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = bf0[32] + bf0[39];
+ bf1[33] = bf0[33] + bf0[38];
+ bf1[34] = bf0[34] + bf0[37];
+ bf1[35] = bf0[35] + bf0[36];
+ bf1[36] = -bf0[36] + bf0[35];
+ bf1[37] = -bf0[37] + bf0[34];
+ bf1[38] = -bf0[38] + bf0[33];
+ bf1[39] = -bf0[39] + bf0[32];
+ bf1[40] = -bf0[40] + bf0[47];
+ bf1[41] = -bf0[41] + bf0[46];
+ bf1[42] = -bf0[42] + bf0[45];
+ bf1[43] = -bf0[43] + bf0[44];
+ bf1[44] = bf0[44] + bf0[43];
+ bf1[45] = bf0[45] + bf0[42];
+ bf1[46] = bf0[46] + bf0[41];
+ bf1[47] = bf0[47] + bf0[40];
+ bf1[48] = bf0[48] + bf0[55];
+ bf1[49] = bf0[49] + bf0[54];
+ bf1[50] = bf0[50] + bf0[53];
+ bf1[51] = bf0[51] + bf0[52];
+ bf1[52] = -bf0[52] + bf0[51];
+ bf1[53] = -bf0[53] + bf0[50];
+ bf1[54] = -bf0[54] + bf0[49];
+ bf1[55] = -bf0[55] + bf0[48];
+ bf1[56] = -bf0[56] + bf0[63];
+ bf1[57] = -bf0[57] + bf0[62];
+ bf1[58] = -bf0[58] + bf0[61];
+ bf1[59] = -bf0[59] + bf0[60];
+ bf1[60] = bf0[60] + bf0[59];
+ bf1[61] = bf0[61] + bf0[58];
+ bf1[62] = bf0[62] + bf0[57];
+ bf1[63] = bf0[63] + bf0[56];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[19];
+ bf1[17] = bf0[17] + bf0[18];
+ bf1[18] = -bf0[18] + bf0[17];
+ bf1[19] = -bf0[19] + bf0[16];
+ bf1[20] = -bf0[20] + bf0[23];
+ bf1[21] = -bf0[21] + bf0[22];
+ bf1[22] = bf0[22] + bf0[21];
+ bf1[23] = bf0[23] + bf0[20];
+ bf1[24] = bf0[24] + bf0[27];
+ bf1[25] = bf0[25] + bf0[26];
+ bf1[26] = -bf0[26] + bf0[25];
+ bf1[27] = -bf0[27] + bf0[24];
+ bf1[28] = -bf0[28] + bf0[31];
+ bf1[29] = -bf0[29] + bf0[30];
+ bf1[30] = bf0[30] + bf0[29];
+ bf1[31] = bf0[31] + bf0[28];
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+ bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+ bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+ bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+ bf1[38] = bf0[38];
+ bf1[39] = bf0[39];
+ bf1[40] = bf0[40];
+ bf1[41] = bf0[41];
+ bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+ bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+ bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+ bf1[46] = bf0[46];
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = bf0[49];
+ bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
+ bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
+ bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
+ bf1[54] = bf0[54];
+ bf1[55] = bf0[55];
+ bf1[56] = bf0[56];
+ bf1[57] = bf0[57];
+ bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
+ bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
+ bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
+ bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ bf1[8] = bf0[8] + bf0[9];
+ bf1[9] = -bf0[9] + bf0[8];
+ bf1[10] = -bf0[10] + bf0[11];
+ bf1[11] = bf0[11] + bf0[10];
+ bf1[12] = bf0[12] + bf0[13];
+ bf1[13] = -bf0[13] + bf0[12];
+ bf1[14] = -bf0[14] + bf0[15];
+ bf1[15] = bf0[15] + bf0[14];
+ bf1[16] = bf0[16];
+ bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+ bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+ bf1[31] = bf0[31];
+ bf1[32] = bf0[32] + bf0[35];
+ bf1[33] = bf0[33] + bf0[34];
+ bf1[34] = -bf0[34] + bf0[33];
+ bf1[35] = -bf0[35] + bf0[32];
+ bf1[36] = -bf0[36] + bf0[39];
+ bf1[37] = -bf0[37] + bf0[38];
+ bf1[38] = bf0[38] + bf0[37];
+ bf1[39] = bf0[39] + bf0[36];
+ bf1[40] = bf0[40] + bf0[43];
+ bf1[41] = bf0[41] + bf0[42];
+ bf1[42] = -bf0[42] + bf0[41];
+ bf1[43] = -bf0[43] + bf0[40];
+ bf1[44] = -bf0[44] + bf0[47];
+ bf1[45] = -bf0[45] + bf0[46];
+ bf1[46] = bf0[46] + bf0[45];
+ bf1[47] = bf0[47] + bf0[44];
+ bf1[48] = bf0[48] + bf0[51];
+ bf1[49] = bf0[49] + bf0[50];
+ bf1[50] = -bf0[50] + bf0[49];
+ bf1[51] = -bf0[51] + bf0[48];
+ bf1[52] = -bf0[52] + bf0[55];
+ bf1[53] = -bf0[53] + bf0[54];
+ bf1[54] = bf0[54] + bf0[53];
+ bf1[55] = bf0[55] + bf0[52];
+ bf1[56] = bf0[56] + bf0[59];
+ bf1[57] = bf0[57] + bf0[58];
+ bf1[58] = -bf0[58] + bf0[57];
+ bf1[59] = -bf0[59] + bf0[56];
+ bf1[60] = -bf0[60] + bf0[63];
+ bf1[61] = -bf0[61] + bf0[62];
+ bf1[62] = bf0[62] + bf0[61];
+ bf1[63] = bf0[63] + bf0[60];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+ bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+ bf1[16] = bf0[16] + bf0[17];
+ bf1[17] = -bf0[17] + bf0[16];
+ bf1[18] = -bf0[18] + bf0[19];
+ bf1[19] = bf0[19] + bf0[18];
+ bf1[20] = bf0[20] + bf0[21];
+ bf1[21] = -bf0[21] + bf0[20];
+ bf1[22] = -bf0[22] + bf0[23];
+ bf1[23] = bf0[23] + bf0[22];
+ bf1[24] = bf0[24] + bf0[25];
+ bf1[25] = -bf0[25] + bf0[24];
+ bf1[26] = -bf0[26] + bf0[27];
+ bf1[27] = bf0[27] + bf0[26];
+ bf1[28] = bf0[28] + bf0[29];
+ bf1[29] = -bf0[29] + bf0[28];
+ bf1[30] = -bf0[30] + bf0[31];
+ bf1[31] = bf0[31] + bf0[30];
+ bf1[32] = bf0[32];
+ bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+ bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+ bf1[35] = bf0[35];
+ bf1[36] = bf0[36];
+ bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+ bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+ bf1[39] = bf0[39];
+ bf1[40] = bf0[40];
+ bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+ bf1[43] = bf0[43];
+ bf1[44] = bf0[44];
+ bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+ bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
+ bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
+ bf1[51] = bf0[51];
+ bf1[52] = bf0[52];
+ bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
+ bf1[55] = bf0[55];
+ bf1[56] = bf0[56];
+ bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
+ bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
+ bf1[59] = bf0[59];
+ bf1[60] = bf0[60];
+ bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
+ bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+ bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+ bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+ bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+ bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+ bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+ bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+ bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+ bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+ bf1[32] = bf0[32] + bf0[33];
+ bf1[33] = -bf0[33] + bf0[32];
+ bf1[34] = -bf0[34] + bf0[35];
+ bf1[35] = bf0[35] + bf0[34];
+ bf1[36] = bf0[36] + bf0[37];
+ bf1[37] = -bf0[37] + bf0[36];
+ bf1[38] = -bf0[38] + bf0[39];
+ bf1[39] = bf0[39] + bf0[38];
+ bf1[40] = bf0[40] + bf0[41];
+ bf1[41] = -bf0[41] + bf0[40];
+ bf1[42] = -bf0[42] + bf0[43];
+ bf1[43] = bf0[43] + bf0[42];
+ bf1[44] = bf0[44] + bf0[45];
+ bf1[45] = -bf0[45] + bf0[44];
+ bf1[46] = -bf0[46] + bf0[47];
+ bf1[47] = bf0[47] + bf0[46];
+ bf1[48] = bf0[48] + bf0[49];
+ bf1[49] = -bf0[49] + bf0[48];
+ bf1[50] = -bf0[50] + bf0[51];
+ bf1[51] = bf0[51] + bf0[50];
+ bf1[52] = bf0[52] + bf0[53];
+ bf1[53] = -bf0[53] + bf0[52];
+ bf1[54] = -bf0[54] + bf0[55];
+ bf1[55] = bf0[55] + bf0[54];
+ bf1[56] = bf0[56] + bf0[57];
+ bf1[57] = -bf0[57] + bf0[56];
+ bf1[58] = -bf0[58] + bf0[59];
+ bf1[59] = bf0[59] + bf0[58];
+ bf1[60] = bf0[60] + bf0[61];
+ bf1[61] = -bf0[61] + bf0[60];
+ bf1[62] = -bf0[62] + bf0[63];
+ bf1[63] = bf0[63] + bf0[62];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 10
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = bf0[21];
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = bf0[26];
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
+ bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
+ bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
+ bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
+ bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
+ bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
+ bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
+ bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
+ bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
+ bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
+ bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
+ bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
+ bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
+ bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
+ bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
+ bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
+ bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
+ bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
+ bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
+ bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
+ bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
+ bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
+ bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
+ bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
+ bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
+ bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
+ bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
+ bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
+ bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
+ bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 11
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[32];
+ bf1[2] = bf0[16];
+ bf1[3] = bf0[48];
+ bf1[4] = bf0[8];
+ bf1[5] = bf0[40];
+ bf1[6] = bf0[24];
+ bf1[7] = bf0[56];
+ bf1[8] = bf0[4];
+ bf1[9] = bf0[36];
+ bf1[10] = bf0[20];
+ bf1[11] = bf0[52];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[44];
+ bf1[14] = bf0[28];
+ bf1[15] = bf0[60];
+ bf1[16] = bf0[2];
+ bf1[17] = bf0[34];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[50];
+ bf1[20] = bf0[10];
+ bf1[21] = bf0[42];
+ bf1[22] = bf0[26];
+ bf1[23] = bf0[58];
+ bf1[24] = bf0[6];
+ bf1[25] = bf0[38];
+ bf1[26] = bf0[22];
+ bf1[27] = bf0[54];
+ bf1[28] = bf0[14];
+ bf1[29] = bf0[46];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[62];
+ bf1[32] = bf0[1];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[17];
+ bf1[35] = bf0[49];
+ bf1[36] = bf0[9];
+ bf1[37] = bf0[41];
+ bf1[38] = bf0[25];
+ bf1[39] = bf0[57];
+ bf1[40] = bf0[5];
+ bf1[41] = bf0[37];
+ bf1[42] = bf0[21];
+ bf1[43] = bf0[53];
+ bf1[44] = bf0[13];
+ bf1[45] = bf0[45];
+ bf1[46] = bf0[29];
+ bf1[47] = bf0[61];
+ bf1[48] = bf0[3];
+ bf1[49] = bf0[35];
+ bf1[50] = bf0[19];
+ bf1[51] = bf0[51];
+ bf1[52] = bf0[11];
+ bf1[53] = bf0[43];
+ bf1[54] = bf0[27];
+ bf1[55] = bf0[59];
+ bf1[56] = bf0[7];
+ bf1[57] = bf0[39];
+ bf1[58] = bf0[23];
+ bf1[59] = bf0[55];
+ bf1[60] = bf0[15];
+ bf1[61] = bf0[47];
+ bf1[62] = bf0[31];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
new file mode 100644
index 0000000000..9ef54fe4de
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
new file mode 100644
index 0000000000..2777cc25bc
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#include "av1/common/enums.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+extern const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t av1_fwd_cos_bit_col[5][5];
+extern const int8_t av1_fwd_cos_bit_row[5][5];
+#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm2d.c b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
new file mode 100644
index 0000000000..12a9535a7c
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+ switch (txfm_type) {
+ case TXFM_TYPE_DCT4: return av1_fdct4;
+ case TXFM_TYPE_DCT8: return av1_fdct8;
+ case TXFM_TYPE_DCT16: return av1_fdct16;
+ case TXFM_TYPE_DCT32: return av1_fdct32;
+ case TXFM_TYPE_DCT64: return av1_fdct64;
+ case TXFM_TYPE_ADST4: return av1_fadst4;
+ case TXFM_TYPE_ADST8: return av1_fadst8;
+ case TXFM_TYPE_ADST16: return av1_fadst16;
+ case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
+ case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
+ case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
+ case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
+ default: assert(0); return NULL;
+ }
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+ const TXFM_2D_FLIP_CFG *cfg, int bd) {
+ // Take the shift from the larger dimension in the rectangular case.
+ const int8_t *shift = cfg->shift;
+ // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+ for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+ stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1;
+ }
+
+ // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+ for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+ stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1;
+ }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+ const int stride, const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *buf, int bd) {
+ int c, r;
+ // Note when assigning txfm_size_col, we use the txfm_size from the
+ // row configuration and vice versa. This is intentionally done to
+ // accurately perform rectangular transforms. When the transform is
+ // rectangular, the number of columns will be the same as the
+ // txfm_size stored in the row cfg struct. It will make no difference
+ // for square transforms.
+ const int txfm_size_col = tx_size_wide[cfg->tx_size];
+ const int txfm_size_row = tx_size_high[cfg->tx_size];
+ // Take the shift from the larger dimension in the rectangular case.
+ const int8_t *shift = cfg->shift;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+ int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+ assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+ assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+ av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd);
+
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+ // use output buffer as temp buffer
+ int32_t *temp_in = output;
+ int32_t *temp_out = output + txfm_size_row;
+
+ // Columns
+ for (c = 0; c < txfm_size_col; ++c) {
+ if (cfg->ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
+ } else {
+ for (r = 0; r < txfm_size_row; ++r)
+ // flip upside down
+ temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
+ }
+ av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
+ txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+ if (cfg->lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ buf[r * txfm_size_col + c] = temp_out[r];
+ } else {
+ for (r = 0; r < txfm_size_row; ++r)
+ // flip from left to right
+ buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
+ }
+ }
+
+ DECLARE_ALIGNED(16, int32_t, row_buffer[MAX_TX_SIZE]);
+
+ // Rows
+ for (r = 0; r < txfm_size_row; ++r) {
+ txfm_func_row(buf + r * txfm_size_col, row_buffer, cos_bit_row,
+ stage_range_row);
+ av1_round_shift_array(row_buffer, txfm_size_col, -shift[2]);
+ if (abs(rect_type) == 1) {
+ // Multiply everything by Sqrt2 if the transform is rectangular and the
+ // size difference is a factor of 2.
+ for (c = 0; c < txfm_size_col; ++c) {
+ row_buffer[c] =
+ round_shift((int64_t)row_buffer[c] * NewSqrt2, NewSqrt2Bits);
+ }
+ }
+ for (c = 0; c < txfm_size_col; ++c) {
+ output[c * txfm_size_row + r] = row_buffer[c];
+ }
+ }
+}
+
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[8 * 4];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[16 * 8];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[32 * 16];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[16 * 4];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[32 * 8];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[4 * 4];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[8 * 8];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[16 * 16];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[32 * 32];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[64 * 64];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+ // Zero out top-right 32x32 area.
+ for (int col = 0; col < 32; ++col) {
+ memset(output + col * 64 + 32, 0, 32 * sizeof(*output));
+ }
+ // Zero out the bottom 64x32 area.
+ memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
+ // Re-pack non-zero coeffs in the first 32x32 indices.
+ for (int col = 1; col < 32; ++col) {
+ memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output));
+ }
+}
+
+void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out right 32x32 area.
+ for (int col = 0; col < 32; ++col) {
+ memset(output + col * 64 + 32, 0, 32 * sizeof(*output));
+ }
+ // Re-pack non-zero coeffs in the first 32x32 indices.
+ for (int col = 1; col < 32; ++col) {
+ memcpy(output + col * 32, output + col * 64, 32 * sizeof(*output));
+ }
+}
+
+void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[64 * 32];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out the bottom 32x32 area.
+ memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
+ // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out right 32x16 area.
+ for (int row = 0; row < 16; ++row) {
+ memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+ }
+ // Re-pack non-zero coeffs in the first 32x16 indices.
+ for (int row = 1; row < 16; ++row) {
+ memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+ }
+}
+
+void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[64 * 16];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+ // Note: no repacking needed here.
+}
+
+static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 };
+static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 };
+static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 };
+static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 };
+
+const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL] = {
+ fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32,
+ fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16,
+ fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
+ fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32,
+ fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16,
+};
+
+const int8_t av1_fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/]
+ [MAX_TXWH_IDX /*txh_idx*/] = {
+ { 13, 13, 13, 0, 0 },
+ { 13, 13, 13, 12, 0 },
+ { 13, 13, 13, 12, 13 },
+ { 0, 13, 13, 12, 13 },
+ { 0, 0, 13, 12, 13 }
+ };
+
+const int8_t av1_fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/]
+ [MAX_TXWH_IDX /*txh_idx*/] = {
+ { 13, 13, 12, 0, 0 },
+ { 13, 13, 13, 12, 0 },
+ { 13, 13, 12, 13, 12 },
+ { 0, 12, 13, 12, 11 },
+ { 0, 0, 12, 11, 10 }
+ };
+
+static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 };
+static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 };
+static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 };
+static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 };
+static const int8_t fdct64_range_mult2[12] = { 0, 2, 4, 6, 8, 10,
+ 11, 11, 11, 11, 11, 11 };
+
+static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 };
+static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 };
+static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 };
+
+static const int8_t fidtx4_range_mult2[1] = { 1 };
+static const int8_t fidtx8_range_mult2[1] = { 2 };
+static const int8_t fidtx16_range_mult2[1] = { 3 };
+static const int8_t fidtx32_range_mult2[1] = { 4 };
+
+static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
+ fdct4_range_mult2, fdct8_range_mult2, fdct16_range_mult2,
+ fdct32_range_mult2, fdct64_range_mult2, fadst4_range_mult2,
+ fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2,
+ fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2
+};
+
+static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
+ av1_zero(cfg->stage_range_col);
+ av1_zero(cfg->stage_range_row);
+
+ const int8_t *const range_mult2_col =
+ fwd_txfm_range_mult2_list[cfg->txfm_type_col];
+ const int stage_num_col = cfg->stage_num_col;
+ // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow.
+ for (int i = 0; i < stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i)
+ cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
+
+ const int8_t *const range_mult2_row =
+ fwd_txfm_range_mult2_list[cfg->txfm_type_row];
+ const int stage_num_row = cfg->stage_num_row;
+ // i < MAX_TXFM_STAGE_NUM will quiet -Wstringop-overflow.
+ for (int i = 0; i < stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+ cfg->stage_range_row[i] =
+ (range_mult2_col[stage_num_col - 1] + range_mult2_row[i] + 1) >> 1;
+ }
+}
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+ TXFM_2D_FLIP_CFG *cfg) {
+ assert(cfg != NULL);
+ cfg->tx_size = tx_size;
+ set_flip_cfg(tx_type, cfg);
+ const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+ const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ cfg->shift = av1_fwd_txfm_shift_ls[tx_size];
+ cfg->cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ cfg->cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+ assert(cfg->txfm_type_col != TXFM_TYPE_INVALID);
+ cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+ assert(cfg->txfm_type_row != TXFM_TYPE_INVALID);
+ cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+ cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+ set_fwd_txfm_non_scale_range(cfg);
+}
diff --git a/third_party/aom/av1/encoder/av1_ml_partition_models.h b/third_party/aom/av1/encoder/av1_ml_partition_models.h
new file mode 100644
index 0000000000..2572b138d5
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_ml_partition_models.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
+#define AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// TODO(kyslov): Replace with proper weights after training AV1 models
+
+#define FEATURES 6
+static const float av1_var_part_nn_weights_64_layer0[FEATURES * 8] = {
+ 0.35755366f, 0.86281112f, -0.20871686f, 0.0409634f, 0.97305766f,
+ 0.75510254f, 0.04860447f, 0.77095283f, -0.44105278f, -0.3755049f,
+ -0.08456618f, 1.1821136f, -0.73956301f, 1.30016453f, 0.45566902f,
+ 0.4742967f, 0.44213975f, 0.4876028f, 0.26720522f, -0.34429858f,
+ -0.25148252f, -0.49623932f, -0.46747941f, -0.36656624f, 0.10213375f,
+ 0.60262819f, -0.54788715f, -0.27272022f, 1.0995462f, -0.36338376f,
+ -0.64836313f, 0.16057039f, 1.02782791f, 0.9985311f, 0.90607883f,
+ 0.80570411f, -0.07750863f, -0.74006402f, 1.72839526f, 1.72355343f,
+ 1.69288916f, 1.59102043f, 0.14140216f, -1.47262839f, 0.4262519f,
+ -0.33805936f, -0.02449707f, 0.67203692f
+};
+
+static const float av1_var_part_nn_bias_64_layer0[8] = {
+ 0.39995694f, 0.65593756f, 1.12876737f, 1.28790576f,
+ 0.53468556f, 0.3177908f, -0.74388266f, -1.81131248f
+};
+
+static const float av1_var_part_nn_weights_64_layer1[8] = {
+ -1.31174053f, 0.69696917f, 0.78721456f, 0.45326379f,
+ 0.79258322f, 1.74626188f, -5.41831f, 3.33887435f
+};
+
+static const float av1_var_part_nn_bias_64_layer1[1] = { -0.90951047f };
+
+static const float av1_var_part_means_64[FEATURES] = {
+ 5.36750249f, 11.58023127f, 0.25550964f, 0.23809917f, 0.24650665f, 0.22117687f
+};
+static const float av1_var_part_vars_64[FEATURES] = {
+ 0.89599769f, 2.2686018f, 0.02568608f, 0.02523411f, 0.02443085f, 0.01922085f
+};
+
+static const NN_CONFIG av1_var_part_nnconfig_64 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ av1_var_part_nn_weights_64_layer0,
+ av1_var_part_nn_weights_64_layer1,
+ },
+ {
+ av1_var_part_nn_bias_64_layer0,
+ av1_var_part_nn_bias_64_layer1,
+ },
+};
+
+static const float av1_var_part_nn_weights_32_layer0[FEATURES * 8] = {
+ 0.97886049f, -1.66262011f, 0.94902798f, 0.7080922f, 0.91181186f,
+ 0.35222601f, -0.04428585f, 0.42086472f, -0.0206325f, -0.77937809f,
+ -0.70947522f, -1.24463119f, 0.23739497f, -1.34327359f, 0.01024804f,
+ 0.4544633f, -0.96907661f, 0.67279522f, 0.23180693f, 1.54063368f,
+ -0.15700707f, 0.18597331f, 0.34167589f, 0.40736558f, 0.69213366f,
+ -1.33584593f, 1.21190814f, 1.26725267f, 1.21284802f, 1.26611399f,
+ 0.17546514f, -0.30248399f, -1.32589316f, -1.37432674f, -1.37423023f,
+ -1.26890855f, 0.12166347f, -0.94565678f, -1.47475267f, -0.69279948f,
+ -0.10166587f, -0.23489881f, 0.57123565f, 0.80051137f, -1.28411946f,
+ -1.36576732f, -1.30257508f, -1.30575106f
+};
+
+static const float av1_var_part_nn_bias_32_layer0[8] = {
+ -1.6301435f, 0.61879037f, -1.68612662f, 1.66960165f,
+ -0.0838243f, 0.32253287f, -0.65755282f, 0.96661531f
+};
+
+static const float av1_var_part_nn_weights_32_layer1[8] = {
+ 1.99257161f, 0.7331492f, 1.33539961f, 1.13501456f,
+ -2.21154528f, 1.85858542f, -0.85565298f, -1.96410246f
+};
+
+static const float av1_var_part_nn_bias_32_layer1[1] = { -0.14880827f };
+
+static const float av1_var_part_means_32[FEATURES] = {
+ 5.36360686f, 9.88421868f, 0.23543671f, 0.23621205f, 0.23409667f, 0.22855539f
+};
+
+static const float av1_var_part_vars_32[FEATURES] = {
+ 0.89077225f, 2.32312894f, 0.02167654f, 0.02392842f, 0.02466495f, 0.02047641f
+};
+
+static const NN_CONFIG av1_var_part_nnconfig_32 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ av1_var_part_nn_weights_32_layer0,
+ av1_var_part_nn_weights_32_layer1,
+ },
+ {
+ av1_var_part_nn_bias_32_layer0,
+ av1_var_part_nn_bias_32_layer1,
+ },
+};
+
+static const float av1_var_part_nn_weights_16_layer0[FEATURES * 8] = {
+ 0.45118305f, -0.22068295f, 0.4604435f, -0.1446326f, -0.15765035f,
+ 0.42260198f, -0.0945916f, 0.49544996f, 0.62781567f, -0.41564372f,
+ -0.39103292f, 0.44407624f, 0.48382613f, -0.85424238f, -0.00961433f,
+ 0.25383582f, 0.14403897f, 0.00901859f, -0.83201967f, -0.19323284f,
+ 0.59271213f, 0.69487457f, 0.6897112f, 0.62768521f, 0.9204492f,
+ -1.42448347f, -0.16491054f, -0.10114424f, -0.1069687f, -0.11289049f,
+ 0.26290832f, -0.41850393f, 0.17239733f, 0.41770622f, 0.43725942f,
+ 0.19362467f, -0.35955731f, -0.899446f, 0.49726389f, 0.66569571f,
+ 0.65893982f, 0.53199654f, -0.1158694f, -0.26472603f, 0.4155923f,
+ 0.15059544f, 0.09596755f, 0.26247133f
+};
+
+static const float av1_var_part_nn_bias_16_layer0[8] = {
+ 1.64486321f, -0.11851574f, 1.29322833f, -0.61193136f,
+ 0.33027532f, 1.04197232f, -0.80716674f, 0.88681233f
+};
+
+static const float av1_var_part_nn_weights_16_layer1[8] = {
+ -1.02832118f, 0.72800106f, -0.42904783f, 1.44490586f,
+ -1.03888227f, -0.9023916f, -1.51543102f, -0.43059521f
+};
+
+static const float av1_var_part_nn_bias_16_layer1[1] = { -0.85087946f };
+
+static const float av1_var_part_means_16[FEATURES] = {
+ 5.32551326f, 8.218448f, 0.21954822f, 0.22808377f, 0.23019798f, 0.22320699f
+};
+
+static const float av1_var_part_vars_16[FEATURES] = { 0.86806032f, 2.39938956f,
+ 0.01958579f, 0.02437927f,
+ 0.02420755f, 0.0192003f };
+
+static const NN_CONFIG av1_var_part_nnconfig_16 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ av1_var_part_nn_weights_16_layer0,
+ av1_var_part_nn_weights_16_layer1,
+ },
+ {
+ av1_var_part_nn_bias_16_layer0,
+ av1_var_part_nn_bias_16_layer1,
+ },
+};
+
+#undef FEATURES
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_ML_PARTITION_MODELS_H_
diff --git a/third_party/aom/av1/encoder/av1_noise_estimate.c b/third_party/aom/av1/encoder/av1_noise_estimate.c
new file mode 100644
index 0000000000..25007bb6d4
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_noise_estimate.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/encoder.h"
+#if CONFIG_AV1_TEMPORAL_DENOISING
+#include "av1/encoder/av1_temporal_denoiser.h"
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) {
+ return (!cpi->ppi->use_svc ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
+void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
+ const int64_t area = (int64_t)width * height;
+ ne->enabled = 0;
+ ne->level = (area < 1280 * 720) ? kLowLow : kLow;
+ ne->value = 0;
+ ne->count = 0;
+ ne->thresh = 90;
+ ne->last_w = 0;
+ ne->last_h = 0;
+ if (area >= 1920 * 1080) {
+ ne->thresh = 200;
+ } else if (area >= 1280 * 720) {
+ ne->thresh = 140;
+ } else if (area >= 640 * 360) {
+ ne->thresh = 115;
+ }
+ ne->num_frames_estimate = 15;
+ ne->adapt_thresh = (3 * ne->thresh) >> 1;
+}
+
+static int enable_noise_estimation(AV1_COMP *const cpi) {
+ const int resize_pending = is_frame_resize_pending(cpi);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cpi->common.seq_params->use_highbitdepth) return 0;
+#endif
+// Enable noise estimation if denoising is on.
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+ cpi->common.width >= 320 && cpi->common.height >= 180)
+ return 1;
+#endif
+ // Only allow noise estimate under certain encoding mode.
+ // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
+ // Not enabled for SVC mode and screen_content_mode.
+ // Not enabled for low resolutions.
+ if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+ cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
+ resize_pending == 0 && !cpi->ppi->use_svc &&
+ cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+ cpi->common.width * cpi->common.height >= 640 * 360)
+ return 1;
+ else
+ return 0;
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+ const YV12_BUFFER_CONFIG *const src) {
+ const uint8_t *srcbuf = src->y_buffer;
+ uint8_t *destbuf = dest->y_buffer;
+
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+
+ for (int r = 0; r < dest->y_height; ++r) {
+ memcpy(destbuf, srcbuf, dest->y_width);
+ destbuf += dest->y_stride;
+ srcbuf += src->y_stride;
+ }
+}
+#endif // CONFIG_AV1_TEMPORAL_DENOISING
+
+NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
+ int noise_level = kLowLow;
+ if (ne->value > (ne->thresh << 1)) {
+ noise_level = kHigh;
+ } else {
+ if (ne->value > ne->thresh)
+ noise_level = kMedium;
+ else if (ne->value > (ne->thresh >> 1))
+ noise_level = kLow;
+ else
+ noise_level = kLowLow;
+ }
+ return noise_level;
+}
+
+void av1_update_noise_estimate(AV1_COMP *const cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+ const int low_res = (cm->width <= 352 && cm->height <= 288);
+ // Estimate of noise level every frame_period frames.
+ int frame_period = 8;
+ int thresh_consec_zeromv = 2;
+ int frame_counter = cm->current_frame.frame_number;
+ // Estimate is between current source and last source.
+ YV12_BUFFER_CONFIG *last_source = cpi->last_source;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
+ last_source = &cpi->denoiser.last_source;
+ // Tune these thresholds for different resolutions when denoising is
+ // enabled.
+ if (cm->width > 640 && cm->width <= 1920) {
+ thresh_consec_zeromv = 2;
+ }
+ }
+#endif
+ ne->enabled = enable_noise_estimation(cpi);
+ if (cpi->svc.number_spatial_layers > 1)
+ frame_counter = cpi->svc.current_superframe;
+ if (!ne->enabled || frame_counter % frame_period != 0 ||
+ last_source == NULL ||
+ (cpi->svc.number_spatial_layers == 1 &&
+ (ne->last_w != cm->width || ne->last_h != cm->height))) {
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+ copy_frame(&cpi->denoiser.last_source, cpi->source);
+#endif
+ if (last_source != NULL) {
+ ne->last_w = cm->width;
+ ne->last_h = cm->height;
+ }
+ return;
+ } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 &&
+ cpi->rc.frames_since_key > cpi->svc.number_spatial_layers &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+ cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) {
+ // Force noise estimation to 0 and denoiser off if content has high motion.
+ ne->level = kLowLow;
+ ne->count = 0;
+ ne->num_frames_estimate = 10;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+ cpi->svc.current_superframe > 1) {
+ av1_denoiser_set_noise_level(cpi, ne->level);
+ copy_frame(&cpi->denoiser.last_source, cpi->source);
+ }
+#endif
+ return;
+ } else {
+ unsigned int bin_size = 100;
+ unsigned int hist[MAX_VAR_HIST_BINS] = { 0 };
+ unsigned int hist_avg[MAX_VAR_HIST_BINS];
+ unsigned int max_bin = 0;
+ unsigned int max_bin_count = 0;
+ unsigned int bin_cnt;
+ BLOCK_SIZE bsize = BLOCK_16X16;
+ // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+ // been encoded as zero/small mv at least x consecutive frames, compute
+ // the variance to update estimate of noise in the source.
+ const uint8_t *src_y = cpi->source->y_buffer;
+ const int src_ystride = cpi->source->y_stride;
+ const uint8_t *last_src_y = last_source->y_buffer;
+ const int last_src_ystride = last_source->y_stride;
+ int mi_row, mi_col;
+ int num_low_motion = 0;
+ int frame_low_motion = 1;
+ for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += 2) {
+ for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += 2) {
+ int bl_index =
+ (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1);
+ if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+ num_low_motion++;
+ }
+ }
+ if (num_low_motion <
+ (((3 * (mi_params->mi_rows * mi_params->mi_cols) >> 2)) >> 3))
+ frame_low_motion = 0;
+ for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) {
+ for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) {
+ // 16x16 blocks, 1/4 sample of frame.
+ if (mi_row % 8 == 0 && mi_col % 8 == 0 &&
+ mi_row < mi_params->mi_rows - 3 &&
+ mi_col < mi_params->mi_cols - 3) {
+ int bl_index =
+ (mi_row >> 1) * (mi_params->mi_cols >> 1) + (mi_col >> 1);
+ int bl_index1 = bl_index + 1;
+ int bl_index2 = bl_index + (mi_params->mi_cols >> 1);
+ int bl_index3 = bl_index2 + 1;
+ int consec_zeromv =
+ AOMMIN(cpi->consec_zero_mv[bl_index],
+ AOMMIN(cpi->consec_zero_mv[bl_index1],
+ AOMMIN(cpi->consec_zero_mv[bl_index2],
+ cpi->consec_zero_mv[bl_index3])));
+ // Only consider blocks that are likely steady background. i.e, have
+ // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+ // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+ // 4 sub-blocks for 16x16 block. And exclude this frame if
+ // high_source_sad is true (i.e., scene/content change).
+ if (frame_low_motion && consec_zeromv > thresh_consec_zeromv &&
+ !cpi->rc.high_source_sad) {
+ unsigned int sse;
+ // Compute variance between co-located blocks from current and
+ // last input frames.
+ unsigned int variance = cpi->ppi->fn_ptr[bsize].vf(
+ src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+ unsigned int hist_index = variance / bin_size;
+ if (hist_index < MAX_VAR_HIST_BINS)
+ hist[hist_index]++;
+ else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1))
+ hist[MAX_VAR_HIST_BINS - 1]++; // Account for the tail
+ }
+ }
+ src_y += 4;
+ last_src_y += 4;
+ }
+ src_y += (src_ystride << 2) - (mi_params->mi_cols << 2);
+ last_src_y += (last_src_ystride << 2) - (mi_params->mi_cols << 2);
+ }
+ ne->last_w = cm->width;
+ ne->last_h = cm->height;
+ // Adjust histogram to account for effect that histogram flattens
+ // and shifts to zero as scene darkens.
+ if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) {
+ hist[0] = 0;
+ hist[1] >>= 2;
+ hist[2] >>= 2;
+ hist[3] >>= 2;
+ hist[4] >>= 1;
+ hist[5] >>= 1;
+ hist[6] = 3 * hist[6] >> 1;
+ hist[MAX_VAR_HIST_BINS - 1] >>= 1;
+ }
+
+ // Average hist[] and find largest bin
+ for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) {
+ if (bin_cnt == 0)
+ hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3;
+ else if (bin_cnt == MAX_VAR_HIST_BINS - 1)
+ hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2;
+ else if (bin_cnt == MAX_VAR_HIST_BINS - 2)
+ hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] +
+ (hist[bin_cnt + 1] >> 1) + 2) >>
+ 2;
+ else
+ hist_avg[bin_cnt] =
+ (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >>
+ 2;
+
+ if (hist_avg[bin_cnt] > max_bin_count) {
+ max_bin_count = hist_avg[bin_cnt];
+ max_bin = bin_cnt;
+ }
+ }
+ // Scale by 40 to work with existing thresholds
+ ne->value = (int)((3 * ne->value + max_bin * 40) >> 2);
+ // Quickly increase VNR strength when the noise level increases suddenly.
+ if (ne->level < kMedium && ne->value > ne->adapt_thresh) {
+ ne->count = ne->num_frames_estimate;
+ } else {
+ ne->count++;
+ }
+ if (ne->count == ne->num_frames_estimate) {
+ // Reset counter and check noise level condition.
+ ne->num_frames_estimate = 30;
+ ne->count = 0;
+ ne->level = av1_noise_estimate_extract_level(ne);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+ av1_denoiser_set_noise_level(cpi, ne->level);
+#endif
+ }
+ }
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+ copy_frame(&cpi->denoiser.last_source, cpi->source);
+#endif
+}
diff --git a/third_party/aom/av1/encoder/av1_noise_estimate.h b/third_party/aom/av1/encoder/av1_noise_estimate.h
new file mode 100644
index 0000000000..85530666f6
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_noise_estimate.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
+#define AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
+
+#include "av1/encoder/block.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_VAR_HIST_BINS 20
+
+typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL;
+
+typedef struct noise_estimate {
+ int enabled;
+ NOISE_LEVEL level;
+ int value;
+ int thresh;
+ int adapt_thresh;
+ int count;
+ int last_w;
+ int last_h;
+ int num_frames_estimate;
+} NOISE_ESTIMATE;
+
+struct AV1_COMP;
+
+void av1_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height);
+
+NOISE_LEVEL av1_noise_estimate_extract_level(NOISE_ESTIMATE *const ne);
+
+void av1_update_noise_estimate(struct AV1_COMP *const cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_NOISE_ESTIMATE_H_
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
new file mode 100644
index 0000000000..110d17f434
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ *eob_ptr = 0;
+}
+
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+ const int16_t dequant_ptr[2],
+ const int16_t round_ptr[2], int log_scale,
+ const int16_t *scan, int coeff_count,
+ const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr) {
+ memset(qcoeff_ptr, 0, coeff_count * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, coeff_count * sizeof(*dqcoeff_ptr));
+ const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+ int eob = 0;
+ for (int i = 0; i < coeff_count; i++) {
+ const int rc = scan[i];
+ const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32 = 0;
+ if ((abs_coeff << (1 + log_scale)) >= thresh) {
+ abs_coeff = clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
+ tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+ if (tmp32) {
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const tran_low_t abs_dqcoeff =
+ (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+ dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+ }
+ }
+ if (tmp32) eob = i + 1;
+ }
+ return eob;
+}
+
+static void quantize_fp_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, int log_scale) {
+ int i, eob = -1;
+ const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (qm_ptr == NULL && iqm_ptr == NULL) {
+ *eob_ptr = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr,
+ log_scale, scan, (int)n_coeffs,
+ coeff_ptr, qcoeff_ptr, dqcoeff_ptr);
+ } else {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const int coeff_sign = AOMSIGN(coeff);
+ int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32 = 0;
+ if (abs_coeff * wt >=
+ (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+ abs_coeff += rounding[rc != 0];
+ abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
+ tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+ }
+
+ if (tmp32) eob = i;
+ }
+ *eob_ptr = eob + 1;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_quantize_fp_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, int log_scale) {
+ int i;
+ int eob = -1;
+ const int shift = 16 - log_scale;
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)iscan;
+
+ if (qm_ptr || iqm_ptr) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int abs_qcoeff = 0;
+ if (abs_coeff * wt >=
+ (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+ const int64_t tmp =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+ abs_qcoeff =
+ (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (abs_qcoeff) eob = i;
+ } else {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+ }
+ } else {
+ const int log_scaled_round_arr[2] = {
+ ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(round_ptr[1], log_scale),
+ };
+ for (i = 0; i < count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int rc01 = (rc != 0);
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int log_scaled_round = log_scaled_round_arr[rc01];
+ if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) {
+ const int quant = quant_ptr[rc01];
+ const int dequant = dequant_ptr[rc01];
+ const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+ const int abs_qcoeff = (int)((tmp * quant) >> shift);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ if (abs_qcoeff) eob = i;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ } else {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)iscan;
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (int i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+ if (tmp) eob = i;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 1:
+ av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 2:
+ av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#if !CONFIG_REALTIME_ONLY
+ if (qparam->use_quant_b_adapt) {
+ // TODO(sarahparker) These quantize_b optimizations need SIMD
+ // implementations
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ aom_quantize_b_adaptive_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX,
+ p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+ p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 1:
+ aom_quantize_b_32x32_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 2:
+ aom_quantize_b_64x64_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+ return;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 1:
+ aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 2:
+ aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale) {
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int64_t tmp;
+ int eob = -1;
+ int32_t tmp32;
+ int dequant;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+ INT16_MIN, INT16_MAX);
+ tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (tmp32) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ (void)sc;
+ assert(qparam->log_scale >= 0 && qparam->log_scale < (3));
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+ p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0],
+ eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ highbd_quantize_fp_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qparam->log_scale);
+ }
+}
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+#if !CONFIG_REALTIME_ONLY
+ if (qparam->use_quant_b_adapt) {
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ aom_highbd_quantize_b_adaptive_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_highbd_quantize_b_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 1:
+ aom_highbd_quantize_b_32x32_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 2:
+ aom_highbd_quantize_b_64x64_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+ return;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ aom_highbd_quantize_b_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 1:
+ aom_highbd_quantize_b_32x32(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 2:
+ aom_highbd_quantize_b_64x64(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+static INLINE void highbd_quantize_dc(
+ const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+ const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) {
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+ const int64_t tmpw = tmp * wt;
+ const int abs_qcoeff =
+ (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int dequant =
+ (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (abs_qcoeff) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ (void)sc;
+
+ highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+ p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr,
+ p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr,
+ qparam->log_scale);
+}
+
+void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan,
+ int log_scale) {
+ highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan, iscan, NULL, NULL,
+ log_scale);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+ uint32_t t;
+ int l, m;
+ t = d;
+ l = get_msb(t);
+ m = 1 + (1 << (16 + l)) / d;
+ *quant = (int16_t)(m - (1 << 16));
+ *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
+ const int quant = av1_dc_quant_QTX(q, 0, bit_depth);
+ switch (bit_depth) {
+ case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+ case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+ case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+}
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+ int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+ int v_ac_delta_q, QUANTS *const quants,
+ Dequants *const deq) {
+ int i, q, quant_QTX;
+
+ for (q = 0; q < QINDEX_RANGE; q++) {
+ const int qzbin_factor = get_qzbin_factor(q, bit_depth);
+ const int qrounding_factor = q == 0 ? 64 : 48;
+
+ for (i = 0; i < 2; ++i) {
+ const int qrounding_factor_fp = 64;
+ // y quantizer with TX scale
+ quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
+ : av1_ac_quant_QTX(q, 0, bit_depth);
+ invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i],
+ quant_QTX);
+ quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX;
+ quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+ quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+ quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+ deq->y_dequant_QTX[q][i] = quant_QTX;
+
+ // u quantizer with TX scale
+ quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth)
+ : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth);
+ invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i],
+ quant_QTX);
+ quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX;
+ quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+ quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+ quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+ deq->u_dequant_QTX[q][i] = quant_QTX;
+
+ // v quantizer with TX scale
+ quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth)
+ : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth);
+ invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i],
+ quant_QTX);
+ quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX;
+ quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+ quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+ quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+ deq->v_dequant_QTX[q][i] = quant_QTX;
+ }
+
+ for (i = 2; i < 8; i++) { // 8: SIMD width
+ quants->y_quant[q][i] = quants->y_quant[q][1];
+ quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+ quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+ quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+ quants->y_zbin[q][i] = quants->y_zbin[q][1];
+ quants->y_round[q][i] = quants->y_round[q][1];
+ deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1];
+
+ quants->u_quant[q][i] = quants->u_quant[q][1];
+ quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1];
+ quants->u_round_fp[q][i] = quants->u_round_fp[q][1];
+ quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1];
+ quants->u_zbin[q][i] = quants->u_zbin[q][1];
+ quants->u_round[q][i] = quants->u_round[q][1];
+ deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
+
+ quants->v_quant[q][i] = quants->v_quant[q][1];
+ quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
+ quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
+ quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1];
+ quants->v_zbin[q][i] = quants->v_zbin[q][1];
+ quants->v_round[q][i] = quants->v_round[q][1];
+ deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1];
+ }
+ }
+}
+
+static INLINE bool deltaq_params_have_changed(
+ const DeltaQuantParams *prev_deltaq_params,
+ const CommonQuantParams *quant_params) {
+ return (prev_deltaq_params->y_dc_delta_q != quant_params->y_dc_delta_q ||
+ prev_deltaq_params->u_dc_delta_q != quant_params->u_dc_delta_q ||
+ prev_deltaq_params->v_dc_delta_q != quant_params->v_dc_delta_q ||
+ prev_deltaq_params->u_ac_delta_q != quant_params->u_ac_delta_q ||
+ prev_deltaq_params->v_ac_delta_q != quant_params->v_ac_delta_q);
+}
+
+void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
+ const CommonQuantParams *quant_params,
+ aom_bit_depth_t bit_depth) {
+ DeltaQuantParams *const prev_deltaq_params =
+ &enc_quant_dequant_params->prev_deltaq_params;
+
+ // Re-initialize the quantizer only if any of the dc/ac deltaq parameters
+ // change.
+ if (!deltaq_params_have_changed(prev_deltaq_params, quant_params)) return;
+ QUANTS *const quants = &enc_quant_dequant_params->quants;
+ Dequants *const dequants = &enc_quant_dequant_params->dequants;
+ av1_build_quantizer(bit_depth, quant_params->y_dc_delta_q,
+ quant_params->u_dc_delta_q, quant_params->u_ac_delta_q,
+ quant_params->v_dc_delta_q, quant_params->v_ac_delta_q,
+ quants, dequants);
+
+ // Record the state of deltaq parameters.
+ prev_deltaq_params->y_dc_delta_q = quant_params->y_dc_delta_q;
+ prev_deltaq_params->u_dc_delta_q = quant_params->u_dc_delta_q;
+ prev_deltaq_params->v_dc_delta_q = quant_params->v_dc_delta_q;
+ prev_deltaq_params->u_ac_delta_q = quant_params->u_ac_delta_q;
+ prev_deltaq_params->v_ac_delta_q = quant_params->v_ac_delta_q;
+}
+
+void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params,
+ int qindex, MACROBLOCK *x) {
+ const QUANTS *const quants = &enc_quant_dequant_params->quants;
+ const Dequants *const dequants = &enc_quant_dequant_params->dequants;
+ x->qindex = qindex;
+ x->seg_skip_block =
+ 0; // TODO(angiebird): Find a proper place to init this variable.
+
+ // Y
+ x->plane[0].quant_QTX = quants->y_quant[qindex];
+ x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
+ x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
+ x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
+ x->plane[0].zbin_QTX = quants->y_zbin[qindex];
+ x->plane[0].round_QTX = quants->y_round[qindex];
+ x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex];
+
+ // U
+ x->plane[1].quant_QTX = quants->u_quant[qindex];
+ x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
+ x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
+ x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
+ x->plane[1].zbin_QTX = quants->u_zbin[qindex];
+ x->plane[1].round_QTX = quants->u_round[qindex];
+ x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex];
+
+ // V
+ x->plane[2].quant_QTX = quants->v_quant[qindex];
+ x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
+ x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
+ x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
+ x->plane[2].zbin_QTX = quants->v_zbin[qindex];
+ x->plane[2].round_QTX = quants->v_round[qindex];
+ x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex];
+}
+
+void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id,
+ MACROBLOCKD *xd) {
+ const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id);
+ const int qmlevel_y =
+ use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
+ const int qmlevel_u =
+ use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
+ const int qmlevel_v =
+ use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
+ const int qmlevel_ls[MAX_MB_PLANE] = { qmlevel_y, qmlevel_u, qmlevel_v };
+ for (int i = 0; i < MAX_MB_PLANE; ++i) {
+ const int qmlevel = qmlevel_ls[i];
+ memcpy(&xd->plane[i].seg_qmatrix[segment_id],
+ quant_params->gqmatrix[qmlevel][i],
+ sizeof(quant_params->gqmatrix[qmlevel][i]));
+ memcpy(&xd->plane[i].seg_iqmatrix[segment_id],
+ quant_params->giqmatrix[qmlevel][i],
+ sizeof(quant_params->giqmatrix[qmlevel][i]));
+ }
+}
+
+void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
+ int segment_id, const int do_update) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonQuantParams *const quant_params = &cm->quant_params;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+ int qindex_rd;
+
+ const int current_qindex = AOMMAX(
+ 0,
+ AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
+ ? quant_params->base_qindex + x->delta_qindex
+ : quant_params->base_qindex));
+ const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
+
+ if (cpi->oxcf.sb_qp_sweep) {
+ const int current_rd_qindex =
+ AOMMAX(0, AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
+ ? quant_params->base_qindex +
+ x->rdmult_delta_qindex
+ : quant_params->base_qindex));
+ qindex_rd = av1_get_qindex(&cm->seg, segment_id, current_rd_qindex);
+ } else {
+ qindex_rd = qindex;
+ }
+
+ const int qindex_rdmult = qindex_rd + quant_params->y_dc_delta_q;
+ const int rdmult = av1_compute_rd_mult(
+ qindex_rdmult, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+
+ const int qindex_change = x->qindex != qindex;
+ if (qindex_change || do_update) {
+ av1_set_q_index(&cpi->enc_quant_dequant_params, qindex, x);
+ }
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ if ((segment_id != x->prev_segment_id) ||
+ av1_use_qmatrix(quant_params, xd, segment_id)) {
+ av1_set_qmatrix(quant_params, segment_id, xd);
+ }
+
+ x->seg_skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+
+ av1_set_error_per_bit(&x->errorperbit, rdmult);
+ av1_set_sad_per_bit(cpi, &x->sadperbit, qindex_rd);
+
+ x->prev_segment_id = segment_id;
+}
+
+void av1_frame_init_quantizer(AV1_COMP *cpi) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ x->prev_segment_id = -1;
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 1);
+}
+
+static int adjust_hdr_cb_deltaq(int base_qindex) {
+ double baseQp = base_qindex / QP_SCALE_FACTOR;
+ const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+ const double dcbQP = CHROMA_CB_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+ int dqpCb = (int)(dcbQP + (dcbQP < 0 ? -0.5 : 0.5));
+ dqpCb = AOMMIN(0, dqpCb);
+ dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+ return dqpCb;
+}
+
+static int adjust_hdr_cr_deltaq(int base_qindex) {
+ double baseQp = base_qindex / QP_SCALE_FACTOR;
+ const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+ const double dcrQP = CHROMA_CR_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+ int dqpCr = (int)(dcrQP + (dcrQP < 0 ? -0.5 : 0.5));
+ dqpCr = AOMMIN(0, dqpCr);
+ dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+ return dqpCr;
+}
+
+void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel,
+ int q, int enable_chroma_deltaq, int enable_hdr_deltaq) {
+ // quantizer has to be reinitialized with av1_init_quantizer() if any
+ // delta_q changes.
+ CommonQuantParams *quant_params = &cm->quant_params;
+ quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q);
+ quant_params->y_dc_delta_q = 0;
+
+ if (enable_chroma_deltaq) {
+ // TODO(aomedia:2717): need to design better delta
+ quant_params->u_dc_delta_q = 2;
+ quant_params->u_ac_delta_q = 2;
+ quant_params->v_dc_delta_q = 2;
+ quant_params->v_ac_delta_q = 2;
+ } else {
+ quant_params->u_dc_delta_q = 0;
+ quant_params->u_ac_delta_q = 0;
+ quant_params->v_dc_delta_q = 0;
+ quant_params->v_ac_delta_q = 0;
+ }
+
+ // following section 8.3.2 in T-REC-H.Sup15 document
+ // to apply to AV1 qindex in the range of [0, 255]
+ if (enable_hdr_deltaq) {
+ int dqpCb = adjust_hdr_cb_deltaq(quant_params->base_qindex);
+ int dqpCr = adjust_hdr_cr_deltaq(quant_params->base_qindex);
+ quant_params->u_dc_delta_q = quant_params->u_ac_delta_q = dqpCb;
+ quant_params->v_dc_delta_q = quant_params->v_ac_delta_q = dqpCr;
+ if (dqpCb != dqpCr) {
+ cm->seq_params->separate_uv_delta_q = 1;
+ }
+ }
+
+ quant_params->qmatrix_level_y =
+ aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel);
+ quant_params->qmatrix_level_u =
+ aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q,
+ min_qmlevel, max_qmlevel);
+
+ if (!cm->seq_params->separate_uv_delta_q)
+ quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
+ else
+ quant_params->qmatrix_level_v =
+ aom_get_qmlevel(quant_params->base_qindex + quant_params->v_ac_delta_q,
+ min_qmlevel, max_qmlevel);
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48,
+ 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100,
+ 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+ 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+ 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int av1_quantizer_to_qindex(int quantizer) {
+ return quantizer_to_qindex[quantizer];
+}
+
+int av1_qindex_to_quantizer(int qindex) {
+ int quantizer;
+
+ for (quantizer = 0; quantizer < 64; ++quantizer)
+ if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+ return 63;
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
new file mode 100644
index 0000000000..040973376d
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct QUANT_PARAM {
+ int log_scale;
+ TX_SIZE tx_size;
+ const qm_val_t *qmatrix;
+ const qm_val_t *iqmatrix;
+ int use_quant_b_adapt;
+ int use_optimize_b;
+ int xform_quant_idx;
+} QUANT_PARAM;
+
+typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+// The QUANTS structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// All of its fields use the same coefficient shift/scaling at TX.
+typedef struct {
+ // 0: dc 1: ac 2-8: ac repeated to SIMD width
+ DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+ // TODO(jingning): in progress of re-working the quantization. will decide
+ // if we want to deprecate the current use of y_quant.
+ DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]);
+
+ DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+// The Dequants structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// Fields are suffixed according to whether or not they're expressed in
+// the same coefficient shift/precision as TX or a fixed Q3 format.
+typedef struct {
+ DECLARE_ALIGNED(16, int16_t,
+ y_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width
+ DECLARE_ALIGNED(16, int16_t,
+ u_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width
+ DECLARE_ALIGNED(16, int16_t,
+ v_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width
+} Dequants;
+
+// The DeltaQuantParams structure holds the dc/ac deltaq parameters.
+typedef struct {
+ int y_dc_delta_q;
+ int u_dc_delta_q;
+ int u_ac_delta_q;
+ int v_dc_delta_q;
+ int v_ac_delta_q;
+} DeltaQuantParams;
+
+typedef struct {
+ // Quantization parameters for internal quantizer setup.
+ QUANTS quants;
+ // Dequantization parameters for internal quantizer setup.
+ Dequants dequants;
+ // Deltaq parameters to track the state of the dc/ac deltaq parameters in
+ // cm->quant_params. It is used to decide whether the quantizer tables need
+ // to be re-initialized.
+ DeltaQuantParams prev_deltaq_params;
+} EncQuantDequantParams;
+
+struct AV1_COMP;
+struct AV1Common;
+
+void av1_frame_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ int segment_id, const int do_update);
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+ int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+ int v_ac_delta_q, QUANTS *const quants,
+ Dequants *const deq);
+
+void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
+ const CommonQuantParams *quant_params,
+ aom_bit_depth_t bit_depth);
+
+void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel,
+ int max_qmlevel, int q, int enable_chroma_deltaq,
+ int enable_hdr_deltaq);
+
+int av1_quantizer_to_qindex(int quantizer);
+
+int av1_qindex_to_quantizer(int qindex);
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+/*!\brief Quantize transform coefficients without using qmatrix
+ *
+ * quant_ptr, dequant_ptr and round_ptr are size 2 arrays,
+ * where index 0 corresponds to dc coeff and index 1 corresponds to ac coeffs.
+ *
+ * \param[in] quant_ptr 16-bit fixed point representation of inverse
+ * quantize step size, i.e. 2^16/dequant
+ * \param[in] dequant_ptr quantize step size
+ * \param[in] round_ptr rounding
+ * \param[in] log_scale the relative log scale of the transform
+ * coefficients
+ * \param[in] scan scan[i] indicates the position of ith to-be-coded
+ * coefficient
+ * \param[in] coeff_count number of coefficients
+ * \param[out] qcoeff_ptr quantized coefficients
+ * \param[out] dqcoeff_ptr dequantized coefficients
+ *
+ * \return The last non-zero coefficient's scan index plus 1
+ */
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+ const int16_t dequant_ptr[2],
+ const int16_t round_ptr[2], int log_scale,
+ const int16_t *scan, int coeff_count,
+ const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr);
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+/*!\brief Update quantize parameters in MACROBLOCK
+ *
+ * \param[in] enc_quant_dequant_params This parameter cached the quantize and
+ * dequantize parameters for all q
+ * indices.
+ * \param[in] qindex Quantize index used for the current
+ * superblock.
+ * \param[out] x A superblock data structure for
+ * encoder.
+ */
+void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params,
+ int qindex, MACROBLOCK *x);
+
+/*!\brief Update quantize matrix in MACROBLOCKD based on segment id
+ *
+ * \param[in] quant_params Quantize parameters used by encoder and decoder
+ * \param[in] segment_id Segment id.
+ * \param[out] xd A superblock data structure used by encoder and
+ * decoder.
+ */
+void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id,
+ MACROBLOCKD *xd);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_
diff --git a/third_party/aom/av1/encoder/av1_temporal_denoiser.c b/third_party/aom/av1/encoder/av1_temporal_denoiser.c
new file mode 100644
index 0000000000..3012df6311
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_temporal_denoiser.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
+#endif
+
+static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ (void)bs;
+ return 3 + (increase_denoising ? 1 : 0);
+}
+
+static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ (void)bs;
+ (void)increase_denoising;
+ return 4;
+}
+
+static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ (void)bs;
+ (void)increase_denoising;
+ return 625;
+}
+
+static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40);
+}
+
+static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
+ int motion_magnitude) {
+ if (motion_magnitude > noise_motion_thresh(bs, increase_denoising)) {
+ if (increase_denoising)
+ return (1 << num_pels_log2_lookup[bs]) << 2;
+ else
+ return 0;
+ } else {
+ return (1 << num_pels_log2_lookup[bs]) << 4;
+ }
+}
+
+static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+
+// TODO(kyslov): If increase_denoising is enabled in the future,
+// we might need to update the code for calculating 'total_adj' in
+// case the C code is not bit-exact with corresponding sse2 code.
+int av1_denoiser_filter_c(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude) {
+ int r, c;
+ const uint8_t *sig_start = sig;
+ const uint8_t *mc_avg_start = mc_avg;
+ uint8_t *avg_start = avg;
+ int diff, adj, absdiff, delta;
+ int adj_val[] = { 3, 4, 6 };
+ int total_adj = 0;
+ int shift_inc = 1;
+
+ // If motion_magnitude is small, making the denoiser more aggressive by
+ // increasing the adjustment for each level. Add another increment for
+ // blocks that are labeled for increase denoising.
+ if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+ if (increase_denoising) {
+ shift_inc = 2;
+ }
+ adj_val[0] += shift_inc;
+ adj_val[1] += shift_inc;
+ adj_val[2] += shift_inc;
+ }
+
+ // First attempt to apply a strong temporal denoising filter.
+ for (r = 0; r < block_size_high[bs]; ++r) {
+ for (c = 0; c < block_size_wide[bs]; ++c) {
+ diff = mc_avg[c] - sig[c];
+ absdiff = abs(diff);
+
+ if (absdiff <= absdiff_thresh(bs, increase_denoising)) {
+ avg[c] = mc_avg[c];
+ total_adj += diff;
+ } else {
+ switch (absdiff) {
+ case 4:
+ case 5:
+ case 6:
+ case 7: adj = adj_val[0]; break;
+ case 8:
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15: adj = adj_val[1]; break;
+ default: adj = adj_val[2];
+ }
+ if (diff > 0) {
+ avg[c] = AOMMIN(UINT8_MAX, sig[c] + adj);
+ total_adj += adj;
+ } else {
+ avg[c] = AOMMAX(0, sig[c] - adj);
+ total_adj -= adj;
+ }
+ }
+ }
+ sig += sig_stride;
+ avg += avg_stride;
+ mc_avg += mc_avg_stride;
+ }
+
+ // If the strong filter did not modify the signal too much, we're all set.
+ if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) {
+ return FILTER_BLOCK;
+ }
+
+ // Otherwise, we try to dampen the filter if the delta is not too high.
+ delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) >>
+ num_pels_log2_lookup[bs]) +
+ 1;
+
+ if (delta >= delta_thresh(bs, increase_denoising)) {
+ return COPY_BLOCK;
+ }
+
+ mc_avg = mc_avg_start;
+ avg = avg_start;
+ sig = sig_start;
+ for (r = 0; r < block_size_high[bs]; ++r) {
+ for (c = 0; c < block_size_wide[bs]; ++c) {
+ diff = mc_avg[c] - sig[c];
+ adj = abs(diff);
+ if (adj > delta) {
+ adj = delta;
+ }
+ if (diff > 0) {
+ // Diff positive means we made positive adjustment above
+ // (in first try/attempt), so now make negative adjustment to bring
+ // denoised signal down.
+ avg[c] = AOMMAX(0, avg[c] - adj);
+ total_adj -= adj;
+ } else {
+ // Diff negative means we made negative adjustment above
+ // (in first try/attempt), so now make positive adjustment to bring
+ // denoised signal up.
+ avg[c] = AOMMIN(UINT8_MAX, avg[c] + adj);
+ total_adj += adj;
+ }
+ }
+ sig += sig_stride;
+ avg += avg_stride;
+ mc_avg += mc_avg_stride;
+ }
+
+ // We can use the filter if it has been sufficiently dampened
+ if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) {
+ return FILTER_BLOCK;
+ }
+ return COPY_BLOCK;
+}
+
+static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row,
+ int mi_col) {
+ return framebuf + (stride * mi_row << 2) + (mi_col << 2);
+}
+
+static AV1_DENOISER_DECISION perform_motion_compensation(
+ AV1_COMMON *const cm, AV1_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
+ int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
+ int motion_magnitude, int *zeromv_filter, int num_spatial_layers, int width,
+ int lst_fb_idx, int gld_fb_idx, int use_svc, int spatial_layer,
+ int use_gf_temporal_ref) {
+ const int sse_diff = (ctx->newmv_sse == UINT_MAX)
+ ? 0
+ : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
+ int frame;
+ int denoise_layer_idx = 0;
+ MACROBLOCKD *filter_mbd = &mb->e_mbd;
+ MB_MODE_INFO *mi = filter_mbd->mi[0];
+ MB_MODE_INFO saved_mi;
+ int i;
+ struct buf_2d saved_dst[MAX_MB_PLANE];
+ struct buf_2d saved_pre[MAX_MB_PLANE];
+ // const RefBuffer *saved_block_refs[2];
+ MV_REFERENCE_FRAME saved_frame;
+
+ frame = ctx->best_reference_frame;
+
+ saved_mi = *mi;
+
+ // Avoid denoising small blocks. When noise > kDenLow or frame width > 480,
+ // denoise 16x16 blocks.
+ if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 ||
+ (bs == BLOCK_16X16 && width > 480 &&
+ denoiser->denoising_level <= kDenLow))
+ return COPY_BLOCK;
+
+ // If the best reference frame uses inter-prediction and there is enough of a
+ // difference in sum-squared-error, use it.
+ if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME &&
+ sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
+ mi->ref_frame[0] = ctx->best_reference_frame;
+ mi->mode = ctx->best_sse_inter_mode;
+ mi->mv[0] = ctx->best_sse_mv;
+ } else {
+ // Otherwise, use the zero reference frame.
+ frame = ctx->best_zeromv_reference_frame;
+ ctx->newmv_sse = ctx->zeromv_sse;
+ // Bias to last reference.
+ if ((num_spatial_layers > 1 && !use_gf_temporal_ref) ||
+ frame == ALTREF_FRAME ||
+ (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
+ (frame != LAST_FRAME &&
+ ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
+ denoiser->denoising_level >= kDenHigh))) {
+ frame = LAST_FRAME;
+ ctx->newmv_sse = ctx->zeromv_lastref_sse;
+ }
+ mi->ref_frame[0] = frame;
+ mi->mode = GLOBALMV;
+ mi->mv[0].as_int = 0;
+ ctx->best_sse_inter_mode = GLOBALMV;
+ ctx->best_sse_mv.as_int = 0;
+ *zeromv_filter = 1;
+ if (denoiser->denoising_level > kDenMedium) {
+ motion_magnitude = 0;
+ }
+ }
+
+ saved_frame = frame;
+ // When using SVC, we need to map REF_FRAME to the frame buffer index.
+ if (use_svc) {
+ if (frame == LAST_FRAME)
+ frame = lst_fb_idx + 1;
+ else if (frame == GOLDEN_FRAME)
+ frame = gld_fb_idx + 1;
+ // Shift for the second spatial layer.
+ if (num_spatial_layers - spatial_layer == 2)
+ frame = frame + denoiser->num_ref_frames;
+ denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
+ }
+
+ // Force copy (no denoise, copy source in denoised buffer) if
+ // running_avg_y[frame] is NULL.
+ if (denoiser->running_avg_y[frame].buffer_alloc == NULL) {
+ // Restore everything to its original state
+ *mi = saved_mi;
+ return COPY_BLOCK;
+ }
+
+ if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+ // Restore everything to its original state
+ *mi = saved_mi;
+ return COPY_BLOCK;
+ }
+ if (motion_magnitude > (noise_motion_thresh(bs, increase_denoising) << 3)) {
+ // Restore everything to its original state
+ *mi = saved_mi;
+ return COPY_BLOCK;
+ }
+
+ // We will restore these after motion compensation.
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ saved_pre[i] = filter_mbd->plane[i].pre[0];
+ saved_dst[i] = filter_mbd->plane[i].dst;
+ }
+
+ // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
+ // struct.
+ set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
+ av1_setup_pre_planes(filter_mbd, 0, &(denoiser->running_avg_y[frame]), mi_row,
+ mi_col, filter_mbd->block_ref_scale_factors[0], 1);
+ av1_setup_dst_planes(filter_mbd->plane, bs,
+ &(denoiser->mc_running_avg_y[denoise_layer_idx]), mi_row,
+ mi_col, 0, 1);
+
+ av1_enc_build_inter_predictor_y(filter_mbd, mi_row, mi_col);
+
+ // Restore everything to its original state
+ *mi = saved_mi;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ filter_mbd->plane[i].pre[0] = saved_pre[i];
+ filter_mbd->plane[i].dst = saved_dst[i];
+ }
+
+ return FILTER_BLOCK;
+}
+
+void av1_denoiser_denoise(AV1_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
+ BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
+ AV1_DENOISER_DECISION *denoiser_decision,
+ int use_gf_temporal_ref) {
+ int mv_col, mv_row;
+ int motion_magnitude = 0;
+ int zeromv_filter = 0;
+ AV1_DENOISER *denoiser = &cpi->denoiser;
+ AV1_DENOISER_DECISION decision = COPY_BLOCK;
+
+ const int shift =
+ cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+ ? denoiser->num_ref_frames
+ : 0;
+ YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
+ const int denoise_layer_index =
+ cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
+ YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
+ uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+
+ uint8_t *mc_avg_start =
+ block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
+ struct buf_2d src = mb->plane[0].src;
+ int increase_denoising = 0;
+ int last_is_reference = cpi->ref_frame_flags & AOM_LAST_FLAG;
+ mv_col = ctx->best_sse_mv.as_mv.col;
+ mv_row = ctx->best_sse_mv.as_mv.row;
+ motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+
+ if (denoiser->denoising_level == kDenHigh) increase_denoising = 1;
+
+ // Copy block if LAST_FRAME is not a reference.
+ // Last doesn't always exist when SVC layers are dynamically changed, e.g. top
+ // spatial layer doesn't have last reference when it's brought up for the
+ // first time on the fly.
+ if (last_is_reference && denoiser->denoising_level >= kDenLow &&
+ !ctx->sb_skip_denoising)
+ decision = perform_motion_compensation(
+ &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
+ motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers,
+ cpi->source->y_width, cpi->ppi->rtc_ref.ref_idx[0],
+ cpi->ppi->rtc_ref.ref_idx[3], cpi->ppi->use_svc,
+ cpi->svc.spatial_layer_id, use_gf_temporal_ref);
+
+ if (decision == FILTER_BLOCK) {
+ decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start,
+ mc_avg.y_stride, avg_start, avg.y_stride,
+ increase_denoising, bs, motion_magnitude);
+ }
+
+ if (decision == FILTER_BLOCK) {
+ aom_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+ block_size_wide[bs], block_size_high[bs]);
+ } else { // COPY_BLOCK
+ aom_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+ block_size_wide[bs], block_size_high[bs]);
+ }
+ *denoiser_decision = decision;
+ if (decision == FILTER_BLOCK && zeromv_filter == 1)
+ *denoiser_decision = FILTER_ZEROMV_BLOCK;
+}
+
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+ const YV12_BUFFER_CONFIG *const src) {
+ int r;
+ const uint8_t *srcbuf = src->y_buffer;
+ uint8_t *destbuf = dest->y_buffer;
+
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+
+ for (r = 0; r < dest->y_height; ++r) {
+ memcpy(destbuf, srcbuf, dest->y_width);
+ destbuf += dest->y_stride;
+ srcbuf += src->y_stride;
+ }
+}
+
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
+ YV12_BUFFER_CONFIG *const src) {
+ uint8_t *tmp_buf = dest->y_buffer;
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+ dest->y_buffer = src->y_buffer;
+ src->y_buffer = tmp_buf;
+}
+
+void av1_denoiser_update_frame_info(
+ AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref,
+ struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame,
+ int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx,
+ int gld_fb_idx, int lst_fb_idx, int resized,
+ int svc_refresh_denoiser_buffers, int second_spatial_layer) {
+ const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
+ // Copy source into denoised reference buffers on KEY_FRAME or
+ // if the just encoded frame was resized. For SVC, copy source if the base
+ // spatial layer was key frame.
+ if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset ||
+ svc_refresh_denoiser_buffers) {
+ int i;
+ // Start at 1 so as not to overwrite the INTRA_FRAME
+ for (i = 1; i < denoiser->num_ref_frames; ++i) {
+ if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+ copy_frame(&denoiser->running_avg_y[i + shift], &src);
+ }
+ denoiser->reset = 0;
+ return;
+ }
+
+ if (rtc_ref->set_ref_frame_config) {
+ int i;
+ for (i = 0; i < REF_FRAMES; i++) {
+ if (rtc_ref->refresh[svc->spatial_layer_id] & (1 << i))
+ copy_frame(&denoiser->running_avg_y[i + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ } else {
+ // If more than one refresh occurs, must copy frame buffer.
+ if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) >
+ 1) {
+ if (refresh_alt_ref_frame) {
+ copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_golden_frame) {
+ copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_last_frame) {
+ copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ } else {
+ if (refresh_alt_ref_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_golden_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_last_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ }
+ }
+}
+
+void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
+ ctx->zeromv_sse = INT64_MAX;
+ ctx->newmv_sse = INT64_MAX;
+ ctx->zeromv_lastref_sse = INT64_MAX;
+ ctx->best_sse_mv.as_int = 0;
+}
+
+void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse,
+ PREDICTION_MODE mode,
+ PICK_MODE_CONTEXT *ctx) {
+ if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
+ ctx->zeromv_sse = sse;
+ ctx->best_zeromv_reference_frame = mi->ref_frame[0];
+ if (mi->ref_frame[0] == LAST_FRAME) ctx->zeromv_lastref_sse = sse;
+ }
+
+ if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
+ ctx->newmv_sse = sse;
+ ctx->best_sse_inter_mode = mode;
+ ctx->best_sse_mv = mi->mv[0];
+ ctx->best_reference_frame = mi->ref_frame[0];
+ }
+}
+
+static int av1_denoiser_realloc_svc_helper(AV1_COMMON *cm,
+ AV1_DENOISER *denoiser, int fb_idx) {
+ int fail = 0;
+ if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
+ fail = aom_alloc_frame_buffer(
+ &denoiser->running_avg_y[fb_idx], cm->width, cm->height,
+ cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+ cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->features.byte_alignment, 0, 0);
+ if (fail) {
+ av1_denoiser_free(denoiser);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
+ struct RTC_REF *rtc_ref, struct SVC *svc,
+ int svc_buf_shift, int refresh_alt,
+ int refresh_gld, int refresh_lst, int alt_fb_idx,
+ int gld_fb_idx, int lst_fb_idx) {
+ int fail = 0;
+ if (rtc_ref->set_ref_frame_config) {
+ int i;
+ for (i = 0; i < REF_FRAMES; i++) {
+ if (cm->current_frame.frame_type == KEY_FRAME ||
+ rtc_ref->refresh[svc->spatial_layer_id] & (1 << i)) {
+ fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+ i + 1 + svc_buf_shift);
+ }
+ }
+ } else {
+ if (refresh_alt) {
+ // Increase the frame buffer index by 1 to map it to the buffer index in
+ // the denoiser.
+ fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+ alt_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ if (refresh_gld) {
+ fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+ gld_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ if (refresh_lst) {
+ fail = av1_denoiser_realloc_svc_helper(cm, denoiser,
+ lst_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ }
+ return 0;
+}
+
+int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser,
+ int use_svc, int noise_sen, int width, int height,
+ int ssx, int ssy, int use_highbitdepth, int border) {
+ int i, layer, fail, init_num_ref_frames;
+ const int legacy_byte_alignment = 0;
+ int num_layers = 1;
+ int scaled_width = width;
+ int scaled_height = height;
+ if (use_svc) {
+ LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
+ svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ av1_get_layer_resolution(width, height, lc->scaling_factor_num,
+ lc->scaling_factor_den, &scaled_width,
+ &scaled_height);
+ // For SVC: only denoise at most 2 spatial (highest) layers.
+ if (noise_sen >= 2)
+ // Denoise from one spatial layer below the top.
+ svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 2, 0);
+ else
+ // Only denoise the top spatial layer.
+ svc->first_layer_denoise = AOMMAX(svc->number_spatial_layers - 1, 0);
+ num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
+ }
+ assert(denoiser != NULL);
+ denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
+ init_num_ref_frames = use_svc ? REF_FRAMES : NONSVC_REF_FRAMES;
+ denoiser->num_layers = num_layers;
+ CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+ aom_calloc(denoiser->num_ref_frames * num_layers,
+ sizeof(denoiser->running_avg_y[0])));
+ CHECK_MEM_ERROR(
+ cm, denoiser->mc_running_avg_y,
+ aom_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
+
+ for (layer = 0; layer < num_layers; ++layer) {
+ const int denoise_width = (layer == 0) ? width : scaled_width;
+ const int denoise_height = (layer == 0) ? height : scaled_height;
+ for (i = 0; i < init_num_ref_frames; ++i) {
+ fail = aom_alloc_frame_buffer(
+ &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+ denoise_width, denoise_height, ssx, ssy, use_highbitdepth, border,
+ legacy_byte_alignment, 0, 0);
+ if (fail) {
+ av1_denoiser_free(denoiser);
+ return 1;
+ }
+#ifdef OUTPUT_YUV_DENOISED
+ make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+ }
+
+ fail = aom_alloc_frame_buffer(
+ &denoiser->mc_running_avg_y[layer], denoise_width, denoise_height, ssx,
+ ssy, use_highbitdepth, border, legacy_byte_alignment, 0, 0);
+ if (fail) {
+ av1_denoiser_free(denoiser);
+ return 1;
+ }
+ }
+
+ // denoiser->last_source only used for noise_estimation, so only for top
+ // layer.
+ fail = aom_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
+ use_highbitdepth, border, legacy_byte_alignment,
+ 0, 0);
+ if (fail) {
+ av1_denoiser_free(denoiser);
+ return 1;
+ }
+#ifdef OUTPUT_YUV_DENOISED
+ make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+ denoiser->frame_buffer_initialized = 1;
+ denoiser->denoising_level = kDenMedium;
+ denoiser->prev_denoising_level = kDenMedium;
+ denoiser->reset = 0;
+ denoiser->current_denoiser_frame = 0;
+ return 0;
+}
+
+void av1_denoiser_free(AV1_DENOISER *denoiser) {
+ int i;
+ if (denoiser == NULL) {
+ return;
+ }
+ denoiser->frame_buffer_initialized = 0;
+ for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
+ aom_free_frame_buffer(&denoiser->running_avg_y[i]);
+ }
+ aom_free(denoiser->running_avg_y);
+ denoiser->running_avg_y = NULL;
+
+ for (i = 0; i < denoiser->num_layers; ++i) {
+ aom_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
+ }
+
+ aom_free(denoiser->mc_running_avg_y);
+ denoiser->mc_running_avg_y = NULL;
+ aom_free_frame_buffer(&denoiser->last_source);
+}
+
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+static void force_refresh_longterm_ref(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ // If long term reference is used, force refresh of that slot, so
+ // denoiser buffer for long term reference stays in sync.
+ if (svc->use_gf_temporal_ref_current_layer) {
+ int index = svc->spatial_layer_id;
+ if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+ assert(index >= 0);
+ cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+ cpi->refresh_alt_ref_frame = 1;
+ }
+}
+#endif
+
+void av1_denoiser_set_noise_level(AV1_COMP *const cpi, int noise_level) {
+ AV1_DENOISER *const denoiser = &cpi->denoiser;
+ denoiser->denoising_level = noise_level;
+ if (denoiser->denoising_level > kDenLowLow &&
+ denoiser->prev_denoising_level == kDenLowLow) {
+ denoiser->reset = 1;
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+ force_refresh_longterm_ref(cpi);
+#endif
+ } else {
+ denoiser->reset = 0;
+ }
+ denoiser->prev_denoising_level = denoiser->denoising_level;
+}
+
+// Scale/increase the partition threshold
+// for denoiser speed-up.
+int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level,
+ CONTENT_STATE_SB content_state,
+ int temporal_layer_id) {
+ if ((content_state.source_sad_nonrd <= kLowSad &&
+ content_state.low_sumdiff) ||
+ (content_state.source_sad_nonrd == kHighSad &&
+ content_state.low_sumdiff) ||
+ (content_state.lighting_change && !content_state.low_sumdiff) ||
+ (noise_level == kDenHigh) || (temporal_layer_id != 0)) {
+ int64_t scaled_thr =
+ (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2;
+ return scaled_thr;
+ } else {
+ return (5 * threshold) >> 2;
+ }
+}
+
+// Scale/increase the ac skip threshold for
+// denoiser speed-up.
+int64_t av1_scale_acskip_thresh(int64_t threshold,
+ AV1_DENOISER_LEVEL noise_level, int abs_sumdiff,
+ int temporal_layer_id) {
+ if (noise_level >= kDenLow && abs_sumdiff < 5)
+ threshold *= (noise_level == kDenLow) ? 2
+ : (temporal_layer_id == 2) ? 10
+ : 6;
+ return threshold;
+}
+
+void av1_denoiser_reset_on_first_frame(AV1_COMP *const cpi) {
+ if (/*av1_denoise_svc_non_key(cpi) &&*/
+ cpi->denoiser.current_denoiser_frame == 0) {
+ cpi->denoiser.reset = 1;
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+ force_refresh_longterm_ref(cpi);
+#endif
+ }
+}
+
+void av1_denoiser_update_ref_frame(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ SVC *const svc = &cpi->svc;
+
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->denoiser.denoising_level > kDenLowLow) {
+ int svc_refresh_denoiser_buffers = 0;
+ int denoise_svc_second_layer = 0;
+ FRAME_TYPE frame_type = cm->current_frame.frame_type == INTRA_ONLY_FRAME
+ ? KEY_FRAME
+ : cm->current_frame.frame_type;
+ cpi->denoiser.current_denoiser_frame++;
+ const int resize_pending = is_frame_resize_pending(cpi);
+
+ if (cpi->ppi->use_svc) {
+// TODO(kyslov) Enable when SVC temporal denosing is implemented
+#if 0
+ const int svc_buf_shift =
+ svc->number_spatial_layers - svc->spatial_layer_id == 2
+ ? cpi->denoiser.num_ref_frames
+ : 0;
+ int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ svc_refresh_denoiser_buffers =
+ lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id];
+ denoise_svc_second_layer =
+ svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0;
+ // Check if we need to allocate extra buffers in the denoiser
+ // for refreshed frames.
+ if (av1_denoiser_realloc_svc(cm, &cpi->denoiser, rtc_ref,
+ svc, svc_buf_shift,
+ cpi->refresh_alt_ref_frame,
+ cpi->refresh_golden_frame,
+ cpi->refresh_last_frame, cpi->alt_fb_idx,
+ cpi->gld_fb_idx, cpi->lst_fb_idx))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to re-allocate denoiser for SVC");
+#endif
+ }
+ av1_denoiser_update_frame_info(
+ &cpi->denoiser, *cpi->source, rtc_ref, svc, frame_type,
+ cpi->refresh_frame.alt_ref_frame, cpi->refresh_frame.golden_frame, 1,
+ rtc_ref->ref_idx[6], rtc_ref->ref_idx[3], rtc_ref->ref_idx[0],
+ resize_pending, svc_refresh_denoiser_buffers, denoise_svc_second_layer);
+ }
+}
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
+ int r, c;
+ uint8_t *u = yuv->u_buffer;
+ uint8_t *v = yuv->v_buffer;
+
+ for (r = 0; r < yuv->uv_height; ++r) {
+ for (c = 0; c < yuv->uv_width; ++c) {
+ u[c] = UINT8_MAX / 2;
+ v[c] = UINT8_MAX / 2;
+ }
+ u += yuv->uv_stride;
+ v += yuv->uv_stride;
+ }
+}
+
+void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
+ unsigned char *src = s->y_buffer;
+ int h = s->y_crop_height;
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_crop_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_crop_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_file);
+ src += s->uv_stride;
+ } while (--h);
+}
+#endif
diff --git a/third_party/aom/av1/encoder/av1_temporal_denoiser.h b/third_party/aom/av1/encoder/av1_temporal_denoiser.h
new file mode 100644
index 0000000000..14dcccce69
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_temporal_denoiser.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
+#define AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
+
+#include "av1/encoder/block.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+
+// Denoiser is used in non svc real-time mode which does not use alt-ref, so no
+// need to allocate for it, and hence we need MAX_REF_FRAME - 1
+#define NONSVC_REF_FRAMES REF_FRAMES - 1
+
+// Number of frame buffers when SVC is used. [0] for current denoised buffer and
+// [1..8] for REF_FRAMES
+#define SVC_REF_FRAMES 9
+
+typedef enum av1_denoiser_decision {
+ COPY_BLOCK,
+ FILTER_BLOCK,
+ FILTER_ZEROMV_BLOCK
+} AV1_DENOISER_DECISION;
+
+typedef enum av1_denoiser_level {
+ kDenLowLow,
+ kDenLow,
+ kDenMedium,
+ kDenHigh
+} AV1_DENOISER_LEVEL;
+
+typedef struct av1_denoiser {
+ YV12_BUFFER_CONFIG *running_avg_y;
+ YV12_BUFFER_CONFIG *mc_running_avg_y;
+ YV12_BUFFER_CONFIG last_source;
+ int frame_buffer_initialized;
+ int reset;
+ int num_ref_frames;
+ int num_layers;
+ unsigned int current_denoiser_frame;
+ AV1_DENOISER_LEVEL denoising_level;
+ AV1_DENOISER_LEVEL prev_denoising_level;
+} AV1_DENOISER;
+
+typedef struct {
+ int64_t zero_last_cost_orig;
+ unsigned int *ref_frame_cost;
+ int_mv (*frame_mv)[REF_FRAMES];
+ int reuse_inter_pred;
+ TX_SIZE best_tx_size;
+ PREDICTION_MODE best_mode;
+ MV_REFERENCE_FRAME best_ref_frame;
+ int_interpfilters best_pred_filter;
+ uint8_t best_mode_skip_txfm;
+} AV1_PICKMODE_CTX_DEN;
+
+struct AV1_COMP;
+struct SVC;
+struct RTC_REF;
+
+void av1_denoiser_update_frame_info(
+ AV1_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct RTC_REF *rtc_ref,
+ struct SVC *svc, FRAME_TYPE frame_type, int refresh_alt_ref_frame,
+ int refresh_golden_frame, int refresh_last_frame, int alt_fb_idx,
+ int gld_fb_idx, int lst_fb_idx, int resized,
+ int svc_refresh_denoiser_buffers, int second_spatial_layer);
+
+void av1_denoiser_denoise(struct AV1_COMP *cpi, MACROBLOCK *mb, int mi_row,
+ int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
+ AV1_DENOISER_DECISION *denoiser_decision,
+ int use_gf_temporal_ref);
+
+void av1_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
+
+void av1_denoiser_update_frame_stats(MB_MODE_INFO *mi, int64_t sse,
+ PREDICTION_MODE mode,
+ PICK_MODE_CONTEXT *ctx);
+
+int av1_denoiser_realloc_svc(AV1_COMMON *cm, AV1_DENOISER *denoiser,
+ struct RTC_REF *rtc, struct SVC *svc,
+ int svc_buf_shift, int refresh_alt,
+ int refresh_gld, int refresh_lst, int alt_fb_idx,
+ int gld_fb_idx, int lst_fb_idx);
+
+int av1_denoiser_alloc(AV1_COMMON *cm, struct SVC *svc, AV1_DENOISER *denoiser,
+ int use_svc, int noise_sen, int width, int height,
+ int ssx, int ssy, int use_highbitdepth, int border);
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+// This function is used by both c and sse2 denoiser implementations.
+// Define it as a static function within the scope where av1_denoiser.h
+// is referenced.
+static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs,
+ int increase_denoising) {
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+#endif
+
+void av1_denoiser_free(AV1_DENOISER *denoiser);
+
+void av1_denoiser_set_noise_level(struct AV1_COMP *const cpi, int noise_level);
+
+void av1_denoiser_reset_on_first_frame(struct AV1_COMP *const cpi);
+
+int64_t av1_scale_part_thresh(int64_t threshold, AV1_DENOISER_LEVEL noise_level,
+ CONTENT_STATE_SB content_state,
+ int temporal_layer_id);
+
+int64_t av1_scale_acskip_thresh(int64_t threshold,
+ AV1_DENOISER_LEVEL noise_level, int abs_sumdiff,
+ int temporal_layer_id);
+
+void av1_denoiser_update_ref_frame(struct AV1_COMP *const cpi);
+
+void aom_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_TEMPORAL_DENOISER_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
new file mode 100644
index 0000000000..219784fedf
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -0,0 +1,4248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem_ops.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/cdef.h"
+#include "av1/common/cfl.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+
+#define ENC_MISMATCH_DEBUG 0
+#define SETUP_TIME_OH_CONST 5 // Setup time overhead constant per worker
+#define JOB_DISP_TIME_OH_CONST 1 // Job dispatch time overhead per tile
+
+static INLINE void write_uniform(aom_writer *w, int n, int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return;
+ if (v < m) {
+ aom_write_literal(w, v, l - 1);
+ } else {
+ aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+ aom_write_literal(w, (v - m) & 1, 1);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void loop_restoration_write_sb_coeffs(
+ const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
+ aom_writer *const w, int plane, FRAME_COUNTS *counts);
+#endif
+
+static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+ const MB_MODE_INFO *mi,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi,
+ PREDICTION_MODE mode,
+ aom_writer *w) {
+ assert(!is_intrabc_block(mi));
+ (void)mi;
+ aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
+ INTRA_MODES);
+}
+
+static AOM_INLINE void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
+ FRAME_CONTEXT *ec_ctx,
+ const int16_t mode_ctx) {
+ const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+
+ aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
+
+ if (mode != NEWMV) {
+ const int16_t zeromv_ctx =
+ (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
+
+ if (mode != GLOBALMV) {
+ int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2);
+ }
+ }
+}
+
+static AOM_INLINE void write_drl_idx(
+ FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
+ const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
+ assert(mbmi->ref_mv_idx < 3);
+
+ const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+ if (new_mv) {
+ int idx;
+ for (idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext_frame->ref_mv_count > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx);
+
+ aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx],
+ 2);
+ if (mbmi->ref_mv_idx == idx) return;
+ }
+ }
+ return;
+ }
+
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ int idx;
+ // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
+ for (idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext_frame->ref_mv_count > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx);
+ aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1),
+ ec_ctx->drl_cdf[drl_ctx], 2);
+ if (mbmi->ref_mv_idx == (idx - 1)) return;
+ }
+ }
+ return;
+ }
+}
+
+static AOM_INLINE void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
+ PREDICTION_MODE mode,
+ const int16_t mode_ctx) {
+ assert(is_inter_compound_mode(mode));
+ aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
+ xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
+ INTER_COMPOUND_MODES);
+}
+
+static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi,
+ TX_SIZE tx_size, int depth,
+ int blk_row, int blk_col,
+ aom_writer *w) {
+ FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+ const int max_blocks_high = max_block_high(xd, mbmi->bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, mbmi->bsize, 0);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (depth == MAX_VARTX_DEPTH) {
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ return;
+ }
+
+ const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row,
+ mbmi->bsize, tx_size);
+ const int txb_size_index =
+ av1_get_txb_size_index(mbmi->bsize, blk_row, blk_col);
+ const int write_txfm_partition =
+ tx_size == mbmi->inter_tx_size[txb_size_index];
+ if (write_txfm_partition) {
+ aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ // TODO(yuec): set correct txfm partition update for qttx
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+
+ aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+ if (sub_txs == TX_4X4) {
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, sub_txs, tx_size);
+ return;
+ }
+
+ assert(bsw > 0 && bsh > 0);
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ const int offsetc = blk_col + col;
+ write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w);
+ }
+ }
+ }
+}
+
+static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd,
+ aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ if (block_signals_txsize(bsize)) {
+ const TX_SIZE tx_size = mbmi->tx_size;
+ const int tx_size_ctx = get_tx_size_context(xd);
+ const int depth = tx_size_to_depth(tx_size, bsize);
+ const int max_depths = bsize_to_max_depth(bsize);
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+
+ assert(depth >= 0 && depth <= max_depths);
+ assert(!is_inter_block(mbmi));
+ assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+
+ aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+ max_depths + 1);
+ }
+}
+
+static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ uint8_t segment_id, const MB_MODE_INFO *mi,
+ aom_writer *w) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
+ const int skip_txfm = mi->skip_txfm;
+ const int ctx = av1_get_skip_txfm_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, skip_txfm, ec_ctx->skip_txfm_cdfs[ctx], 2);
+ return skip_txfm;
+ }
+}
+
+static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ uint8_t segment_id, const MB_MODE_INFO *mi,
+ aom_writer *w) {
+ if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0;
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 0;
+ }
+ const int skip_mode = mi->skip_mode;
+ if (!is_comp_ref_allowed(mi->bsize)) {
+ assert(!skip_mode);
+ return 0;
+ }
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ // These features imply single-reference mode, while skip mode implies
+ // compound reference. Hence, the two are mutually exclusive.
+ // In other words, skip_mode is implicitly 0 here.
+ assert(!skip_mode);
+ return 0;
+ }
+ const int ctx = av1_get_skip_mode_context(xd);
+ aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2);
+ return skip_mode;
+}
+
+static AOM_INLINE void write_is_inter(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd, uint8_t segment_id,
+ aom_writer *w, const int is_inter) {
+ if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ assert(is_inter);
+ return;
+ }
+ const int ctx = av1_get_intra_inter_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2);
+ }
+}
+
+static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi,
+ aom_writer *w) {
+ MOTION_MODE last_motion_mode_allowed =
+ cm->features.switchable_motion_mode
+ ? motion_mode_allowed(cm->global_motion, xd, mbmi,
+ cm->features.allow_warped_motion)
+ : SIMPLE_TRANSLATION;
+ assert(mbmi->motion_mode <= last_motion_mode_allowed);
+ switch (last_motion_mode_allowed) {
+ case SIMPLE_TRANSLATION: break;
+ case OBMC_CAUSAL:
+ aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
+ xd->tile_ctx->obmc_cdf[mbmi->bsize], 2);
+ break;
+ default:
+ aom_write_symbol(w, mbmi->motion_mode,
+ xd->tile_ctx->motion_mode_cdf[mbmi->bsize],
+ MOTION_MODES);
+ }
+}
+
+static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd,
+ int delta_qindex, aom_writer *w) {
+ int sign = delta_qindex < 0;
+ int abs = sign ? -delta_qindex : delta_qindex;
+ int rem_bits, thr;
+ int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
+ DELTA_Q_PROBS + 1);
+
+ if (!smallval) {
+ rem_bits = get_msb(abs - 1);
+ thr = (1 << rem_bits) + 1;
+ aom_write_literal(w, rem_bits - 1, 3);
+ aom_write_literal(w, abs - thr, rem_bits);
+ }
+ if (abs > 0) {
+ aom_write_bit(w, sign);
+ }
+}
+
+static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd, int lf_id,
+ int delta_lflevel,
+ int delta_lf_multi, aom_writer *w) {
+ int sign = delta_lflevel < 0;
+ int abs = sign ? -delta_lflevel : delta_lflevel;
+ int rem_bits, thr;
+ int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ (void)cm;
+
+ if (delta_lf_multi) {
+ assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
+ : FRAME_LF_COUNT - 2));
+ aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
+ ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1);
+ } else {
+ aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
+ DELTA_LF_PROBS + 1);
+ }
+
+ if (!smallval) {
+ rem_bits = get_msb(abs - 1);
+ thr = (1 << rem_bits) + 1;
+ aom_write_literal(w, rem_bits - 1, 3);
+ aom_write_literal(w, abs - thr, rem_bits);
+ }
+ if (abs > 0) {
+ aom_write_bit(w, sign);
+ }
+}
+
+static AOM_INLINE void pack_map_tokens(aom_writer *w, const TokenExtra **tp,
+ int n, int num, MapCdf map_pb_cdf) {
+ const TokenExtra *p = *tp;
+ const int palette_size_idx = n - PALETTE_MIN_SIZE;
+ write_uniform(w, n, p->token); // The first color index.
+ ++p;
+ --num;
+ for (int i = 0; i < num; ++i) {
+ assert((p->color_ctx >= 0) &&
+ (p->color_ctx < PALETTE_COLOR_INDEX_CONTEXTS));
+ aom_cdf_prob *color_map_cdf = map_pb_cdf[palette_size_idx][p->color_ctx];
+ aom_write_symbol(w, p->token, color_map_cdf, n);
+ ++p;
+ }
+ *tp = p;
+}
+
+static AOM_INLINE void pack_txb_tokens(
+ aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TokenExtra **tp,
+ const TokenExtra *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+ int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block,
+ int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+
+ if (tx_size == plane_tx_size || plane) {
+ av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, block, tx_size);
+#if CONFIG_RD_DEBUG
+ TOKEN_STATS tmp_token_stats;
+ init_token_stats(&tmp_token_stats);
+ token_stats->cost += tmp_token_stats.cost;
+#endif
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int step = bsh * bsw;
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+
+ assert(bsw > 0 && bsh > 0);
+
+ for (int r = 0; r < row_end; r += bsh) {
+ const int offsetr = blk_row + r;
+ for (int c = 0; c < col_end; c += bsw) {
+ const int offsetc = blk_col + c;
+ pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize,
+ bit_depth, block, offsetr, offsetc, sub_txs,
+ token_stats);
+ block += step;
+ }
+ }
+ }
+}
+
+static INLINE void set_spatial_segment_id(
+ const CommonModeInfoParams *const mi_params, uint8_t *segment_ids,
+ BLOCK_SIZE bsize, int mi_row, int mi_col, uint8_t segment_id) {
+ const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+
+ const int mi_stride = mi_params->mi_cols;
+
+ set_segment_id(segment_ids, mi_offset, xmis, ymis, mi_stride, segment_id);
+}
+
+int av1_neg_interleave(int x, int ref, int max) {
+ assert(x < max);
+ const int diff = x - ref;
+ if (!ref) return x;
+ if (ref >= (max - 1)) return -x + max - 1;
+ if (2 * ref < max) {
+ if (abs(diff) <= ref) {
+ if (diff > 0)
+ return (diff << 1) - 1;
+ else
+ return ((-diff) << 1);
+ }
+ return x;
+ } else {
+ if (abs(diff) < (max - ref)) {
+ if (diff > 0)
+ return (diff << 1) - 1;
+ else
+ return ((-diff) << 1);
+ }
+ return (max - x) - 1;
+ }
+}
+
+static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
+ const MB_MODE_INFO *const mbmi,
+ aom_writer *w,
+ const struct segmentation *seg,
+ struct segmentation_probs *segp,
+ int skip_txfm) {
+ if (!seg->enabled || !seg->update_map) return;
+
+ AV1_COMMON *const cm = &cpi->common;
+ int cdf_num;
+ const uint8_t pred = av1_get_spatial_seg_pred(
+ cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ if (skip_txfm) {
+ // Still need to transmit tx size for intra blocks even if skip_txfm is
+ // true. Changing segment_id may make the tx size become invalid, e.g
+ // changing from lossless to lossy.
+ assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment);
+
+ set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize,
+ mi_row, mi_col, pred);
+ set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->bsize,
+ mi_row, mi_col, pred);
+ /* mbmi is read only but we need to update segment_id */
+ ((MB_MODE_INFO *)mbmi)->segment_id = pred;
+ return;
+ }
+
+ const int coded_id =
+ av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
+ aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+ aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
+ set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->bsize,
+ mi_row, mi_col, mbmi->segment_id);
+}
+
+#define WRITE_REF_BIT(bname, pname) \
+ aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
+
+// This function encodes the reference frame
+static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd, aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_compound = has_second_ref(mbmi);
+ const uint8_t segment_id = mbmi->segment_id;
+
+ // If segment level coding of this signal is disabled...
+ // or the segment allows multiple reference frame options
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ assert(!is_compound);
+ assert(mbmi->ref_frame[0] ==
+ get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+ } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ assert(!is_compound);
+ assert(mbmi->ref_frame[0] == LAST_FRAME);
+ } else {
+ // does the feature use compound prediction or not
+ // (if not specified at the frame/segment level)
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+ if (is_comp_ref_allowed(mbmi->bsize))
+ aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2);
+ } else {
+ assert((!is_compound) ==
+ (cm->current_frame.reference_mode == SINGLE_REFERENCE));
+ }
+
+ if (is_compound) {
+ const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+ ? UNIDIR_COMP_REFERENCE
+ : BIDIR_COMP_REFERENCE;
+ aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd),
+ 2);
+
+ if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+ const int bit = mbmi->ref_frame[0] == BWDREF_FRAME;
+ WRITE_REF_BIT(bit, uni_comp_ref_p);
+
+ if (!bit) {
+ assert(mbmi->ref_frame[0] == LAST_FRAME);
+ const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
+ mbmi->ref_frame[1] == GOLDEN_FRAME;
+ WRITE_REF_BIT(bit1, uni_comp_ref_p1);
+ if (bit1) {
+ const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
+ WRITE_REF_BIT(bit2, uni_comp_ref_p2);
+ }
+ } else {
+ assert(mbmi->ref_frame[1] == ALTREF_FRAME);
+ }
+
+ return;
+ }
+
+ assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+
+ const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+ mbmi->ref_frame[0] == LAST3_FRAME);
+ WRITE_REF_BIT(bit, comp_ref_p);
+
+ if (!bit) {
+ const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME;
+ WRITE_REF_BIT(bit1, comp_ref_p1);
+ } else {
+ const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+ WRITE_REF_BIT(bit2, comp_ref_p2);
+ }
+
+ const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+ WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
+
+ if (!bit_bwd) {
+ WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
+ }
+
+ } else {
+ const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME &&
+ mbmi->ref_frame[0] >= BWDREF_FRAME);
+ WRITE_REF_BIT(bit0, single_ref_p1);
+
+ if (bit0) {
+ const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+ WRITE_REF_BIT(bit1, single_ref_p2);
+
+ if (!bit1) {
+ WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
+ }
+ } else {
+ const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
+ mbmi->ref_frame[0] == GOLDEN_FRAME);
+ WRITE_REF_BIT(bit2, single_ref_p3);
+
+ if (!bit2) {
+ const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+ WRITE_REF_BIT(bit3, single_ref_p4);
+ } else {
+ const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+ WRITE_REF_BIT(bit4, single_ref_p5);
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void write_filter_intra_mode_info(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+ aom_writer *w) {
+ if (av1_filter_intra_allowed(cm, mbmi)) {
+ aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
+ xd->tile_ctx->filter_intra_cdfs[mbmi->bsize], 2);
+ if (mbmi->filter_intra_mode_info.use_filter_intra) {
+ const FILTER_INTRA_MODE mode =
+ mbmi->filter_intra_mode_info.filter_intra_mode;
+ aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf,
+ FILTER_INTRA_MODES);
+ }
+ }
+}
+
+static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta,
+ aom_cdf_prob *cdf) {
+ aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf,
+ 2 * MAX_ANGLE_DELTA + 1);
+}
+
+static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
+ ThreadData *td, aom_writer *w) {
+ const MACROBLOCKD *xd = &td->mb.e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (!av1_is_interp_needed(xd)) {
+ int_interpfilters filters = av1_broadcast_interp_filter(
+ av1_unswitchable_filter(cm->features.interp_filter));
+ assert(mbmi->interp_filters.as_int == filters.as_int);
+ (void)filters;
+ return;
+ }
+ if (cm->features.interp_filter == SWITCHABLE) {
+ int dir;
+ for (dir = 0; dir < 2; ++dir) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ InterpFilter filter =
+ av1_extract_interp_filter(mbmi->interp_filters, dir);
+ aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
+ SWITCHABLE_FILTERS);
+ ++td->interp_filter_selected[filter];
+ if (cm->seq_params->enable_dual_filter == 0) return;
+ }
+ }
+}
+
+// Transmit color values with delta encoding. Write the first value as
+// literal, and the deltas between each value and the previous one. "min_val" is
+// the smallest possible value of the deltas.
+static AOM_INLINE void delta_encode_palette_colors(const int *colors, int num,
+ int bit_depth, int min_val,
+ aom_writer *w) {
+ if (num <= 0) return;
+ assert(colors[0] < (1 << bit_depth));
+ aom_write_literal(w, colors[0], bit_depth);
+ if (num == 1) return;
+ int max_delta = 0;
+ int deltas[PALETTE_MAX_SIZE];
+ memset(deltas, 0, sizeof(deltas));
+ for (int i = 1; i < num; ++i) {
+ assert(colors[i] < (1 << bit_depth));
+ const int delta = colors[i] - colors[i - 1];
+ deltas[i - 1] = delta;
+ assert(delta >= min_val);
+ if (delta > max_delta) max_delta = delta;
+ }
+ const int min_bits = bit_depth - 3;
+ int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+ assert(bits <= bit_depth);
+ int range = (1 << bit_depth) - colors[0] - min_val;
+ aom_write_literal(w, bits - min_bits, 2);
+ for (int i = 0; i < num - 1; ++i) {
+ aom_write_literal(w, deltas[i] - min_val, bits);
+ range -= deltas[i];
+ bits = AOMMIN(bits, av1_ceil_log2(range));
+ }
+}
+
+// Transmit luma palette color values. First signal if each color in the color
+// cache is used. Those colors that are not in the cache are transmitted with
+// delta encoding.
+static AOM_INLINE void write_palette_colors_y(
+ const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, aom_writer *w) {
+ const int n = pmi->palette_size[0];
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache =
+ av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+ cache_color_found, out_cache_colors);
+ int n_in_cache = 0;
+ for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+ const int found = cache_color_found[i];
+ aom_write_bit(w, found);
+ n_in_cache += found;
+ }
+ assert(n_in_cache + n_out_cache == n);
+ delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w);
+}
+
+// Write chroma palette color values. U channel is handled similarly to the luma
+// channel. For v channel, either use delta encoding or transmit raw values
+// directly, whichever costs less.
+static AOM_INLINE void write_palette_colors_uv(
+ const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, aom_writer *w) {
+ const int n = pmi->palette_size[1];
+ const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
+ const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
+ // U channel colors.
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache = av1_index_color_cache(
+ color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors);
+ int n_in_cache = 0;
+ for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+ const int found = cache_color_found[i];
+ aom_write_bit(w, found);
+ n_in_cache += found;
+ }
+ delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w);
+
+ // V channel colors. Don't use color cache as the colors are not sorted.
+ const int max_val = 1 << bit_depth;
+ int zero_count = 0, min_bits_v = 0;
+ int bits_v =
+ av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+ const int rate_using_delta =
+ 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+ const int rate_using_raw = bit_depth * n;
+ if (rate_using_delta < rate_using_raw) { // delta encoding
+ assert(colors_v[0] < (1 << bit_depth));
+ aom_write_bit(w, 1);
+ aom_write_literal(w, bits_v - min_bits_v, 2);
+ aom_write_literal(w, colors_v[0], bit_depth);
+ for (int i = 1; i < n; ++i) {
+ assert(colors_v[i] < (1 << bit_depth));
+ if (colors_v[i] == colors_v[i - 1]) { // No need to signal sign bit.
+ aom_write_literal(w, 0, bits_v);
+ continue;
+ }
+ const int delta = abs((int)colors_v[i] - colors_v[i - 1]);
+ const int sign_bit = colors_v[i] < colors_v[i - 1];
+ if (delta <= max_val - delta) {
+ aom_write_literal(w, delta, bits_v);
+ aom_write_bit(w, sign_bit);
+ } else {
+ aom_write_literal(w, max_val - delta, bits_v);
+ aom_write_bit(w, !sign_bit);
+ }
+ }
+ } else { // Transmit raw values.
+ aom_write_bit(w, 0);
+ for (int i = 0; i < n; ++i) {
+ assert(colors_v[i] < (1 << bit_depth));
+ aom_write_literal(w, colors_v[i], bit_depth);
+ }
+ }
+}
+
+static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd,
+ const MB_MODE_INFO *const mbmi,
+ aom_writer *w) {
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+ if (mbmi->mode == DC_PRED) {
+ const int n = pmi->palette_size[0];
+ const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd);
+ aom_write_symbol(
+ w, n > 0,
+ xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2);
+ if (n > 0) {
+ aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+ xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
+ PALETTE_SIZES);
+ write_palette_colors_y(xd, pmi, cm->seq_params->bit_depth, w);
+ }
+ }
+
+ const int uv_dc_pred =
+ num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref;
+ if (uv_dc_pred) {
+ const int n = pmi->palette_size[1];
+ const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+ aom_write_symbol(w, n > 0,
+ xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2);
+ if (n > 0) {
+ aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+ xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
+ PALETTE_SIZES);
+ write_palette_colors_uv(xd, pmi, cm->seq_params->bit_depth, w);
+ }
+ }
+}
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+ TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const FeatureFlags *const features = &cm->features;
+ const int is_inter = is_inter_block(mbmi);
+ if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 &&
+ ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) ||
+ (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+ !mbmi->skip_txfm &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+ const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+ tx_size, is_inter, features->reduced_tx_set_used);
+ const int eset =
+ get_ext_tx_set(tx_size, is_inter, features->reduced_tx_set_used);
+ // eset == 0 should correspond to a set with only DCT_DCT and there
+ // is no need to send the tx_type
+ assert(eset > 0);
+ assert(av1_ext_tx_used[tx_set_type][tx_type]);
+ if (is_inter) {
+ aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
+ ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+ av1_num_ext_tx_set[tx_set_type]);
+ } else {
+ PREDICTION_MODE intra_dir;
+ if (mbmi->filter_intra_mode_info.use_filter_intra)
+ intra_dir =
+ fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
+ else
+ intra_dir = mbmi->mode;
+ aom_write_symbol(
+ w, av1_ext_tx_ind[tx_set_type][tx_type],
+ ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+ av1_num_ext_tx_set[tx_set_type]);
+ }
+ }
+}
+
+static AOM_INLINE void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx,
+ BLOCK_SIZE bsize,
+ PREDICTION_MODE mode,
+ aom_writer *w) {
+ aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
+ INTRA_MODES);
+}
+
+static AOM_INLINE void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
+ UV_PREDICTION_MODE uv_mode,
+ PREDICTION_MODE y_mode,
+ CFL_ALLOWED_TYPE cfl_allowed,
+ aom_writer *w) {
+ aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+ UV_INTRA_MODES - !cfl_allowed);
+}
+
+static AOM_INLINE void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx,
+ uint8_t idx, int8_t joint_sign,
+ aom_writer *w) {
+ aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
+ // Magnitudes are only signaled for nonzero codes.
+ if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+ aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE);
+ }
+ if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+ aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE);
+ }
+}
+
+static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
+ aom_writer *w, int skip) {
+ if (cm->features.coded_lossless || cm->features.allow_intrabc) return;
+
+ // At the start of a superblock, mark that we haven't yet written CDEF
+ // strengths for any of the CDEF units contained in this superblock.
+ const int sb_mask = (cm->seq_params->mib_size - 1);
+ const int mi_row_in_sb = (xd->mi_row & sb_mask);
+ const int mi_col_in_sb = (xd->mi_col & sb_mask);
+ if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
+ xd->cdef_transmitted[0] = xd->cdef_transmitted[1] =
+ xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false;
+ }
+
+ // CDEF unit size is 64x64 irrespective of the superblock size.
+ const int cdef_size = 1 << (6 - MI_SIZE_LOG2);
+
+ // Find index of this CDEF unit in this superblock.
+ const int index_mask = cdef_size;
+ const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
+ const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
+ const int index = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
+ : 0;
+
+ // Write CDEF strength to the first non-skip coding block in this CDEF unit.
+ if (!xd->cdef_transmitted[index] && !skip) {
+ // CDEF strength for this CDEF unit needs to be stored in the MB_MODE_INFO
+ // of the 1st block in this CDEF unit.
+ const int first_block_mask = ~(cdef_size - 1);
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int grid_idx =
+ get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask,
+ xd->mi_col & first_block_mask);
+ const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
+ aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits);
+ xd->cdef_transmitted[index] = true;
+ }
+}
+
+static AOM_INLINE void write_inter_segment_id(
+ AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w,
+ const struct segmentation *const seg, struct segmentation_probs *const segp,
+ int skip, int preskip) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ if (seg->update_map) {
+ if (preskip) {
+ if (!seg->segid_preskip) return;
+ } else {
+ if (seg->segid_preskip) return;
+ if (skip) {
+ write_segment_id(cpi, xd, mbmi, w, seg, segp, 1);
+ if (seg->temporal_update) mbmi->seg_id_predicted = 0;
+ return;
+ }
+ }
+ if (seg->temporal_update) {
+ const int pred_flag = mbmi->seg_id_predicted;
+ aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
+ aom_write_symbol(w, pred_flag, pred_cdf, 2);
+ if (!pred_flag) {
+ write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
+ }
+ if (pred_flag) {
+ set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
+ mbmi->bsize, mi_row, mi_col, mbmi->segment_id);
+ }
+ } else {
+ write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
+ }
+ }
+}
+
+// If delta q is present, writes delta_q index.
+// Also writes delta_q loop filter levels, if present.
+static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, int skip,
+ aom_writer *w) {
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+
+ if (delta_q_info->delta_q_present_flag) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int super_block_upper_left =
+ ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+ ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
+
+ if ((bsize != cm->seq_params->sb_size || skip == 0) &&
+ super_block_upper_left) {
+ assert(mbmi->current_qindex > 0);
+ const int reduced_delta_qindex =
+ (mbmi->current_qindex - xd->current_base_qindex) /
+ delta_q_info->delta_q_res;
+ write_delta_qindex(xd, reduced_delta_qindex, w);
+ xd->current_base_qindex = mbmi->current_qindex;
+ if (delta_q_info->delta_lf_present_flag) {
+ if (delta_q_info->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ int reduced_delta_lflevel =
+ (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+ delta_q_info->delta_lf_res;
+ write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, 1, w);
+ xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+ }
+ } else {
+ int reduced_delta_lflevel =
+ (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+ delta_q_info->delta_lf_res;
+ write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, 0, w);
+ xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm,
+ MACROBLOCKD *const xd,
+ int is_keyframe,
+ aom_writer *w) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const PREDICTION_MODE mode = mbmi->mode;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+
+ // Y mode.
+ if (is_keyframe) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+ } else {
+ write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w);
+ }
+
+ // Y angle delta.
+ const int use_angle_delta = av1_use_angle_delta(bsize);
+ if (use_angle_delta && av1_is_directional_mode(mode)) {
+ write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+ ec_ctx->angle_delta_cdf[mode - V_PRED]);
+ }
+
+ // UV mode and UV angle delta.
+ if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+ if (uv_mode == UV_CFL_PRED)
+ write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+ const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ if (use_angle_delta && av1_is_directional_mode(intra_mode)) {
+ write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+ ec_ctx->angle_delta_cdf[intra_mode - V_PRED]);
+ }
+ }
+
+ // Palette.
+ if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+ write_palette_mode_info(cm, xd, mbmi, w);
+ }
+
+ // Filter intra.
+ write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
+
+static INLINE int16_t mode_context_analyzer(
+ const int16_t mode_context, const MV_REFERENCE_FRAME *const rf) {
+ if (rf[1] <= INTRA_FRAME) return mode_context;
+
+ const int16_t newmv_ctx = mode_context & NEWMV_CTX_MASK;
+ const int16_t refmv_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+ const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+ newmv_ctx, COMP_NEWMV_CTXS - 1)];
+ return comp_ctx;
+}
+
+static INLINE int_mv get_ref_mv_from_stack(
+ int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx,
+ const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame) {
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack;
+
+ if (ref_frame[1] > INTRA_FRAME) {
+ assert(ref_idx == 0 || ref_idx == 1);
+ return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+ : curr_ref_mv_stack[ref_mv_idx].this_mv;
+ }
+
+ assert(ref_idx == 0);
+ return ref_mv_idx < mbmi_ext_frame->ref_mv_count
+ ? curr_ref_mv_stack[ref_mv_idx].this_mv
+ : mbmi_ext_frame->global_mvs[ref_frame_type];
+}
+
+static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+ assert(has_second_ref(mbmi));
+ ref_mv_idx += 1;
+ }
+ return get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+ x->mbmi_ext_frame);
+}
+
+static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td,
+ aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame;
+ const PREDICTION_MODE mode = mbmi->mode;
+ const uint8_t segment_id = mbmi->segment_id;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int allow_hp = cm->features.allow_high_precision_mv;
+ const int is_inter = is_inter_block(mbmi);
+ const int is_compound = has_second_ref(mbmi);
+ int ref;
+
+ write_inter_segment_id(cpi, xd, w, seg, segp, 0, 1);
+
+ write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+ assert(IMPLIES(mbmi->skip_mode, mbmi->skip_txfm));
+ const int skip =
+ mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+ write_inter_segment_id(cpi, xd, w, seg, segp, skip, 0);
+
+ write_cdef(cm, xd, w, skip);
+
+ write_delta_q_params(cm, xd, skip, w);
+
+ if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+
+ if (mbmi->skip_mode) return;
+
+ if (!is_inter) {
+ write_intra_prediction_modes(cm, xd, 0, w);
+ } else {
+ int16_t mode_ctx;
+
+ av1_collect_neighbors_ref_counts(xd);
+
+ write_ref_frames(cm, xd, w);
+
+ mode_ctx =
+ mode_context_analyzer(mbmi_ext_frame->mode_context, mbmi->ref_frame);
+
+ // If segment skip is not enabled code the mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+ if (is_inter_compound_mode(mode))
+ write_inter_compound_mode(xd, w, mode, mode_ctx);
+ else if (is_inter_singleref_mode(mode))
+ write_inter_mode(w, mode, ec_ctx, mode_ctx);
+
+ if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode))
+ write_drl_idx(ec_ctx, mbmi, mbmi_ext_frame, w);
+ else
+ assert(mbmi->ref_mv_idx == 0);
+ }
+
+ if (mode == NEWMV || mode == NEW_NEWMV) {
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ const int_mv ref_mv = get_ref_mv(x, ref);
+ av1_encode_mv(cpi, w, td, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+ allow_hp);
+ }
+ } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ const int_mv ref_mv = get_ref_mv(x, 1);
+ av1_encode_mv(cpi, w, td, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc,
+ allow_hp);
+ } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ const int_mv ref_mv = get_ref_mv(x, 0);
+ av1_encode_mv(cpi, w, td, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc,
+ allow_hp);
+ }
+
+ if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE &&
+ cpi->common.seq_params->enable_interintra_compound &&
+ is_interintra_allowed(mbmi)) {
+ const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
+ const int bsize_group = size_group_lookup[bsize];
+ aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2);
+ if (interintra) {
+ aom_write_symbol(w, mbmi->interintra_mode,
+ ec_ctx->interintra_mode_cdf[bsize_group],
+ INTERINTRA_MODES);
+ if (av1_is_wedge_used(bsize)) {
+ aom_write_symbol(w, mbmi->use_wedge_interintra,
+ ec_ctx->wedge_interintra_cdf[bsize], 2);
+ if (mbmi->use_wedge_interintra) {
+ aom_write_symbol(w, mbmi->interintra_wedge_index,
+ ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
+ }
+ }
+ }
+ }
+
+ if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w);
+
+ // First write idx to indicate current compound inter prediction mode group
+ // Group A (0): dist_wtd_comp, compound_average
+ // Group B (1): interintra, compound_diffwtd, wedge
+ if (has_second_ref(mbmi)) {
+ const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+ cm->seq_params->enable_masked_compound;
+
+ if (masked_compound_used) {
+ const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+ aom_write_symbol(w, mbmi->comp_group_idx,
+ ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2);
+ } else {
+ assert(mbmi->comp_group_idx == 0);
+ }
+
+ if (mbmi->comp_group_idx == 0) {
+ if (mbmi->compound_idx)
+ assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
+
+ if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+ aom_write_symbol(w, mbmi->compound_idx,
+ ec_ctx->compound_index_cdf[comp_index_ctx], 2);
+ } else {
+ assert(mbmi->compound_idx == 1);
+ }
+ } else {
+ assert(cpi->common.current_frame.reference_mode != SINGLE_REFERENCE &&
+ is_inter_compound_mode(mbmi->mode) &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION);
+ assert(masked_compound_used);
+ // compound_diffwtd, wedge
+ assert(mbmi->interinter_comp.type == COMPOUND_WEDGE ||
+ mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+ aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE,
+ ec_ctx->compound_type_cdf[bsize],
+ MASKED_COMPOUND_TYPES);
+
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+ assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+ aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
+ ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
+ aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
+ } else {
+ assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+ aom_write_literal(w, mbmi->interinter_comp.mask_type,
+ MAX_DIFFWTD_MASK_BITS);
+ }
+ }
+ }
+ write_mb_interp_filter(cm, td, w);
+ }
+}
+
+static AOM_INLINE void write_intrabc_info(
+ MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+ aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ int use_intrabc = is_intrabc_block(mbmi);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
+ if (use_intrabc) {
+ assert(mbmi->mode == DC_PRED);
+ assert(mbmi->uv_mode == UV_DC_PRED);
+ assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
+ int_mv dv_ref = mbmi_ext_frame->ref_mv_stack[0].this_mv;
+ av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+ }
+}
+
+static AOM_INLINE void write_mb_modes_kf(
+ AV1_COMP *cpi, MACROBLOCKD *xd,
+ const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ if (seg->segid_preskip && seg->update_map)
+ write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
+
+ const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
+
+ if (!seg->segid_preskip && seg->update_map)
+ write_segment_id(cpi, xd, mbmi, w, seg, segp, skip);
+
+ write_cdef(cm, xd, w, skip);
+
+ write_delta_q_params(cm, xd, skip, w);
+
+ if (av1_allow_intrabc(cm)) {
+ write_intrabc_info(xd, mbmi_ext_frame, w);
+ if (is_intrabc_block(mbmi)) return;
+ }
+
+ write_intra_prediction_modes(cm, xd, 1, w);
+}
+
+#if CONFIG_RD_DEBUG
+static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) {
+ printf("\nmi->mi_row == %d\n", mi->mi_row);
+ printf("&& mi->mi_col == %d\n", mi->mi_col);
+ printf("&& mi->bsize == %d\n", mi->bsize);
+ printf("&& mi->tx_size == %d\n", mi->tx_size);
+ printf("&& mi->mode == %d\n", mi->mode);
+}
+
+static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
+ int plane) {
+ if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
+ printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
+ plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
+ return 1;
+ }
+ return 0;
+}
+#endif
+
+#if ENC_MISMATCH_DEBUG
+static AOM_INLINE void enc_dump_logs(
+ const AV1_COMMON *const cm,
+ const MBMIExtFrameBufferInfo *const mbmi_ext_info, int mi_row, int mi_col) {
+ const MB_MODE_INFO *const mbmi = *(
+ cm->mi_params.mi_grid_base + (mi_row * cm->mi_params.mi_stride + mi_col));
+ const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame =
+ mbmi_ext_info->frame_base + get_mi_ext_idx(mi_row, mi_col,
+ cm->mi_params.mi_alloc_bsize,
+ mbmi_ext_info->stride);
+ if (is_inter_block(mbmi)) {
+#define FRAME_TO_CHECK 11
+ if (cm->current_frame.frame_number == FRAME_TO_CHECK &&
+ cm->show_frame == 1) {
+ const BLOCK_SIZE bsize = mbmi->bsize;
+
+ int_mv mv[2] = { 0 };
+ const int is_comp_ref = has_second_ref(mbmi);
+
+ for (int ref = 0; ref < 1 + is_comp_ref; ++ref)
+ mv[ref].as_mv = mbmi->mv[ref].as_mv;
+
+ if (!is_comp_ref) {
+ mv[1].as_int = 0;
+ }
+
+ const int16_t mode_ctx =
+ is_comp_ref ? 0
+ : mode_context_analyzer(mbmi_ext_frame->mode_context,
+ mbmi->ref_frame);
+
+ const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+ int16_t zeromv_ctx = -1;
+ int16_t refmv_ctx = -1;
+
+ if (mbmi->mode != NEWMV) {
+ zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ if (mbmi->mode != GLOBALMV)
+ refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ }
+
+ printf(
+ "=== ENCODER ===: "
+ "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
+ "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+ "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+ "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+ cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode,
+ mbmi->mode, bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+ mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+ mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx,
+ zeromv_ctx, refmv_ctx, mbmi->tx_size);
+ }
+ }
+}
+#endif // ENC_MISMATCH_DEBUG
+
+static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td,
+ aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &td->mb.e_mbd;
+ MB_MODE_INFO *m = xd->mi[0];
+
+ if (frame_is_intra_only(cm)) {
+ write_mb_modes_kf(cpi, xd, td->mb.mbmi_ext_frame, w);
+ } else {
+ // has_subpel_mv_component needs the ref frame buffers set up to look
+ // up if they are scaled. has_subpel_mv_component is in turn needed by
+ // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
+ set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]);
+
+#if ENC_MISMATCH_DEBUG
+ enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col);
+#endif // ENC_MISMATCH_DEBUG
+
+ pack_inter_mode_mvs(cpi, td, w);
+ }
+}
+
+static AOM_INLINE void write_inter_txb_coeff(
+ AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi,
+ aom_writer *w, const TokenExtra **tok, const TokenExtra *const tok_end,
+ TOKEN_STATS *token_stats, const int row, const int col, int *block,
+ const int plane) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+ const int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ const int bkw = tx_size_wide_unit[max_tx_size];
+ const int bkh = tx_size_high_unit[max_tx_size];
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, ss_x, ss_y);
+ const int num_4x4_w = mi_size_wide[plane_bsize];
+ const int num_4x4_h = mi_size_high[plane_bsize];
+ const int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ const int mu_blocks_high = mi_size_high[max_unit_bsize];
+ const int unit_height = AOMMIN(mu_blocks_high + (row >> ss_y), num_4x4_h);
+ const int unit_width = AOMMIN(mu_blocks_wide + (col >> ss_x), num_4x4_w);
+ for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) {
+ for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) {
+ pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
+ cm->seq_params->bit_depth, *block, blk_row, blk_col,
+ max_tx_size, token_stats);
+ *block += step;
+ }
+ }
+}
+
+static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x,
+ aom_writer *w, const TokenExtra **tok,
+ const TokenExtra *const tok_end) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+
+ assert(!mbmi->skip_txfm);
+
+ const int is_inter = is_inter_block(mbmi);
+ if (!is_inter) {
+ av1_write_intra_coeffs_mb(cm, x, w, bsize);
+ } else {
+ int block[MAX_MB_PLANE] = { 0 };
+ assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ const int num_4x4_w = mi_size_wide[bsize];
+ const int num_4x4_h = mi_size_high[bsize];
+ TOKEN_STATS token_stats;
+ init_token_stats(&token_stats);
+
+ const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+ assert(max_unit_bsize == get_plane_block_size(BLOCK_64X64,
+ xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ int mu_blocks_high = mi_size_high[max_unit_bsize];
+ mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
+
+ const int num_planes = av1_num_planes(cm);
+ for (int row = 0; row < num_4x4_h; row += mu_blocks_high) {
+ for (int col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats, row,
+ col, &block[plane], plane);
+ }
+ }
+ }
+#if CONFIG_RD_DEBUG
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (mbmi->bsize >= BLOCK_8X8 &&
+ rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+ dump_mode_info(mbmi);
+ assert(0);
+ }
+ }
+#endif // CONFIG_RD_DEBUG
+ }
+}
+
+static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td,
+ const TileInfo *const tile, aom_writer *w,
+ const TokenExtra **tok,
+ const TokenExtra *const tok_end,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MACROBLOCKD *xd = &td->mb.e_mbd;
+ FRAME_CONTEXT *tile_ctx = xd->tile_ctx;
+ const int grid_idx = mi_row * mi_params->mi_stride + mi_col;
+ xd->mi = mi_params->mi_grid_base + grid_idx;
+ td->mb.mbmi_ext_frame =
+ cpi->mbmi_ext_info.frame_base +
+ get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
+ cpi->mbmi_ext_info.stride);
+ xd->tx_type_map = mi_params->tx_type_map + grid_idx;
+ xd->tx_type_map_stride = mi_params->mi_stride;
+
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ assert(bsize <= cm->seq_params->sb_size ||
+ (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL));
+
+ const int bh = mi_size_high[bsize];
+ const int bw = mi_size_wide[bsize];
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+ mi_params->mi_cols);
+
+ xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ write_mbmi_b(cpi, td, w);
+
+ for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
+ const uint8_t palette_size_plane =
+ mbmi->palette_mode_info.palette_size[plane];
+ assert(!mbmi->skip_mode || !palette_size_plane);
+ if (palette_size_plane > 0) {
+ assert(mbmi->use_intrabc == 0);
+ assert(av1_allow_palette(cm->features.allow_screen_content_tools,
+ mbmi->bsize));
+ assert(!plane || xd->is_chroma_ref);
+ int rows, cols;
+ av1_get_block_dimensions(mbmi->bsize, plane, xd, NULL, NULL, &rows,
+ &cols);
+ assert(*tok < tok_end);
+ MapCdf map_pb_cdf = plane ? tile_ctx->palette_uv_color_index_cdf
+ : tile_ctx->palette_y_color_index_cdf;
+ pack_map_tokens(w, tok, palette_size_plane, rows * cols, map_pb_cdf);
+ }
+ }
+
+ const int is_inter_tx = is_inter_block(mbmi);
+ const int skip_txfm = mbmi->skip_txfm;
+ const uint8_t segment_id = mbmi->segment_id;
+ if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+ !(is_inter_tx && skip_txfm) && !xd->lossless[segment_id]) {
+ if (is_inter_tx) { // This implies skip flag is 0.
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+ const int txbh = tx_size_high_unit[max_tx_size];
+ const int txbw = tx_size_wide_unit[max_tx_size];
+ const int width = mi_size_wide[bsize];
+ const int height = mi_size_high[bsize];
+ for (int idy = 0; idy < height; idy += txbh) {
+ for (int idx = 0; idx < width; idx += txbw) {
+ write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
+ }
+ }
+ } else {
+ write_selected_tx_size(xd, w);
+ set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd);
+ }
+ } else {
+ set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
+ skip_txfm && is_inter_tx, xd);
+ }
+
+ if (!mbmi->skip_txfm) {
+ int start = aom_tell_size(w);
+
+ write_tokens_b(cpi, &td->mb, w, tok, tok_end);
+
+ const int end = aom_tell_size(w);
+ td->coefficient_size += end - start;
+ }
+}
+
+static AOM_INLINE void write_partition(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, int hbs,
+ int mi_row, int mi_col, PARTITION_TYPE p,
+ BLOCK_SIZE bsize, aom_writer *w) {
+ const int is_partition_point = bsize >= BLOCK_8X8;
+
+ if (!is_partition_point) return;
+
+ const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
+ const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (!has_rows && !has_cols) {
+ assert(p == PARTITION_SPLIT);
+ return;
+ }
+
+ if (has_rows && has_cols) {
+ aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx],
+ partition_cdf_length(bsize));
+ } else if (!has_rows && has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+ assert(bsize > BLOCK_8X8);
+ aom_cdf_prob cdf[2];
+ partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+ aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+ } else {
+ assert(has_rows && !has_cols);
+ assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+ assert(bsize > BLOCK_8X8);
+ aom_cdf_prob cdf[2];
+ partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+ aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+ }
+}
+
+static AOM_INLINE void write_modes_sb(
+ AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile,
+ aom_writer *const w, const TokenExtra **tok,
+ const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MACROBLOCKD *const xd = &td->mb.e_mbd;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int quarter_step = mi_size_wide[bsize] / 4;
+ int i;
+ const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+#if !CONFIG_REALTIME_ONLY
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ int rcol0, rcol1, rrow0, rrow1;
+
+ // Skip some unnecessary work if loop restoration is disabled
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
+ if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+ &rcol0, &rcol1, &rrow0, &rrow1)) {
+ const int rstride = cm->rst_info[plane].horz_units;
+ for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+ for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+ const int runit_idx = rcol + rrow * rstride;
+ loop_restoration_write_sb_coeffs(cm, xd, runit_idx, w, plane,
+ td->counts);
+ }
+ }
+ }
+ }
+#endif
+
+ write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+ switch (partition) {
+ case PARTITION_NONE:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ break;
+ case PARTITION_HORZ:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ if (mi_row + hbs < mi_params->mi_rows)
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_VERT:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ if (mi_col + hbs < mi_params->mi_cols)
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_SPLIT:
+ write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+ write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs,
+ subsize);
+ write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col,
+ subsize);
+ write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+ subsize);
+ break;
+ case PARTITION_HORZ_A:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_HORZ_B:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_VERT_A:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_VERT_B:
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_HORZ_4:
+ for (i = 0; i < 4; ++i) {
+ int this_mi_row = mi_row + i * quarter_step;
+ if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+ write_modes_b(cpi, td, tile, w, tok, tok_end, this_mi_row, mi_col);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (i = 0; i < 4; ++i) {
+ int this_mi_col = mi_col + i * quarter_step;
+ if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+
+ write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, this_mi_col);
+ }
+ break;
+ default: assert(0);
+ }
+
+ // update partition context
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+// Populate token pointers appropriately based on token_info.
+static AOM_INLINE void get_token_pointers(const TokenInfo *token_info,
+ const int tile_row, int tile_col,
+ const int sb_row_in_tile,
+ const TokenExtra **tok,
+ const TokenExtra **tok_end) {
+ if (!is_token_info_allocated(token_info)) {
+ *tok = NULL;
+ *tok_end = NULL;
+ return;
+ }
+ *tok = token_info->tplist[tile_row][tile_col][sb_row_in_tile].start;
+ *tok_end =
+ *tok + token_info->tplist[tile_row][tile_col][sb_row_in_tile].count;
+}
+
+static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td,
+ const TileInfo *const tile,
+ aom_writer *const w, int tile_row,
+ int tile_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &td->mb.e_mbd;
+ const int mi_row_start = tile->mi_row_start;
+ const int mi_row_end = tile->mi_row_end;
+ const int mi_col_start = tile->mi_col_start;
+ const int mi_col_end = tile->mi_col_end;
+ const int num_planes = av1_num_planes(cm);
+
+ av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
+ av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd);
+
+ if (cpi->common.delta_q_info.delta_q_present_flag) {
+ xd->current_base_qindex = cpi->common.quant_params.base_qindex;
+ if (cpi->common.delta_q_info.delta_lf_present_flag) {
+ av1_reset_loop_filter_delta(xd, num_planes);
+ }
+ }
+
+ for (int mi_row = mi_row_start; mi_row < mi_row_end;
+ mi_row += cm->seq_params->mib_size) {
+ const int sb_row_in_tile =
+ (mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2;
+ const TokenInfo *token_info = &cpi->token_info;
+ const TokenExtra *tok;
+ const TokenExtra *tok_end;
+ get_token_pointers(token_info, tile_row, tile_col, sb_row_in_tile, &tok,
+ &tok_end);
+
+ av1_zero_left_context(xd);
+
+ for (int mi_col = mi_col_start; mi_col < mi_col_end;
+ mi_col += cm->seq_params->mib_size) {
+ td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+ write_modes_sb(cpi, td, tile, w, &tok, tok_end, mi_row, mi_col,
+ cm->seq_params->sb_size);
+ }
+ assert(tok == tok_end);
+ }
+}
+
+static AOM_INLINE void encode_restoration_mode(
+ AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+ assert(!cm->features.all_lossless);
+ if (!cm->seq_params->enable_restoration) return;
+ if (cm->features.allow_intrabc) return;
+ const int num_planes = av1_num_planes(cm);
+ int all_none = 1, chroma_none = 1;
+ for (int p = 0; p < num_planes; ++p) {
+ RestorationInfo *rsi = &cm->rst_info[p];
+ if (rsi->frame_restoration_type != RESTORE_NONE) {
+ all_none = 0;
+ chroma_none &= p == 0;
+ }
+ switch (rsi->frame_restoration_type) {
+ case RESTORE_NONE:
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, 0);
+ break;
+ case RESTORE_WIENER:
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_bit(wb, 0);
+ break;
+ case RESTORE_SGRPROJ:
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_bit(wb, 1);
+ break;
+ case RESTORE_SWITCHABLE:
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, 1);
+ break;
+ default: assert(0);
+ }
+ }
+ if (!all_none) {
+ assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+ cm->seq_params->sb_size == BLOCK_128X128);
+ const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
+
+ RestorationInfo *rsi = &cm->rst_info[0];
+
+ assert(rsi->restoration_unit_size >= sb_size);
+ assert(RESTORATION_UNITSIZE_MAX == 256);
+
+ if (sb_size == 64) {
+ aom_wb_write_bit(wb, rsi->restoration_unit_size > 64);
+ }
+ if (rsi->restoration_unit_size > 64) {
+ aom_wb_write_bit(wb, rsi->restoration_unit_size > 128);
+ }
+ }
+
+ if (num_planes > 1) {
+ int s =
+ AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
+ if (s && !chroma_none) {
+ aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
+ cm->rst_info[0].restoration_unit_size);
+ assert(cm->rst_info[1].restoration_unit_size ==
+ cm->rst_info[0].restoration_unit_size ||
+ cm->rst_info[1].restoration_unit_size ==
+ (cm->rst_info[0].restoration_unit_size >> s));
+ assert(cm->rst_info[2].restoration_unit_size ==
+ cm->rst_info[1].restoration_unit_size);
+ } else if (!s) {
+ assert(cm->rst_info[1].restoration_unit_size ==
+ cm->rst_info[0].restoration_unit_size);
+ assert(cm->rst_info[2].restoration_unit_size ==
+ cm->rst_info[1].restoration_unit_size);
+ }
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void write_wiener_filter(int wiener_win,
+ const WienerInfo *wiener_info,
+ WienerInfo *ref_wiener_info,
+ aom_writer *wb) {
+ if (wiener_win == WIENER_WIN)
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+ else
+ assert(wiener_info->vfilter[0] == 0 &&
+ wiener_info->vfilter[WIENER_WIN - 1] == 0);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+ if (wiener_win == WIENER_WIN)
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+ else
+ assert(wiener_info->hfilter[0] == 0 &&
+ wiener_info->hfilter[WIENER_WIN - 1] == 0);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+ memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
+ SgrprojInfo *ref_sgrproj_info,
+ aom_writer *wb) {
+ aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
+ const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
+
+ if (params->r[0] == 0) {
+ assert(sgrproj_info->xqd[0] == 0);
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ } else if (params->r[1] == 0) {
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ } else {
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ }
+
+ memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static AOM_INLINE void loop_restoration_write_sb_coeffs(
+ const AV1_COMMON *const cm, MACROBLOCKD *xd, int runit_idx,
+ aom_writer *const w, int plane, FRAME_COUNTS *counts) {
+ const RestorationUnitInfo *rui = &cm->rst_info[plane].unit_info[runit_idx];
+ const RestorationInfo *rsi = cm->rst_info + plane;
+ RestorationType frame_rtype = rsi->frame_restoration_type;
+ assert(frame_rtype != RESTORE_NONE);
+
+ (void)counts;
+ assert(!cm->features.all_lossless);
+
+ const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+ WienerInfo *ref_wiener_info = &xd->wiener_info[plane];
+ SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane];
+ RestorationType unit_rtype = rui->restoration_type;
+
+ if (frame_rtype == RESTORE_SWITCHABLE) {
+ aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf,
+ RESTORE_SWITCHABLE_TYPES);
+#if CONFIG_ENTROPY_STATS
+ ++counts->switchable_restore[unit_rtype];
+#endif
+ switch (unit_rtype) {
+ case RESTORE_WIENER:
+#if DEBUG_LR_COSTING
+ assert(!memcmp(
+ ref_wiener_info,
+ &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx].wiener_info,
+ sizeof(*ref_wiener_info)));
+#endif
+ write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
+ break;
+ case RESTORE_SGRPROJ:
+#if DEBUG_LR_COSTING
+ assert(!memcmp(&ref_sgrproj_info->xqd,
+ &lr_ref_params[RESTORE_SWITCHABLE][plane][runit_idx]
+ .sgrproj_info.xqd,
+ sizeof(ref_sgrproj_info->xqd)));
+#endif
+ write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
+ break;
+ default: assert(unit_rtype == RESTORE_NONE); break;
+ }
+ } else if (frame_rtype == RESTORE_WIENER) {
+ aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+ xd->tile_ctx->wiener_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+ ++counts->wiener_restore[unit_rtype != RESTORE_NONE];
+#endif
+ if (unit_rtype != RESTORE_NONE) {
+#if DEBUG_LR_COSTING
+ assert(
+ !memcmp(ref_wiener_info,
+ &lr_ref_params[RESTORE_WIENER][plane][runit_idx].wiener_info,
+ sizeof(*ref_wiener_info)));
+#endif
+ write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
+ }
+ } else if (frame_rtype == RESTORE_SGRPROJ) {
+ aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+ xd->tile_ctx->sgrproj_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+ ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
+#endif
+ if (unit_rtype != RESTORE_NONE) {
+#if DEBUG_LR_COSTING
+ assert(!memcmp(
+ &ref_sgrproj_info->xqd,
+ &lr_ref_params[RESTORE_SGRPROJ][plane][runit_idx].sgrproj_info.xqd,
+ sizeof(ref_sgrproj_info->xqd)));
+#endif
+ write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
+ }
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// Only write out the ref delta section if any of the elements
+// will signal a delta.
+static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) {
+ struct loopfilter *lf = &cm->lf;
+ if (!lf->mode_ref_delta_update) {
+ return 0;
+ }
+ const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
+ int8_t last_ref_deltas[REF_FRAMES];
+ int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+ if (buf == NULL) {
+ av1_set_default_ref_deltas(last_ref_deltas);
+ av1_set_default_mode_deltas(last_mode_deltas);
+ } else {
+ memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
+ memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
+ }
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (lf->ref_deltas[i] != last_ref_deltas[i]) {
+ return true;
+ }
+ }
+ for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+ if (lf->mode_deltas[i] != last_mode_deltas[i]) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ assert(!cm->features.coded_lossless);
+ if (cm->features.allow_intrabc) return;
+ const int num_planes = av1_num_planes(cm);
+ struct loopfilter *lf = &cm->lf;
+
+ // Encode the loop filter level and type
+ aom_wb_write_literal(wb, lf->filter_level[0], 6);
+ aom_wb_write_literal(wb, lf->filter_level[1], 6);
+ if (num_planes > 1) {
+ if (lf->filter_level[0] || lf->filter_level[1]) {
+ aom_wb_write_literal(wb, lf->filter_level_u, 6);
+ aom_wb_write_literal(wb, lf->filter_level_v, 6);
+ }
+ }
+ aom_wb_write_literal(wb, lf->sharpness_level, 3);
+
+ aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+ // Write out loop filter deltas applied at the MB level based on mode or
+ // ref frame (if they are enabled), only if there is information to write.
+ int meaningful = is_mode_ref_delta_meaningful(cm);
+ aom_wb_write_bit(wb, meaningful);
+ if (!meaningful) {
+ return;
+ }
+
+ const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
+ int8_t last_ref_deltas[REF_FRAMES];
+ int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+ if (buf == NULL) {
+ av1_set_default_ref_deltas(last_ref_deltas);
+ av1_set_default_mode_deltas(last_mode_deltas);
+ } else {
+ memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
+ memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
+ }
+ for (int i = 0; i < REF_FRAMES; i++) {
+ const int delta = lf->ref_deltas[i];
+ const int changed = delta != last_ref_deltas[i];
+ aom_wb_write_bit(wb, changed);
+ if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+ }
+ for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+ const int delta = lf->mode_deltas[i];
+ const int changed = delta != last_mode_deltas[i];
+ aom_wb_write_bit(wb, changed);
+ if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+ }
+}
+
+static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ assert(!cm->features.coded_lossless);
+ if (!cm->seq_params->enable_cdef) return;
+ if (cm->features.allow_intrabc) return;
+ const int num_planes = av1_num_planes(cm);
+ int i;
+ aom_wb_write_literal(wb, cm->cdef_info.cdef_damping - 3, 2);
+ aom_wb_write_literal(wb, cm->cdef_info.cdef_bits, 2);
+ for (i = 0; i < cm->cdef_info.nb_cdef_strengths; i++) {
+ aom_wb_write_literal(wb, cm->cdef_info.cdef_strengths[i],
+ CDEF_STRENGTH_BITS);
+ if (num_planes > 1)
+ aom_wb_write_literal(wb, cm->cdef_info.cdef_uv_strengths[i],
+ CDEF_STRENGTH_BITS);
+ }
+}
+
+static AOM_INLINE void write_delta_q(struct aom_write_bit_buffer *wb,
+ int delta_q) {
+ if (delta_q != 0) {
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_inv_signed_literal(wb, delta_q, 6);
+ } else {
+ aom_wb_write_bit(wb, 0);
+ }
+}
+
+static AOM_INLINE void encode_quantization(
+ const CommonQuantParams *const quant_params, int num_planes,
+ bool separate_uv_delta_q, struct aom_write_bit_buffer *wb) {
+ aom_wb_write_literal(wb, quant_params->base_qindex, QINDEX_BITS);
+ write_delta_q(wb, quant_params->y_dc_delta_q);
+ if (num_planes > 1) {
+ int diff_uv_delta =
+ (quant_params->u_dc_delta_q != quant_params->v_dc_delta_q) ||
+ (quant_params->u_ac_delta_q != quant_params->v_ac_delta_q);
+ if (separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+ write_delta_q(wb, quant_params->u_dc_delta_q);
+ write_delta_q(wb, quant_params->u_ac_delta_q);
+ if (diff_uv_delta) {
+ write_delta_q(wb, quant_params->v_dc_delta_q);
+ write_delta_q(wb, quant_params->v_ac_delta_q);
+ }
+ }
+ aom_wb_write_bit(wb, quant_params->using_qmatrix);
+ if (quant_params->using_qmatrix) {
+ aom_wb_write_literal(wb, quant_params->qmatrix_level_y, QM_LEVEL_BITS);
+ aom_wb_write_literal(wb, quant_params->qmatrix_level_u, QM_LEVEL_BITS);
+ if (!separate_uv_delta_q)
+ assert(quant_params->qmatrix_level_u == quant_params->qmatrix_level_v);
+ else
+ aom_wb_write_literal(wb, quant_params->qmatrix_level_v, QM_LEVEL_BITS);
+ }
+}
+
+static AOM_INLINE void encode_segmentation(AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ int i, j;
+ struct segmentation *seg = &cm->seg;
+
+ aom_wb_write_bit(wb, seg->enabled);
+ if (!seg->enabled) return;
+
+ // Write update flags
+ if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) {
+ aom_wb_write_bit(wb, seg->update_map);
+ if (seg->update_map) aom_wb_write_bit(wb, seg->temporal_update);
+ aom_wb_write_bit(wb, seg->update_data);
+ }
+
+ // Segmentation data
+ if (seg->update_data) {
+ for (i = 0; i < MAX_SEGMENTS; i++) {
+ for (j = 0; j < SEG_LVL_MAX; j++) {
+ const int active = segfeature_active(seg, i, j);
+ aom_wb_write_bit(wb, active);
+ if (active) {
+ const int data_max = av1_seg_feature_data_max(j);
+ const int data_min = -data_max;
+ const int ubits = get_unsigned_bits(data_max);
+ const int data = clamp(get_segdata(seg, i, j), data_min, data_max);
+
+ if (av1_is_segfeature_signed(j)) {
+ aom_wb_write_inv_signed_literal(wb, data, ubits);
+ } else {
+ aom_wb_write_literal(wb, data, ubits);
+ }
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void write_frame_interp_filter(
+ InterpFilter filter, struct aom_write_bit_buffer *wb) {
+ aom_wb_write_bit(wb, filter == SWITCHABLE);
+ if (filter != SWITCHABLE)
+ aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
+}
+
+// Same function as write_uniform but writing to uncompresses header wb
+static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n,
+ int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return;
+ if (v < m) {
+ aom_wb_write_literal(wb, v, l - 1);
+ } else {
+ aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+ aom_wb_write_literal(wb, (v - m) & 1, 1);
+ }
+}
+
+static AOM_INLINE void write_tile_info_max_tile(
+ const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+ int width_sb =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+ int height_sb =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+ int size_sb, i;
+ const CommonTileParams *const tiles = &cm->tiles;
+
+ aom_wb_write_bit(wb, tiles->uniform_spacing);
+
+ if (tiles->uniform_spacing) {
+ int ones = tiles->log2_cols - tiles->min_log2_cols;
+ while (ones--) {
+ aom_wb_write_bit(wb, 1);
+ }
+ if (tiles->log2_cols < tiles->max_log2_cols) {
+ aom_wb_write_bit(wb, 0);
+ }
+
+ // rows
+ ones = tiles->log2_rows - tiles->min_log2_rows;
+ while (ones--) {
+ aom_wb_write_bit(wb, 1);
+ }
+ if (tiles->log2_rows < tiles->max_log2_rows) {
+ aom_wb_write_bit(wb, 0);
+ }
+ } else {
+ // Explicit tiles with configurable tile widths and heights
+ // columns
+ for (i = 0; i < tiles->cols; i++) {
+ size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+ wb_write_uniform(wb, AOMMIN(width_sb, tiles->max_width_sb), size_sb - 1);
+ width_sb -= size_sb;
+ }
+ assert(width_sb == 0);
+
+ // rows
+ for (i = 0; i < tiles->rows; i++) {
+ size_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
+ wb_write_uniform(wb, AOMMIN(height_sb, tiles->max_height_sb),
+ size_sb - 1);
+ height_sb -= size_sb;
+ }
+ assert(height_sb == 0);
+ }
+}
+
+static AOM_INLINE void write_tile_info(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *saved_wb,
+ struct aom_write_bit_buffer *wb) {
+ write_tile_info_max_tile(cm, wb);
+
+ *saved_wb = *wb;
+ if (cm->tiles.rows * cm->tiles.cols > 1) {
+ // tile id used for cdf update
+ aom_wb_write_literal(wb, 0, cm->tiles.log2_cols + cm->tiles.log2_rows);
+ // Number of bytes in tile size - 1
+ aom_wb_write_literal(wb, 3, 2);
+ }
+}
+
+static AOM_INLINE void write_ext_tile_info(
+ const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb,
+ struct aom_write_bit_buffer *wb) {
+ // This information is stored as a separate byte.
+ int mod = wb->bit_offset % CHAR_BIT;
+ if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod);
+ assert(aom_wb_is_byte_aligned(wb));
+
+ *saved_wb = *wb;
+ if (cm->tiles.rows * cm->tiles.cols > 1) {
+ // Note that the last item in the uncompressed header is the data
+ // describing tile configuration.
+ // Number of bytes in tile column size - 1
+ aom_wb_write_literal(wb, 0, 2);
+ // Number of bytes in tile size - 1
+ aom_wb_write_literal(wb, 0, 2);
+ }
+}
+
+static INLINE int find_identical_tile(
+ const int tile_row, const int tile_col,
+ TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
+ const MV32 candidate_offset[1] = { { 1, 0 } };
+ const uint8_t *const cur_tile_data =
+ tile_buffers[tile_row][tile_col].data + 4;
+ const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size;
+
+ int i;
+
+ if (tile_row == 0) return 0;
+
+ // (TODO: yunqingwang) For now, only above tile is checked and used.
+ // More candidates such as left tile can be added later.
+ for (i = 0; i < 1; i++) {
+ int row_offset = candidate_offset[0].row;
+ int col_offset = candidate_offset[0].col;
+ int row = tile_row - row_offset;
+ int col = tile_col - col_offset;
+ const uint8_t *tile_data;
+ TileBufferEnc *candidate;
+
+ if (row < 0 || col < 0) continue;
+
+ const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data);
+
+ // Read out tile-copy-mode bit:
+ if ((tile_hdr >> 31) == 1) {
+ // The candidate is a copy tile itself: the offset is stored in bits
+ // 30 through 24 inclusive.
+ row_offset += (tile_hdr >> 24) & 0x7f;
+ row = tile_row - row_offset;
+ }
+
+ candidate = &tile_buffers[row][col];
+
+ if (row_offset >= 128 || candidate->size != cur_tile_size) continue;
+
+ tile_data = candidate->data + 4;
+
+ if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue;
+
+ // Identical tile found
+ assert(row_offset > 0);
+ return row_offset;
+ }
+
+ // No identical tile found
+ return 0;
+}
+
+static AOM_INLINE void write_render_size(const AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ const int scaling_active = av1_resize_scaled(cm);
+ aom_wb_write_bit(wb, scaling_active);
+ if (scaling_active) {
+ aom_wb_write_literal(wb, cm->render_width - 1, 16);
+ aom_wb_write_literal(wb, cm->render_height - 1, 16);
+ }
+}
+
+static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ const SequenceHeader *const seq_params = cm->seq_params;
+ if (!seq_params->enable_superres) {
+ assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
+ return;
+ }
+
+ // First bit is whether to to scale or not
+ if (cm->superres_scale_denominator == SCALE_NUMERATOR) {
+ aom_wb_write_bit(wb, 0); // no scaling
+ } else {
+ aom_wb_write_bit(wb, 1); // scaling, write scale factor
+ assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN);
+ assert(cm->superres_scale_denominator <
+ SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS));
+ aom_wb_write_literal(
+ wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN,
+ SUPERRES_SCALE_BITS);
+ }
+}
+
+static AOM_INLINE void write_frame_size(const AV1_COMMON *cm,
+ int frame_size_override,
+ struct aom_write_bit_buffer *wb) {
+ const int coded_width = cm->superres_upscaled_width - 1;
+ const int coded_height = cm->superres_upscaled_height - 1;
+
+ if (frame_size_override) {
+ const SequenceHeader *seq_params = cm->seq_params;
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
+ aom_wb_write_literal(wb, coded_width, num_bits_width);
+ aom_wb_write_literal(wb, coded_height, num_bits_height);
+ }
+
+ write_superres_scale(cm, wb);
+ write_render_size(cm, wb);
+}
+
+static AOM_INLINE void write_frame_size_with_refs(
+ const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+ int found = 0;
+
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame);
+
+ if (cfg != NULL) {
+ found = cm->superres_upscaled_width == cfg->y_crop_width &&
+ cm->superres_upscaled_height == cfg->y_crop_height;
+ found &= cm->render_width == cfg->render_width &&
+ cm->render_height == cfg->render_height;
+ }
+ aom_wb_write_bit(wb, found);
+ if (found) {
+ write_superres_scale(cm, wb);
+ break;
+ }
+ }
+
+ if (!found) {
+ int frame_size_override = 1; // Always equal to 1 in this function
+ write_frame_size(cm, frame_size_override, wb);
+ }
+}
+
+static AOM_INLINE void write_profile(BITSTREAM_PROFILE profile,
+ struct aom_write_bit_buffer *wb) {
+ assert(profile >= PROFILE_0 && profile < MAX_PROFILES);
+ aom_wb_write_literal(wb, profile, PROFILE_BITS);
+}
+
+static AOM_INLINE void write_bitdepth(const SequenceHeader *const seq_params,
+ struct aom_write_bit_buffer *wb) {
+ // Profile 0/1: [0] for 8 bit, [1] 10-bit
+ // Profile 2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
+ aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1);
+ if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) {
+ aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1);
+ }
+}
+
+static AOM_INLINE void write_color_config(
+ const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
+ write_bitdepth(seq_params, wb);
+ const int is_monochrome = seq_params->monochrome;
+ // monochrome bit
+ if (seq_params->profile != PROFILE_1)
+ aom_wb_write_bit(wb, is_monochrome);
+ else
+ assert(!is_monochrome);
+ if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
+ aom_wb_write_bit(wb, 0); // No color description present
+ } else {
+ aom_wb_write_bit(wb, 1); // Color description present
+ aom_wb_write_literal(wb, seq_params->color_primaries, 8);
+ aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8);
+ aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8);
+ }
+ if (is_monochrome) {
+ // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+ aom_wb_write_bit(wb, seq_params->color_range);
+ return;
+ }
+ if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ assert(seq_params->profile == PROFILE_1 ||
+ (seq_params->profile == PROFILE_2 &&
+ seq_params->bit_depth == AOM_BITS_12));
+ } else {
+ // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+ aom_wb_write_bit(wb, seq_params->color_range);
+ if (seq_params->profile == PROFILE_0) {
+ // 420 only
+ assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1);
+ } else if (seq_params->profile == PROFILE_1) {
+ // 444 only
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ } else if (seq_params->profile == PROFILE_2) {
+ if (seq_params->bit_depth == AOM_BITS_12) {
+ // 420, 444 or 422
+ aom_wb_write_bit(wb, seq_params->subsampling_x);
+ if (seq_params->subsampling_x == 0) {
+ assert(seq_params->subsampling_y == 0 &&
+ "4:4:0 subsampling not allowed in AV1");
+ } else {
+ aom_wb_write_bit(wb, seq_params->subsampling_y);
+ }
+ } else {
+ // 422 only
+ assert(seq_params->subsampling_x == 1 &&
+ seq_params->subsampling_y == 0);
+ }
+ }
+ if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ }
+ if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) {
+ aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2);
+ }
+ }
+ aom_wb_write_bit(wb, seq_params->separate_uv_delta_q);
+}
+
+static AOM_INLINE void write_timing_info_header(
+ const aom_timing_info_t *const timing_info,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_unsigned_literal(wb, timing_info->num_units_in_display_tick, 32);
+ aom_wb_write_unsigned_literal(wb, timing_info->time_scale, 32);
+ aom_wb_write_bit(wb, timing_info->equal_picture_interval);
+ if (timing_info->equal_picture_interval) {
+ aom_wb_write_uvlc(wb, timing_info->num_ticks_per_picture - 1);
+ }
+}
+
+static AOM_INLINE void write_decoder_model_info(
+ const aom_dec_model_info_t *const decoder_model_info,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_literal(
+ wb, decoder_model_info->encoder_decoder_buffer_delay_length - 1, 5);
+ aom_wb_write_unsigned_literal(
+ wb, decoder_model_info->num_units_in_decoding_tick, 32);
+ aom_wb_write_literal(wb, decoder_model_info->buffer_removal_time_length - 1,
+ 5);
+ aom_wb_write_literal(
+ wb, decoder_model_info->frame_presentation_time_length - 1, 5);
+}
+
+static AOM_INLINE void write_dec_model_op_parameters(
+ const aom_dec_model_op_parameters_t *op_params, int buffer_delay_length,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_unsigned_literal(wb, op_params->decoder_buffer_delay,
+ buffer_delay_length);
+ aom_wb_write_unsigned_literal(wb, op_params->encoder_buffer_delay,
+ buffer_delay_length);
+ aom_wb_write_bit(wb, op_params->low_delay_mode_flag);
+}
+
+static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_unsigned_literal(
+ wb, cm->frame_presentation_time,
+ cm->seq_params->decoder_model_info.frame_presentation_time_length);
+}
+
+static AOM_INLINE void write_film_grain_params(
+ const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params;
+ aom_wb_write_bit(wb, pars->apply_grain);
+ if (!pars->apply_grain) return;
+
+ aom_wb_write_literal(wb, pars->random_seed, 16);
+
+ if (cm->current_frame.frame_type == INTER_FRAME)
+ aom_wb_write_bit(wb, pars->update_parameters);
+
+ if (!pars->update_parameters) {
+ int ref_frame, ref_idx;
+ for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) {
+ ref_idx = get_ref_frame_map_idx(cm, ref_frame);
+ assert(ref_idx != INVALID_IDX);
+ const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx];
+ if (buf->film_grain_params_present &&
+ aom_check_grain_params_equiv(pars, &buf->film_grain_params)) {
+ break;
+ }
+ }
+ assert(ref_frame < REF_FRAMES);
+ aom_wb_write_literal(wb, ref_idx, 3);
+ return;
+ }
+
+ // Scaling functions parameters
+ aom_wb_write_literal(wb, pars->num_y_points, 4); // max 14
+ for (int i = 0; i < pars->num_y_points; i++) {
+ aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8);
+ aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
+ }
+
+ if (!cm->seq_params->monochrome) {
+ aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
+ } else {
+ assert(!pars->chroma_scaling_from_luma);
+ }
+
+ if (cm->seq_params->monochrome || pars->chroma_scaling_from_luma ||
+ ((cm->seq_params->subsampling_x == 1) &&
+ (cm->seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) {
+ assert(pars->num_cb_points == 0 && pars->num_cr_points == 0);
+ } else {
+ aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10
+ for (int i = 0; i < pars->num_cb_points; i++) {
+ aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8);
+ aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8);
+ }
+
+ aom_wb_write_literal(wb, pars->num_cr_points, 4); // max 10
+ for (int i = 0; i < pars->num_cr_points; i++) {
+ aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8);
+ aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8);
+ }
+ }
+
+ aom_wb_write_literal(wb, pars->scaling_shift - 8, 2); // 8 + value
+
+ // AR coefficients
+ // Only sent if the corresponsing scaling function has
+ // more than 0 points
+
+ aom_wb_write_literal(wb, pars->ar_coeff_lag, 2);
+
+ int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+ int num_pos_chroma = num_pos_luma;
+ if (pars->num_y_points > 0) ++num_pos_chroma;
+
+ if (pars->num_y_points)
+ for (int i = 0; i < num_pos_luma; i++)
+ aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8);
+
+ if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+ for (int i = 0; i < num_pos_chroma; i++)
+ aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8);
+
+ if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+ for (int i = 0; i < num_pos_chroma; i++)
+ aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8);
+
+ aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2); // 8 + value
+
+ aom_wb_write_literal(wb, pars->grain_scale_shift, 2);
+
+ if (pars->num_cb_points) {
+ aom_wb_write_literal(wb, pars->cb_mult, 8);
+ aom_wb_write_literal(wb, pars->cb_luma_mult, 8);
+ aom_wb_write_literal(wb, pars->cb_offset, 9);
+ }
+
+ if (pars->num_cr_points) {
+ aom_wb_write_literal(wb, pars->cr_mult, 8);
+ aom_wb_write_literal(wb, pars->cr_luma_mult, 8);
+ aom_wb_write_literal(wb, pars->cr_offset, 9);
+ }
+
+ aom_wb_write_bit(wb, pars->overlap_flag);
+
+ aom_wb_write_bit(wb, pars->clip_to_restricted_range);
+}
+
+static AOM_INLINE void write_sb_size(const SequenceHeader *const seq_params,
+ struct aom_write_bit_buffer *wb) {
+ (void)seq_params;
+ (void)wb;
+ assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]);
+ assert(seq_params->mib_size == 1 << seq_params->mib_size_log2);
+ assert(seq_params->sb_size == BLOCK_128X128 ||
+ seq_params->sb_size == BLOCK_64X64);
+ aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
+}
+
+static AOM_INLINE void write_sequence_header(
+ const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
+ aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4);
+ aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4);
+ aom_wb_write_literal(wb, seq_params->max_frame_width - 1,
+ seq_params->num_bits_width);
+ aom_wb_write_literal(wb, seq_params->max_frame_height - 1,
+ seq_params->num_bits_height);
+
+ if (!seq_params->reduced_still_picture_hdr) {
+ aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
+ if (seq_params->frame_id_numbers_present_flag) {
+ // We must always have delta_frame_id_length < frame_id_length,
+ // in order for a frame to be referenced with a unique delta.
+ // Avoid wasting bits by using a coding that enforces this restriction.
+ aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4);
+ aom_wb_write_literal(
+ wb,
+ seq_params->frame_id_length - seq_params->delta_frame_id_length - 1,
+ 3);
+ }
+ }
+
+ write_sb_size(seq_params, wb);
+
+ aom_wb_write_bit(wb, seq_params->enable_filter_intra);
+ aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter);
+
+ if (!seq_params->reduced_still_picture_hdr) {
+ aom_wb_write_bit(wb, seq_params->enable_interintra_compound);
+ aom_wb_write_bit(wb, seq_params->enable_masked_compound);
+ aom_wb_write_bit(wb, seq_params->enable_warped_motion);
+ aom_wb_write_bit(wb, seq_params->enable_dual_filter);
+
+ aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint);
+
+ if (seq_params->order_hint_info.enable_order_hint) {
+ aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp);
+ aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs);
+ }
+ if (seq_params->force_screen_content_tools == 2) {
+ aom_wb_write_bit(wb, 1);
+ } else {
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, seq_params->force_screen_content_tools);
+ }
+ if (seq_params->force_screen_content_tools > 0) {
+ if (seq_params->force_integer_mv == 2) {
+ aom_wb_write_bit(wb, 1);
+ } else {
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, seq_params->force_integer_mv);
+ }
+ } else {
+ assert(seq_params->force_integer_mv == 2);
+ }
+ if (seq_params->order_hint_info.enable_order_hint)
+ aom_wb_write_literal(
+ wb, seq_params->order_hint_info.order_hint_bits_minus_1, 3);
+ }
+
+ aom_wb_write_bit(wb, seq_params->enable_superres);
+ aom_wb_write_bit(wb, seq_params->enable_cdef);
+ aom_wb_write_bit(wb, seq_params->enable_restoration);
+}
+
+static AOM_INLINE void write_global_motion_params(
+ const WarpedMotionParams *params, const WarpedMotionParams *ref_params,
+ struct aom_write_bit_buffer *wb, int allow_hp) {
+ const TransformationType type = params->wmtype;
+
+ // As a workaround for an AV1 spec bug, we avoid choosing TRANSLATION
+ // type models. Check here that we don't accidentally pick one somehow.
+ // See comments in gm_get_motion_vector() for details on the bug we're
+ // working around here
+ assert(type != TRANSLATION);
+
+ aom_wb_write_bit(wb, type != IDENTITY);
+ if (type != IDENTITY) {
+ aom_wb_write_bit(wb, type == ROTZOOM);
+ if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION);
+ }
+
+ if (type >= ROTZOOM) {
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+ (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+ }
+
+ if (type >= AFFINE) {
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+ (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ }
+
+ if (type >= TRANSLATION) {
+ const int trans_bits = (type == TRANSLATION)
+ ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+ : GM_ABS_TRANS_BITS;
+ const int trans_prec_diff = (type == TRANSLATION)
+ ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+ : GM_TRANS_PREC_DIFF;
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[0] >> trans_prec_diff),
+ (params->wmmat[0] >> trans_prec_diff));
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[1] >> trans_prec_diff),
+ (params->wmmat[1] >> trans_prec_diff));
+ }
+}
+
+static AOM_INLINE void write_global_motion(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ int frame;
+ for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+ const WarpedMotionParams *ref_params =
+ cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+ : &default_warp_params;
+ write_global_motion_params(&cm->global_motion[frame], ref_params, wb,
+ cm->features.allow_high_precision_mv);
+ // TODO(sarahparker, debargha): The logic in the commented out code below
+ // does not work currently and causes mismatches when resize is on.
+ // Fix it before turning the optimization back on.
+ /*
+ YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame);
+ if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
+ cpi->source->y_crop_height == ref_buf->y_crop_height) {
+ write_global_motion_params(&cm->global_motion[frame],
+ &cm->prev_frame->global_motion[frame], wb,
+ cm->features.allow_high_precision_mv);
+ } else {
+ assert(cm->global_motion[frame].wmtype == IDENTITY &&
+ "Invalid warp type for frames of different resolutions");
+ }
+ */
+ /*
+ printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n",
+ cm->current_frame.frame_number, cm->show_frame, frame,
+ cm->global_motion[frame].wmmat[0],
+ cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2],
+ cm->global_motion[frame].wmmat[3]);
+ */
+ }
+}
+
+static int check_frame_refs_short_signaling(AV1_COMMON *const cm,
+ bool enable_ref_short_signaling) {
+ // In rtc case when res < 360p and speed >= 9, we turn on
+ // frame_refs_short_signaling if it won't break the decoder.
+ if (enable_ref_short_signaling) {
+ const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+ const int base =
+ 1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+
+ const int order_hint_group_cur =
+ cm->current_frame.display_order_hint / base;
+ const int order_hint_group_gld =
+ cm->ref_frame_map[gld_map_idx]->display_order_hint / base;
+ const int relative_dist = cm->current_frame.order_hint -
+ cm->ref_frame_map[gld_map_idx]->order_hint;
+
+ // If current frame and GOLDEN frame are in the same order_hint group, and
+ // they are not far apart (i.e., > 64 frames), then return 1.
+ if (order_hint_group_cur == order_hint_group_gld && relative_dist >= 0 &&
+ relative_dist <= 64) {
+ return 1;
+ }
+ return 0;
+ }
+
+ // Check whether all references are distinct frames.
+ const RefCntBuffer *seen_bufs[INTER_REFS_PER_FRAME] = { NULL };
+ int num_refs = 0;
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf != NULL) {
+ int seen = 0;
+ for (int i = 0; i < num_refs; i++) {
+ if (seen_bufs[i] == buf) {
+ seen = 1;
+ break;
+ }
+ }
+ if (!seen) seen_bufs[num_refs++] = buf;
+ }
+ }
+
+ // We only turn on frame_refs_short_signaling when all references are
+ // distinct.
+ if (num_refs < INTER_REFS_PER_FRAME) {
+ // It indicates that there exist more than one reference frame pointing to
+ // the same reference buffer, i.e. two or more references are duplicate.
+ return 0;
+ }
+
+ // Check whether the encoder side ref frame choices are aligned with that to
+ // be derived at the decoder side.
+ int remapped_ref_idx_decoder[REF_FRAMES];
+
+ const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+ const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+
+ // Set up the frame refs mapping indexes according to the
+ // frame_refs_short_signaling policy.
+ av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx);
+
+ // We only turn on frame_refs_short_signaling when the encoder side decision
+ // on ref frames is identical to that at the decoder side.
+ int frame_refs_short_signaling = 1;
+ for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) {
+ // Compare the buffer index between two reference frames indexed
+ // respectively by the encoder and the decoder side decisions.
+ RefCntBuffer *ref_frame_buf_new = NULL;
+ if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) {
+ ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]];
+ }
+ if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) {
+ frame_refs_short_signaling = 0;
+ break;
+ }
+ }
+
+#if 0 // For debug
+ printf("\nFrame=%d: \n", cm->current_frame.frame_number);
+ printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling);
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ printf("enc_ref(map_idx=%d)=%d, vs. "
+ "dec_ref(map_idx=%d)=%d\n",
+ get_ref_frame_map_idx(cm, ref_frame), ref_frame,
+ cm->remapped_ref_idx[ref_frame - LAST_FRAME],
+ ref_frame);
+ }
+#endif // 0
+
+ return frame_refs_short_signaling;
+}
+
+// New function based on HLS R18
+static AOM_INLINE void write_uncompressed_header_obu(
+ AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const CommonQuantParams *quant_params = &cm->quant_params;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ FeatureFlags *const features = &cm->features;
+
+ if (!cpi->sf.rt_sf.enable_ref_short_signaling ||
+ !seq_params->order_hint_info.enable_order_hint ||
+ seq_params->order_hint_info.enable_ref_frame_mvs) {
+ current_frame->frame_refs_short_signaling = 0;
+ } else {
+ current_frame->frame_refs_short_signaling = 1;
+ }
+
+ if (seq_params->still_picture) {
+ assert(cm->show_existing_frame == 0);
+ assert(cm->show_frame == 1);
+ assert(current_frame->frame_type == KEY_FRAME);
+ }
+ if (!seq_params->reduced_still_picture_hdr) {
+ if (encode_show_existing_frame(cm)) {
+ aom_wb_write_bit(wb, 1); // show_existing_frame
+ aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+ if (seq_params->decoder_model_info_present_flag &&
+ seq_params->timing_info.equal_picture_interval == 0) {
+ write_tu_pts_info(cm, wb);
+ }
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_len = seq_params->frame_id_length;
+ int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+ aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+ }
+ return;
+ } else {
+ aom_wb_write_bit(wb, 0); // show_existing_frame
+ }
+
+ aom_wb_write_literal(wb, current_frame->frame_type, 2);
+
+ aom_wb_write_bit(wb, cm->show_frame);
+ if (cm->show_frame) {
+ if (seq_params->decoder_model_info_present_flag &&
+ seq_params->timing_info.equal_picture_interval == 0)
+ write_tu_pts_info(cm, wb);
+ } else {
+ aom_wb_write_bit(wb, cm->showable_frame);
+ }
+ if (frame_is_sframe(cm)) {
+ assert(features->error_resilient_mode);
+ } else if (!(current_frame->frame_type == KEY_FRAME && cm->show_frame)) {
+ aom_wb_write_bit(wb, features->error_resilient_mode);
+ }
+ }
+ aom_wb_write_bit(wb, features->disable_cdf_update);
+
+ if (seq_params->force_screen_content_tools == 2) {
+ aom_wb_write_bit(wb, features->allow_screen_content_tools);
+ } else {
+ assert(features->allow_screen_content_tools ==
+ seq_params->force_screen_content_tools);
+ }
+
+ if (features->allow_screen_content_tools) {
+ if (seq_params->force_integer_mv == 2) {
+ aom_wb_write_bit(wb, features->cur_frame_force_integer_mv);
+ } else {
+ assert(features->cur_frame_force_integer_mv ==
+ seq_params->force_integer_mv);
+ }
+ } else {
+ assert(features->cur_frame_force_integer_mv == 0);
+ }
+
+ int frame_size_override_flag = 0;
+
+ if (seq_params->reduced_still_picture_hdr) {
+ assert(cm->superres_upscaled_width == seq_params->max_frame_width &&
+ cm->superres_upscaled_height == seq_params->max_frame_height);
+ } else {
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_len = seq_params->frame_id_length;
+ aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+ }
+
+ if (cm->superres_upscaled_width > seq_params->max_frame_width ||
+ cm->superres_upscaled_height > seq_params->max_frame_height) {
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Frame dimensions are larger than the maximum values");
+ }
+
+ frame_size_override_flag =
+ frame_is_sframe(cm)
+ ? 1
+ : (cm->superres_upscaled_width != seq_params->max_frame_width ||
+ cm->superres_upscaled_height != seq_params->max_frame_height);
+ if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
+
+ if (seq_params->order_hint_info.enable_order_hint)
+ aom_wb_write_literal(
+ wb, current_frame->order_hint,
+ seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+
+ if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
+ aom_wb_write_literal(wb, features->primary_ref_frame, PRIMARY_REF_BITS);
+ }
+ }
+
+ if (seq_params->decoder_model_info_present_flag) {
+ aom_wb_write_bit(wb, cpi->ppi->buffer_removal_time_present);
+ if (cpi->ppi->buffer_removal_time_present) {
+ for (int op_num = 0;
+ op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
+ if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
+ if (seq_params->operating_point_idc[op_num] == 0 ||
+ ((seq_params->operating_point_idc[op_num] >>
+ cm->temporal_layer_id) &
+ 0x1 &&
+ (seq_params->operating_point_idc[op_num] >>
+ (cm->spatial_layer_id + 8)) &
+ 0x1)) {
+ aom_wb_write_unsigned_literal(
+ wb, cm->buffer_removal_times[op_num],
+ seq_params->decoder_model_info.buffer_removal_time_length);
+ cm->buffer_removal_times[op_num]++;
+ if (cm->buffer_removal_times[op_num] == 0) {
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "buffer_removal_time overflowed");
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Shown keyframes and switch-frames automatically refreshes all reference
+ // frames. For all other frame types, we need to write refresh_frame_flags.
+ if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) ||
+ current_frame->frame_type == INTER_FRAME ||
+ current_frame->frame_type == INTRA_ONLY_FRAME)
+ aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES);
+
+ if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) {
+ // Write all ref frame order hints if error_resilient_mode == 1
+ if (features->error_resilient_mode &&
+ seq_params->order_hint_info.enable_order_hint) {
+ for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+ aom_wb_write_literal(
+ wb, cm->ref_frame_map[ref_idx]->order_hint,
+ seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+ }
+ }
+ }
+
+ if (current_frame->frame_type == KEY_FRAME) {
+ write_frame_size(cm, frame_size_override_flag, wb);
+ assert(!av1_superres_scaled(cm) || !features->allow_intrabc);
+ if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+ aom_wb_write_bit(wb, features->allow_intrabc);
+ } else {
+ if (current_frame->frame_type == INTRA_ONLY_FRAME) {
+ write_frame_size(cm, frame_size_override_flag, wb);
+ assert(!av1_superres_scaled(cm) || !features->allow_intrabc);
+ if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+ aom_wb_write_bit(wb, features->allow_intrabc);
+ } else if (current_frame->frame_type == INTER_FRAME ||
+ frame_is_sframe(cm)) {
+ MV_REFERENCE_FRAME ref_frame;
+
+ // NOTE: Error resilient mode turns off frame_refs_short_signaling
+ // automatically.
+#define FRAME_REFS_SHORT_SIGNALING 0
+#if FRAME_REFS_SHORT_SIGNALING
+ current_frame->frame_refs_short_signaling =
+ seq_params->order_hint_info.enable_order_hint;
+#endif // FRAME_REFS_SHORT_SIGNALING
+
+ if (current_frame->frame_refs_short_signaling) {
+ // In rtc case when cpi->sf.rt_sf.enable_ref_short_signaling is true,
+ // we turn on frame_refs_short_signaling when the current frame and
+ // golden frame are in the same order_hint group, and their relative
+ // distance is <= 64 (in order to be decodable).
+
+ // For other cases, an example solution for encoder-side
+ // implementation on frame_refs_short_signaling is also provided in
+ // this function, where frame_refs_short_signaling is only turned on
+ // when the encoder side decision on ref frames is identical to that
+ // at the decoder side.
+
+ current_frame->frame_refs_short_signaling =
+ check_frame_refs_short_signaling(
+ cm, cpi->sf.rt_sf.enable_ref_short_signaling);
+ }
+
+ if (seq_params->order_hint_info.enable_order_hint)
+ aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling);
+
+ if (current_frame->frame_refs_short_signaling) {
+ const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME);
+ aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2);
+
+ const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+ aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2);
+ }
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX);
+ if (!current_frame->frame_refs_short_signaling)
+ aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame),
+ REF_FRAMES_LOG2);
+ if (seq_params->frame_id_numbers_present_flag) {
+ int i = get_ref_frame_map_idx(cm, ref_frame);
+ int frame_id_len = seq_params->frame_id_length;
+ int diff_len = seq_params->delta_frame_id_length;
+ int delta_frame_id_minus_1 =
+ ((cm->current_frame_id - cm->ref_frame_id[i] +
+ (1 << frame_id_len)) %
+ (1 << frame_id_len)) -
+ 1;
+ if (delta_frame_id_minus_1 < 0 ||
+ delta_frame_id_minus_1 >= (1 << diff_len)) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Invalid delta_frame_id_minus_1");
+ }
+ aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
+ }
+ }
+
+ if (!features->error_resilient_mode && frame_size_override_flag) {
+ write_frame_size_with_refs(cm, wb);
+ } else {
+ write_frame_size(cm, frame_size_override_flag, wb);
+ }
+
+ if (!features->cur_frame_force_integer_mv)
+ aom_wb_write_bit(wb, features->allow_high_precision_mv);
+ write_frame_interp_filter(features->interp_filter, wb);
+ aom_wb_write_bit(wb, features->switchable_motion_mode);
+ if (frame_might_allow_ref_frame_mvs(cm)) {
+ aom_wb_write_bit(wb, features->allow_ref_frame_mvs);
+ } else {
+ assert(features->allow_ref_frame_mvs == 0);
+ }
+ }
+ }
+
+ const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) &&
+ !(features->disable_cdf_update);
+ if (cm->tiles.large_scale)
+ assert(features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+
+ if (might_bwd_adapt) {
+ aom_wb_write_bit(
+ wb, features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+ }
+
+ write_tile_info(cm, saved_wb, wb);
+ encode_quantization(quant_params, av1_num_planes(cm),
+ cm->seq_params->separate_uv_delta_q, wb);
+ encode_segmentation(cm, wb);
+
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0);
+ if (quant_params->base_qindex > 0) {
+ aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag);
+ if (delta_q_info->delta_q_present_flag) {
+ aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2);
+ xd->current_base_qindex = quant_params->base_qindex;
+ if (features->allow_intrabc)
+ assert(delta_q_info->delta_lf_present_flag == 0);
+ else
+ aom_wb_write_bit(wb, delta_q_info->delta_lf_present_flag);
+ if (delta_q_info->delta_lf_present_flag) {
+ aom_wb_write_literal(wb, get_msb(delta_q_info->delta_lf_res), 2);
+ aom_wb_write_bit(wb, delta_q_info->delta_lf_multi);
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ }
+ }
+ }
+
+ if (features->all_lossless) {
+ assert(!av1_superres_scaled(cm));
+ } else {
+ if (!features->coded_lossless) {
+ encode_loopfilter(cm, wb);
+ encode_cdef(cm, wb);
+ }
+ encode_restoration_mode(cm, wb);
+ }
+
+ // Write TX mode
+ if (features->coded_lossless)
+ assert(features->tx_mode == ONLY_4X4);
+ else
+ aom_wb_write_bit(wb, features->tx_mode == TX_MODE_SELECT);
+
+ if (!frame_is_intra_only(cm)) {
+ const int use_hybrid_pred =
+ current_frame->reference_mode == REFERENCE_MODE_SELECT;
+
+ aom_wb_write_bit(wb, use_hybrid_pred);
+ }
+
+ if (current_frame->skip_mode_info.skip_mode_allowed)
+ aom_wb_write_bit(wb, current_frame->skip_mode_info.skip_mode_flag);
+
+ if (frame_might_allow_warped_motion(cm))
+ aom_wb_write_bit(wb, features->allow_warped_motion);
+ else
+ assert(!features->allow_warped_motion);
+
+ aom_wb_write_bit(wb, features->reduced_tx_set_used);
+
+ if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
+
+ if (seq_params->film_grain_params_present &&
+ (cm->show_frame || cm->showable_frame))
+ write_film_grain_params(cpi, wb);
+
+ if (cm->tiles.large_scale) write_ext_tile_info(cm, saved_wb, wb);
+}
+
+static int choose_size_bytes(uint32_t size, int spare_msbs) {
+ // Choose the number of bytes required to represent size, without
+ // using the 'spare_msbs' number of most significant bits.
+
+ // Make sure we will fit in 4 bytes to start with..
+ if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1;
+
+ // Normalise to 32 bits
+ size <<= spare_msbs;
+
+ if (size >> 24 != 0)
+ return 4;
+ else if (size >> 16 != 0)
+ return 3;
+ else if (size >> 8 != 0)
+ return 2;
+ else
+ return 1;
+}
+
+static AOM_INLINE void mem_put_varsize(uint8_t *const dst, const int sz,
+ const int val) {
+ switch (sz) {
+ case 1: dst[0] = (uint8_t)(val & 0xff); break;
+ case 2: mem_put_le16(dst, val); break;
+ case 3: mem_put_le24(dst, val); break;
+ case 4: mem_put_le32(dst, val); break;
+ default: assert(0 && "Invalid size"); break;
+ }
+}
+
+static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst,
+ const uint32_t data_size, const uint32_t max_tile_size,
+ const uint32_t max_tile_col_size,
+ int *const tile_size_bytes,
+ int *const tile_col_size_bytes) {
+ // Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+ int tsb;
+ int tcsb;
+
+ if (tiles->large_scale) {
+ // The top bit in the tile size field indicates tile copy mode, so we
+ // have 1 less bit to code the tile size
+ tsb = choose_size_bytes(max_tile_size, 1);
+ tcsb = choose_size_bytes(max_tile_col_size, 0);
+ } else {
+ tsb = choose_size_bytes(max_tile_size, 0);
+ tcsb = 4; // This is ignored
+ (void)max_tile_col_size;
+ }
+
+ assert(tsb > 0);
+ assert(tcsb > 0);
+
+ *tile_size_bytes = tsb;
+ *tile_col_size_bytes = tcsb;
+ if (tsb == 4 && tcsb == 4) return data_size;
+
+ uint32_t wpos = 0;
+ uint32_t rpos = 0;
+
+ if (tiles->large_scale) {
+ int tile_row;
+ int tile_col;
+
+ for (tile_col = 0; tile_col < tiles->cols; tile_col++) {
+ // All but the last column has a column header
+ if (tile_col < tiles->cols - 1) {
+ uint32_t tile_col_size = mem_get_le32(dst + rpos);
+ rpos += 4;
+
+ // Adjust the tile column size by the number of bytes removed
+ // from the tile size fields.
+ tile_col_size -= (4 - tsb) * tiles->rows;
+
+ mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+ wpos += tcsb;
+ }
+
+ for (tile_row = 0; tile_row < tiles->rows; tile_row++) {
+ // All, including the last row has a header
+ uint32_t tile_header = mem_get_le32(dst + rpos);
+ rpos += 4;
+
+ // If this is a copy tile, we need to shift the MSB to the
+ // top bit of the new width, and there is no data to copy.
+ if (tile_header >> 31 != 0) {
+ if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+ mem_put_varsize(dst + wpos, tsb, tile_header);
+ wpos += tsb;
+ } else {
+ mem_put_varsize(dst + wpos, tsb, tile_header);
+ wpos += tsb;
+
+ tile_header += AV1_MIN_TILE_SIZE_BYTES;
+ memmove(dst + wpos, dst + rpos, tile_header);
+ rpos += tile_header;
+ wpos += tile_header;
+ }
+ }
+ }
+
+ assert(rpos > wpos);
+ assert(rpos == data_size);
+
+ return wpos;
+ }
+ const int n_tiles = tiles->cols * tiles->rows;
+ int n;
+
+ for (n = 0; n < n_tiles; n++) {
+ int tile_size;
+
+ if (n == n_tiles - 1) {
+ tile_size = data_size - rpos;
+ } else {
+ tile_size = mem_get_le32(dst + rpos);
+ rpos += 4;
+ mem_put_varsize(dst + wpos, tsb, tile_size);
+ tile_size += AV1_MIN_TILE_SIZE_BYTES;
+ wpos += tsb;
+ }
+
+ memmove(dst + wpos, dst + rpos, tile_size);
+
+ rpos += tile_size;
+ wpos += tile_size;
+ }
+
+ assert(rpos > wpos);
+ assert(rpos == data_size);
+
+ return wpos;
+}
+
+uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
+ int *frame_header_count, OBU_TYPE obu_type,
+ int obu_extension, uint8_t *const dst) {
+ if (level_params->keep_level_stats &&
+ (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
+ ++(*frame_header_count);
+
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ uint32_t size = 0;
+
+ aom_wb_write_literal(&wb, 0, 1); // forbidden bit.
+ aom_wb_write_literal(&wb, (int)obu_type, 4);
+ aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
+ aom_wb_write_literal(&wb, 1, 1); // obu_has_size_field
+ aom_wb_write_literal(&wb, 0, 1); // reserved
+
+ if (obu_extension) {
+ aom_wb_write_literal(&wb, obu_extension & 0xFF, 8);
+ }
+
+ size = aom_wb_bytes_written(&wb);
+ return size;
+}
+
+int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
+ uint8_t *dest) {
+ const size_t offset = obu_header_size;
+ size_t coded_obu_size = 0;
+ const uint32_t obu_size = (uint32_t)obu_payload_size;
+ assert(obu_size == obu_payload_size);
+
+ if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset,
+ &coded_obu_size) != 0) {
+ return AOM_CODEC_ERROR;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size,
+ uint8_t *data) {
+ const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
+ const size_t move_dst_offset = length_field_size + obu_header_size;
+ const size_t move_src_offset = obu_header_size;
+ const size_t move_size = obu_payload_size;
+ memmove(data + move_dst_offset, data + move_src_offset, move_size);
+ return length_field_size;
+}
+
+static AOM_INLINE void add_trailing_bits(struct aom_write_bit_buffer *wb) {
+ if (aom_wb_is_byte_aligned(wb)) {
+ aom_wb_write_literal(wb, 0x80, 8);
+ } else {
+ // assumes that the other bits are already 0s
+ aom_wb_write_bit(wb, 1);
+ }
+}
+
+static AOM_INLINE void write_bitstream_level(AV1_LEVEL seq_level_idx,
+ struct aom_write_bit_buffer *wb) {
+ assert(is_valid_seq_level_idx(seq_level_idx));
+ aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
+}
+
+uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
+ uint8_t *const dst) {
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ uint32_t size = 0;
+
+ write_profile(seq_params->profile, &wb);
+
+ // Still picture or not
+ aom_wb_write_bit(&wb, seq_params->still_picture);
+ assert(IMPLIES(!seq_params->still_picture,
+ !seq_params->reduced_still_picture_hdr));
+ // whether to use reduced still picture header
+ aom_wb_write_bit(&wb, seq_params->reduced_still_picture_hdr);
+
+ if (seq_params->reduced_still_picture_hdr) {
+ assert(seq_params->timing_info_present == 0);
+ assert(seq_params->decoder_model_info_present_flag == 0);
+ assert(seq_params->display_model_info_present_flag == 0);
+ write_bitstream_level(seq_params->seq_level_idx[0], &wb);
+ } else {
+ aom_wb_write_bit(
+ &wb, seq_params->timing_info_present); // timing info present flag
+
+ if (seq_params->timing_info_present) {
+ // timing_info
+ write_timing_info_header(&seq_params->timing_info, &wb);
+ aom_wb_write_bit(&wb, seq_params->decoder_model_info_present_flag);
+ if (seq_params->decoder_model_info_present_flag) {
+ write_decoder_model_info(&seq_params->decoder_model_info, &wb);
+ }
+ }
+ aom_wb_write_bit(&wb, seq_params->display_model_info_present_flag);
+ aom_wb_write_literal(&wb, seq_params->operating_points_cnt_minus_1,
+ OP_POINTS_CNT_MINUS_1_BITS);
+ int i;
+ for (i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+ aom_wb_write_literal(&wb, seq_params->operating_point_idc[i],
+ OP_POINTS_IDC_BITS);
+ write_bitstream_level(seq_params->seq_level_idx[i], &wb);
+ if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
+ aom_wb_write_bit(&wb, seq_params->tier[i]);
+ if (seq_params->decoder_model_info_present_flag) {
+ aom_wb_write_bit(
+ &wb, seq_params->op_params[i].decoder_model_param_present_flag);
+ if (seq_params->op_params[i].decoder_model_param_present_flag) {
+ write_dec_model_op_parameters(
+ &seq_params->op_params[i],
+ seq_params->decoder_model_info
+ .encoder_decoder_buffer_delay_length,
+ &wb);
+ }
+ }
+ if (seq_params->display_model_info_present_flag) {
+ aom_wb_write_bit(
+ &wb, seq_params->op_params[i].display_model_param_present_flag);
+ if (seq_params->op_params[i].display_model_param_present_flag) {
+ assert(seq_params->op_params[i].initial_display_delay >= 1);
+ assert(seq_params->op_params[i].initial_display_delay <= 10);
+ aom_wb_write_literal(
+ &wb, seq_params->op_params[i].initial_display_delay - 1, 4);
+ }
+ }
+ }
+ }
+ write_sequence_header(seq_params, &wb);
+
+ write_color_config(seq_params, &wb);
+
+ aom_wb_write_bit(&wb, seq_params->film_grain_params_present);
+
+ add_trailing_bits(&wb);
+
+ size = aom_wb_bytes_written(&wb);
+ return size;
+}
+
+static uint32_t write_frame_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd,
+ struct aom_write_bit_buffer *saved_wb,
+ uint8_t *const dst,
+ int append_trailing_bits) {
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ write_uncompressed_header_obu(cpi, xd, saved_wb, &wb);
+ if (append_trailing_bits) add_trailing_bits(&wb);
+ return aom_wb_bytes_written(&wb);
+}
+
+static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile,
+ int end_tile, int tiles_log2,
+ int tile_start_and_end_present_flag) {
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ uint32_t size = 0;
+
+ if (!tiles_log2) return size;
+
+ aom_wb_write_bit(&wb, tile_start_and_end_present_flag);
+
+ if (tile_start_and_end_present_flag) {
+ aom_wb_write_literal(&wb, start_tile, tiles_log2);
+ aom_wb_write_literal(&wb, end_tile, tiles_log2);
+ }
+
+ size = aom_wb_bytes_written(&wb);
+ return size;
+}
+
+extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+ const char *filename);
+
+typedef struct {
+ uint32_t tg_hdr_size;
+ uint32_t frame_header_size;
+} LargeTileFrameOBU;
+
+// Initialize OBU header for large scale tile case.
+static uint32_t init_large_scale_tile_obu_header(
+ AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb,
+ LargeTileFrameOBU *lst_obu) {
+ AV1LevelParams *const level_params = &cpi->ppi->level_params;
+ CurrentFrame *const current_frame = &cpi->common.current_frame;
+ // For large_scale_tile case, we always have only one tile group, so it can
+ // be written as an OBU_FRAME.
+ const OBU_TYPE obu_type = OBU_FRAME;
+ lst_obu->tg_hdr_size = av1_write_obu_header(
+ level_params, &cpi->frame_header_count, obu_type, 0, *data);
+ *data += lst_obu->tg_hdr_size;
+
+ const uint32_t frame_header_size =
+ write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, saved_wb, *data, 0);
+ *data += frame_header_size;
+ lst_obu->frame_header_size = frame_header_size;
+ // (yunqing) This test ensures the correctness of large scale tile coding.
+ if (cpi->oxcf.tile_cfg.enable_ext_tile_debug) {
+ char fn[20] = "./fh";
+ fn[4] = current_frame->frame_number / 100 + '0';
+ fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+ fn[6] = (current_frame->frame_number % 10) + '0';
+ fn[7] = '\0';
+ av1_print_uncompressed_frame_header(*data - frame_header_size,
+ frame_header_size, fn);
+ }
+ return frame_header_size;
+}
+
+// Write total buffer size and related information into the OBU header for large
+// scale tile case.
+static void write_large_scale_tile_obu_size(
+ const CommonTileParams *const tiles, uint8_t *const dst, uint8_t *data,
+ struct aom_write_bit_buffer *saved_wb, LargeTileFrameOBU *const lst_obu,
+ int have_tiles, uint32_t *total_size, int max_tile_size,
+ int max_tile_col_size) {
+ int tile_size_bytes = 0;
+ int tile_col_size_bytes = 0;
+ if (have_tiles) {
+ *total_size = remux_tiles(
+ tiles, data, *total_size - lst_obu->frame_header_size, max_tile_size,
+ max_tile_col_size, &tile_size_bytes, &tile_col_size_bytes);
+ *total_size += lst_obu->frame_header_size;
+ }
+
+ // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write
+ // current tile group size before tile data(include tile column header).
+ // Tile group size doesn't include the bytes storing tg size.
+ *total_size += lst_obu->tg_hdr_size;
+ const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size;
+ const size_t length_field_size =
+ av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst);
+ if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) !=
+ AOM_CODEC_OK)
+ assert(0);
+
+ *total_size += (uint32_t)length_field_size;
+ saved_wb->bit_buffer += length_field_size;
+
+ // Now fill in the gaps in the uncompressed header.
+ if (have_tiles) {
+ assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+ aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2);
+
+ assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+ aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+ }
+}
+
+// Store information on each large scale tile in the OBU header.
+static void write_large_scale_tile_obu(
+ AV1_COMP *const cpi, uint8_t *const dst, LargeTileFrameOBU *const lst_obu,
+ int *const largest_tile_id, uint32_t *total_size, const int have_tiles,
+ unsigned int *const max_tile_size, unsigned int *const max_tile_col_size) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+
+ TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+ const int tile_cols = tiles->cols;
+ const int tile_rows = tiles->rows;
+ unsigned int tile_size = 0;
+
+ av1_reset_pack_bs_thread_data(&cpi->td);
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileInfo tile_info;
+ const int is_last_col = (tile_col == tile_cols - 1);
+ const uint32_t col_offset = *total_size;
+
+ av1_tile_set_col(&tile_info, cm, tile_col);
+
+ // The last column does not have a column header
+ if (!is_last_col) *total_size += 4;
+
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+ const int data_offset = have_tiles ? 4 : 0;
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+ av1_tile_set_row(&tile_info, cm, tile_row);
+ aom_writer mode_bc;
+
+ buf->data = dst + *total_size + lst_obu->tg_hdr_size;
+
+ // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+ // even for the last one, unless no tiling is used at all.
+ *total_size += data_offset;
+ cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+ mode_bc.allow_update_cdf = !tiles->large_scale;
+ mode_bc.allow_update_cdf =
+ mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+ aom_start_encode(&mode_bc, buf->data + data_offset);
+ write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col);
+ if (aom_stop_encode(&mode_bc) < 0) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR, "Error writing modes");
+ }
+ tile_size = mode_bc.pos;
+ buf->size = tile_size;
+
+ // Record the maximum tile size we see, so we can compact headers later.
+ if (tile_size > *max_tile_size) {
+ *max_tile_size = tile_size;
+ *largest_tile_id = tile_cols * tile_row + tile_col;
+ }
+
+ if (have_tiles) {
+ // tile header: size of this tile, or copy offset
+ uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
+ const int tile_copy_mode =
+ ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256) ? 1
+ : 0;
+
+ // If tile_copy_mode = 1, check if this tile is a copy tile.
+ // Very low chances to have copy tiles on the key frames, so don't
+ // search on key frames to reduce unnecessary search.
+ if (cm->current_frame.frame_type != KEY_FRAME && tile_copy_mode) {
+ const int identical_tile_offset =
+ find_identical_tile(tile_row, tile_col, tile_buffers);
+
+ // Indicate a copy-tile by setting the most significant bit.
+ // The row-offset to copy from is stored in the highest byte.
+ // remux_tiles will move these around later
+ if (identical_tile_offset > 0) {
+ tile_size = 0;
+ tile_header = identical_tile_offset | 0x80;
+ tile_header <<= 24;
+ }
+ }
+
+ mem_put_le32(buf->data, (MEM_VALUE_T)tile_header);
+ }
+
+ *total_size += tile_size;
+ }
+ if (!is_last_col) {
+ uint32_t col_size = *total_size - col_offset - 4;
+ mem_put_le32(dst + col_offset + lst_obu->tg_hdr_size, col_size);
+
+ // Record the maximum tile column size we see.
+ *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+ }
+ }
+ av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
+}
+
+// Packs information in the obu header for large scale tiles.
+static INLINE uint32_t pack_large_scale_tiles_in_tg_obus(
+ AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb, int *const largest_tile_id) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+ uint32_t total_size = 0;
+ unsigned int max_tile_size = 0;
+ unsigned int max_tile_col_size = 0;
+ const int have_tiles = tiles->cols * tiles->rows > 1;
+ uint8_t *data = dst;
+
+ LargeTileFrameOBU lst_obu;
+
+ total_size +=
+ init_large_scale_tile_obu_header(cpi, &data, saved_wb, &lst_obu);
+
+ write_large_scale_tile_obu(cpi, dst, &lst_obu, largest_tile_id, &total_size,
+ have_tiles, &max_tile_size, &max_tile_col_size);
+
+ write_large_scale_tile_obu_size(tiles, dst, data, saved_wb, &lst_obu,
+ have_tiles, &total_size, max_tile_size,
+ max_tile_col_size);
+
+ return total_size;
+}
+
+// Writes obu, tile group and uncompressed headers to bitstream.
+void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd,
+ PackBSParams *const pack_bs_params,
+ const int tile_idx) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+ int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size;
+ const int tg_size =
+ (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg;
+
+ // Write Tile group, frame and OBU header
+ // A new tile group begins at this tile. Write the obu header and
+ // tile group header
+ const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+ *curr_tg_hdr_size = av1_write_obu_header(
+ &cpi->ppi->level_params, &cpi->frame_header_count, obu_type,
+ pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr);
+ pack_bs_params->obu_header_size = *curr_tg_hdr_size;
+
+ if (cpi->num_tg == 1)
+ *curr_tg_hdr_size += write_frame_header_obu(
+ cpi, xd, pack_bs_params->saved_wb,
+ pack_bs_params->tile_data_curr + *curr_tg_hdr_size, 0);
+ *curr_tg_hdr_size += write_tile_group_header(
+ pack_bs_params->tile_data_curr + *curr_tg_hdr_size, tile_idx,
+ AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1),
+ (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1);
+ *pack_bs_params->total_size += *curr_tg_hdr_size;
+}
+
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(AV1_COMP *const cpi, ThreadData *const td,
+ PackBSParams *const pack_bs_params) {
+ aom_writer mode_bc;
+ AV1_COMMON *const cm = &cpi->common;
+ int tile_row = pack_bs_params->tile_row;
+ int tile_col = pack_bs_params->tile_col;
+ uint32_t *const total_size = pack_bs_params->total_size;
+ TileInfo tile_info;
+ av1_tile_set_col(&tile_info, cm, tile_col);
+ av1_tile_set_row(&tile_info, cm, tile_row);
+ mode_bc.allow_update_cdf = 1;
+ mode_bc.allow_update_cdf =
+ mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+
+ unsigned int tile_size;
+
+ const int num_planes = av1_num_planes(cm);
+ av1_reset_loop_restoration(&td->mb.e_mbd, num_planes);
+
+ pack_bs_params->buf.data = pack_bs_params->dst + *total_size;
+
+ // The last tile of the tile group does not have a header.
+ if (!pack_bs_params->is_last_tile_in_tg) *total_size += 4;
+
+ // Pack tile data
+ aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size);
+ write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col);
+ if (aom_stop_encode(&mode_bc) < 0) {
+ aom_internal_error(td->mb.e_mbd.error_info, AOM_CODEC_ERROR,
+ "Error writing modes");
+ }
+ tile_size = mode_bc.pos;
+ assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+
+ pack_bs_params->buf.size = tile_size;
+
+ // Write tile size
+ if (!pack_bs_params->is_last_tile_in_tg) {
+ // size of this tile
+ mem_put_le32(pack_bs_params->buf.data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
+ }
+}
+
+void av1_write_last_tile_info(
+ AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+ struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+ uint8_t *curr_tg_start, uint32_t *const total_size,
+ uint8_t **tile_data_start, int *const largest_tile_id,
+ int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header) {
+ // write current tile group size
+ const uint32_t obu_payload_size =
+ (uint32_t)(*curr_tg_data_size) - obu_header_size;
+ const size_t length_field_size =
+ av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start);
+ if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
+ curr_tg_start) != AOM_CODEC_OK) {
+ assert(0);
+ }
+ *curr_tg_data_size += (int)length_field_size;
+ *total_size += (uint32_t)length_field_size;
+ *tile_data_start += length_field_size;
+ if (cpi->num_tg == 1) {
+ // if this tg is combined with the frame header then update saved
+ // frame header base offset according to length field size
+ saved_wb->bit_buffer += length_field_size;
+ }
+
+ if (!(*is_first_tg) && cpi->common.features.error_resilient_mode) {
+ // Make room for a duplicate Frame Header OBU.
+ memmove(curr_tg_start + fh_info->total_length, curr_tg_start,
+ *curr_tg_data_size);
+
+ // Insert a copy of the Frame Header OBU.
+ memcpy(curr_tg_start, fh_info->frame_header, fh_info->total_length);
+
+ // Force context update tile to be the first tile in error
+ // resilient mode as the duplicate frame headers will have
+ // context_update_tile_id set to 0
+ *largest_tile_id = 0;
+
+ // Rewrite the OBU header to change the OBU type to Redundant Frame
+ // Header.
+ av1_write_obu_header(&cpi->ppi->level_params, &cpi->frame_header_count,
+ OBU_REDUNDANT_FRAME_HEADER, obu_extn_header,
+ &curr_tg_start[fh_info->obu_header_byte_offset]);
+
+ *curr_tg_data_size += (int)(fh_info->total_length);
+ *total_size += (uint32_t)(fh_info->total_length);
+ }
+ *is_first_tg = 0;
+}
+
+void av1_reset_pack_bs_thread_data(ThreadData *const td) {
+ td->coefficient_size = 0;
+ td->max_mv_magnitude = 0;
+ av1_zero(td->interp_filter_selected);
+}
+
+void av1_accumulate_pack_bs_thread_data(AV1_COMP *const cpi,
+ ThreadData const *td) {
+ int do_max_mv_magnitude_update = 1;
+ cpi->rc.coefficient_size += td->coefficient_size;
+
+ // Disable max_mv_magnitude update for parallel frames based on update flag.
+ if (!cpi->do_frame_data_update) do_max_mv_magnitude_update = 0;
+
+ if (cpi->sf.mv_sf.auto_mv_step_size && do_max_mv_magnitude_update)
+ cpi->mv_search_params.max_mv_magnitude =
+ AOMMAX(cpi->mv_search_params.max_mv_magnitude, td->max_mv_magnitude);
+
+ for (InterpFilter filter = EIGHTTAP_REGULAR; filter < SWITCHABLE; filter++)
+ cpi->common.cur_frame->interp_filter_selected[filter] +=
+ td->interp_filter_selected[filter];
+}
+
+// Store information related to each default tile in the OBU header.
+static void write_tile_obu(
+ AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+ struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+ const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+ unsigned int *max_tile_size, uint32_t *const obu_header_size,
+ uint8_t **tile_data_start) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ const CommonTileParams *const tiles = &cm->tiles;
+ const int tile_cols = tiles->cols;
+ const int tile_rows = tiles->rows;
+ // Fixed size tile groups for the moment
+ const int num_tg_hdrs = cpi->num_tg;
+ const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+ int tile_count = 0;
+ size_t curr_tg_data_size = 0;
+ uint8_t *tile_data_curr = dst;
+ int new_tg = 1;
+ int is_first_tg = 1;
+
+ av1_reset_pack_bs_thread_data(&cpi->td);
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+
+ int is_last_tile_in_tg = 0;
+ if (new_tg) {
+ tile_data_curr = dst + *total_size;
+ tile_count = 0;
+ }
+ tile_count++;
+
+ if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1))
+ is_last_tile_in_tg = 1;
+
+ xd->tile_ctx = &this_tile->tctx;
+
+ // PackBSParams stores all parameters required to pack tile and header
+ // info.
+ PackBSParams pack_bs_params;
+ pack_bs_params.dst = dst;
+ pack_bs_params.curr_tg_hdr_size = 0;
+ pack_bs_params.is_last_tile_in_tg = is_last_tile_in_tg;
+ pack_bs_params.new_tg = new_tg;
+ pack_bs_params.obu_extn_header = obu_extn_header;
+ pack_bs_params.obu_header_size = 0;
+ pack_bs_params.saved_wb = saved_wb;
+ pack_bs_params.tile_col = tile_col;
+ pack_bs_params.tile_row = tile_row;
+ pack_bs_params.tile_data_curr = tile_data_curr;
+ pack_bs_params.total_size = total_size;
+
+ if (new_tg)
+ av1_write_obu_tg_tile_headers(cpi, xd, &pack_bs_params, tile_idx);
+
+ av1_pack_tile_info(cpi, &cpi->td, &pack_bs_params);
+
+ if (new_tg) {
+ curr_tg_data_size = pack_bs_params.curr_tg_hdr_size;
+ *tile_data_start += pack_bs_params.curr_tg_hdr_size;
+ *obu_header_size = pack_bs_params.obu_header_size;
+ new_tg = 0;
+ }
+ if (is_last_tile_in_tg) new_tg = 1;
+
+ curr_tg_data_size +=
+ (pack_bs_params.buf.size + (is_last_tile_in_tg ? 0 : 4));
+
+ if (pack_bs_params.buf.size > *max_tile_size) {
+ *largest_tile_id = tile_idx;
+ *max_tile_size = (unsigned int)pack_bs_params.buf.size;
+ }
+
+ if (is_last_tile_in_tg)
+ av1_write_last_tile_info(cpi, fh_info, saved_wb, &curr_tg_data_size,
+ tile_data_curr, total_size, tile_data_start,
+ largest_tile_id, &is_first_tg,
+ *obu_header_size, obu_extn_header);
+ *total_size += (uint32_t)pack_bs_params.buf.size;
+ }
+ }
+ av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
+}
+
+// Write total buffer size and related information into the OBU header for
+// default tile case.
+static void write_tile_obu_size(AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb,
+ int largest_tile_id, uint32_t *const total_size,
+ unsigned int max_tile_size,
+ uint32_t obu_header_size,
+ uint8_t *tile_data_start) {
+ const CommonTileParams *const tiles = &cpi->common.tiles;
+
+ // Fill in context_update_tile_id indicating the tile to use for the
+ // cdf update. The encoder currently sets it to the largest tile
+ // (but is up to the encoder)
+ aom_wb_overwrite_literal(saved_wb, largest_tile_id,
+ (tiles->log2_cols + tiles->log2_rows));
+ // If more than one tile group. tile_size_bytes takes the default value 4
+ // and does not need to be set. For a single tile group it is set in the
+ // section below.
+ if (cpi->num_tg != 1) return;
+ int tile_size_bytes = 4, unused;
+ const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst);
+ const uint32_t tile_data_size = *total_size - tile_data_offset;
+
+ *total_size = remux_tiles(tiles, tile_data_start, tile_data_size,
+ max_tile_size, 0, &tile_size_bytes, &unused);
+ *total_size += tile_data_offset;
+ assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+
+ aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+
+ // Update the OBU length if remux_tiles() reduced the size.
+ uint64_t payload_size;
+ size_t length_field_size;
+ int res =
+ aom_uleb_decode(dst + obu_header_size, *total_size - obu_header_size,
+ &payload_size, &length_field_size);
+ assert(res == 0);
+ (void)res;
+
+ const uint64_t new_payload_size =
+ *total_size - obu_header_size - length_field_size;
+ if (new_payload_size != payload_size) {
+ size_t new_length_field_size;
+ res = aom_uleb_encode(new_payload_size, length_field_size,
+ dst + obu_header_size, &new_length_field_size);
+ assert(res == 0);
+ if (new_length_field_size < length_field_size) {
+ const size_t src_offset = obu_header_size + length_field_size;
+ const size_t dst_offset = obu_header_size + new_length_field_size;
+ memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size);
+ *total_size -= (int)(length_field_size - new_length_field_size);
+ }
+ }
+}
+
+// As per the experiments, single-thread bitstream packing is better for
+// frames with a smaller bitstream size. This behavior is due to setup time
+// overhead of multithread function would be more than that of time required
+// to pack the smaller bitstream of such frames. This function computes the
+// number of required number of workers based on setup time overhead and job
+// dispatch time overhead for given tiles and available workers.
+int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles,
+ int avail_workers, bool pack_bs_mt_enabled) {
+ if (!pack_bs_mt_enabled) return 1;
+
+ uint64_t frame_abs_sum_level = 0;
+
+ for (int idx = 0; idx < num_tiles; idx++)
+ frame_abs_sum_level += tile_data[idx].abs_sum_level;
+
+ int ideal_num_workers = 1;
+ const float job_disp_time_const = (float)num_tiles * JOB_DISP_TIME_OH_CONST;
+ float max_sum = 0.0;
+
+ for (int num_workers = avail_workers; num_workers > 1; num_workers--) {
+ const float fas_per_worker_const =
+ ((float)(num_workers - 1) / num_workers) * frame_abs_sum_level;
+ const float setup_time_const = (float)num_workers * SETUP_TIME_OH_CONST;
+ const float this_sum = fas_per_worker_const - setup_time_const -
+ job_disp_time_const / num_workers;
+
+ if (this_sum > max_sum) {
+ max_sum = this_sum;
+ ideal_num_workers = num_workers;
+ }
+ }
+ return ideal_num_workers;
+}
+
+static INLINE uint32_t pack_tiles_in_tg_obus(
+ AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header,
+ const FrameHeaderInfo *fh_info, int *const largest_tile_id) {
+ const CommonTileParams *const tiles = &cpi->common.tiles;
+ uint32_t total_size = 0;
+ unsigned int max_tile_size = 0;
+ uint32_t obu_header_size = 0;
+ uint8_t *tile_data_start = dst;
+ const int tile_cols = tiles->cols;
+ const int tile_rows = tiles->rows;
+ const int num_tiles = tile_rows * tile_cols;
+
+ const int num_workers = calc_pack_bs_mt_workers(
+ cpi->tile_data, num_tiles, cpi->mt_info.num_mod_workers[MOD_PACK_BS],
+ cpi->mt_info.pack_bs_mt_enabled);
+
+ if (num_workers > 1) {
+ av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header,
+ fh_info, largest_tile_id, &max_tile_size,
+ &obu_header_size, &tile_data_start, num_workers);
+ } else {
+ write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header,
+ fh_info, largest_tile_id, &max_tile_size, &obu_header_size,
+ &tile_data_start);
+ }
+
+ if (num_tiles > 1)
+ write_tile_obu_size(cpi, dst, saved_wb, *largest_tile_id, &total_size,
+ max_tile_size, obu_header_size, tile_data_start);
+ return total_size;
+}
+
+static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb,
+ uint8_t obu_extension_header,
+ const FrameHeaderInfo *fh_info,
+ int *const largest_tile_id) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+ *largest_tile_id = 0;
+
+ // Select the coding strategy (temporal or spatial)
+ if (cm->seg.enabled && cm->seg.update_map) {
+ if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+ cm->seg.temporal_update = 0;
+ } else {
+ cm->seg.temporal_update = 1;
+ if (cpi->td.rd_counts.seg_tmp_pred_cost[0] <
+ cpi->td.rd_counts.seg_tmp_pred_cost[1])
+ cm->seg.temporal_update = 0;
+ }
+ }
+
+ if (tiles->large_scale)
+ return pack_large_scale_tiles_in_tg_obus(cpi, dst, saved_wb,
+ largest_tile_id);
+
+ return pack_tiles_in_tg_obus(cpi, dst, saved_wb, obu_extension_header,
+ fh_info, largest_tile_id);
+}
+
+static size_t av1_write_metadata_obu(const aom_metadata_t *metadata,
+ uint8_t *const dst) {
+ size_t coded_metadata_size = 0;
+ const uint64_t metadata_type = (uint64_t)metadata->type;
+ if (aom_uleb_encode(metadata_type, sizeof(metadata_type), dst,
+ &coded_metadata_size) != 0) {
+ return 0;
+ }
+ memcpy(dst + coded_metadata_size, metadata->payload, metadata->sz);
+ // Add trailing bits.
+ dst[coded_metadata_size + metadata->sz] = 0x80;
+ return (uint32_t)(coded_metadata_size + metadata->sz + 1);
+}
+
+static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) {
+ if (!cpi->source) return 0;
+ AV1_COMMON *const cm = &cpi->common;
+ aom_metadata_array_t *arr = cpi->source->metadata;
+ if (!arr) return 0;
+ size_t obu_header_size = 0;
+ size_t obu_payload_size = 0;
+ size_t total_bytes_written = 0;
+ size_t length_field_size = 0;
+ for (size_t i = 0; i < arr->sz; i++) {
+ aom_metadata_t *current_metadata = arr->metadata_array[i];
+ if (current_metadata && current_metadata->payload) {
+ if ((cm->current_frame.frame_type == KEY_FRAME &&
+ current_metadata->insert_flag == AOM_MIF_KEY_FRAME) ||
+ (cm->current_frame.frame_type != KEY_FRAME &&
+ current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) ||
+ current_metadata->insert_flag == AOM_MIF_ANY_FRAME) {
+ obu_header_size = av1_write_obu_header(&cpi->ppi->level_params,
+ &cpi->frame_header_count,
+ OBU_METADATA, 0, dst);
+ obu_payload_size =
+ av1_write_metadata_obu(current_metadata, dst + obu_header_size);
+ length_field_size =
+ av1_obu_memmove(obu_header_size, obu_payload_size, dst);
+ if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) ==
+ AOM_CODEC_OK) {
+ const size_t obu_size = obu_header_size + obu_payload_size;
+ dst += obu_size + length_field_size;
+ total_bytes_written += obu_size + length_field_size;
+ } else {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Error writing metadata OBU size");
+ }
+ }
+ }
+ }
+ return total_bytes_written;
+}
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+ int *const largest_tile_id) {
+ uint8_t *data = dst;
+ uint32_t data_size;
+ AV1_COMMON *const cm = &cpi->common;
+ AV1LevelParams *const level_params = &cpi->ppi->level_params;
+ uint32_t obu_header_size = 0;
+ uint32_t obu_payload_size = 0;
+ FrameHeaderInfo fh_info = { NULL, 0, 0 };
+ const uint8_t obu_extension_header =
+ cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0;
+
+ // If no non-zero delta_q has been used, reset delta_q_present_flag
+ if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) {
+ cm->delta_q_info.delta_q_present_flag = 0;
+ }
+
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_reset_write();
+#endif
+
+ cpi->frame_header_count = 0;
+
+ // The TD is now written outside the frame encode loop
+
+ // write sequence header obu at each key frame or intra_only frame,
+ // preceded by 4-byte size
+ if (cm->current_frame.frame_type == INTRA_ONLY_FRAME ||
+ cm->current_frame.frame_type == KEY_FRAME) {
+ obu_header_size = av1_write_obu_header(
+ level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data);
+ obu_payload_size =
+ av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size);
+ const size_t length_field_size =
+ av1_obu_memmove(obu_header_size, obu_payload_size, data);
+ if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ data += obu_header_size + obu_payload_size + length_field_size;
+ }
+
+ // write metadata obus before the frame obu that has the show_frame flag set
+ if (cm->show_frame) data += av1_write_metadata_array(cpi, data);
+
+ const int write_frame_header =
+ (cpi->num_tg > 1 || encode_show_existing_frame(cm));
+ struct aom_write_bit_buffer saved_wb = { NULL, 0 };
+ size_t length_field = 0;
+ if (write_frame_header) {
+ // Write Frame Header OBU.
+ fh_info.frame_header = data;
+ obu_header_size =
+ av1_write_obu_header(level_params, &cpi->frame_header_count,
+ OBU_FRAME_HEADER, obu_extension_header, data);
+ obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb,
+ data + obu_header_size, 1);
+
+ length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data);
+ if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ fh_info.obu_header_byte_offset = 0;
+ fh_info.total_length = obu_header_size + obu_payload_size + length_field;
+ data += fh_info.total_length;
+ }
+
+ if (encode_show_existing_frame(cm)) {
+ data_size = 0;
+ } else {
+ // Since length_field is determined adaptively after frame header
+ // encoding, saved_wb must be adjusted accordingly.
+ if (saved_wb.bit_buffer != NULL) {
+ saved_wb.bit_buffer += length_field;
+ }
+
+ // Each tile group obu will be preceded by 4-byte size of the tile group
+ // obu
+ data_size = write_tiles_in_tg_obus(
+ cpi, data, &saved_wb, obu_extension_header, &fh_info, largest_tile_id);
+ }
+ data += data_size;
+ *size = data - dst;
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
new file mode 100644
index 0000000000..12e8a630db
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_BITSTREAM_H_
+#define AOM_AV1_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/level.h"
+#include "aom_dsp/bitwriter.h"
+
+struct aom_write_bit_buffer;
+struct AV1_COMP;
+struct ThreadData;
+
+/*!\cond */
+
+// Stores the location and size of a tile's data in the bitstream. Used for
+// later identifying identical tiles
+typedef struct {
+ uint8_t *data;
+ size_t size;
+} TileBufferEnc;
+
+typedef struct {
+ uint8_t *frame_header;
+ size_t obu_header_byte_offset;
+ size_t total_length;
+} FrameHeaderInfo;
+
+typedef struct {
+ struct aom_write_bit_buffer *saved_wb; // Bit stream buffer writer structure
+ TileBufferEnc buf; // Structure to hold bitstream buffer and size
+ uint32_t *total_size; // Size of the bitstream buffer for the tile in bytes
+ uint8_t *dst; // Base address of tile bitstream buffer
+ uint8_t *tile_data_curr; // Base address of tile-group bitstream buffer
+ size_t tile_buf_size; // Available bitstream buffer for the tile in bytes
+ uint8_t obu_extn_header; // Presence of OBU extension header
+ uint32_t obu_header_size; // Size of the OBU header
+ int curr_tg_hdr_size; // Size of the obu, tg, frame headers
+ int tile_size_mi; // Tile size in mi units
+ int tile_row; // Number of tile rows
+ int tile_col; // Number of tile columns
+ int is_last_tile_in_tg; // Flag to indicate last tile in a tile-group
+ int new_tg; // Flag to indicate starting of a new tile-group
+} PackBSParams;
+
+typedef struct {
+ uint64_t abs_sum_level;
+ uint16_t tile_idx;
+} PackBSTileOrder;
+
+// Pack bitstream data for pack bitstream multi-threading.
+typedef struct {
+#if CONFIG_MULTITHREAD
+ // Mutex lock used while dispatching jobs.
+ pthread_mutex_t *mutex_;
+#endif
+ // Tile order structure of pack bitstream multithreading.
+ PackBSTileOrder pack_bs_tile_order[MAX_TILES];
+
+ // Index of next job to be processed.
+ int next_job_idx;
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool pack_bs_mt_exit;
+} AV1EncPackBSSync;
+
+/*!\endcond */
+
+// Writes only the OBU Sequence Header payload, and returns the size of the
+// payload written to 'dst'. This function does not write the OBU header, the
+// optional extension, or the OBU size to 'dst'.
+uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
+ uint8_t *const dst);
+
+// Writes the OBU header byte, and the OBU header extension byte when
+// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
+uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
+ int *frame_header_count, OBU_TYPE obu_type,
+ int obu_extension, uint8_t *const dst);
+
+int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
+ uint8_t *dest);
+
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(struct AV1_COMP *const cpi, struct ThreadData *const td,
+ PackBSParams *const pack_bs_params);
+
+void av1_write_last_tile_info(
+ struct AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+ struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+ uint8_t *curr_tg_start, uint32_t *const total_size,
+ uint8_t **tile_data_start, int *const largest_tile_id,
+ int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header);
+
+/*!\brief Pack the bitstream for one frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ */
+int av1_pack_bitstream(struct AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+ int *const largest_tile_id);
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+ TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w);
+
+void av1_reset_pack_bs_thread_data(struct ThreadData *const td);
+
+void av1_accumulate_pack_bs_thread_data(struct AV1_COMP *const cpi,
+ struct ThreadData const *td);
+
+void av1_write_obu_tg_tile_headers(struct AV1_COMP *const cpi,
+ MACROBLOCKD *const xd,
+ PackBSParams *const pack_bs_params,
+ const int tile_idx);
+
+int av1_neg_interleave(int x, int ref, int max);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_BITSTREAM_H_
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
new file mode 100644
index 0000000000..33d2d8c2a0
--- /dev/null
+++ b/third_party/aom/av1/encoder/block.h
@@ -0,0 +1,1515 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * Declares various structs used to encode the current partition block.
+ */
+#ifndef AOM_AV1_ENCODER_BLOCK_H_
+#define AOM_AV1_ENCODER_BLOCK_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/entropy.h"
+#include "av1/common/enums.h"
+#include "av1/common/mvref_common.h"
+
+#include "av1/encoder/enc_enums.h"
+#include "av1/encoder/mcomp_structs.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/partition_cnn_weights.h"
+#endif
+
+#include "av1/encoder/hash_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//! Minimum linear dimension of a tpl block
+#define MIN_TPL_BSIZE_1D 16
+//! Maximum number of tpl block in a super block
+#define MAX_TPL_BLK_IN_SB (MAX_SB_SIZE / MIN_TPL_BSIZE_1D)
+//! Number of txfm hash records kept for the partition block.
+#define RD_RECORD_BUFFER_LEN 8
+
+/*! Maximum value taken by transform type probabilities */
+#define MAX_TX_TYPE_PROB 1024
+
+//! Compute color sensitivity index for given plane
+#define COLOR_SENS_IDX(plane) ((plane)-1)
+
+//! Enable timer statistics of mode search in non-rd
+#define COLLECT_NONRD_PICK_MODE_STAT 0
+
+/*!\cond */
+#if COLLECT_NONRD_PICK_MODE_STAT
+#include "aom_ports/aom_timer.h"
+
+typedef struct _mode_search_stat_nonrd {
+ int32_t num_blocks[BLOCK_SIZES];
+ int64_t total_block_times[BLOCK_SIZES];
+ int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT];
+ int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t nonskipped_search_times[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t ms_time[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t ifs_time[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t model_rd_time[BLOCK_SIZES][MB_MODE_COUNT];
+ int64_t txfm_time[BLOCK_SIZES][MB_MODE_COUNT];
+ struct aom_usec_timer timer1;
+ struct aom_usec_timer timer2;
+ struct aom_usec_timer bsize_timer;
+} mode_search_stat_nonrd;
+#endif // COLLECT_NONRD_PICK_MODE_STAT
+/*!\endcond */
+
+/*! \brief Superblock level encoder info
+ *
+ * SuperblockEnc stores superblock level information used by the encoder for
+ * more efficient encoding. Currently this is mostly used to store TPL data
+ * for the current superblock.
+ */
+typedef struct {
+ //! Maximum partition size for the sb.
+ BLOCK_SIZE min_partition_size;
+ //! Minimum partition size for the sb.
+ BLOCK_SIZE max_partition_size;
+
+ /*****************************************************************************
+ * \name TPL Info
+ *
+ * Information gathered from tpl_model at tpl block precision for the
+ * superblock to speed up the encoding process..
+ ****************************************************************************/
+ /**@{*/
+ //! Number of TPL blocks in this superblock.
+ int tpl_data_count;
+ //! TPL's estimate of inter cost for each tpl block.
+ int64_t tpl_inter_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
+ //! TPL's estimate of tpl cost for each tpl block.
+ int64_t tpl_intra_cost[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB];
+ //! Motion vectors found by TPL model for each tpl block.
+ int_mv tpl_mv[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB][INTER_REFS_PER_FRAME];
+ //! TPL's stride for the arrays in this struct.
+ int tpl_stride;
+ /**@}*/
+} SuperBlockEnc;
+
+/*! \brief Stores the best performing modes.
+ */
+typedef struct {
+ //! The mbmi used to reconstruct the winner mode.
+ MB_MODE_INFO mbmi;
+ //! Rdstats of the winner mode.
+ RD_STATS rd_cost;
+ //! Rdcost of the winner mode
+ int64_t rd;
+ //! Luma rate of the winner mode.
+ int rate_y;
+ //! Chroma rate of the winner mode.
+ int rate_uv;
+ //! The color map needed to reconstruct palette mode.
+ uint8_t color_index_map[MAX_SB_SQUARE];
+ //! The current winner mode.
+ THR_MODES mode_index;
+} WinnerModeStats;
+
+/*! \brief Each source plane of the current macroblock
+ *
+ * This struct also stores the txfm buffers and quantizer settings.
+ */
+typedef struct macroblock_plane {
+ //! Stores source - pred so the txfm can be computed later
+ int16_t *src_diff;
+ //! Dequantized coefficients
+ tran_low_t *dqcoeff;
+ //! Quantized coefficients
+ tran_low_t *qcoeff;
+ //! Transformed coefficients
+ tran_low_t *coeff;
+ //! Location of the end of qcoeff (end of block).
+ uint16_t *eobs;
+ //! Contexts used to code the transform coefficients.
+ uint8_t *txb_entropy_ctx;
+ //! A buffer containing the source frame.
+ struct buf_2d src;
+
+ /*! \name Quantizer Settings
+ *
+ * \attention These are used/accessed only in the quantization process.
+ * RDO does not and *must not* depend on any of these values.
+ * All values below share the coefficient scale/shift used in TX.
+ */
+ /**@{*/
+ //! Quantization step size used by AV1_XFORM_QUANT_FP.
+ const int16_t *quant_fp_QTX;
+ //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_FP.
+ const int16_t *round_fp_QTX;
+ //! Quantization step size used by AV1_XFORM_QUANT_B.
+ const int16_t *quant_QTX;
+ //! Offset used for rounding in the quantizer process by AV1_XFORM_QUANT_B.
+ const int16_t *round_QTX;
+ //! Scale factor to shift coefficients toward zero. Only used by QUANT_B.
+ const int16_t *quant_shift_QTX;
+ //! Size of the quantization bin around 0. Only Used by QUANT_B
+ const int16_t *zbin_QTX;
+ //! Dequantizer
+ const int16_t *dequant_QTX;
+ /**@}*/
+} MACROBLOCK_PLANE;
+
+/*! \brief Costs for encoding the coefficients within a level.
+ *
+ * Covers everything including txb_skip, eob, dc_sign,
+ */
+typedef struct {
+ //! Cost to skip txfm for the current txfm block.
+ int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
+ /*! \brief Cost for encoding the base_eob of a level.
+ *
+ * Decoder uses base_eob to derive the base_level as base_eob := base_eob+1.
+ */
+ int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
+ /*! \brief Cost for encoding the base level of a coefficient.
+ *
+ * Decoder derives coeff_base as coeff_base := base_eob + 1.
+ */
+ int base_cost[SIG_COEF_CONTEXTS][8];
+ /*! \brief Cost for encoding the last non-zero coefficient.
+ *
+ * Eob is derived from eob_extra at the decoder as eob := eob_extra + 1
+ */
+ int eob_extra_cost[EOB_COEF_CONTEXTS][2];
+ //! Cost for encoding the dc_sign
+ int dc_sign_cost[DC_SIGN_CONTEXTS][2];
+ //! Cost for encoding an increment to the coefficient
+ int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1];
+} LV_MAP_COEFF_COST;
+
+/*! \brief Costs for encoding the eob.
+ */
+typedef struct {
+ //! eob_cost.
+ int eob_cost[2][11];
+} LV_MAP_EOB_COST;
+
+/*! \brief Stores the transforms coefficients for the whole superblock.
+ */
+typedef struct {
+ //! The transformed coefficients.
+ tran_low_t *tcoeff[MAX_MB_PLANE];
+ //! Where the transformed coefficients end.
+ uint16_t *eobs[MAX_MB_PLANE];
+ /*! \brief Transform block entropy contexts.
+ *
+ * Each element is used as a bit field.
+ * - Bits 0~3: txb_skip_ctx
+ * - Bits 4~5: dc_sign_ctx.
+ */
+ uint8_t *entropy_ctx[MAX_MB_PLANE];
+} CB_COEFF_BUFFER;
+
+/*! \brief Extended mode info derived from mbmi.
+ */
+typedef struct {
+ // TODO(angiebird): Reduce the buffer size according to sb_type
+ //! The reference mv list for the current block.
+ CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+ //! The weights used to compute the ref mvs.
+ uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+ //! Number of ref mvs in the drl.
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+ //! Global mvs
+ int_mv global_mvs[REF_FRAMES];
+ //! Context used to encode the current mode.
+ int16_t mode_context[MODE_CTX_REF_FRAMES];
+} MB_MODE_INFO_EXT;
+
+/*! \brief Stores best extended mode information at frame level.
+ *
+ * The frame level in here is used in bitstream preparation stage. The
+ * information in \ref MB_MODE_INFO_EXT are copied to this struct to save
+ * memory.
+ */
+typedef struct {
+ //! \copydoc MB_MODE_INFO_EXT::ref_mv_stack
+ CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE];
+ //! \copydoc MB_MODE_INFO_EXT::weight
+ uint16_t weight[USABLE_REF_MV_STACK_SIZE];
+ //! \copydoc MB_MODE_INFO_EXT::ref_mv_count
+ uint8_t ref_mv_count;
+ // TODO(Ravi/Remya): Reduce the buffer size of global_mvs
+ //! \copydoc MB_MODE_INFO_EXT::global_mvs
+ int_mv global_mvs[REF_FRAMES];
+ //! \copydoc MB_MODE_INFO_EXT::mode_context
+ int16_t mode_context;
+ //! Offset of current coding block's coeff buffer relative to the sb.
+ uint16_t cb_offset[PLANE_TYPES];
+} MB_MODE_INFO_EXT_FRAME;
+
+/*! \brief Inter-mode txfm results for a partition block.
+ */
+typedef struct {
+ //! Txfm size used if the current mode is intra mode.
+ TX_SIZE tx_size;
+ //! Txfm sizes used if the current mode is inter mode.
+ TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+ //! Map showing which txfm block skips the txfm process.
+ uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ //! Map showing the txfm types for each block.
+ uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ //! Rd_stats for the whole partition block.
+ RD_STATS rd_stats;
+ //! Hash value of the current record.
+ uint32_t hash_value;
+} MB_RD_INFO;
+
+/*! \brief Hash records of the inter-mode transform results
+ *
+ * Hash records of the inter-mode transform results for a whole partition block
+ * based on the residue. Since this operates on the partition block level, this
+ * can give us a whole txfm partition tree.
+ */
+typedef struct {
+ /*! Circular buffer that stores the inter-mode txfm results of a partition
+ * block.
+ */
+ MB_RD_INFO mb_rd_info[RD_RECORD_BUFFER_LEN];
+ //! Index to insert the newest rd record.
+ int index_start;
+ //! Number of info stored in this record.
+ int num;
+ //! Hash function
+ CRC32C crc_calculator;
+} MB_RD_RECORD;
+
+//! Number of compound rd stats
+#define MAX_COMP_RD_STATS 64
+/*! \brief Rdcost stats in compound mode.
+ */
+typedef struct {
+ //! Rate of the compound modes.
+ int32_t rate[COMPOUND_TYPES];
+ //! Distortion of the compound modes.
+ int64_t dist[COMPOUND_TYPES];
+ //! Estimated rate of the compound modes.
+ int32_t model_rate[COMPOUND_TYPES];
+ //! Estimated distortion of the compound modes.
+ int64_t model_dist[COMPOUND_TYPES];
+ //! Rate need to send the mask type.
+ int comp_rs2[COMPOUND_TYPES];
+ //! Motion vector for each predictor.
+ int_mv mv[2];
+ //! Ref frame for each predictor.
+ MV_REFERENCE_FRAME ref_frames[2];
+ //! Current prediction mode.
+ PREDICTION_MODE mode;
+ //! Current interpolation filter.
+ int_interpfilters filter;
+ //! Refmv index in the drl.
+ int ref_mv_idx;
+ //! Whether the predictors are GLOBALMV.
+ int is_global[2];
+ //! Current parameters for interinter mode.
+ INTERINTER_COMPOUND_DATA interinter_comp;
+} COMP_RD_STATS;
+
+/*! \brief Contains buffers used to speed up rdopt for obmc.
+ *
+ * See the comments for calc_target_weighted_pred for details.
+ */
+typedef struct {
+ /*! \brief A new source weighted with the above and left predictors.
+ *
+ * Used to efficiently construct multiple obmc predictors during rdopt.
+ */
+ int32_t *wsrc;
+ /*! \brief A new mask constructed from the original horz/vert mask.
+ *
+ * \copydetails wsrc
+ */
+ int32_t *mask;
+ /*! \brief Prediction from the up predictor.
+ *
+ * Used to build the obmc predictor.
+ */
+ uint8_t *above_pred;
+ /*! \brief Prediction from the up predictor.
+ *
+ * \copydetails above_pred
+ */
+ uint8_t *left_pred;
+} OBMCBuffer;
+
+/*! \brief Contains color maps used in palette mode.
+ */
+typedef struct {
+ //! The best color map found.
+ uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
+ //! A temporary buffer used for k-means clustering.
+ int16_t kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
+} PALETTE_BUFFER;
+
+/*! \brief Contains buffers used by av1_compound_type_rd()
+ *
+ * For sizes and alignment of these arrays, refer to
+ * alloc_compound_type_rd_buffers() function.
+ */
+typedef struct {
+ //! First prediction.
+ uint8_t *pred0;
+ //! Second prediction.
+ uint8_t *pred1;
+ //! Source - first prediction.
+ int16_t *residual1;
+ //! Second prediction - first prediction.
+ int16_t *diff10;
+ //! Backup of the best segmentation mask.
+ uint8_t *tmp_best_mask_buf;
+} CompoundTypeRdBuffers;
+
+/*! \brief Holds some parameters related to partitioning schemes in AV1.
+ */
+// TODO(chiyotsai@google.com): Consolidate this with SIMPLE_MOTION_DATA_TREE
+typedef struct {
+#if !CONFIG_REALTIME_ONLY
+ // The following 4 parameters are used for cnn-based partitioning on intra
+ // frame.
+ /*! \brief Current index on the partition block quad tree.
+ *
+ * Used to index into the cnn buffer for partition decision.
+ */
+ int quad_tree_idx;
+ //! Whether the CNN buffer contains valid output.
+ int cnn_output_valid;
+ //! A buffer used by our segmentation CNN for intra-frame partitioning.
+ float cnn_buffer[CNN_OUT_BUF_SIZE];
+ //! log of the quantization parameter of the ancestor BLOCK_64X64.
+ float log_q;
+#endif
+
+ /*! \brief Variance of the subblocks in the superblock.
+ *
+ * This is used by rt mode for variance based partitioning.
+ * The indices corresponds to the following block sizes:
+ * - 0 - 128x128
+ * - 1-2 - 128x64
+ * - 3-4 - 64x128
+ * - 5-8 - 64x64
+ * - 9-16 - 64x32
+ * - 17-24 - 32x64
+ * - 25-40 - 32x32
+ * - 41-104 - 16x16
+ */
+ uint8_t variance_low[105];
+} PartitionSearchInfo;
+
+/*!\cond */
+enum {
+ /**
+ * Do not prune transform depths.
+ */
+ TX_PRUNE_NONE = 0,
+ /**
+ * Prune largest transform (depth 0) based on NN model.
+ */
+ TX_PRUNE_LARGEST = 1,
+ /**
+ * Prune split transforms (depth>=1) based on NN model.
+ */
+ TX_PRUNE_SPLIT = 2,
+} UENUM1BYTE(TX_PRUNE_TYPE);
+/*!\endcond */
+
+/*! \brief Defines the parameters used to perform txfm search.
+ *
+ * For the most part, this determines how various speed features are used.
+ */
+typedef struct {
+ /*! \brief Whether to limit the intra txfm search type to the default txfm.
+ *
+ * This could either be a result of either sequence parameter or speed
+ * features.
+ */
+ int use_default_intra_tx_type;
+
+ /*! Probability threshold used for conditionally forcing tx type*/
+ int default_inter_tx_type_prob_thresh;
+
+ //! Whether to prune 2d transforms based on 1d transform results.
+ int prune_2d_txfm_mode;
+
+ /*! \brief Variable from \ref WinnerModeParams based on current eval mode.
+ *
+ * See the documentation for \ref WinnerModeParams for more detail.
+ */
+ unsigned int coeff_opt_thresholds[2];
+ /*! \copydoc coeff_opt_thresholds */
+ unsigned int tx_domain_dist_threshold;
+ /*! \copydoc coeff_opt_thresholds */
+ TX_SIZE_SEARCH_METHOD tx_size_search_method;
+ /*! \copydoc coeff_opt_thresholds */
+ unsigned int use_transform_domain_distortion;
+ /*! \copydoc coeff_opt_thresholds */
+ unsigned int skip_txfm_level;
+
+ /*! \brief How to search for the optimal tx_size
+ *
+ * If ONLY_4X4, use TX_4X4; if TX_MODE_LARGEST, use the largest tx_size for
+ * the current partition block; if TX_MODE_SELECT, search through the whole
+ * tree.
+ *
+ * \attention
+ * Although this looks suspicious similar to a bitstream element, this
+ * tx_mode_search_type is only used internally by the encoder, and is *not*
+ * written to the bitstream. It determines what kind of tx_mode would be
+ * searched. For example, we might set it to TX_MODE_LARGEST to find a good
+ * candidate, then code it as TX_MODE_SELECT.
+ */
+ TX_MODE tx_mode_search_type;
+
+ /*!
+ * Determines whether a block can be predicted as transform skip or DC only
+ * based on residual mean and variance.
+ * Type 0 : No skip block or DC only block prediction
+ * Type 1 : Prediction of skip block based on residual mean and variance
+ * Type 2 : Prediction of skip block or DC only block based on residual mean
+ * and variance
+ */
+ unsigned int predict_dc_level;
+
+ /*!
+ * Whether or not we should use the quantization matrix as weights for PSNR
+ * during RD search.
+ */
+ int use_qm_dist_metric;
+
+ /*!
+ * Keep track of previous mode evaluation stage type. This will be used to
+ * reset mb rd hash record when mode evaluation type changes.
+ */
+ int mode_eval_type;
+
+#if !CONFIG_REALTIME_ONLY
+ //! Indicates the transform depths for which RD evaluation is skipped.
+ TX_PRUNE_TYPE nn_prune_depths_for_intra_tx;
+
+ /*! \brief Indicates if NN model should be invoked to prune transform depths.
+ *
+ * Used to signal whether NN model should be evaluated to prune the R-D
+ * evaluation of specific transform depths.
+ */
+ bool enable_nn_prune_intra_tx_depths;
+#endif
+} TxfmSearchParams;
+
+/*!\cond */
+#define MAX_NUM_8X8_TXBS ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1))
+#define MAX_NUM_16X16_TXBS ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2))
+#define MAX_NUM_32X32_TXBS ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3))
+#define MAX_NUM_64X64_TXBS ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4))
+/*!\endcond */
+
+/*! \brief Stores various encoding/search decisions related to txfm search.
+ *
+ * This struct contains a cache of previous txfm results, and some buffers for
+ * the current txfm decision.
+ */
+typedef struct {
+ //! Whether to skip transform and quantization on a partition block level.
+ uint8_t skip_txfm;
+
+ /*! \brief Whether to skip transform and quantization on a txfm block level.
+ *
+ * Skips transform and quantization on a transform block level inside the
+ * current partition block. Each element of this array is used as a bit-field.
+ * So for example, the we are skipping on the luma plane, then the last bit
+ * would be set to 1.
+ */
+ uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+ /*! \brief Transform types inside the partition block
+ *
+ * Keeps a record of what kind of transform to use for each of the transform
+ * block inside the partition block.
+ * \attention The buffer here is *never* directly used. Instead, this just
+ * allocates the memory for MACROBLOCKD::tx_type_map during rdopt on the
+ * partition block. So if we need to save memory, we could move the allocation
+ * to pick_sb_mode instead.
+ */
+ uint8_t tx_type_map_[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+ //! Txfm hash records of inter-modes.
+ MB_RD_RECORD *mb_rd_record;
+
+ /*! \brief Number of txb splits.
+ *
+ * Keep track of how many times we've used split tx partition for transform
+ * blocks. Somewhat misleadingly, this parameter doesn't actually keep track
+ * of the count of the current block. Instead, it's a cumulative count across
+ * of the whole frame. The main usage is that if txb_split_count is zero, then
+ * we can signal TX_MODE_LARGEST at frame level.
+ */
+ // TODO(chiyotsai@google.com): Move this to a more appropriate location such
+ // as ThreadData.
+ unsigned int txb_split_count;
+#if CONFIG_SPEED_STATS
+ //! For debugging. Used to check how many txfm searches we are doing.
+ unsigned int tx_search_count;
+#endif // CONFIG_SPEED_STATS
+} TxfmSearchInfo;
+#undef MAX_NUM_8X8_TXBS
+#undef MAX_NUM_16X16_TXBS
+#undef MAX_NUM_32X32_TXBS
+#undef MAX_NUM_64X64_TXBS
+
+/*! \brief Holds the entropy costs for various modes sent to the bitstream.
+ *
+ * \attention This does not include the costs for mv and transformed
+ * coefficients.
+ */
+typedef struct {
+ /*****************************************************************************
+ * \name Partition Costs
+ ****************************************************************************/
+ /**@{*/
+ //! Cost for coding the partition.
+ int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Intra Costs: General
+ ****************************************************************************/
+ /**@{*/
+ //! Luma mode cost for inter frame.
+ int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+ //! Luma mode cost for intra frame.
+ int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+ //! Chroma mode cost
+ int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+ //! filter_intra_cost
+ int filter_intra_cost[BLOCK_SIZES_ALL][2];
+ //! filter_intra_mode_cost
+ int filter_intra_mode_cost[FILTER_INTRA_MODES];
+ //! angle_delta_cost
+ int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+
+ //! Rate rate associated with each alpha codeword
+ int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Intra Costs: Screen Contents
+ ****************************************************************************/
+ /**@{*/
+ //! intrabc_cost
+ int intrabc_cost[2];
+
+ //! palette_y_size_cost
+ int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ //! palette_uv_size_cost
+ int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ //! palette_y_color_cost
+ int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ //! palette_uv_color_cost
+ int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ //! palette_y_mode_cost
+ int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+ //! palette_uv_mode_cost
+ int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Costs: MV Modes
+ ****************************************************************************/
+ /**@{*/
+ //! skip_mode_cost
+ int skip_mode_cost[SKIP_MODE_CONTEXTS][2];
+ //! newmv_mode_cost
+ int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+ //! zeromv_mode_cost
+ int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
+ //! refmv_mode_cost
+ int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+ //! drl_mode_cost0
+ int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Costs: Ref Frame Types
+ ****************************************************************************/
+ /**@{*/
+ //! single_ref_cost
+ int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
+ //! comp_inter_cost
+ int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+ //! comp_ref_type_cost
+ int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
+ [CDF_SIZE(COMP_REFERENCE_TYPES)];
+ //! uni_comp_ref_cost
+ int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+ [CDF_SIZE(2)];
+ /*! \brief Cost for signaling ref_frame[0] in bidir-comp mode
+ *
+ * Includes LAST_FRAME, LAST2_FRAME, LAST3_FRAME, and GOLDEN_FRAME.
+ */
+ int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
+ /*! \brief Cost for signaling ref_frame[1] in bidir-comp mode
+ *
+ * Includes ALTREF_FRAME, ALTREF2_FRAME, and BWDREF_FRAME.
+ */
+ int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Costs: Compound Types
+ ****************************************************************************/
+ /**@{*/
+ //! intra_inter_cost
+ int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+ //! inter_compound_mode_cost
+ int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+ //! compound_type_cost
+ int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
+ //! wedge_idx_cost
+ int wedge_idx_cost[BLOCK_SIZES_ALL][16];
+ //! interintra_cost
+ int interintra_cost[BLOCK_SIZE_GROUPS][2];
+ //! wedge_interintra_cost
+ int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
+ //! interintra_mode_cost
+ int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Costs: Compound Masks
+ ****************************************************************************/
+ /**@{*/
+ //! comp_idx_cost
+ int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
+ //! comp_group_idx_cost
+ int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Inter Costs: Motion Modes/Filters
+ ****************************************************************************/
+ /**@{*/
+ //! motion_mode_cost
+ int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
+ //! motion_mode_cost1
+ int motion_mode_cost1[BLOCK_SIZES_ALL][2];
+ //! switchable_interp_costs
+ int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Txfm Mode Costs
+ ****************************************************************************/
+ /**@{*/
+ //! skip_txfm_cost
+ int skip_txfm_cost[SKIP_CONTEXTS][2];
+ //! tx_size_cost
+ int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+ //! txfm_partition_cost
+ int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
+ //! inter_tx_type_costs
+ int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+ //! intra_tx_type_costs
+ int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+ [TX_TYPES];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Restoration Mode Costs
+ ****************************************************************************/
+ /**@{*/
+ //! switchable_restore_cost
+ int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+ //! wiener_restore_cost
+ int wiener_restore_cost[2];
+ //! sgrproj_restore_cost
+ int sgrproj_restore_cost[2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Segmentation Mode Costs
+ ****************************************************************************/
+ /**@{*/
+ //! tmp_pred_cost
+ int tmp_pred_cost[SEG_TEMPORAL_PRED_CTXS][2];
+ //! spatial_pred_cost
+ int spatial_pred_cost[SPATIAL_PREDICTION_PROBS][MAX_SEGMENTS];
+ /**@}*/
+} ModeCosts;
+
+/*! \brief Holds mv costs for encoding and motion search.
+ */
+typedef struct {
+ /*****************************************************************************
+ * \name Encoding Costs
+ * Here are the entropy costs needed to encode a given mv.
+ * \ref nmv_cost_alloc and \ref nmv_cost_hp_alloc are two arrays that holds
+ * the memory for holding the mv cost. But since the motion vectors can be
+ * negative, we shift them to the middle and store the resulting pointer in
+ * \ref nmv_cost and \ref nmv_cost_hp for easier referencing. Finally, \ref
+ * mv_cost_stack points to the \ref nmv_cost with the mv precision we are
+ * currently working with. In essence, only \ref mv_cost_stack is needed for
+ * motion search, the other can be considered private.
+ ****************************************************************************/
+ /**@{*/
+ //! Costs for coding the zero components.
+ int nmv_joint_cost[MV_JOINTS];
+
+ //! Allocates memory for 1/4-pel motion vector costs.
+ int nmv_cost_alloc[2][MV_VALS];
+ //! Allocates memory for 1/8-pel motion vector costs.
+ int nmv_cost_hp_alloc[2][MV_VALS];
+ //! Points to the middle of \ref nmv_cost_alloc
+ int *nmv_cost[2];
+ //! Points to the middle of \ref nmv_cost_hp_alloc
+ int *nmv_cost_hp[2];
+ //! Points to the nmv_cost_hp in use.
+ int **mv_cost_stack;
+ /**@}*/
+} MvCosts;
+
+/*! \brief Holds mv costs for intrabc.
+ */
+typedef struct {
+ /*! Costs for coding the joint mv. */
+ int joint_mv[MV_JOINTS];
+
+ /*! \brief Cost of transmitting the actual motion vector.
+ * dv_costs_alloc[0][i] is the cost of motion vector with horizontal
+ * component (mv_row) equal to i - MV_MAX. dv_costs_alloc[1][i] is the cost of
+ * motion vector with vertical component (mv_col) equal to i - MV_MAX.
+ */
+ int dv_costs_alloc[2][MV_VALS];
+
+ /*! Points to the middle of \ref dv_costs_alloc. */
+ int *dv_costs[2];
+} IntraBCMVCosts;
+
+/*! \brief Holds the costs needed to encode the coefficients
+ */
+typedef struct {
+ //! Costs for coding the coefficients.
+ LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+ //! Costs for coding the eobs.
+ LV_MAP_EOB_COST eob_costs[7][2];
+} CoeffCosts;
+
+/*!\cond */
+// 4: NEAREST, NEW, NEAR, GLOBAL
+#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4)
+/*!\endcond */
+struct inter_modes_info;
+
+/*! \brief Holds the motion samples for warp motion model estimation
+ */
+typedef struct {
+ //! Number of samples.
+ int num;
+ //! Sample locations in current frame.
+ int pts[16];
+ //! Sample location in the reference frame.
+ int pts_inref[16];
+} WARP_SAMPLE_INFO;
+
+/*!\cond */
+typedef enum {
+ kZeroSad = 0,
+ kVeryLowSad = 1,
+ kLowSad = 2,
+ kMedSad = 3,
+ kHighSad = 4
+} SOURCE_SAD;
+
+typedef struct {
+ //! SAD levels in non-rd path
+ SOURCE_SAD source_sad_nonrd;
+ //! SAD levels in rd-path for var-based part qindex thresholds
+ SOURCE_SAD source_sad_rd;
+ int lighting_change;
+ int low_sumdiff;
+} CONTENT_STATE_SB;
+
+// Structure to hold pixel level gradient info.
+typedef struct {
+ uint16_t abs_dx_abs_dy_sum;
+ int8_t hist_bin_idx;
+ bool is_dx_zero;
+} PixelLevelGradientInfo;
+
+// Structure to hold the variance and log(1 + variance) for 4x4 sub-blocks.
+typedef struct {
+ double log_var;
+ int var;
+} Block4x4VarInfo;
+
+#ifndef NDEBUG
+typedef struct SetOffsetsLoc {
+ int mi_row;
+ int mi_col;
+ BLOCK_SIZE bsize;
+} SetOffsetsLoc;
+#endif // NDEBUG
+
+/*!\endcond */
+
+/*! \brief Encoder's parameters related to the current coding block.
+ *
+ * This struct contains most of the information the encoder needs to encode the
+ * current coding block. This includes the src and pred buffer, a copy of the
+ * decoder's view of the current block, the txfm coefficients. This struct also
+ * contains various buffers and data used to speed up the encoding process.
+ */
+typedef struct macroblock {
+ /*****************************************************************************
+ * \name Source, Buffers and Decoder
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Each of the encoding plane.
+ *
+ * An array holding the src buffer for each of plane of the current block. It
+ * also contains the txfm and quantized txfm coefficients.
+ */
+ struct macroblock_plane plane[MAX_MB_PLANE];
+
+ /*! \brief Decoder's view of current coding block.
+ *
+ * Contains the encoder's copy of what the decoder sees in the current block.
+ * Most importantly, this struct contains pointers to mbmi that is used in
+ * final bitstream packing.
+ */
+ MACROBLOCKD e_mbd;
+
+ /*! \brief Derived coding information.
+ *
+ * Contains extra information not transmitted in the bitstream but are
+ * derived. For example, this contains the stack of ref_mvs.
+ */
+ MB_MODE_INFO_EXT mbmi_ext;
+
+ /*! \brief Finalized mbmi_ext for the whole frame.
+ *
+ * Contains the finalized info in mbmi_ext that gets used at the frame level
+ * for bitstream packing.
+ */
+ MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame;
+
+ //! Entropy context for the current row.
+ FRAME_CONTEXT *row_ctx;
+ /*! \brief Entropy context for the current tile.
+ *
+ * This context will be used to update color_map_cdf pointer which would be
+ * used during pack bitstream. For single thread and tile-multithreading case
+ * this pointer will be same as xd->tile_ctx, but for the case of row-mt:
+ * xd->tile_ctx will point to a temporary context while tile_pb_ctx will point
+ * to the accurate tile context.
+ */
+ FRAME_CONTEXT *tile_pb_ctx;
+
+ /*! \brief Buffer of transformed coefficients
+ *
+ * Points to cb_coef_buff in the AV1_COMP struct, which contains the finalized
+ * coefficients. This is here to conveniently copy the best coefficients to
+ * frame level for bitstream packing. Since CB_COEFF_BUFFER is allocated on a
+ * superblock level, we need to combine it with cb_offset to get the proper
+ * position for the current coding block.
+ */
+ CB_COEFF_BUFFER *cb_coef_buff;
+ //! Offset of current coding block's coeff buffer relative to the sb.
+ uint16_t cb_offset[PLANE_TYPES];
+
+ //! Modified source and masks used for fast OBMC search.
+ OBMCBuffer obmc_buffer;
+ //! Buffer to store the best palette map.
+ PALETTE_BUFFER *palette_buffer;
+ //! Buffer used for compound_type_rd().
+ CompoundTypeRdBuffers comp_rd_buffer;
+ //! Buffer to store convolution during averaging process in compound mode.
+ CONV_BUF_TYPE *tmp_conv_dst;
+
+ /*! \brief Temporary buffer to hold prediction.
+ *
+ * Points to a buffer that is used to hold temporary prediction results. This
+ * is used in two ways:
+ * - This is a temporary buffer used to ping-pong the prediction in
+ * handle_inter_mode.
+ * - xd->tmp_obmc_bufs also points to this buffer, and is used in ombc
+ * prediction.
+ */
+ uint8_t *tmp_pred_bufs[2];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Rdopt Costs
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Quantization index for the current partition block.
+ *
+ * This is used to as the index to find quantization parameter for luma and
+ * chroma transformed coefficients.
+ */
+ int qindex;
+
+ /*! \brief Difference between frame-level qindex and current qindex.
+ *
+ * This is used to track whether a non-zero delta for qindex is used at least
+ * once in the current frame.
+ */
+ int delta_qindex;
+
+ /*! \brief Difference between frame-level qindex and qindex used to
+ * compute rdmult (lambda).
+ *
+ * rdmult_delta_qindex is assigned the same as delta_qindex before qp sweep.
+ * During qp sweep, delta_qindex is changed and used to calculate the actual
+ * quant params, while rdmult_delta_qindex remains the same, and is used to
+ * calculate the rdmult in "set_deltaq_rdmult".
+ */
+ int rdmult_delta_qindex;
+
+ /*! \brief Current qindex (before being adjusted by delta_q_res) used to
+ * derive rdmult_delta_qindex.
+ */
+ int rdmult_cur_qindex;
+
+ /*! \brief Rate-distortion multiplier.
+ *
+ * The rd multiplier used to determine the rate-distortion trade-off. This is
+ * roughly proportional to the inverse of q-index for a given frame, but this
+ * can be manipulated for better rate-control. For example, in tune_ssim
+ * mode, this is scaled by a factor related to the variance of the current
+ * block.
+ */
+ int rdmult;
+
+ //! Intra only, per sb rd adjustment.
+ int intra_sb_rdmult_modifier;
+
+ //! Superblock level distortion propagation factor.
+ double rb;
+
+ //! Energy in the current source coding block. Used to calculate \ref rdmult
+ int mb_energy;
+ //! Energy in the current source superblock. Used to calculate \ref rdmult
+ int sb_energy_level;
+
+ //! The rate needed to signal a mode to the bitstream.
+ ModeCosts mode_costs;
+
+ //! The rate needed to encode a new motion vector to the bitstream and some
+ //! multipliers for motion search.
+ MvCosts *mv_costs;
+
+ /*! The rate needed to encode a new motion vector to the bitstream in intrabc
+ * mode.
+ */
+ IntraBCMVCosts *dv_costs;
+
+ //! The rate needed to signal the txfm coefficients to the bitstream.
+ CoeffCosts coeff_costs;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Rate to Distortion Multipliers
+ ****************************************************************************/
+ /**@{*/
+ //! A multiplier that converts mv cost to l2 error.
+ int errorperbit;
+ //! A multiplier that converts mv cost to l1 error.
+ int sadperbit;
+ /**@}*/
+
+ /******************************************************************************
+ * \name Segmentation
+ *****************************************************************************/
+ /**@{*/
+ /*! \brief Skip mode for the segment
+ *
+ * A syntax element of the segmentation mode. In skip_block mode, all mvs are
+ * set 0 and all txfms are skipped.
+ */
+ int seg_skip_block;
+
+ /*! \brief Number of segment 1 blocks
+ * Actual number of (4x4) blocks that were applied delta-q,
+ * for segment 1.
+ */
+ int actual_num_seg1_blocks;
+
+ /*!\brief Number of segment 2 blocks
+ * Actual number of (4x4) blocks that were applied delta-q,
+ * for segment 2.
+ */
+ int actual_num_seg2_blocks;
+
+ /*!\brief Number of zero motion vectors
+ */
+ int cnt_zeromv;
+
+ /*!\brief Flag to force zeromv-skip at superblock level, for nonrd path.
+ *
+ * 0/1 imply zeromv-skip is disabled/enabled. 2 implies that the blocks
+ * in the superblock may be marked as zeromv-skip at block level.
+ */
+ int force_zeromv_skip_for_sb;
+
+ /*!\brief Flag to force zeromv-skip at block level, for nonrd path.
+ */
+ int force_zeromv_skip_for_blk;
+
+ /*! \brief Previous segment id for which qmatrices were updated.
+ * This is used to bypass setting of qmatrices if no change in qindex.
+ */
+ int prev_segment_id;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Superblock
+ ****************************************************************************/
+ /**@{*/
+ //! Information on a whole superblock level.
+ // TODO(chiyotsai@google.com): Refactor this out of macroblock
+ SuperBlockEnc sb_enc;
+
+ /*! \brief Characteristics of the current superblock.
+ *
+ * Characteristics like whether the block has high sad, low sad, etc. This is
+ * only used by av1 realtime mode.
+ */
+ CONTENT_STATE_SB content_state_sb;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Reference Frame Search
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Sum absolute distortion of the predicted mv for each ref frame.
+ *
+ * This is used to measure how viable a reference frame is.
+ */
+ int pred_mv_sad[REF_FRAMES];
+ /*! \brief The minimum of \ref pred_mv_sad.
+ *
+ * Index 0 stores the minimum \ref pred_mv_sad across past reference frames.
+ * Index 1 stores the minimum \ref pred_mv_sad across future reference frames.
+ */
+ int best_pred_mv_sad[2];
+ //! The sad of the 1st mv ref (nearest).
+ int pred_mv0_sad[REF_FRAMES];
+ //! The sad of the 2nd mv ref (near).
+ int pred_mv1_sad[REF_FRAMES];
+
+ /*! \brief Disables certain ref frame pruning based on tpl.
+ *
+ * Determines whether a given ref frame is "good" based on data from the TPL
+ * model. If so, this stops selective_ref frame from pruning the given ref
+ * frame at block level.
+ */
+ uint8_t tpl_keep_ref_frame[REF_FRAMES];
+
+ /*! \brief Warp motion samples buffer.
+ *
+ * Store the motion samples used for warp motion.
+ */
+ WARP_SAMPLE_INFO warp_sample_info[REF_FRAMES];
+
+ /*! \brief Reference frames picked by the square subblocks in a superblock.
+ *
+ * Keeps track of ref frames that are selected by square partition blocks
+ * within a superblock, in MI resolution. They can be used to prune ref frames
+ * for rectangular blocks.
+ */
+ int picked_ref_frames_mask[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+ /*! \brief Prune ref frames in real-time mode.
+ *
+ * Determines whether to prune reference frames in real-time mode. For the
+ * most part, this is the same as nonrd_prune_ref_frame_search in
+ * cpi->sf.rt_sf.nonrd_prune_ref_frame_search, but this can be selectively
+ * turned off if the only frame available is GOLDEN_FRAME.
+ */
+ int nonrd_prune_ref_frame_search;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Partition Search
+ ****************************************************************************/
+ /**@{*/
+ //! Stores some partition-search related buffers.
+ PartitionSearchInfo part_search_info;
+
+ /*! \brief Whether to disable some features to force a mode in current block.
+ *
+ * In some cases, our speed features can be overly aggressive and remove all
+ * modes search in the superblock. When this happens, we set
+ * must_find_valid_partition to 1 to reduce the number of speed features, and
+ * recode the superblock again.
+ */
+ int must_find_valid_partition;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Prediction Mode Search
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Inter skip mode.
+ *
+ * Skip mode tries to use the closest forward and backward references for
+ * inter prediction. Skip here means to skip transmitting the reference
+ * frames, not to be confused with skip_txfm.
+ */
+ int skip_mode;
+
+ /*! \brief Factors used for rd-thresholding.
+ *
+ * Determines a rd threshold to determine whether to continue searching the
+ * current mode. If the current best rd is already <= threshold, then we skip
+ * the current mode.
+ */
+ int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+
+ /*! \brief Tracks the winner modes in the current coding block.
+ *
+ * Winner mode is a two-pass strategy to find the best prediction mode. In the
+ * first pass, we search the prediction modes with a limited set of txfm
+ * options, and keep the top modes. These modes are called the winner modes.
+ * In the second pass, we retry the winner modes with more thorough txfm
+ * options.
+ */
+ WinnerModeStats *winner_mode_stats;
+ //! Tracks how many winner modes there are.
+ int winner_mode_count;
+
+ /*! \brief The model used for rd-estimation to avoid txfm
+ *
+ * These are for inter_mode_rd_model_estimation, which is another two pass
+ * approach. In this speed feature, we collect data in the first couple frames
+ * to build an rd model to estimate the rdcost of a prediction model based on
+ * the residue error. Once enough data is collected, this speed feature uses
+ * the estimated rdcost to find the most performant prediction mode. Then we
+ * follow up with a second pass find the best transform for the mode.
+ * Determines if one would go with reduced complexity transform block
+ * search model to select prediction modes, or full complexity model
+ * to select transform kernel.
+ */
+ TXFM_RD_MODEL rd_model;
+
+ /*! \brief Stores the inter mode information needed to build an rd model.
+ *
+ * These are for inter_mode_rd_model_estimation, which is another two pass
+ * approach. In this speed feature, we collect data in the first couple frames
+ * to build an rd model to estimate the rdcost of a prediction model based on
+ * the residue error. Once enough data is collected, this speed feature uses
+ * the estimated rdcost to find the most performant prediction mode. Then we
+ * follow up with a second pass find the best transform for the mode.
+ */
+ // TODO(any): try to consolidate this speed feature with winner mode
+ // processing.
+ struct inter_modes_info *inter_modes_info;
+
+ //! How to blend the compound predictions.
+ uint8_t compound_idx;
+
+ //! A caches of results of compound type search so they can be reused later.
+ COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS];
+ //! The idx for the latest compound mode in the cache \ref comp_rd_stats.
+ int comp_rd_stats_idx;
+
+ /*! \brief Whether to recompute the luma prediction.
+ *
+ * In interpolation search, we can usually skip recalculating the luma
+ * prediction because it is already calculated by a previous predictor. This
+ * flag signifies that some modes might have been skipped, so we need to
+ * rebuild the prediction.
+ */
+ int recalc_luma_mc_data;
+
+ /*! \brief Data structure to speed up intrabc search.
+ *
+ * Contains the hash table, hash function, and buffer used for intrabc.
+ */
+ IntraBCHashInfo intrabc_hash_info;
+
+ /*! \brief Whether to reuse the mode stored in mb_mode_cache. */
+ int use_mb_mode_cache;
+ /*! \brief The mode to reuse during \ref av1_rd_pick_intra_mode_sb and
+ * \ref av1_rd_pick_inter_mode. */
+ const MB_MODE_INFO *mb_mode_cache;
+ /*! \brief Pointer to the buffer which caches gradient information.
+ *
+ * Pointer to the array of structures to store gradient information of each
+ * pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+ * structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+ */
+ PixelLevelGradientInfo *pixel_gradient_info;
+ /*! \brief Flags indicating the availability of cached gradient info. */
+ bool is_sb_gradient_cached[PLANE_TYPES];
+
+ /*! \brief Flag to reuse predicted samples of inter block. */
+ bool reuse_inter_pred;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name MV Search
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Context used to determine the initial step size in motion search.
+ *
+ * This context is defined as the \f$l_\inf\f$ norm of the best ref_mvs for
+ * each frame.
+ */
+ unsigned int max_mv_context[REF_FRAMES];
+
+ /*! \brief Limit for the range of motion vectors.
+ *
+ * These define limits to motion vector components to prevent them from
+ * extending outside the UMV borders
+ */
+ FullMvLimits mv_limits;
+
+ /*! \brief Buffer for storing the search site config.
+ *
+ * When resize mode or super resolution mode is on, the stride of the
+ * reference frame does not always match what's specified in \ref
+ * MotionVectorSearchParams::search_site_cfg. When his happens, we update the
+ * search_sine_config buffer here and use it for motion search.
+ */
+ search_site_config search_site_cfg_buf[NUM_DISTINCT_SEARCH_METHODS];
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Txfm Search
+ ****************************************************************************/
+ /**@{*/
+ /*! \brief Parameters that control how motion search is done.
+ *
+ * Stores various txfm search related parameters such as txfm_type, txfm_size,
+ * trellis eob search, etc.
+ */
+ TxfmSearchParams txfm_search_params;
+
+ /*! \brief Results of the txfm searches that have been done.
+ *
+ * Caches old txfm search results and keeps the current txfm decisions to
+ * facilitate rdopt.
+ */
+ TxfmSearchInfo txfm_search_info;
+
+ /*! \brief Whether there is a strong color activity.
+ *
+ * Used in REALTIME coding mode to enhance the visual quality at the boundary
+ * of moving color objects.
+ */
+ uint8_t color_sensitivity_sb[MAX_MB_PLANE - 1];
+ //! Color sensitivity flag for the superblock for golden reference.
+ uint8_t color_sensitivity_sb_g[MAX_MB_PLANE - 1];
+ //! Color sensitivity flag for the superblock for altref reference.
+ uint8_t color_sensitivity_sb_alt[MAX_MB_PLANE - 1];
+ //! Color sensitivity flag for the coding block.
+ uint8_t color_sensitivity[MAX_MB_PLANE - 1];
+ //! Coding block distortion value for uv/color, minimum over the inter modes.
+ int64_t min_dist_inter_uv;
+
+ //! The buffer used by search_tx_type() to swap dqcoeff in macroblockd_plane
+ // so we can keep dqcoeff of the best tx_type.
+ tran_low_t *dqcoeff_buf;
+ /**@}*/
+
+ /*****************************************************************************
+ * \name Misc
+ ****************************************************************************/
+ /**@{*/
+ //! Variance of the source frame.
+ unsigned int source_variance;
+ //! Flag to indicate coding block is zero sad.
+ int block_is_zero_sad;
+ //! Flag to indicate superblock ME in variance partition is determined to be
+ // good/reliable, and so the superblock MV will be tested in the
+ // nonrd_pickmode. This is only used for LAST_FRAME.
+ int sb_me_partition;
+ //! Flag to indicate to test the superblock MV for the coding block in the
+ // nonrd_pickmode.
+ int sb_me_block;
+ //! Motion vector from superblock MV derived from int_pro_motion() in
+ // the variance_partitioning.
+ int_mv sb_me_mv;
+ //! SSE of the current predictor.
+ unsigned int pred_sse[REF_FRAMES];
+ //! Prediction for ML based partition.
+#if CONFIG_RT_ML_PARTITIONING
+ DECLARE_ALIGNED(16, uint8_t, est_pred[128 * 128]);
+#endif
+ /**@}*/
+
+ /*! \brief NONE partition evaluated for merge.
+ *
+ * In variance based partitioning scheme, NONE & SPLIT partitions are
+ * evaluated to check the SPLIT can be merged as NONE. This flag signifies the
+ * partition is evaluated in the scheme.
+ */
+ int try_merge_partition;
+
+ /*! \brief Pointer to buffer which caches sub-block variances in a superblock.
+ *
+ * Pointer to the array of structures to store source variance information of
+ * each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to
+ * store source variance and log of source variance of each 4x4 sub-block.
+ */
+ Block4x4VarInfo *src_var_info_of_4x4_sub_blocks;
+#ifndef NDEBUG
+ /*! \brief A hash to make sure av1_set_offsets is called */
+ SetOffsetsLoc last_set_offsets_loc;
+#endif // NDEBUG
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ mode_search_stat_nonrd ms_stat_nonrd;
+#endif // COLLECT_NONRD_PICK_MODE_STAT
+
+ /*!\brief Number of pixels in current thread that choose palette mode in the
+ * fast encoding stage for screen content tool detemination.
+ */
+ int palette_pixels;
+
+ /*!\brief Pointer to the structure which stores the statistics used by
+ * sb-level multi-pass encoding.
+ */
+ struct SB_FIRST_PASS_STATS *sb_stats_cache;
+
+ /*!\brief Pointer to the structure which stores the statistics used by
+ * first-pass when superblock is searched twice consecutively.
+ */
+ struct SB_FIRST_PASS_STATS *sb_fp_stats;
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+ /*!\brief Pointer to RD_STATS structure to be used in
+ * av1_rd_partition_search().
+ */
+ RD_STATS *rdcost;
+#endif // CONFIG_PARTITION_SEARCH_ORDER
+} MACROBLOCK;
+#undef SINGLE_REF_MODES
+
+/*!\cond */
+// Zeroes out 'n_stats' elements in the array x->winner_mode_stats.
+// It only zeroes out what is necessary in 'color_index_map' (just the block
+// size, not the whole array).
+static INLINE void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats,
+ WinnerModeStats *stats) {
+ // When winner mode stats are not required, the memory allocation is avoided
+ // for x->winner_mode_stats. The stats pointer will be NULL in such cases.
+ if (stats == NULL) return;
+
+ const int block_height = block_size_high[bsize];
+ const int block_width = block_size_wide[bsize];
+ for (int i = 0; i < n_stats; ++i) {
+ WinnerModeStats *const stat = &stats[i];
+ memset(&stat->mbmi, 0, sizeof(stat->mbmi));
+ memset(&stat->rd_cost, 0, sizeof(stat->rd_cost));
+ memset(&stat->rd, 0, sizeof(stat->rd));
+ memset(&stat->rate_y, 0, sizeof(stat->rate_y));
+ memset(&stat->rate_uv, 0, sizeof(stat->rate_uv));
+ // Do not reset the whole array as it is CPU intensive.
+ memset(&stat->color_index_map, 0,
+ block_width * block_height * sizeof(stat->color_index_map[0]));
+ memset(&stat->mode_index, 0, sizeof(stat->mode_index));
+ }
+}
+
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+ static const char LUT[BLOCK_SIZES_ALL] = {
+ 0, // BLOCK_4X4
+ 1, // BLOCK_4X8
+ 1, // BLOCK_8X4
+ 0, // BLOCK_8X8
+ 1, // BLOCK_8X16
+ 1, // BLOCK_16X8
+ 0, // BLOCK_16X16
+ 1, // BLOCK_16X32
+ 1, // BLOCK_32X16
+ 0, // BLOCK_32X32
+ 1, // BLOCK_32X64
+ 1, // BLOCK_64X32
+ 0, // BLOCK_64X64
+ 0, // BLOCK_64X128
+ 0, // BLOCK_128X64
+ 0, // BLOCK_128X128
+ 1, // BLOCK_4X16
+ 1, // BLOCK_16X4
+ 1, // BLOCK_8X32
+ 1, // BLOCK_32X8
+ 1, // BLOCK_16X64
+ 1, // BLOCK_64X16
+ };
+
+ return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi) {
+ return is_rect_tx_allowed_bsize(mbmi->bsize) &&
+ !xd->lossless[mbmi->segment_id];
+}
+
+static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
+ TX_SIZE ctx_size = max_txsize_rect_lookup[bsize];
+ int depth = 0;
+ while (tx_size != ctx_size) {
+ depth++;
+ ctx_size = sub_tx_size_map[ctx_size];
+ assert(depth <= MAX_TX_DEPTH);
+ }
+ return depth;
+}
+
+static INLINE void set_blk_skip(uint8_t txb_skip[], int plane, int blk_idx,
+ int skip) {
+ if (skip)
+ txb_skip[blk_idx] |= 1UL << plane;
+ else
+ txb_skip[blk_idx] &= ~(1UL << plane);
+#ifndef NDEBUG
+ // Set chroma planes to uninitialized states when luma is set to check if
+ // it will be set later
+ if (plane == 0) {
+ txb_skip[blk_idx] |= 1UL << (1 + 4);
+ txb_skip[blk_idx] |= 1UL << (2 + 4);
+ }
+
+ // Clear the initialization checking bit
+ txb_skip[blk_idx] &= ~(1UL << (plane + 4));
+#endif
+}
+
+static INLINE int is_blk_skip(uint8_t *txb_skip, int plane, int blk_idx) {
+#ifndef NDEBUG
+ // Check if this is initialized
+ assert(!(txb_skip[blk_idx] & (1UL << (plane + 4))));
+
+ // The magic number is 0x77, this is to test if there is garbage data
+ assert((txb_skip[blk_idx] & 0x88) == 0);
+#endif
+ return (txb_skip[blk_idx] >> plane) & 1;
+}
+
+/*!\endcond */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_BLOCK_H_
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
new file mode 100644
index 0000000000..6ad2ddaf25
--- /dev/null
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+static int horizontal_filter(const uint8_t *s) {
+ return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+ return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+ return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+// p0 p1 p2 p3
+// q0 q1 q2 q3
+// block edge ->
+// r0 r1 r2 r3
+// s0 s1 s2 s3
+
+// blockiness = p0*-2+q0*6+r0*-6+s0*2 +
+// p1*-2+q1*6+r1*-6+s1*2 +
+// p2*-2+q2*6+r2*-6+s2*2 +
+// p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+// blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+ int rp, int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, s += sp, r += rp) {
+ s_blockiness += horizontal_filter(s);
+ r_blockiness += horizontal_filter(r);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-1];
+ sum_sq_1 += s[-1] * s[-1];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+ int rp, int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, ++s, ++r) {
+ s_blockiness += vertical_filter(s, sp);
+ r_blockiness += vertical_filter(r, rp);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-sp];
+ sum_sq_1 += s[-sp] * s[-sp];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+ const unsigned char *img2, int img2_pitch, int width,
+ int height) {
+ double blockiness = 0;
+ int i, j;
+ for (i = 0; i < height;
+ i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4) {
+ if (i > 0 && i < height && j > 0 && j < width) {
+ blockiness +=
+ blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4);
+ blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j,
+ img2_pitch, 4);
+ }
+ }
+ }
+ blockiness /= width * height / 16;
+ return blockiness;
+}
diff --git a/third_party/aom/av1/encoder/cnn.c b/third_party/aom/av1/encoder/cnn.c
new file mode 100644
index 0000000000..598b362753
--- /dev/null
+++ b/third_party/aom/av1/encoder/cnn.c
@@ -0,0 +1,1189 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+
+#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
+
+typedef struct {
+ const float **input;
+ int in_width;
+ int in_height;
+ int in_stride;
+ const CNN_LAYER_CONFIG *layer_config;
+ float **output;
+ int out_stride;
+ int start_idx;
+ int th_step;
+} CONVOLVE_OPS;
+
+static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); }
+
+static INLINE float relu(float x) { return (x < 0) ? 0 : x; }
+
+typedef struct {
+ int allocsize;
+ int channels;
+ int width, height, stride;
+ float *buf[CNN_MAX_CHANNELS];
+} TENSOR;
+
+static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
+
+static void free_tensor(TENSOR *tensor) {
+ if (tensor->allocsize) {
+ aom_free(tensor->buf[0]);
+ tensor->buf[0] = NULL;
+ tensor->allocsize = 0;
+ }
+}
+
+static bool realloc_tensor(TENSOR *tensor, int channels, int width,
+ int height) {
+ const int newallocsize = channels * width * height;
+ if (tensor->allocsize < newallocsize) {
+ free_tensor(tensor);
+ tensor->buf[0] =
+ (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
+ if (!tensor->buf[0]) return false;
+ tensor->allocsize = newallocsize;
+ }
+ tensor->width = width;
+ tensor->height = height;
+ tensor->stride = width;
+ tensor->channels = channels;
+ for (int c = 1; c < channels; ++c)
+ tensor->buf[c] = &tensor->buf[0][c * width * height];
+ return true;
+}
+
+static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
+ TENSOR *dst) {
+ assert(src->width == dst->width);
+ assert(src->height == dst->height);
+ assert(copy_channels <= src->channels);
+ if (src->stride == dst->width && dst->stride == dst->width) {
+ for (int c = 0; c < copy_channels; ++c) {
+ memcpy(dst->buf[dst_offset + c], src->buf[c],
+ sizeof(*dst->buf[0]) * src->width * src->height);
+ }
+ } else {
+ for (int c = 0; c < copy_channels; ++c) {
+ for (int r = 0; r < dst->height; ++r) {
+ memcpy(&dst->buf[dst_offset + c][r * dst->stride],
+ &src->buf[c][r * src->stride],
+ dst->width * sizeof(*dst->buf[c]));
+ }
+ }
+ }
+}
+
+static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
+ int channels, int width, int height, int stride) {
+ tensor->allocsize = 0;
+ tensor->channels = channels;
+ tensor->width = width;
+ tensor->height = height;
+ tensor->stride = stride;
+ if (buf) {
+ for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
+ } else {
+ for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
+ }
+}
+
+static void swap_tensor(TENSOR *t1, TENSOR *t2) {
+ TENSOR t = *t1;
+ *t1 = *t2;
+ *t2 = t;
+}
+
+// The concatenated tensor goes into dst with first the channels in
+// original dst followed by the channels in the src
+static bool concat_tensor(const TENSOR *src, TENSOR *dst) {
+ assert(src->width == dst->width);
+ assert(src->height == dst->height);
+
+ const int dst_channels = dst->channels;
+ const int channels = dst->channels + src->channels;
+ const int newallocsize = channels * dst->width * dst->height;
+ if (dst->allocsize < newallocsize) {
+ TENSOR t;
+ init_tensor(&t);
+ // allocate new buffers and copy first the dst channels
+ if (!realloc_tensor(&t, channels, dst->width, dst->height)) return false;
+ copy_tensor(dst, dst->channels, 0, &t);
+ // Swap the tensors and free the old buffers
+ swap_tensor(dst, &t);
+ free_tensor(&t);
+ }
+ for (int c = 1; c < channels; ++c)
+ dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
+ // Copy the channels in src after the first dst_channels channels.
+ copy_tensor(src, src->channels, dst_channels, dst);
+ return true;
+}
+
+int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
+ return (t1->width == t2->width && t1->height == t2->height);
+}
+
+int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
+ return (t1->channels == t2->channels && t1->width == t2->width &&
+ t1->height == t2->height);
+}
+
+void av1_find_cnn_layer_output_size(int in_width, int in_height,
+ const CNN_LAYER_CONFIG *layer_config,
+ int *out_width, int *out_height) {
+ assert(layer_config->skip_width > 0);
+ assert(layer_config->skip_height > 0);
+ if (!layer_config->deconvolve) {
+ switch (layer_config->pad) {
+ case PADDING_SAME_ZERO:
+ case PADDING_SAME_REPLICATE:
+ *out_width = (in_width + layer_config->skip_width - 1) /
+ layer_config->skip_width;
+ *out_height = (in_height + layer_config->skip_height - 1) /
+ layer_config->skip_height;
+ break;
+ case PADDING_VALID:
+ *out_width =
+ (in_width - layer_config->filter_width + layer_config->skip_width) /
+ layer_config->skip_width;
+ *out_height = (in_height - layer_config->filter_height +
+ layer_config->skip_height) /
+ layer_config->skip_height;
+ break;
+ default: assert(0 && "Unknown padding type");
+ }
+ } else {
+ switch (layer_config->pad) {
+ case PADDING_SAME_ZERO:
+ case PADDING_SAME_REPLICATE:
+ *out_width = in_width * layer_config->skip_width;
+ *out_height = in_height * layer_config->skip_height;
+ break;
+ case PADDING_VALID:
+ *out_width = (in_width - 1) * layer_config->skip_width +
+ layer_config->filter_width;
+ *out_height = (in_height - 1) * layer_config->skip_height +
+ layer_config->filter_height;
+ break;
+ default: assert(0 && "Unknown padding type");
+ }
+ }
+}
+
+void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
+ int channels_per_branch[]) {
+ int branch = layer_config->branch;
+ const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+ if (layer_config->branch_copy_type == BRANCH_INPUT) {
+ channels_per_branch[b] = layer_config->in_channels;
+ } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+ channels_per_branch[b] = layer_config->out_channels;
+ } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+ channels_per_branch[b] = layer_config->out_channels;
+ for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+ if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+ assert(channels_per_branch[c] > 0);
+ channels_per_branch[b] += channels_per_branch[c];
+ }
+ }
+ }
+ }
+ }
+ channels_per_branch[branch] = layer_config->out_channels;
+ for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+ if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+ assert(channels_per_branch[c] > 0);
+ channels_per_branch[branch] += channels_per_branch[c];
+ }
+ }
+}
+
+#if CONFIG_DEBUG
+static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
+ const int num_layers = cnn_config->num_layers;
+ const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
+
+ for (int idx = 0; idx < num_layers; idx++) {
+ if (layer_configs[idx].output_num != -1) {
+ return 1;
+ }
+ }
+ return 0;
+}
+#endif
+
+void av1_find_cnn_output_size(int in_width, int in_height,
+ const CNN_CONFIG *cnn_config, int *out_width,
+ int *out_height, int *out_channels) {
+ int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
+ int i_width[CNN_MAX_BRANCHES] = { 0 };
+ int i_height[CNN_MAX_BRANCHES] = { 0 };
+ i_width[0] = in_width + cnn_config->ext_width * 2;
+ i_height[0] = in_height + cnn_config->ext_height * 2;
+
+#if CONFIG_DEBUG
+ assert(cnn_has_at_least_one_output(cnn_config));
+#endif
+
+ for (int i = 0; i < cnn_config->num_layers; ++i) {
+ const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
+ const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+ const int branch = layer_config->branch;
+ int o_width = 0, o_height = 0;
+
+ if (layer_config->branch_copy_type == BRANCH_INPUT) {
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+ assert(i_width[branch] > 0 && i_height[branch] > 0);
+ i_width[b] = i_width[branch];
+ i_height[b] = i_height[branch];
+ }
+ }
+ }
+
+ av1_find_cnn_layer_output_size(i_width[branch], i_height[branch],
+ layer_config, &o_width, &o_height);
+ i_width[branch] = o_width;
+ i_height[branch] = o_height;
+
+ if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+ i_width[b] = o_width;
+ i_height[b] = o_height;
+ }
+ }
+ }
+
+ find_cnn_out_channels(layer_config, channels_per_branch);
+
+ const int output_num = layer_config->output_num;
+ if (output_num != -1) { // Current layer is an output layer
+ out_width[output_num] = o_width;
+ out_height[output_num] = o_height;
+ out_channels[output_num] = channels_per_branch[layer_config->branch];
+ }
+ }
+}
+
+static INLINE int get_start_shift_convolve(int width, int filt_width,
+ int stride) {
+ const int mod = (width % stride);
+ const int filt_off = (filt_width - 1) / 2;
+ const int dif = (mod ? mod - 1 : stride - 1);
+ return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
+}
+
+void av1_cnn_add_c(float **output, int channels, int width, int height,
+ int stride, const float **add) {
+ for (int c = 0; c < channels; ++c) {
+ for (int i = 0; i < height; ++i)
+ for (int j = 0; j < width; ++j)
+ output[c][i * stride + j] += add[c][i * stride + j];
+ }
+}
+
+void av1_cnn_activate_c(float **output, int channels, int width, int height,
+ int stride, ACTIVATION layer_activation) {
+ if (layer_activation == RELU) {
+ for (int c = 0; c < channels; ++c) {
+ for (int i = 0; i < height; ++i)
+ for (int j = 0; j < width; ++j)
+ output[c][i * stride + j] = relu(output[c][i * stride + j]);
+ }
+ } else if (layer_activation == SOFTSIGN) {
+ for (int c = 0; c < channels; ++c) {
+ for (int i = 0; i < height; ++i)
+ for (int j = 0; j < width; ++j)
+ output[c][i * stride + j] = softsign(output[c][i * stride + j]);
+ }
+ } else if (layer_activation == SIGMOID) {
+ assert(0 && "Sigmoid has not been supported in CNN."); // TO DO
+ } else if (layer_activation != NONE) {
+ assert(0 && "Unknown activation type");
+ }
+}
+
+static bool copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
+ const CNN_LAYER_CONFIG *layer_config,
+ int branch, TENSOR branch_output[]) {
+ const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+ // Copy layer's active tensor to output tensor of branch b if set in
+ // mask. The output becomes the input of the first layer of the branch
+ // because the layer of the branch is not the first layer.
+ int copy_channels = branch_config->channels_to_copy > 0
+ ? branch_config->channels_to_copy
+ : layer_active_tensor->channels;
+ if (!realloc_tensor(&branch_output[b], copy_channels,
+ layer_active_tensor->width,
+ layer_active_tensor->height)) {
+ return false;
+ }
+ copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
+ }
+ }
+ return true;
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_SAME_ZERO.
+static void convolve_maxpool_padding_zero(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ const int cstep, const int filter_width_half,
+ const int filter_height_half) {
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
+ for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
+ for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+ ++hh) {
+ for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+ ++ww) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int ii = hh + l - filter_height_half;
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int jj = ww + m - filter_width_half;
+ if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+ continue;
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ const float a = sum;
+ if (h == hh && w == ww)
+ output[i][u * out_stride + v] = a;
+ else
+ output[i][u * out_stride + v] =
+ AOMMAX(output[i][u * out_stride + v], a);
+ }
+ }
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_SAME_REPLICATE.
+static void convolve_maxpool_padding_replicate(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ const int cstep, const int filter_width_half,
+ const int filter_height_half) {
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int h = 0, u = 0; h < in_height; h += layer_config->skip_height, ++u) {
+ for (int w = 0, v = 0; w < in_width; w += layer_config->skip_width, ++v) {
+ for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+ ++hh) {
+ for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+ ++ww) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int ii =
+ CLAMPINDEX(hh + l - filter_height_half, in_height);
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int jj =
+ CLAMPINDEX(ww + m - filter_width_half, in_width);
+ assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ const float a = sum;
+ if (h == hh && w == ww)
+ output[i][u * out_stride + v] = a;
+ else
+ output[i][u * out_stride + v] =
+ AOMMAX(output[i][u * out_stride + v], a);
+ }
+ }
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 1, either skip_width or skip_height
+// greater than 1 and padding equal to PADDING_VALID.
+static void convolve_maxpool_padding_valid(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ const int cstep) {
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
+ h += layer_config->skip_height, ++u) {
+ for (int w = 0, v = 0; w < in_width - layer_config->filter_width + 1;
+ w += layer_config->skip_width, ++v) {
+ for (int hh = h; hh < AOMMIN(in_height, h + layer_config->skip_height);
+ ++hh) {
+ for (int ww = w; ww < AOMMIN(in_width, w + layer_config->skip_width);
+ ++ww) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int ii = hh + l;
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int jj = ww + m;
+ assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ const float a = sum;
+ if (h == hh && w == ww)
+ output[i][u * out_stride + v] = a;
+ else
+ output[i][u * out_stride + v] =
+ AOMMAX(output[i][u * out_stride + v], a);
+ }
+ }
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 0 with filter_height and filter_width
+// equal to 1.
+static void convolve_element_wise(const float **input, int in_width,
+ int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config,
+ float **output, int out_stride, int start_idx,
+ int step) {
+ const int start_h = get_start_shift_convolve(
+ in_height, layer_config->filter_height, layer_config->skip_height);
+ const int start_w =
+ get_start_shift_convolve(in_width, layer_config->filter_width,
+ layer_config->skip_width) +
+ start_idx * layer_config->skip_width;
+ const int out_w_step = AOMMAX(step, 1);
+ const int in_w_step = layer_config->skip_width * out_w_step;
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int h = start_h, u = 0; h < in_height;
+ h += layer_config->skip_height, ++u) {
+ const int in_h = h * in_stride;
+ const int out_h = u * out_stride + start_idx;
+ for (int w = start_w, out_index = out_h; w < in_width;
+ w += in_w_step, out_index += out_w_step) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ sum += layer_config->weights[k * layer_config->out_channels + i] *
+ input[k][in_h + w];
+ }
+ output[i][out_index] = sum;
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_SAME_ZERO.
+static void convolve_no_maxpool_padding_zero(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int filter_width_half,
+ const int filter_height_half, const int ii_shift, const int jj_shift,
+ const int channel_step) {
+ const int start_h = get_start_shift_convolve(
+ in_height, layer_config->filter_height, layer_config->skip_height);
+ const int start_w = get_start_shift_convolve(
+ in_width, layer_config->filter_width, layer_config->skip_width);
+ const int end_ii_shift = filter_height_half + 1;
+ const int end_jj_shift = filter_width_half + 1;
+ // *_filter_margin stores the number of pixels along a dimension in the
+ // intersection of the complement of the image in the extended image
+ // and the filter.
+ const int top_filter_margin = layer_config->filter_width * ii_shift;
+ const int right_filter_margin = end_jj_shift - in_width;
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ for (int h = start_h, u = 0; h < in_height;
+ h += layer_config->skip_height, ++u) {
+ const int out_h = u * out_stride;
+ const int top_cstep =
+ AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
+ cstep +
+ i;
+ const int start_ii = AOMMAX(0, h - ii_shift);
+ const int end_ii = AOMMIN(in_height, h + end_ii_shift);
+ for (int w = start_w, out_index = out_h; w < in_width;
+ w += layer_config->skip_width, ++out_index) {
+ const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
+ const int right_cstep = AOMMAX(0, right_filter_margin + w) * cstep;
+ const int start_jj = AOMMAX(0, w - jj_shift);
+ const int end_jj = AOMMIN(in_width, w + end_jj_shift);
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + top_cstep;
+ for (int ii = start_ii; ii < end_ii; ++ii) {
+ off += left_cstep;
+ for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
+ sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
+ }
+ off += right_cstep;
+ }
+ }
+ output[i][out_index] = sum;
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_SAME_REPLICATE.
+static void convolve_no_maxpool_padding_replicate(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int ii_shift, const int jj_shift,
+ const int channel_step) {
+ // h and w are shifted to an offset coordinate system to reduce in-loop
+ // computation.
+ const int start_h =
+ get_start_shift_convolve(in_height, layer_config->filter_height,
+ layer_config->skip_height) -
+ ii_shift;
+ const int start_w =
+ get_start_shift_convolve(in_width, layer_config->filter_width,
+ layer_config->skip_width) -
+ jj_shift;
+ const int end_h = in_height - ii_shift;
+ const int end_w = in_width - jj_shift;
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ for (int h = start_h, u = 0; h < end_h;
+ h += layer_config->skip_height, ++u) {
+ const int out_h = u * out_stride;
+ const int upper_ii_index = layer_config->filter_height + h;
+ for (int w = start_w, out_index = out_h; w < end_w;
+ w += layer_config->skip_width, ++out_index) {
+ const int upper_jj_index = layer_config->filter_width + w;
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int ii = h; ii < upper_ii_index; ++ii) {
+ const int clamped_ii = CLAMPINDEX(ii, in_height);
+ for (int jj = w; jj < upper_jj_index; ++jj) {
+ const int clamped_jj = CLAMPINDEX(jj, in_width);
+ assert(clamped_ii >= 0 && clamped_ii < in_height &&
+ clamped_jj >= 0 && clamped_jj < in_width);
+ sum += layer_config->weights[off] *
+ input[k][clamped_ii * in_stride + clamped_jj];
+ off += cstep;
+ }
+ }
+ }
+ output[i][out_index] = sum;
+ }
+ }
+ }
+}
+
+// CNNConvolve specific to maxpool set as 0 and padding equal to
+// PADDING_VALID.
+void av1_cnn_convolve_no_maxpool_padding_valid_c(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+ int start_idx, int cstep, int channel_step) {
+ assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) ||
+ !layer_config->maxpool);
+ assert(layer_config->filter_height > 1 || layer_config->filter_width > 1);
+ assert(layer_config->pad == PADDING_VALID);
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ for (int h = 0, u = 0; h < in_height - layer_config->filter_height + 1;
+ h += layer_config->skip_height, ++u) {
+ const int out_h = u * out_stride;
+ const int upper_ii_index = layer_config->filter_height + h;
+ for (int w = 0, out_index = out_h;
+ w < in_width - layer_config->filter_width + 1;
+ w += layer_config->skip_width, ++out_index) {
+ const int upper_jj_index = layer_config->filter_width + w;
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int ii = h; ii < upper_ii_index; ++ii) {
+ for (int jj = w; jj < upper_jj_index; ++jj) {
+ assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+ sum += layer_config->weights[off] * input[k][ii * in_stride + jj];
+ off += cstep;
+ }
+ }
+ }
+ output[i][out_index] = sum;
+ }
+ }
+ }
+}
+
+static void av1_cnn_convolve(const float **input, int in_width, int in_height,
+ int in_stride,
+ const CNN_LAYER_CONFIG *layer_config,
+ float **output, int out_stride, int start_idx,
+ int step) {
+ assert(!layer_config->deconvolve);
+ const int cstep = layer_config->in_channels * layer_config->out_channels;
+ const int filter_height_half = layer_config->filter_height >> 1;
+ const int filter_width_half = layer_config->filter_width >> 1;
+ const int channel_step = AOMMAX(step, 1);
+
+ if (layer_config->maxpool &&
+ (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
+ switch (layer_config->pad) {
+ case PADDING_SAME_ZERO:
+ convolve_maxpool_padding_zero(input, in_width, in_height, in_stride,
+ layer_config, output, out_stride, cstep,
+ filter_width_half, filter_height_half);
+ break;
+ case PADDING_SAME_REPLICATE:
+ convolve_maxpool_padding_replicate(
+ input, in_width, in_height, in_stride, layer_config, output,
+ out_stride, cstep, filter_width_half, filter_height_half);
+ break;
+ case PADDING_VALID:
+ convolve_maxpool_padding_valid(input, in_width, in_height, in_stride,
+ layer_config, output, out_stride, cstep);
+ break;
+ default: assert(0 && "Unknown padding type");
+ }
+ } else {
+ // Results in element-wise matrix multiplication.
+ if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
+ convolve_element_wise(input, in_width, in_height, in_stride, layer_config,
+ output, out_stride, start_idx, step);
+ return;
+ }
+ const int ii_shift =
+ filter_height_half - (layer_config->filter_height - 1) % 2;
+ const int jj_shift =
+ filter_width_half - (layer_config->filter_width - 1) % 2;
+ switch (layer_config->pad) {
+ case PADDING_SAME_ZERO:
+ convolve_no_maxpool_padding_zero(
+ input, in_width, in_height, in_stride, layer_config, output,
+ out_stride, start_idx, cstep, filter_width_half, filter_height_half,
+ ii_shift, jj_shift, channel_step);
+ break;
+ case PADDING_SAME_REPLICATE:
+ convolve_no_maxpool_padding_replicate(
+ input, in_width, in_height, in_stride, layer_config, output,
+ out_stride, start_idx, cstep, ii_shift, jj_shift, channel_step);
+ break;
+ case PADDING_VALID:
+ av1_cnn_convolve_no_maxpool_padding_valid(
+ input, in_width, in_height, in_stride, layer_config, output,
+ out_stride, start_idx, cstep, channel_step);
+ break;
+ default: assert(0 && "Unknown padding type");
+ }
+ }
+}
+
+static int convolve_layer(void *arg1, void *arg2) {
+ const CONVOLVE_OPS *convolve_ops = arg1;
+ (void)arg2;
+ av1_cnn_convolve(
+ convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
+ convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
+ convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
+ return 1;
+}
+
+static void convolve_layer_mt(const float **input, int in_width, int in_height,
+ int in_stride,
+ const CNN_LAYER_CONFIG *layer_config,
+ const CNN_THREAD_DATA *thread_data,
+ float **output, int out_stride) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ const int num_workers = thread_data->num_workers;
+ assert(thread_data->workers);
+
+ CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
+ for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+ AVxWorker *const worker = &thread_data->workers[th];
+ winterface->reset(worker);
+
+ CONVOLVE_OPS convolve_op = { input, in_width, in_height,
+ in_stride, layer_config, output,
+ out_stride, th, num_workers };
+ convolve_ops[th] = convolve_op;
+ worker->hook = convolve_layer;
+ worker->data1 = &(convolve_ops[th]);
+ worker->data2 = NULL;
+
+ // Start convolving.
+ if (th == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ // Wait until all workers have finished.
+ for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+ winterface->sync(&thread_data->workers[th]);
+ }
+}
+
+static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
+ const int dif = AOMMAX(filt_width - stride, 0);
+ return dif / 2;
+}
+
+void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
+ int stride, const float *gamma, const float *beta,
+ const float *mean, const float *std) {
+ assert(gamma && beta && beta && std && "batchnorm has null parameter!");
+ for (int ch = 0; ch < channels; ch++) {
+ const float ch_gamma = gamma[ch];
+ const float ch_beta = beta[ch];
+ const float ch_mean = mean[ch];
+ const float ch_std = std[ch];
+ float *image_row = image[ch];
+
+ for (int row = 0; row < height; row++) {
+ for (int col = 0; col < width; col++) {
+ image_row[col] =
+ ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
+ }
+ image_row += stride;
+ }
+ }
+}
+
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
+ int in_stride, const CNN_LAYER_CONFIG *layer_config,
+ float **output, int out_stride) {
+ assert(layer_config->deconvolve);
+
+ const int cstep = layer_config->in_channels * layer_config->out_channels;
+
+ int out_width = 0;
+ int out_height = 0;
+ av1_find_cnn_layer_output_size(in_width, in_height, layer_config, &out_width,
+ &out_height);
+ switch (layer_config->pad) {
+ case PADDING_SAME_ZERO:
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int u = 0; u < out_height; ++u) {
+ for (int v = 0; v < out_width; ++v) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int h =
+ u - l +
+ get_start_shift_deconvolve(layer_config->filter_height,
+ layer_config->skip_height);
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int w =
+ v - m +
+ get_start_shift_deconvolve(layer_config->filter_width,
+ layer_config->skip_width);
+ if ((h % layer_config->skip_height) != 0 ||
+ (w % layer_config->skip_width) != 0)
+ continue;
+ const int ii = h / layer_config->skip_height;
+ const int jj = w / layer_config->skip_width;
+ if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+ continue;
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ output[i][u * out_stride + v] = sum;
+ }
+ }
+ }
+ break;
+ case PADDING_SAME_REPLICATE:
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int u = 0; u < out_height; ++u) {
+ for (int v = 0; v < out_width; ++v) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int h =
+ u - l +
+ get_start_shift_deconvolve(layer_config->filter_height,
+ layer_config->skip_height);
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int w =
+ v - m +
+ get_start_shift_deconvolve(layer_config->filter_width,
+ layer_config->skip_width);
+ if ((h % layer_config->skip_height) != 0 ||
+ (w % layer_config->skip_width) != 0)
+ continue;
+ const int ii =
+ CLAMPINDEX(h / layer_config->skip_height, in_height);
+ const int jj =
+ CLAMPINDEX(w / layer_config->skip_width, in_width);
+ assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ output[i][u * out_stride + v] = sum;
+ }
+ }
+ }
+ break;
+ case PADDING_VALID:
+ for (int i = 0; i < layer_config->out_channels; ++i) {
+ for (int u = 0; u < out_height; ++u) {
+ for (int v = 0; v < out_width; ++v) {
+ float sum = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ int off = k * layer_config->out_channels + i;
+ for (int l = 0; l < layer_config->filter_height; ++l) {
+ const int h = u - l;
+ for (int m = 0; m < layer_config->filter_width;
+ ++m, off += cstep) {
+ const int w = v - m;
+ if ((h % layer_config->skip_height) != 0 ||
+ (w % layer_config->skip_width) != 0)
+ continue;
+ const int ii = h / layer_config->skip_height;
+ const int jj = w / layer_config->skip_width;
+ if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+ continue;
+ sum += layer_config->weights[off] *
+ input[k][ii * in_stride + jj];
+ }
+ }
+ }
+ output[i][u * out_stride + v] = sum;
+ }
+ }
+ }
+ break;
+ default: assert(0 && "Unknown padding type");
+ }
+}
+
+bool av1_cnn_predict_c(const float **input, int in_width, int in_height,
+ int in_stride, const CNN_CONFIG *cnn_config,
+ const CNN_THREAD_DATA *thread_data,
+ CNN_MULTI_OUT *output_struct) {
+ bool success = false;
+ TENSOR tensor1[CNN_MAX_BRANCHES] = { { 0 } };
+ TENSOR tensor2[CNN_MAX_BRANCHES] = { { 0 } };
+
+ float **output[CNN_MAX_BRANCHES];
+ const int *out_chs = output_struct->output_channels;
+ output[0] = output_struct->output_buffer;
+ for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
+ output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
+ }
+
+ int i_width = in_width;
+ int i_height = in_height;
+ int o_width = 0, o_height = 0;
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ init_tensor(&tensor1[b]);
+ init_tensor(&tensor2[b]);
+ }
+
+ const int *out_stride = output_struct->output_strides;
+ for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+ const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
+ const int branch = layer_config->branch;
+ const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+
+ // Allocate input tensor
+ if (layer == 0) { // First layer
+ assert(branch == 0); // First layer must be primary branch
+ assign_tensor(&tensor1[branch], (float **)input,
+ layer_config->in_channels, in_width, in_height, in_stride);
+ } else { // Non-first layer
+ // Swap tensor1 and tensor2
+ swap_tensor(&tensor1[branch], &tensor2[branch]);
+
+ i_width = tensor1[branch].width;
+ i_height = tensor1[branch].height;
+ }
+
+ // Allocate output tensor
+ av1_find_cnn_layer_output_size(i_width, i_height, layer_config, &o_width,
+ &o_height);
+ const int output_num = layer_config->output_num;
+ if (output_num == -1) { // Non-output layer
+ if (!realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
+ o_height)) {
+ goto Error;
+ }
+ } else { // Output layer
+ free_tensor(&tensor2[branch]);
+ assign_tensor(&tensor2[branch], output[output_num],
+ layer_config->out_channels, o_width, o_height,
+ out_stride[output_num]);
+ }
+
+ // If we are combining branches make sure that the branch to combine
+ // is different from the current branch.
+ assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
+ !(branch_config->branches_to_combine & (1 << branch))));
+
+ if (layer_config->branch_copy_type == BRANCH_INPUT) {
+ if (!copy_active_tensor_to_branches(&tensor1[branch], layer_config,
+ branch, tensor2)) {
+ goto Error;
+ }
+ }
+ // Check consistency of input and output channels
+ assert(tensor1[branch].channels == layer_config->in_channels);
+ assert(tensor2[branch].channels == layer_config->out_channels);
+
+ // Convolve/Deconvolve
+ if (!cnn_config->layer_config[layer].deconvolve) {
+ if (thread_data->num_workers > 1) {
+ convolve_layer_mt((const float **)tensor1[branch].buf,
+ tensor1[branch].width, tensor1[branch].height,
+ tensor1[branch].stride, layer_config, thread_data,
+ tensor2[branch].buf, tensor2[branch].stride);
+ } else {
+ av1_cnn_convolve((const float **)tensor1[branch].buf,
+ tensor1[branch].width, tensor1[branch].height,
+ tensor1[branch].stride, layer_config,
+ tensor2[branch].buf, tensor2[branch].stride, 0, 1);
+ }
+ } else {
+ av1_cnn_deconvolve((const float **)tensor1[branch].buf,
+ tensor1[branch].width, tensor1[branch].height,
+ tensor1[branch].stride, layer_config,
+ tensor2[branch].buf, tensor2[branch].stride);
+ }
+
+ if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+ if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
+ branch, tensor2)) {
+ goto Error;
+ }
+ }
+
+ // Add tensors from other branches if needed
+ if (layer_config->branch_combine_type == BRANCH_ADD) {
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+ assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
+ av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
+ tensor2[branch].width, tensor2[branch].height,
+ tensor2[branch].stride, (const float **)tensor2[b].buf);
+ }
+ }
+ }
+
+ // Non-linearity
+ av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
+ tensor2[branch].width, tensor2[branch].height,
+ tensor2[branch].stride, layer_config->activation);
+
+ if (layer_config->bn_params.bn_gamma) {
+ av1_cnn_batchnorm(
+ tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
+ tensor2[branch].height, tensor2[branch].stride,
+ layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
+ layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
+ }
+
+ // Concatenate tensors
+ if (layer_config->branch_combine_type == BRANCH_CAT) {
+ if (output_num == -1) { // Non-output layer
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+ assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+ assert(tensor2[b].channels > 0);
+ if (!concat_tensor(&tensor2[b], &tensor2[branch])) goto Error;
+ }
+ }
+ } else { // Output layer
+ const int existing_channels = tensor2[branch].channels;
+ int num_chs = existing_channels;
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+ assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+ // Needed only to assign the new channel buffers
+ num_chs += tensor2[b].channels;
+ }
+ }
+ assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
+ o_height, out_stride[output_num]);
+
+ num_chs = existing_channels;
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+ assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+ // Needed only to assign the new channel buffers
+ copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
+ &tensor2[branch]);
+ num_chs += tensor2[b].channels;
+ }
+ }
+ }
+ }
+
+ if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+ if (!copy_active_tensor_to_branches(&tensor2[branch], layer_config,
+ branch, tensor2)) {
+ goto Error;
+ }
+ }
+ }
+
+ success = true;
+Error:
+ for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+ free_tensor(&tensor1[b]);
+ free_tensor(&tensor2[b]);
+ }
+ return success;
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+ int stride, const CNN_CONFIG *cnn_config,
+ const CNN_THREAD_DATA *thread_data,
+ CNN_MULTI_OUT *output) {
+ const float max_val = 255.0;
+
+ const int in_width = width + 2 * cnn_config->ext_width;
+ const int in_height = height + 2 * cnn_config->ext_height;
+ const int in_channels = cnn_config->layer_config[0].in_channels;
+ float *inputs[CNN_MAX_CHANNELS];
+ float *input_ =
+ (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+ if (!input_) return false;
+ const int in_stride = in_width;
+
+ for (int c = 0; c < in_channels; ++c) {
+ inputs[c] = input_ + c * in_stride * in_height;
+ float *input =
+ inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+ if (cnn_config->strict_bounds) {
+ for (int i = 0; i < height; ++i)
+ for (int j = 0; j < width; ++j)
+ input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+ // extend left and right
+ for (int i = 0; i < height; ++i) {
+ for (int j = -cnn_config->ext_width; j < 0; ++j)
+ input[i * in_stride + j] = input[i * in_stride];
+ for (int j = width; j < width + cnn_config->ext_width; ++j)
+ input[i * in_stride + j] = input[i * in_stride + width - 1];
+ }
+ // extend top and bottom
+ for (int i = -cnn_config->ext_height; i < 0; ++i)
+ memcpy(&input[i * in_stride - cnn_config->ext_width],
+ &input[-cnn_config->ext_width], in_width * sizeof(*input));
+ for (int i = height; i < height + cnn_config->ext_height; ++i)
+ memcpy(&input[i * in_stride - cnn_config->ext_width],
+ &input[(height - 1) * in_stride - cnn_config->ext_width],
+ in_width * sizeof(*input));
+ } else {
+ for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+ ++i)
+ for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+ ++j)
+ input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+ }
+ }
+ bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
+ in_stride, cnn_config, thread_data, output);
+
+ aom_free(input_);
+ return success;
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+ int stride,
+ const CNN_CONFIG *cnn_config,
+ const CNN_THREAD_DATA *thread_data,
+ int bit_depth,
+ CNN_MULTI_OUT *output) {
+ const float max_val = (float)((1 << bit_depth) - 1);
+
+ const int in_width = width + 2 * cnn_config->ext_width;
+ const int in_height = height + 2 * cnn_config->ext_height;
+ const int in_channels = cnn_config->layer_config[0].in_channels;
+ float *inputs[CNN_MAX_CHANNELS];
+ float *input_ =
+ (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+ if (!input_) return false;
+ const int in_stride = in_width;
+
+ for (int c = 0; c < in_channels; ++c) {
+ inputs[c] = input_ + c * in_stride * in_height;
+ float *input =
+ inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+ if (cnn_config->strict_bounds) {
+ for (int i = 0; i < height; ++i)
+ for (int j = 0; j < width; ++j)
+ input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+ // extend left and right
+ for (int i = 0; i < height; ++i) {
+ for (int j = -cnn_config->ext_width; j < 0; ++j)
+ input[i * in_stride + j] = input[i * in_stride];
+ for (int j = width; j < width + cnn_config->ext_width; ++j)
+ input[i * in_stride + j] = input[i * in_stride + width - 1];
+ }
+ // extend top and bottom
+ for (int i = -cnn_config->ext_height; i < 0; ++i)
+ memcpy(&input[i * in_stride - cnn_config->ext_width],
+ &input[-cnn_config->ext_width], in_width * sizeof(*input));
+ for (int i = height; i < height + cnn_config->ext_height; ++i)
+ memcpy(&input[i * in_stride - cnn_config->ext_width],
+ &input[(height - 1) * in_stride - cnn_config->ext_width],
+ in_width * sizeof(*input));
+ } else {
+ for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+ ++i)
+ for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+ ++j)
+ input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+ }
+ }
+
+ bool success = av1_cnn_predict((const float **)inputs, in_width, in_height,
+ in_stride, cnn_config, thread_data, output);
+
+ aom_free(input_);
+ return success;
+}
diff --git a/third_party/aom/av1/encoder/cnn.h b/third_party/aom/av1/encoder/cnn.h
new file mode 100644
index 0000000000..df6401f73f
--- /dev/null
+++ b/third_party/aom/av1/encoder/cnn.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CNN_H_
+#define AOM_AV1_ENCODER_CNN_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <math.h>
+#include <stdbool.h>
+
+#include "aom_util/aom_thread.h"
+#include "config/av1_rtcd.h"
+
+struct AV1Common;
+
+#define CNN_MAX_HIDDEN_LAYERS 64
+#define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1)
+#define CNN_MAX_CHANNELS 256
+#define CNN_MAX_BRANCHES 4
+#define CNN_MAX_THREADS 32
+
+#define NO_BRANCH_CONFIG \
+ { 0, 0, 0 }
+#define NO_BN_PARAMS \
+ { NULL, NULL, NULL, NULL }
+
+enum {
+ PADDING_SAME_ZERO, // tensorflow's SAME padding with pixels outside
+ // the image area assumed to be 0 (default)
+ PADDING_SAME_REPLICATE, // tensorflow's SAME padding with pixels outside
+ // the image area replicated from closest edge
+ PADDING_VALID // tensorflow's VALID padding
+} UENUM1BYTE(PADDING_TYPE);
+
+// enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION);
+
+// Times when input tensor may be copied to branches given in input_to_branches.
+// BRANCH_NO_COPY: doesn't copy any tensor.
+// BRANCH_INPUT: copies the input tensor to branches.
+// BRANCH_OUTPUT: copies the convolved tensor to branches.
+// BRANCH_COMBINED: copies the combined (after convolving and branch combining)
+// tensor. If no combinations happen at this layer, then this option
+// has the same effect as COPY_OUTPUT.
+enum {
+ BRANCH_NO_COPY,
+ BRANCH_INPUT,
+ BRANCH_OUTPUT,
+ BRANCH_COMBINED
+} UENUM1BYTE(BRANCH_COPY);
+
+// Types of combining branches with output of current layer:
+// BRANCH_NOC: no branch combining
+// BRANCH_ADD: Add previously stored branch tensor to output of layer
+// BRANCH_CAT: Concatenate branch tensor to output of layer
+enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE);
+
+// The parameters used to scale each channel in batch
+// normalization. The processing in done on a per-channel basis.
+// e.g. bn_mean[c] is the mean for all pixels in channel c. This
+// is always applied after activation. The output is given by
+// out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where
+// norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c]
+// here we assume that the effect of variance_epsilon is already
+// taken into account when bn_std is calculated. The pointers
+// needs to be either all zero or all valid. If all zero, then
+// batchnorm is disabled, else batchnorm is applied.
+struct CNN_BATCHNORM_PARAMS {
+ const float *bn_gamma;
+ const float *bn_beta;
+ const float *bn_mean;
+ const float *bn_std;
+};
+
+struct CNN_BRANCH_CONFIG {
+ int input_to_branches; // If nonzero, copy the active tensor to the current
+ // layer and store for future use in branches
+ // specified in the field as a binary mask. For
+ // example, if input_to_branch = 0x06, it means the
+ // input tensor to the current branch is copied to
+ // branches 1 and 2 (where 0 represents the primary
+ // branch). One restriction is that the mask
+ // cannot indicate copying to the current branch.
+ // If greater than 0, only copies the channels up
+ // to the given index.
+ int channels_to_copy; // Within the layer, input a copy of active
+ // tensor to branches given in input_to_branches.
+ int branches_to_combine; // mask of branches to combine with output of
+ // current layer, if
+ // branch_combine_type != BRANCH_NOC
+ // For example, if branches_to_combine = 0x0A,
+ // it means that braches 1 and 3 are combined
+ // with the current branch.
+};
+
+struct CNN_LAYER_CONFIG {
+ int in_channels;
+ int filter_width;
+ int filter_height;
+ int out_channels;
+ int skip_width;
+ int skip_height;
+ int maxpool; // whether to use maxpool or not (only effective when
+ // skip width or skip_height are > 1)
+ const float *weights; // array of length filter_height x filter_width x
+ // in_channels x out_channels where the inner-most
+ // scan is out_channels and the outer most scan is
+ // filter_height.
+ const float *bias; // array of length out_channels
+ PADDING_TYPE pad; // padding type
+ ACTIVATION activation; // the activation function to use after convolution
+ int deconvolve; // whether this is a deconvolution layer.
+ // 0: If skip_width or skip_height are > 1, then we
+ // reduce resolution
+ // 1: If skip_width or skip_height are > 1, then we
+ // increase resolution
+ int branch; // branch index in [0, CNN_MAX_BRANCHES - 1], where
+ // 0 refers to the primary branch.
+ BRANCH_COPY branch_copy_type;
+ BRANCH_COMBINE branch_combine_type;
+ struct CNN_BRANCH_CONFIG branch_config;
+ struct CNN_BATCHNORM_PARAMS
+ bn_params; // A struct that contains the parameters
+ // used for batch normalization.
+ int output_num; // The output buffer idx to which the layer output is
+ // written. Set to -1 to disable writing it to the output. In
+ // the case that branch_combine_type is BRANCH_CAT, all
+ // concatenated channels will be written to output. In the
+ // case of BRANCH_ADD, the output will be the result of
+ // summation.
+};
+
+struct CNN_CONFIG {
+ int num_layers; // number of CNN layers ( = number of hidden layers + 1)
+ int is_residue; // whether the output activation is a residue
+ int ext_width, ext_height; // extension horizontally and vertically
+ int strict_bounds; // whether the input bounds are strict or not.
+ // If strict, the extension area is filled by
+ // replication; if not strict, image data is
+ // assumed available beyond the bounds.
+ CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS];
+};
+
+struct CNN_THREAD_DATA {
+ int num_workers;
+ AVxWorker *workers;
+};
+
+struct CNN_MULTI_OUT {
+ int num_outputs;
+ const int *output_channels;
+ const int *output_strides;
+ float **output_buffer;
+};
+
+// Function to return size of output
+void av1_find_cnn_output_size(int in_width, int in_height,
+ const CNN_CONFIG *cnn_config, int *out_width,
+ int *out_height, int *out_channels);
+
+// Function to return output width and output height of given layer.
+void av1_find_cnn_layer_output_size(int in_width, int in_height,
+ const CNN_LAYER_CONFIG *layer_config,
+ int *out_width, int *out_height);
+
+// Prediction functions from set of input image buffers. This function supports
+// CNN with multiple outputs.
+bool av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+ int stride, const CNN_CONFIG *cnn_config,
+ const CNN_THREAD_DATA *thread_data,
+ struct CNN_MULTI_OUT *output);
+bool av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+ int stride,
+ const CNN_CONFIG *cnn_config,
+ const CNN_THREAD_DATA *thread_data,
+ int bit_depth, CNN_MULTI_OUT *output);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_CNN_H_
diff --git a/third_party/aom/av1/encoder/compound_type.c b/third_party/aom/av1/encoder/compound_type.c
new file mode 100644
index 0000000000..3b0ee88241
--- /dev/null
+++ b/third_party/aom/av1/encoder/compound_type.c
@@ -0,0 +1,1678 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/compound_type.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tx_search.h"
+
+typedef int64_t (*pick_interinter_mask_type)(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+ const uint8_t *const p0, const uint8_t *const p1,
+ const int16_t *const residual1, const int16_t *const diff10,
+ uint64_t *best_sse);
+
+// Checks if characteristics of search match
+static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const COMP_RD_STATS *st,
+ const MB_MODE_INFO *const mi,
+ int32_t *comp_rate, int64_t *comp_dist,
+ int32_t *comp_model_rate,
+ int64_t *comp_model_dist, int *comp_rs2) {
+ // TODO(ranjit): Ensure that compound type search use regular filter always
+ // and check if following check can be removed
+ // Check if interp filter matches with previous case
+ if (st->filter.as_int != mi->interp_filters.as_int) return 0;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ // Match MV and reference indices
+ for (int i = 0; i < 2; ++i) {
+ if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+ (st->mv[i].as_int != mi->mv[i].as_int)) {
+ return 0;
+ }
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]];
+ if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
+ }
+
+ int reuse_data[COMPOUND_TYPES] = { 1, 1, 0, 0 };
+ // For compound wedge, reuse data if newmv search is disabled when NEWMV is
+ // present or if NEWMV is not present in either of the directions
+ if ((!have_newmv_in_inter_mode(mi->mode) &&
+ !have_newmv_in_inter_mode(st->mode)) ||
+ (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search))
+ reuse_data[COMPOUND_WEDGE] = 1;
+ // For compound diffwtd, reuse data if fast search is enabled (no newmv search
+ // when NEWMV is present) or if NEWMV is not present in either of the
+ // directions
+ if (cpi->sf.inter_sf.enable_fast_compound_mode_search ||
+ (!have_newmv_in_inter_mode(mi->mode) &&
+ !have_newmv_in_inter_mode(st->mode)))
+ reuse_data[COMPOUND_DIFFWTD] = 1;
+
+ // Store the stats for the different compound types
+ for (int comp_type = COMPOUND_AVERAGE; comp_type < COMPOUND_TYPES;
+ comp_type++) {
+ if (reuse_data[comp_type]) {
+ comp_rate[comp_type] = st->rate[comp_type];
+ comp_dist[comp_type] = st->dist[comp_type];
+ comp_model_rate[comp_type] = st->model_rate[comp_type];
+ comp_model_dist[comp_type] = st->model_dist[comp_type];
+ comp_rs2[comp_type] = st->comp_rs2[comp_type];
+ }
+ }
+ return 1;
+}
+
+// Checks if similar compound type search case is accounted earlier
+// If found, returns relevant rd data
+static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
+ const MACROBLOCK *x,
+ const MB_MODE_INFO *const mbmi,
+ int32_t *comp_rate, int64_t *comp_dist,
+ int32_t *comp_model_rate,
+ int64_t *comp_model_dist, int *comp_rs2,
+ int *match_index) {
+ for (int j = 0; j < x->comp_rd_stats_idx; ++j) {
+ if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate,
+ comp_dist, comp_model_rate, comp_model_dist,
+ comp_rs2)) {
+ *match_index = j;
+ return 1;
+ }
+ }
+ return 0; // no match result found
+}
+
+static INLINE bool enable_wedge_search(
+ MACROBLOCK *const x, const unsigned int disable_wedge_var_thresh) {
+ // Enable wedge search if source variance and edge strength are above
+ // the thresholds.
+ return x->source_variance > disable_wedge_var_thresh;
+}
+
+static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
+ const AV1_COMP *const cpi) {
+ return enable_wedge_search(
+ x, cpi->sf.inter_sf.disable_interinter_wedge_var_thresh) &&
+ cpi->oxcf.comp_type_cfg.enable_interinter_wedge;
+}
+
+static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
+ const AV1_COMP *const cpi) {
+ return enable_wedge_search(
+ x, cpi->sf.inter_sf.disable_interintra_wedge_var_thresh) &&
+ cpi->oxcf.comp_type_cfg.enable_interintra_wedge;
+}
+
+static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
+ const BLOCK_SIZE bsize, const uint8_t *pred0,
+ int stride0, const uint8_t *pred1,
+ int stride1) {
+ static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = {
+ // 4X4
+ BLOCK_INVALID,
+ // 4X8, 8X4, 8X8
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+ // 8X16, 16X8, 16X16
+ BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+ // 16X32, 32X16, 32X32
+ BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+ // 32X64, 64X32, 64X64
+ BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
+ // 64x128, 128x64, 128x128
+ BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+ // 4X16, 16X4, 8X32
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+ // 32X8, 16X64, 64X16
+ BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
+ };
+ const struct macroblock_plane *const p = &x->plane[0];
+ const uint8_t *src = p->src.buf;
+ int src_stride = p->src.stride;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int bw_by2 = bw >> 1;
+ const int bh_by2 = bh >> 1;
+ uint32_t esq[2][2];
+ int64_t tl, br;
+
+ const BLOCK_SIZE f_index = split_qtr[bsize];
+ assert(f_index != BLOCK_INVALID);
+
+ if (is_cur_buf_hbd(&x->e_mbd)) {
+ pred0 = CONVERT_TO_BYTEPTR(pred0);
+ pred1 = CONVERT_TO_BYTEPTR(pred1);
+ }
+
+ // Residual variance computation over relevant quandrants in order to
+ // find TL + BR, TL = sum(1st,2nd,3rd) quadrants of (pred0 - pred1),
+ // BR = sum(2nd,3rd,4th) quadrants of (pred1 - pred0)
+ // The 2nd and 3rd quadrants cancel out in TL + BR
+ // Hence TL + BR = 1st quadrant of (pred0-pred1) + 4th of (pred1-pred0)
+ // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants)
+ // for all codebooks; experiment with other quadrant combinations for
+ // 0, 90 and 135 degrees also.
+ cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+ cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+ pred0 + bh_by2 * stride0 + bw_by2, stride0,
+ &esq[0][1]);
+ cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+ cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+ pred1 + bh_by2 * stride1 + bw_by2, stride0,
+ &esq[1][1]);
+
+ tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]);
+ br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]);
+ return (tl + br > 0);
+}
+
+// Choose the best wedge index and sign
+static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const uint8_t *const p0,
+ const int16_t *const residual1,
+ const int16_t *const diff10,
+ int8_t *const best_wedge_sign,
+ int8_t *const best_wedge_index, uint64_t *best_sse) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const src = &x->plane[0].src;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = bw * bh;
+ assert(N >= 64);
+ int rate;
+ int64_t dist;
+ int64_t rd, best_rd = INT64_MAX;
+ int8_t wedge_index;
+ int8_t wedge_sign;
+ const int8_t wedge_types = get_wedge_types_lookup(bsize);
+ const uint8_t *mask;
+ uint64_t sse;
+ const int hbd = is_cur_buf_hbd(xd);
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+
+ DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (hbd) {
+ aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p0), bw);
+ } else {
+ aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+ }
+#else
+ (void)hbd;
+ aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+#endif
+
+ int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
+ (int64_t)aom_sum_squares_i16(residual1, N)) *
+ (1 << WEDGE_WEIGHT_BITS) / 2;
+ int16_t *ds = residual0;
+
+ av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
+
+ for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
+
+ wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+ mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ // int rate2;
+ // int64_t dist2;
+ // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
+ // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
+ // sse, rate, dist, rate2, dist2); dist = dist2;
+ // rate = rate2;
+
+ rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index];
+ rd = RDCOST(x->rdmult, rate, dist);
+
+ if (rd < best_rd) {
+ *best_wedge_index = wedge_index;
+ *best_wedge_sign = wedge_sign;
+ best_rd = rd;
+ *best_sse = sse;
+ }
+ }
+
+ return best_rd -
+ RDCOST(x->rdmult,
+ x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(
+ const AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int16_t *const residual1,
+ const int16_t *const diff10, const int8_t wedge_sign,
+ int8_t *const best_wedge_index, uint64_t *best_sse) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = bw * bh;
+ assert(N >= 64);
+ int rate;
+ int64_t dist;
+ int64_t rd, best_rd = INT64_MAX;
+ int8_t wedge_index;
+ const int8_t wedge_types = get_wedge_types_lookup(bsize);
+ const uint8_t *mask;
+ uint64_t sse;
+ const int hbd = is_cur_buf_hbd(xd);
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+ for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ rate += x->mode_costs.wedge_idx_cost[bsize][wedge_index];
+ rd = RDCOST(x->rdmult, rate, dist);
+
+ if (rd < best_rd) {
+ *best_wedge_index = wedge_index;
+ best_rd = rd;
+ *best_sse = sse;
+ }
+ }
+ return best_rd -
+ RDCOST(x->rdmult,
+ x->mode_costs.wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+static int64_t pick_interinter_wedge(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
+ const uint8_t *const p0, const uint8_t *const p1,
+ const int16_t *const residual1, const int16_t *const diff10,
+ uint64_t *best_sse) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int bw = block_size_wide[bsize];
+
+ int64_t rd;
+ int8_t wedge_index = -1;
+ int8_t wedge_sign = 0;
+
+ assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+ assert(cpi->common.seq_params->enable_masked_compound);
+
+ if (cpi->sf.inter_sf.fast_wedge_sign_estimate) {
+ wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+ rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign,
+ &wedge_index, best_sse);
+ } else {
+ rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign,
+ &wedge_index, best_sse);
+ }
+
+ mbmi->interinter_comp.wedge_sign = wedge_sign;
+ mbmi->interinter_comp.wedge_index = wedge_index;
+ return rd;
+}
+
+static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1,
+ const int16_t *const residual1,
+ const int16_t *const diff10,
+ uint64_t *best_sse) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = 1 << num_pels_log2_lookup[bsize];
+ int rate;
+ int64_t dist;
+ DIFFWTD_MASK_TYPE cur_mask_type;
+ int64_t best_rd = INT64_MAX;
+ DIFFWTD_MASK_TYPE best_mask_type = 0;
+ const int hbd = is_cur_buf_hbd(xd);
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+ DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+ uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
+ // try each mask type and its inverse
+ for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
+ // build mask and inverse
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (hbd)
+ av1_build_compound_diffwtd_mask_highbd(
+ tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+ CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
+ else
+ av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
+ p0, bw, p1, bw, bh, bw);
+#else
+ (void)hbd;
+ av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type, p0,
+ bw, p1, bw, bh, bw);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+ // compute rd for mask
+ uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
+ tmp_mask[cur_mask_type], N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
+
+ if (rd0 < best_rd) {
+ best_mask_type = cur_mask_type;
+ best_rd = rd0;
+ *best_sse = sse;
+ }
+ }
+ mbmi->interinter_comp.mask_type = best_mask_type;
+ if (best_mask_type == DIFFWTD_38_INV) {
+ memcpy(xd->seg_mask, seg_mask, N * 2);
+ }
+ return best_rd;
+}
+
+static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(av1_is_wedge_used(bsize));
+ assert(cpi->common.seq_params->enable_interintra_compound);
+
+ const struct buf_2d *const src = &x->plane[0].src;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1
+ DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p1), bw);
+ aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+ CONVERT_TO_BYTEPTR(p0), bw);
+ } else {
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+ aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+ }
+#else
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+ aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+#endif
+ int8_t wedge_index = -1;
+ uint64_t sse;
+ int64_t rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0,
+ &wedge_index, &sse);
+
+ mbmi->interintra_wedge_index = wedge_index;
+ return rd;
+}
+
+static AOM_INLINE void get_inter_predictors_masked_compound(
+ MACROBLOCK *x, const BLOCK_SIZE bsize, uint8_t **preds0, uint8_t **preds1,
+ int16_t *residual1, int16_t *diff10, int *strides) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ // get inter predictors to use for masked compound modes
+ av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, preds0,
+ strides);
+ av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, preds1,
+ strides);
+ const struct buf_2d *const src = &x->plane[0].src;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(*preds1), bw);
+ aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
+ bw, CONVERT_TO_BYTEPTR(*preds0), bw);
+ } else {
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
+ bw);
+ aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+ }
+#else
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, bw);
+ aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+#endif
+}
+
+// Computes the rd cost for the given interintra mode and updates the best
+static INLINE void compute_best_interintra_mode(
+ const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
+ MACROBLOCK *const x, const int *const interintra_mode_cost,
+ const BUFFER_SET *orig_dst, uint8_t *intrapred, const uint8_t *tmp_buf,
+ INTERINTRA_MODE *best_interintra_mode, int64_t *best_interintra_rd,
+ INTERINTRA_MODE interintra_mode, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int rate;
+ uint8_t skip_txfm_sb;
+ int64_t dist, skip_sse_sb;
+ const int bw = block_size_wide[bsize];
+ mbmi->interintra_mode = interintra_mode;
+ int rmode = interintra_mode_cost[interintra_mode];
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](cpi, bsize, x, xd, 0, 0, &rate, &dist,
+ &skip_txfm_sb, &skip_sse_sb, NULL,
+ NULL, NULL);
+ int64_t rd = RDCOST(x->rdmult, rate + rmode, dist);
+ if (rd < *best_interintra_rd) {
+ *best_interintra_rd = rd;
+ *best_interintra_mode = mbmi->interintra_mode;
+ }
+}
+
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+ MACROBLOCK *x, int64_t ref_best_rd,
+ RD_STATS *rd_stats) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ if (ref_best_rd < 0) return INT64_MAX;
+ av1_subtract_plane(x, bs, 0);
+ const int64_t rd = av1_estimate_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs,
+ max_txsize_rect_lookup[bs]);
+ if (rd != INT64_MAX) {
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ if (rd_stats->skip_txfm) {
+ const int s1 = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+ rd_stats->rate = s1;
+ } else {
+ const int s0 = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+ rd_stats->rate += s0;
+ }
+ }
+ return rd;
+}
+
+// Computes the rd_threshold for smooth interintra rd search.
+static AOM_INLINE int64_t compute_rd_thresh(MACROBLOCK *const x,
+ int total_mode_rate,
+ int64_t ref_best_rd) {
+ const int64_t rd_thresh = get_rd_thresh_from_best_rd(
+ ref_best_rd, (1 << INTER_INTRA_RD_THRESH_SHIFT),
+ INTER_INTRA_RD_THRESH_SCALE);
+ const int64_t mode_rd = RDCOST(x->rdmult, total_mode_rate, 0);
+ return (rd_thresh - mode_rd);
+}
+
+// Computes the best wedge interintra mode
+static AOM_INLINE int64_t compute_best_wedge_interintra(
+ const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
+ MACROBLOCK *const x, const int *const interintra_mode_cost,
+ const BUFFER_SET *orig_dst, uint8_t *intrapred_, uint8_t *tmp_buf_,
+ int *best_mode, int *best_wedge_index, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bw = block_size_wide[bsize];
+ int64_t best_interintra_rd_wedge = INT64_MAX;
+ int64_t best_total_rd = INT64_MAX;
+ uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+ for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) {
+ mbmi->interintra_mode = mode;
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ const int rate_overhead =
+ interintra_mode_cost[mode] +
+ x->mode_costs.wedge_idx_cost[bsize][mbmi->interintra_wedge_index];
+ const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0);
+ if (total_rd < best_total_rd) {
+ best_total_rd = total_rd;
+ best_interintra_rd_wedge = rd;
+ *best_mode = mbmi->interintra_mode;
+ *best_wedge_index = mbmi->interintra_wedge_index;
+ }
+ }
+ return best_interintra_rd_wedge;
+}
+
+static int handle_smooth_inter_intra_mode(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+ MB_MODE_INFO *mbmi, int64_t ref_best_rd, int *rate_mv,
+ INTERINTRA_MODE *best_interintra_mode, int64_t *best_rd,
+ int *best_mode_rate, const BUFFER_SET *orig_dst, uint8_t *tmp_buf,
+ uint8_t *intrapred, HandleInterModeArgs *args) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *const interintra_mode_cost =
+ mode_costs->interintra_mode_cost[size_group_lookup[bsize]];
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bw = block_size_wide[bsize];
+
+ mbmi->use_wedge_interintra = 0;
+
+ if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 ||
+ *best_interintra_mode == INTERINTRA_MODES) {
+ int64_t best_interintra_rd = INT64_MAX;
+ for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+ ++cur_mode) {
+ if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
+ cpi->sf.intra_sf.disable_smooth_intra) &&
+ cur_mode == II_SMOOTH_PRED)
+ continue;
+ compute_best_interintra_mode(
+ cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred, tmp_buf,
+ best_interintra_mode, &best_interintra_rd, cur_mode, bsize);
+ }
+ args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode;
+ }
+ assert(IMPLIES(!cpi->oxcf.comp_type_cfg.enable_smooth_interintra,
+ *best_interintra_mode != II_SMOOTH_PRED));
+ // Recompute prediction if required
+ bool interintra_mode_reuse = cpi->sf.inter_sf.reuse_inter_intra_mode ||
+ *best_interintra_mode != INTERINTRA_MODES;
+ if (interintra_mode_reuse || *best_interintra_mode != INTERINTRA_MODES - 1) {
+ mbmi->interintra_mode = *best_interintra_mode;
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+
+ // Compute rd cost for best smooth_interintra
+ RD_STATS rd_stats;
+ const int is_wedge_used = av1_is_wedge_used(bsize);
+ const int rmode =
+ interintra_mode_cost[*best_interintra_mode] +
+ (is_wedge_used ? mode_costs->wedge_interintra_cost[bsize][0] : 0);
+ const int total_mode_rate = rmode + *rate_mv;
+ const int64_t rd_thresh = compute_rd_thresh(x, total_mode_rate, ref_best_rd);
+ int64_t rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats);
+ if (rd != INT64_MAX) {
+ rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist);
+ } else {
+ return IGNORE_MODE;
+ }
+ *best_rd = rd;
+ *best_mode_rate = rmode;
+ // Return early if best rd not good enough
+ if (ref_best_rd < INT64_MAX &&
+ (*best_rd >> INTER_INTRA_RD_THRESH_SHIFT) * INTER_INTRA_RD_THRESH_SCALE >
+ ref_best_rd) {
+ return IGNORE_MODE;
+ }
+ return 0;
+}
+
+static int handle_wedge_inter_intra_mode(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+ MB_MODE_INFO *mbmi, int *rate_mv, INTERINTRA_MODE *best_interintra_mode,
+ int64_t *best_rd, const BUFFER_SET *orig_dst, uint8_t *tmp_buf_,
+ uint8_t *tmp_buf, uint8_t *intrapred_, uint8_t *intrapred,
+ HandleInterModeArgs *args, int *tmp_rate_mv, int *rate_overhead,
+ int_mv *tmp_mv, int64_t best_rd_no_wedge) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *const interintra_mode_cost =
+ mode_costs->interintra_mode_cost[size_group_lookup[bsize]];
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bw = block_size_wide[bsize];
+ const int try_smooth_interintra =
+ cpi->oxcf.comp_type_cfg.enable_smooth_interintra;
+
+ mbmi->use_wedge_interintra = 1;
+
+ if (!cpi->sf.inter_sf.fast_interintra_wedge_search) {
+ // Exhaustive search of all wedge and mode combinations.
+ int best_mode = 0;
+ int best_wedge_index = 0;
+ *best_rd = compute_best_wedge_interintra(
+ cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_, tmp_buf_,
+ &best_mode, &best_wedge_index, bsize);
+ mbmi->interintra_mode = best_mode;
+ mbmi->interintra_wedge_index = best_wedge_index;
+ if (best_mode != INTERINTRA_MODES - 1) {
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ }
+ } else if (!try_smooth_interintra) {
+ if (*best_interintra_mode == INTERINTRA_MODES) {
+ mbmi->interintra_mode = INTERINTRA_MODES - 1;
+ *best_interintra_mode = INTERINTRA_MODES - 1;
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ // Pick wedge mask based on INTERINTRA_MODES - 1
+ *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ // Find the best interintra mode for the chosen wedge mask
+ for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+ ++cur_mode) {
+ compute_best_interintra_mode(
+ cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred,
+ tmp_buf, best_interintra_mode, best_rd, cur_mode, bsize);
+ }
+ args->inter_intra_mode[mbmi->ref_frame[0]] = *best_interintra_mode;
+ mbmi->interintra_mode = *best_interintra_mode;
+
+ // Recompute prediction if required
+ if (*best_interintra_mode != INTERINTRA_MODES - 1) {
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ }
+ } else {
+ // Pick wedge mask for the best interintra mode (reused)
+ mbmi->interintra_mode = *best_interintra_mode;
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ }
+ } else {
+ // Pick wedge mask for the best interintra mode from smooth_interintra
+ *best_rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ }
+
+ *rate_overhead =
+ interintra_mode_cost[mbmi->interintra_mode] +
+ mode_costs->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] +
+ mode_costs->wedge_interintra_cost[bsize][1];
+ *best_rd += RDCOST(x->rdmult, *rate_overhead + *rate_mv, 0);
+
+ int64_t rd = INT64_MAX;
+ const int_mv mv0 = mbmi->mv[0];
+ // Refine motion vector for NEWMV case.
+ if (have_newmv_in_inter_mode(mbmi->mode)) {
+ int rate_sum;
+ uint8_t skip_txfm_sb;
+ int64_t dist_sum, skip_sse_sb;
+ // get negative of mask
+ const uint8_t *mask =
+ av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize);
+ av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv->as_mv, intrapred,
+ mask, bw, tmp_rate_mv, 0);
+ if (mbmi->mv[0].as_int != tmp_mv->as_int) {
+ mbmi->mv[0].as_int = tmp_mv->as_int;
+ // Set ref_frame[1] to NONE_FRAME temporarily so that the intra
+ // predictor is not calculated again in av1_enc_build_inter_predictor().
+ mbmi->ref_frame[1] = NONE_FRAME;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ mbmi->ref_frame[1] = INTRA_FRAME;
+ av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf,
+ xd->plane[AOM_PLANE_Y].dst.stride, intrapred, bw);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb,
+ &skip_sse_sb, NULL, NULL, NULL);
+ rd =
+ RDCOST(x->rdmult, *tmp_rate_mv + *rate_overhead + rate_sum, dist_sum);
+ }
+ }
+ if (rd >= *best_rd) {
+ tmp_mv->as_int = mv0.as_int;
+ *tmp_rate_mv = *rate_mv;
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+ // Evaluate closer to true rd
+ RD_STATS rd_stats;
+ const int64_t mode_rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv, 0);
+ const int64_t tmp_rd_thresh = best_rd_no_wedge - mode_rd;
+ rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+ if (rd != INT64_MAX) {
+ rd = RDCOST(x->rdmult, *rate_overhead + *tmp_rate_mv + rd_stats.rate,
+ rd_stats.dist);
+ } else {
+ if (*best_rd == INT64_MAX) return IGNORE_MODE;
+ }
+ *best_rd = rd;
+ return 0;
+}
+
+int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+ HandleInterModeArgs *args, int64_t ref_best_rd,
+ int *rate_mv, int *tmp_rate2,
+ const BUFFER_SET *orig_dst) {
+ const int try_smooth_interintra =
+ cpi->oxcf.comp_type_cfg.enable_smooth_interintra;
+
+ const int is_wedge_used = av1_is_wedge_used(bsize);
+ const int try_wedge_interintra =
+ is_wedge_used && enable_wedge_interintra_search(x, cpi);
+
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int bw = block_size_wide[bsize];
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+ uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
+ uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // Single reference inter prediction
+ mbmi->ref_frame[1] = NONE_FRAME;
+ xd->plane[0].dst.buf = tmp_buf;
+ xd->plane[0].dst.stride = bw;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ const int num_planes = av1_num_planes(cm);
+
+ // Restore the buffers for intra prediction
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ mbmi->ref_frame[1] = INTRA_FRAME;
+ INTERINTRA_MODE best_interintra_mode =
+ args->inter_intra_mode[mbmi->ref_frame[0]];
+
+ // Compute smooth_interintra
+ int64_t best_interintra_rd_nowedge = INT64_MAX;
+ int best_mode_rate = INT_MAX;
+ if (try_smooth_interintra) {
+ int ret = handle_smooth_inter_intra_mode(
+ cpi, x, bsize, mbmi, ref_best_rd, rate_mv, &best_interintra_mode,
+ &best_interintra_rd_nowedge, &best_mode_rate, orig_dst, tmp_buf,
+ intrapred, args);
+ if (ret == IGNORE_MODE) {
+ return IGNORE_MODE;
+ }
+ }
+
+ // Compute wedge interintra
+ int64_t best_interintra_rd_wedge = INT64_MAX;
+ const int_mv mv0 = mbmi->mv[0];
+ int_mv tmp_mv = mv0;
+ int tmp_rate_mv = 0;
+ int rate_overhead = 0;
+ if (try_wedge_interintra) {
+ int ret = handle_wedge_inter_intra_mode(
+ cpi, x, bsize, mbmi, rate_mv, &best_interintra_mode,
+ &best_interintra_rd_wedge, orig_dst, tmp_buf_, tmp_buf, intrapred_,
+ intrapred, args, &tmp_rate_mv, &rate_overhead, &tmp_mv,
+ best_interintra_rd_nowedge);
+ if (ret == IGNORE_MODE) {
+ return IGNORE_MODE;
+ }
+ }
+
+ if (best_interintra_rd_nowedge == INT64_MAX &&
+ best_interintra_rd_wedge == INT64_MAX) {
+ return IGNORE_MODE;
+ }
+ if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+ mbmi->mv[0].as_int = tmp_mv.as_int;
+ *tmp_rate2 += tmp_rate_mv - *rate_mv;
+ *rate_mv = tmp_rate_mv;
+ best_mode_rate = rate_overhead;
+ } else if (try_smooth_interintra && try_wedge_interintra) {
+ // If smooth was best, but we over-wrote the values when evaluating the
+ // wedge mode, we need to recompute the smooth values.
+ mbmi->use_wedge_interintra = 0;
+ mbmi->interintra_mode = best_interintra_mode;
+ mbmi->mv[0].as_int = mv0.as_int;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ }
+ *tmp_rate2 += best_mode_rate;
+
+ if (num_planes > 1) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_U, num_planes - 1);
+ }
+ return 0;
+}
+
+// Computes the valid compound_types to be evaluated
+static INLINE int compute_valid_comp_types(MACROBLOCK *x,
+ const AV1_COMP *const cpi,
+ BLOCK_SIZE bsize,
+ int masked_compound_used,
+ int mode_search_mask,
+ COMPOUND_TYPE *valid_comp_types) {
+ const AV1_COMMON *cm = &cpi->common;
+ int valid_type_count = 0;
+ int comp_type, valid_check;
+ int8_t enable_masked_type[MASKED_COMPOUND_TYPES] = { 0, 0 };
+
+ const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
+ const int try_distwtd_comp =
+ ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
+ cm->seq_params->order_hint_info.enable_dist_wtd_comp == 1 &&
+ cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+
+ // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases
+ for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
+ comp_type++) {
+ valid_check =
+ (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp;
+ if (valid_check && is_interinter_compound_used(comp_type, bsize))
+ valid_comp_types[valid_type_count++] = comp_type;
+ }
+ // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases
+ if (masked_compound_used) {
+ // enable_masked_type[0] corresponds to COMPOUND_WEDGE
+ // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD
+ enable_masked_type[0] = enable_wedge_interinter_search(x, cpi);
+ enable_masked_type[1] = cpi->oxcf.comp_type_cfg.enable_diff_wtd_comp;
+ for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD;
+ comp_type++) {
+ if ((mode_search_mask & (1 << comp_type)) &&
+ is_interinter_compound_used(comp_type, bsize) &&
+ enable_masked_type[comp_type - COMPOUND_WEDGE])
+ valid_comp_types[valid_type_count++] = comp_type;
+ }
+ }
+ return valid_type_count;
+}
+
+// Calculates the cost for compound type mask
+static INLINE void calc_masked_type_cost(
+ const ModeCosts *mode_costs, BLOCK_SIZE bsize, int comp_group_idx_ctx,
+ int comp_index_ctx, int masked_compound_used, int *masked_type_cost) {
+ av1_zero_array(masked_type_cost, COMPOUND_TYPES);
+ // Account for group index cost when wedge and/or diffwtd prediction are
+ // enabled
+ if (masked_compound_used) {
+ // Compound group index of average and distwtd is 0
+ // Compound group index of wedge and diffwtd is 1
+ masked_type_cost[COMPOUND_AVERAGE] +=
+ mode_costs->comp_group_idx_cost[comp_group_idx_ctx][0];
+ masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE];
+ masked_type_cost[COMPOUND_WEDGE] +=
+ mode_costs->comp_group_idx_cost[comp_group_idx_ctx][1];
+ masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE];
+ }
+
+ // Compute the cost to signal compound index/type
+ masked_type_cost[COMPOUND_AVERAGE] +=
+ mode_costs->comp_idx_cost[comp_index_ctx][1];
+ masked_type_cost[COMPOUND_DISTWTD] +=
+ mode_costs->comp_idx_cost[comp_index_ctx][0];
+ masked_type_cost[COMPOUND_WEDGE] += mode_costs->compound_type_cost[bsize][0];
+ masked_type_cost[COMPOUND_DIFFWTD] +=
+ mode_costs->compound_type_cost[bsize][1];
+}
+
+// Updates mbmi structure with the relevant compound type info
+static INLINE void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi,
+ COMPOUND_TYPE cur_type) {
+ mbmi->interinter_comp.type = cur_type;
+ mbmi->comp_group_idx = (cur_type >= COMPOUND_WEDGE);
+ mbmi->compound_idx = (cur_type != COMPOUND_DISTWTD);
+}
+
+// When match is found, populate the compound type data
+// and calculate the rd cost using the stored stats and
+// update the mbmi appropriately.
+static INLINE int populate_reuse_comp_type_data(
+ const MACROBLOCK *x, MB_MODE_INFO *mbmi,
+ BEST_COMP_TYPE_STATS *best_type_stats, int_mv *cur_mv, int32_t *comp_rate,
+ int64_t *comp_dist, int *comp_rs2, int *rate_mv, int64_t *rd,
+ int match_index) {
+ const int winner_comp_type =
+ x->comp_rd_stats[match_index].interinter_comp.type;
+ if (comp_rate[winner_comp_type] == INT_MAX)
+ return best_type_stats->best_compmode_interinter_cost;
+ update_mbmi_for_compound_type(mbmi, winner_comp_type);
+ mbmi->interinter_comp = x->comp_rd_stats[match_index].interinter_comp;
+ *rd = RDCOST(
+ x->rdmult,
+ comp_rs2[winner_comp_type] + *rate_mv + comp_rate[winner_comp_type],
+ comp_dist[winner_comp_type]);
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ return comp_rs2[winner_comp_type];
+}
+
+// Updates rd cost and relevant compound type data for the best compound type
+static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd,
+ BEST_COMP_TYPE_STATS *best_type_stats,
+ int64_t best_rd_cur,
+ int64_t comp_model_rd_cur, int rs2) {
+ *rd = best_rd_cur;
+ best_type_stats->comp_best_model_rd = comp_model_rd_cur;
+ best_type_stats->best_compound_data = mbmi->interinter_comp;
+ best_type_stats->best_compmode_interinter_cost = rs2;
+}
+
+// Updates best_mv for masked compound types
+static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi,
+ int_mv *best_mv, int *best_tmp_rate_mv,
+ int tmp_rate_mv) {
+ *best_tmp_rate_mv = tmp_rate_mv;
+ best_mv[0].as_int = mbmi->mv[0].as_int;
+ best_mv[1].as_int = mbmi->mv[1].as_int;
+}
+
+static INLINE void save_comp_rd_search_stat(
+ MACROBLOCK *x, const MB_MODE_INFO *const mbmi, const int32_t *comp_rate,
+ const int64_t *comp_dist, const int32_t *comp_model_rate,
+ const int64_t *comp_model_dist, const int_mv *cur_mv, const int *comp_rs2) {
+ const int offset = x->comp_rd_stats_idx;
+ if (offset < MAX_COMP_RD_STATS) {
+ COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset;
+ memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate));
+ memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist));
+ memcpy(rd_stats->model_rate, comp_model_rate, sizeof(rd_stats->model_rate));
+ memcpy(rd_stats->model_dist, comp_model_dist, sizeof(rd_stats->model_dist));
+ memcpy(rd_stats->comp_rs2, comp_rs2, sizeof(rd_stats->comp_rs2));
+ memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv));
+ memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames));
+ rd_stats->mode = mbmi->mode;
+ rd_stats->filter = mbmi->interp_filters;
+ rd_stats->ref_mv_idx = mbmi->ref_mv_idx;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ for (int i = 0; i < 2; ++i) {
+ const WarpedMotionParams *const wm =
+ &xd->global_motion[mbmi->ref_frame[i]];
+ rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype);
+ }
+ memcpy(&rd_stats->interinter_comp, &mbmi->interinter_comp,
+ sizeof(rd_stats->interinter_comp));
+ ++x->comp_rd_stats_idx;
+ }
+}
+
+static INLINE int get_interinter_compound_mask_rate(
+ const ModeCosts *const mode_costs, const MB_MODE_INFO *const mbmi) {
+ const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+ // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
+ if (compound_type == COMPOUND_WEDGE) {
+ return av1_is_wedge_used(mbmi->bsize)
+ ? av1_cost_literal(1) +
+ mode_costs
+ ->wedge_idx_cost[mbmi->bsize]
+ [mbmi->interinter_comp.wedge_index]
+ : 0;
+ } else {
+ assert(compound_type == COMPOUND_DIFFWTD);
+ return av1_cost_literal(1);
+ }
+}
+
+// Takes a backup of rate, distortion and model_rd for future reuse
+static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate,
+ int64_t *comp_dist, int32_t *comp_model_rate,
+ int64_t *comp_model_dist, int rate_sum,
+ int64_t dist_sum, RD_STATS *rd_stats,
+ int *comp_rs2, int rs2) {
+ comp_rate[cur_type] = rd_stats->rate;
+ comp_dist[cur_type] = rd_stats->dist;
+ comp_model_rate[cur_type] = rate_sum;
+ comp_model_dist[cur_type] = dist_sum;
+ comp_rs2[cur_type] = rs2;
+}
+
+static INLINE int save_mask_search_results(const PREDICTION_MODE this_mode,
+ const int reuse_level) {
+ if (reuse_level || (this_mode == NEW_NEWMV))
+ return 1;
+ else
+ return 0;
+}
+
+static INLINE int prune_mode_by_skip_rd(const AV1_COMP *const cpi,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ const BLOCK_SIZE bsize,
+ int64_t ref_skip_rd, int mode_rate) {
+ int eval_txfm = 1;
+ const int txfm_rd_gate_level =
+ get_txfm_rd_gate_level(cpi->common.seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+ TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
+ // Check if the mode is good enough based on skip rd
+ if (txfm_rd_gate_level) {
+ int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
+ int64_t skip_rd = RDCOST(x->rdmult, mode_rate, (sse_y << 4));
+ eval_txfm =
+ check_txfm_eval(x, bsize, ref_skip_rd, skip_rd, txfm_rd_gate_level, 1);
+ }
+ return eval_txfm;
+}
+
+static int64_t masked_compound_type_rd(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
+ int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+ uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
+ int mode_rate, int64_t rd_thresh, int *calc_pred_masked_compound,
+ int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate,
+ int64_t *comp_model_dist, const int64_t comp_best_model_rd,
+ int64_t *const comp_model_rd_cur, int *comp_rs2, int64_t ref_skip_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int64_t best_rd_cur = INT64_MAX;
+ int64_t rd = INT64_MAX;
+ const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+ // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
+ assert(compound_type == COMPOUND_WEDGE || compound_type == COMPOUND_DIFFWTD);
+ int rate_sum;
+ uint8_t tmp_skip_txfm_sb;
+ int64_t dist_sum, tmp_skip_sse_sb;
+ pick_interinter_mask_type pick_interinter_mask[2] = { pick_interinter_wedge,
+ pick_interinter_seg };
+
+ // TODO(any): Save pred and mask calculation as well into records. However
+ // this may increase memory requirements as compound segment mask needs to be
+ // stored in each record.
+ if (*calc_pred_masked_compound) {
+ get_inter_predictors_masked_compound(x, bsize, preds0, preds1, residual1,
+ diff10, strides);
+ *calc_pred_masked_compound = 0;
+ }
+ if (compound_type == COMPOUND_WEDGE) {
+ unsigned int sse;
+ if (is_cur_buf_hbd(xd))
+ (void)cpi->ppi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
+ CONVERT_TO_BYTEPTR(*preds1), *strides,
+ &sse);
+ else
+ (void)cpi->ppi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides,
+ &sse);
+ const unsigned int mse =
+ ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
+ // If two predictors are very similar, skip wedge compound mode search
+ if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) {
+ *comp_model_rd_cur = INT64_MAX;
+ return INT64_MAX;
+ }
+ }
+ // Function pointer to pick the appropriate mask
+ // compound_type == COMPOUND_WEDGE, calls pick_interinter_wedge()
+ // compound_type == COMPOUND_DIFFWTD, calls pick_interinter_seg()
+ uint64_t cur_sse = UINT64_MAX;
+ best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE](
+ cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse);
+ *rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+ best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
+ assert(cur_sse != UINT64_MAX);
+ int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4));
+
+ // Although the true rate_mv might be different after motion search, but it
+ // is unlikely to be the best mode considering the transform rd cost and other
+ // mode overhead cost
+ int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
+ if (mode_rd > rd_thresh) {
+ *comp_model_rd_cur = INT64_MAX;
+ return INT64_MAX;
+ }
+
+ // Check if the mode is good enough based on skip rd
+ // TODO(nithya): Handle wedge_newmv_search if extending for lower speed
+ // setting
+ const int txfm_rd_gate_level =
+ get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+ TX_SEARCH_COMP_TYPE_MODE, /*eval_motion_mode=*/0);
+ if (txfm_rd_gate_level) {
+ int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur,
+ txfm_rd_gate_level, 1);
+ if (!eval_txfm) {
+ *comp_model_rd_cur = INT64_MAX;
+ return INT64_MAX;
+ }
+ }
+
+ // Compute cost if matching record not found, else, reuse data
+ if (comp_rate[compound_type] == INT_MAX) {
+ // Check whether new MV search for wedge is to be done
+ int wedge_newmv_search =
+ have_newmv_in_inter_mode(this_mode) &&
+ (compound_type == COMPOUND_WEDGE) &&
+ (!cpi->sf.inter_sf.disable_interinter_wedge_newmv_search);
+
+ // Search for new MV if needed and build predictor
+ if (wedge_newmv_search) {
+ *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+ bsize, this_mode);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ } else {
+ *out_rate_mv = rate_mv;
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+ preds1, strides);
+ }
+ // Get the RD cost from model RD
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb,
+ &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+ *comp_model_rd_cur = rd;
+ // Override with best if current is worse than best for new MV
+ if (wedge_newmv_search) {
+ if (rd >= best_rd_cur) {
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ *out_rate_mv = rate_mv;
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+ strides, preds1, strides);
+ *comp_model_rd_cur = best_rd_cur;
+ }
+ }
+ if (cpi->sf.inter_sf.prune_comp_type_by_model_rd &&
+ (*comp_model_rd_cur > comp_best_model_rd) &&
+ comp_best_model_rd != INT64_MAX) {
+ *comp_model_rd_cur = INT64_MAX;
+ return INT64_MAX;
+ }
+ // Compute RD cost for the current type
+ RD_STATS rd_stats;
+ const int64_t tmp_mode_rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0);
+ const int64_t tmp_rd_thresh = rd_thresh - tmp_mode_rd;
+ rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+ if (rd != INT64_MAX) {
+ rd =
+ RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist);
+ // Backup rate and distortion for future reuse
+ backup_stats(compound_type, comp_rate, comp_dist, comp_model_rate,
+ comp_model_dist, rate_sum, dist_sum, &rd_stats, comp_rs2,
+ *rs2);
+ }
+ } else {
+ // Reuse data as matching record is found
+ assert(comp_dist[compound_type] != INT64_MAX);
+ // When disable_interinter_wedge_newmv_search is set, motion refinement is
+ // disabled. Hence rate and distortion can be reused in this case as well
+ assert(IMPLIES((have_newmv_in_inter_mode(this_mode) &&
+ (compound_type == COMPOUND_WEDGE)),
+ cpi->sf.inter_sf.disable_interinter_wedge_newmv_search));
+ assert(mbmi->mv[0].as_int == cur_mv[0].as_int);
+ assert(mbmi->mv[1].as_int == cur_mv[1].as_int);
+ *out_rate_mv = rate_mv;
+ // Calculate RD cost based on stored stats
+ rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type],
+ comp_dist[compound_type]);
+ // Recalculate model rdcost with the updated rate
+ *comp_model_rd_cur =
+ RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_model_rate[compound_type],
+ comp_model_dist[compound_type]);
+ }
+ return rd;
+}
+
+// scaling values to be used for gating wedge/compound segment based on best
+// approximate rd
+static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
+static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
+
+int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ HandleInterModeArgs *args, BLOCK_SIZE bsize,
+ int_mv *cur_mv, int mode_search_mask,
+ int masked_compound_used, const BUFFER_SET *orig_dst,
+ const BUFFER_SET *tmp_dst,
+ const CompoundTypeRdBuffers *buffers, int *rate_mv,
+ int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd,
+ int64_t ref_skip_rd, int *is_luma_interp_done,
+ int64_t rd_thresh) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ int ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+ const int bw = block_size_wide[bsize];
+ int rs2;
+ int_mv best_mv[2];
+ int best_tmp_rate_mv = *rate_mv;
+ BEST_COMP_TYPE_STATS best_type_stats;
+ // Initializing BEST_COMP_TYPE_STATS
+ best_type_stats.best_compound_data.type = COMPOUND_AVERAGE;
+ best_type_stats.best_compmode_interinter_cost = 0;
+ best_type_stats.comp_best_model_rd = INT64_MAX;
+
+ uint8_t *preds0[1] = { buffers->pred0 };
+ uint8_t *preds1[1] = { buffers->pred1 };
+ int strides[1] = { bw };
+ int tmp_rate_mv;
+ COMPOUND_TYPE cur_type;
+ // Local array to store the mask cost for different compound types
+ int masked_type_cost[COMPOUND_TYPES];
+
+ int calc_pred_masked_compound = 1;
+ int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+ INT64_MAX };
+ int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+ int comp_rs2[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+ int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX,
+ INT_MAX };
+ int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+ INT64_MAX };
+ int match_index = 0;
+ const int match_found =
+ find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate,
+ comp_model_dist, comp_rs2, &match_index);
+ best_mv[0].as_int = cur_mv[0].as_int;
+ best_mv[1].as_int = cur_mv[1].as_int;
+ *rd = INT64_MAX;
+
+ // Local array to store the valid compound types to be evaluated in the core
+ // loop
+ COMPOUND_TYPE valid_comp_types[COMPOUND_TYPES] = {
+ COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD
+ };
+ int valid_type_count = 0;
+ // compute_valid_comp_types() returns the number of valid compound types to be
+ // evaluated and populates the same in the local array valid_comp_types[].
+ // It also sets the flag 'try_average_and_distwtd_comp'
+ valid_type_count = compute_valid_comp_types(
+ x, cpi, bsize, masked_compound_used, mode_search_mask, valid_comp_types);
+
+ // The following context indices are independent of compound type
+ const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+
+ // Populates masked_type_cost local array for the 4 compound types
+ calc_masked_type_cost(&x->mode_costs, bsize, comp_group_idx_ctx,
+ comp_index_ctx, masked_compound_used, masked_type_cost);
+
+ int64_t comp_model_rd_cur = INT64_MAX;
+ int64_t best_rd_cur = ref_best_rd;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // If the match is found, calculate the rd cost using the
+ // stored stats and update the mbmi appropriately.
+ if (match_found && cpi->sf.inter_sf.reuse_compound_type_decision) {
+ return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv,
+ comp_rate, comp_dist, comp_rs2,
+ rate_mv, rd, match_index);
+ }
+
+ // If COMPOUND_AVERAGE is not valid, use the spare buffer
+ if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+
+ // Loop over valid compound types
+ for (int i = 0; i < valid_type_count; i++) {
+ cur_type = valid_comp_types[i];
+
+ if (args->cmp_mode[ref_frame] == COMPOUND_AVERAGE) {
+ if (cur_type == COMPOUND_WEDGE) continue;
+ }
+
+ comp_model_rd_cur = INT64_MAX;
+ tmp_rate_mv = *rate_mv;
+ best_rd_cur = INT64_MAX;
+ ref_best_rd = AOMMIN(ref_best_rd, *rd);
+ update_mbmi_for_compound_type(mbmi, cur_type);
+ rs2 = masked_type_cost[cur_type];
+
+ int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+ if (mode_rd >= ref_best_rd) continue;
+
+ // Derive the flags to indicate enabling/disabling of MV refinement process.
+ const int enable_fast_compound_mode_search =
+ cpi->sf.inter_sf.enable_fast_compound_mode_search;
+ const bool skip_mv_refinement_for_avg_distwtd =
+ enable_fast_compound_mode_search == 3 ||
+ (enable_fast_compound_mode_search == 2 && (this_mode != NEW_NEWMV));
+ const bool skip_mv_refinement_for_diffwtd =
+ (!enable_fast_compound_mode_search && cur_type == COMPOUND_DIFFWTD);
+
+ // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD
+ if (cur_type < COMPOUND_WEDGE) {
+ if (skip_mv_refinement_for_avg_distwtd) {
+ int rate_sum;
+ uint8_t tmp_skip_txfm_sb;
+ int64_t dist_sum, tmp_skip_sse_sb;
+
+ // Reuse data if matching record is found
+ if (comp_rate[cur_type] == INT_MAX) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+ // Compute RD cost for the current type
+ RD_STATS est_rd_stats;
+ const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
+ int64_t est_rd = INT64_MAX;
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ rs2 + *rate_mv);
+ // Evaluate further if skip rd is low enough
+ if (eval_txfm) {
+ est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh,
+ &est_rd_stats);
+ }
+ if (est_rd != INT64_MAX) {
+ best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ comp_model_rd_cur =
+ RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+ // Backup rate and distortion for future reuse
+ backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate,
+ comp_model_dist, rate_sum, dist_sum, &est_rd_stats,
+ comp_rs2, rs2);
+ }
+ } else {
+ // Calculate RD cost based on stored stats
+ assert(comp_dist[cur_type] != INT64_MAX);
+ best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type],
+ comp_dist[cur_type]);
+ // Recalculate model rdcost with the updated rate
+ comp_model_rd_cur =
+ RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type],
+ comp_model_dist[cur_type]);
+ }
+ } else {
+ tmp_rate_mv = *rate_mv;
+ if (have_newmv_in_inter_mode(this_mode)) {
+ InterPredParams inter_pred_params;
+ av1_dist_wtd_comp_weight_assign(
+ &cpi->common, mbmi, &inter_pred_params.conv_params.fwd_offset,
+ &inter_pred_params.conv_params.bck_offset,
+ &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1);
+ int mask_value = inter_pred_params.conv_params.fwd_offset * 4;
+ memset(xd->seg_mask, mask_value,
+ sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+ tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+ bsize, this_mode);
+ }
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ rs2 + *rate_mv);
+ if (eval_txfm) {
+ RD_STATS est_rd_stats;
+ estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats);
+
+ best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ }
+ }
+
+ // use spare buffer for following compound type try
+ if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+ } else if (cur_type == COMPOUND_WEDGE) {
+ int best_mask_index = 0;
+ int best_wedge_sign = 0;
+ int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] };
+ int best_rs2 = 0;
+ int best_rate_mv = *rate_mv;
+ int wedge_mask_size = get_wedge_types_lookup(bsize);
+ int need_mask_search = args->wedge_index == -1;
+ int wedge_newmv_search =
+ have_newmv_in_inter_mode(this_mode) &&
+ !cpi->sf.inter_sf.disable_interinter_wedge_newmv_search;
+
+ if (need_mask_search && !wedge_newmv_search) {
+ // short cut repeated single reference block build
+ av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0,
+ preds0, strides);
+ av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1,
+ preds1, strides);
+ }
+
+ for (int wedge_mask = 0; wedge_mask < wedge_mask_size && need_mask_search;
+ ++wedge_mask) {
+ for (int wedge_sign = 0; wedge_sign < 2; ++wedge_sign) {
+ tmp_rate_mv = *rate_mv;
+ mbmi->interinter_comp.wedge_index = wedge_mask;
+ mbmi->interinter_comp.wedge_sign = wedge_sign;
+ rs2 = masked_type_cost[cur_type];
+ rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+ mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+ if (mode_rd >= ref_best_rd / 2) continue;
+
+ if (wedge_newmv_search) {
+ tmp_rate_mv = av1_interinter_compound_motion_search(
+ cpi, x, cur_mv, bsize, this_mode);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst,
+ bsize, AOM_PLANE_Y, AOM_PLANE_Y);
+ } else {
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+ strides, preds1, strides);
+ }
+
+ RD_STATS est_rd_stats;
+ int64_t this_rd_cur = INT64_MAX;
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ rs2 + *rate_mv);
+ if (eval_txfm) {
+ this_rd_cur = estimate_yrd_for_sb(
+ cpi, bsize, x, AOMMIN(best_rd_cur, ref_best_rd), &est_rd_stats);
+ }
+ if (this_rd_cur < INT64_MAX) {
+ this_rd_cur =
+ RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ }
+ if (this_rd_cur < best_rd_cur) {
+ best_mask_index = wedge_mask;
+ best_wedge_sign = wedge_sign;
+ best_rd_cur = this_rd_cur;
+ tmp_mv[0] = mbmi->mv[0];
+ tmp_mv[1] = mbmi->mv[1];
+ best_rate_mv = tmp_rate_mv;
+ best_rs2 = rs2;
+ }
+ }
+ // Consider the asymmetric partitions for oblique angle only if the
+ // corresponding symmetric partition is the best so far.
+ // Note: For horizontal and vertical types, both symmetric and
+ // asymmetric partitions are always considered.
+ if (cpi->sf.inter_sf.enable_fast_wedge_mask_search) {
+ // The first 4 entries in wedge_codebook_16_heqw/hltw/hgtw[16]
+ // correspond to symmetric partitions of the 4 oblique angles, the
+ // next 4 entries correspond to the vertical/horizontal
+ // symmetric/asymmetric partitions and the last 8 entries correspond
+ // to the asymmetric partitions of oblique types.
+ const int idx_before_asym_oblique = 7;
+ const int last_oblique_sym_idx = 3;
+ if (wedge_mask == idx_before_asym_oblique) {
+ if (best_mask_index > last_oblique_sym_idx) {
+ break;
+ } else {
+ // Asymmetric (Index-1) map for the corresponding oblique masks.
+ // WEDGE_OBLIQUE27: sym - 0, asym - 8, 9
+ // WEDGE_OBLIQUE63: sym - 1, asym - 12, 13
+ // WEDGE_OBLIQUE117: sym - 2, asym - 14, 15
+ // WEDGE_OBLIQUE153: sym - 3, asym - 10, 11
+ const int asym_mask_idx[4] = { 7, 11, 13, 9 };
+ wedge_mask = asym_mask_idx[best_mask_index];
+ wedge_mask_size = wedge_mask + 3;
+ }
+ }
+ }
+ }
+
+ if (need_mask_search) {
+ if (save_mask_search_results(
+ this_mode, cpi->sf.inter_sf.reuse_mask_search_results)) {
+ args->wedge_index = best_mask_index;
+ args->wedge_sign = best_wedge_sign;
+ }
+ } else {
+ mbmi->interinter_comp.wedge_index = args->wedge_index;
+ mbmi->interinter_comp.wedge_sign = args->wedge_sign;
+ rs2 = masked_type_cost[cur_type];
+ rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+ if (wedge_newmv_search) {
+ tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+ bsize, this_mode);
+ }
+
+ best_mask_index = args->wedge_index;
+ best_wedge_sign = args->wedge_sign;
+ tmp_mv[0] = mbmi->mv[0];
+ tmp_mv[1] = mbmi->mv[1];
+ best_rate_mv = tmp_rate_mv;
+ best_rs2 = masked_type_cost[cur_type];
+ best_rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ best_rs2 + *rate_mv);
+ if (eval_txfm) {
+ RD_STATS est_rd_stats;
+ estimate_yrd_for_sb(cpi, bsize, x, INT64_MAX, &est_rd_stats);
+ best_rd_cur =
+ RDCOST(x->rdmult, best_rs2 + tmp_rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ }
+ }
+
+ mbmi->interinter_comp.wedge_index = best_mask_index;
+ mbmi->interinter_comp.wedge_sign = best_wedge_sign;
+ mbmi->mv[0] = tmp_mv[0];
+ mbmi->mv[1] = tmp_mv[1];
+ tmp_rate_mv = best_rate_mv;
+ rs2 = best_rs2;
+ } else if (skip_mv_refinement_for_diffwtd) {
+ int_mv tmp_mv[2];
+ int best_mask_index = 0;
+ rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+ int need_mask_search = args->diffwtd_index == -1;
+
+ for (int mask_index = 0; mask_index < 2 && need_mask_search;
+ ++mask_index) {
+ tmp_rate_mv = *rate_mv;
+ mbmi->interinter_comp.mask_type = mask_index;
+ if (have_newmv_in_inter_mode(this_mode)) {
+ // hard coded number for diff wtd
+ int mask_value = mask_index == 0 ? 38 : 26;
+ memset(xd->seg_mask, mask_value,
+ sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+ tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+ bsize, this_mode);
+ }
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ RD_STATS est_rd_stats;
+ int64_t this_rd_cur = INT64_MAX;
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ rs2 + *rate_mv);
+ if (eval_txfm) {
+ this_rd_cur =
+ estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+ }
+ if (this_rd_cur < INT64_MAX) {
+ this_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ }
+
+ if (this_rd_cur < best_rd_cur) {
+ best_rd_cur = this_rd_cur;
+ best_mask_index = mbmi->interinter_comp.mask_type;
+ tmp_mv[0] = mbmi->mv[0];
+ tmp_mv[1] = mbmi->mv[1];
+ }
+ }
+
+ if (need_mask_search) {
+ if (save_mask_search_results(this_mode, 0))
+ args->diffwtd_index = best_mask_index;
+ } else {
+ mbmi->interinter_comp.mask_type = args->diffwtd_index;
+ rs2 = masked_type_cost[cur_type];
+ rs2 += get_interinter_compound_mask_rate(&x->mode_costs, mbmi);
+
+ int mask_value = mbmi->interinter_comp.mask_type == 0 ? 38 : 26;
+ memset(xd->seg_mask, mask_value,
+ sizeof(xd->seg_mask[0]) * 2 * MAX_SB_SQUARE);
+
+ if (have_newmv_in_inter_mode(this_mode)) {
+ tmp_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+ bsize, this_mode);
+ }
+ best_mask_index = mbmi->interinter_comp.mask_type;
+ tmp_mv[0] = mbmi->mv[0];
+ tmp_mv[1] = mbmi->mv[1];
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ RD_STATS est_rd_stats;
+ int64_t this_rd_cur = INT64_MAX;
+ int eval_txfm = prune_mode_by_skip_rd(cpi, x, xd, bsize, ref_skip_rd,
+ rs2 + *rate_mv);
+ if (eval_txfm) {
+ this_rd_cur =
+ estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+ }
+ if (this_rd_cur < INT64_MAX) {
+ best_rd_cur = RDCOST(x->rdmult, rs2 + tmp_rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ }
+ }
+
+ mbmi->interinter_comp.mask_type = best_mask_index;
+ mbmi->mv[0] = tmp_mv[0];
+ mbmi->mv[1] = tmp_mv[1];
+ } else {
+ // Handle masked compound types
+ bool eval_masked_comp_type = true;
+ if (*rd != INT64_MAX) {
+ // Factors to control gating of compound type selection based on best
+ // approximate rd so far
+ const int max_comp_type_rd_threshold_mul =
+ comp_type_rd_threshold_mul[cpi->sf.inter_sf
+ .prune_comp_type_by_comp_avg];
+ const int max_comp_type_rd_threshold_div =
+ comp_type_rd_threshold_div[cpi->sf.inter_sf
+ .prune_comp_type_by_comp_avg];
+ // Evaluate COMPOUND_WEDGE / COMPOUND_DIFFWTD if approximated cost is
+ // within threshold
+ const int64_t approx_rd = ((*rd / max_comp_type_rd_threshold_div) *
+ max_comp_type_rd_threshold_mul);
+ if (approx_rd >= ref_best_rd) eval_masked_comp_type = false;
+ }
+
+ if (eval_masked_comp_type) {
+ const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh);
+ best_rd_cur = masked_compound_type_rd(
+ cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+ &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
+ strides, rd_stats->rate, tmp_rd_thresh, &calc_pred_masked_compound,
+ comp_rate, comp_dist, comp_model_rate, comp_model_dist,
+ best_type_stats.comp_best_model_rd, &comp_model_rd_cur, comp_rs2,
+ ref_skip_rd);
+ }
+ }
+
+ // Update stats for best compound type
+ if (best_rd_cur < *rd) {
+ update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
+ comp_model_rd_cur, rs2);
+ if (have_newmv_in_inter_mode(this_mode))
+ update_mask_best_mv(mbmi, best_mv, &best_tmp_rate_mv, tmp_rate_mv);
+ }
+ // reset to original mvs for next iteration
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ }
+
+ mbmi->comp_group_idx =
+ (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
+ mbmi->compound_idx =
+ !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD);
+ mbmi->interinter_comp = best_type_stats.best_compound_data;
+
+ if (have_newmv_in_inter_mode(this_mode)) {
+ mbmi->mv[0].as_int = best_mv[0].as_int;
+ mbmi->mv[1].as_int = best_mv[1].as_int;
+ rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+ *rate_mv = best_tmp_rate_mv;
+ }
+
+ if (this_mode == NEW_NEWMV)
+ args->cmp_mode[ref_frame] = mbmi->interinter_comp.type;
+
+ restore_dst_buf(xd, *orig_dst, 1);
+ if (!match_found)
+ save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate,
+ comp_model_dist, cur_mv, comp_rs2);
+ return best_type_stats.best_compmode_interinter_cost;
+}
diff --git a/third_party/aom/av1/encoder/compound_type.h b/third_party/aom/av1/encoder/compound_type.h
new file mode 100644
index 0000000000..a028a35093
--- /dev/null
+++ b/third_party/aom/av1/encoder/compound_type.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COMPOUND_TYPE_H_
+#define AOM_AV1_ENCODER_COMPOUND_TYPE_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Structure to store the compound type related stats for best compound type
+typedef struct {
+ INTERINTER_COMPOUND_DATA best_compound_data;
+ int64_t comp_best_model_rd;
+ int best_compmode_interinter_cost;
+} BEST_COMP_TYPE_STATS;
+
+#define IGNORE_MODE -1
+// Searches for the best inter-intra mode. Returns IGNORE_MODE if no good mode
+// is found, 0 otherwise.
+int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+ HandleInterModeArgs *args, int64_t ref_best_rd,
+ int *rate_mv, int *tmp_rate2,
+ const BUFFER_SET *orig_dst);
+
+int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ HandleInterModeArgs *args, BLOCK_SIZE bsize,
+ int_mv *cur_mv, int mode_search_mask,
+ int masked_compound_used, const BUFFER_SET *orig_dst,
+ const BUFFER_SET *tmp_dst,
+ const CompoundTypeRdBuffers *buffers, int *rate_mv,
+ int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd,
+ int64_t ref_skip_rd, int *is_luma_interp_done,
+ int64_t rd_thresh);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_COMPOUND_TYPE_H_
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
new file mode 100644
index 0000000000..aafe55d2d0
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+#include <assert.h>
+
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+ PICK_MODE_CONTEXT *src_ctx) {
+ dst_ctx->mic = src_ctx->mic;
+ dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best;
+
+ dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
+ dst_ctx->skippable = src_ctx->skippable;
+#if CONFIG_INTERNAL_STATS
+ dst_ctx->best_mode_index = src_ctx->best_mode_index;
+#endif // CONFIG_INTERNAL_STATS
+
+ memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
+ sizeof(uint8_t) * src_ctx->num_4x4_blk);
+ av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map,
+ src_ctx->num_4x4_blk);
+
+ dst_ctx->rd_stats = src_ctx->rd_stats;
+ dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+}
+
+void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params,
+ PC_TREE_SHARED_BUFFERS *shared_bufs,
+ struct aom_internal_error_info *error) {
+ const int num_planes = seq_params->monochrome ? 1 : MAX_MB_PLANE;
+ const int max_sb_square_y = 1 << num_pels_log2_lookup[seq_params->sb_size];
+ const int max_sb_square_uv = max_sb_square_y >> (seq_params->subsampling_x +
+ seq_params->subsampling_y);
+ for (int i = 0; i < num_planes; i++) {
+ const int max_num_pix =
+ (i == AOM_PLANE_Y) ? max_sb_square_y : max_sb_square_uv;
+ AOM_CHECK_MEM_ERROR(error, shared_bufs->coeff_buf[i],
+ aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+ AOM_CHECK_MEM_ERROR(error, shared_bufs->qcoeff_buf[i],
+ aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+ AOM_CHECK_MEM_ERROR(error, shared_bufs->dqcoeff_buf[i],
+ aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+ }
+}
+
+void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs) {
+ for (int i = 0; i < 3; i++) {
+ aom_free(shared_bufs->coeff_buf[i]);
+ aom_free(shared_bufs->qcoeff_buf[i]);
+ aom_free(shared_bufs->dqcoeff_buf[i]);
+ shared_bufs->coeff_buf[i] = NULL;
+ shared_bufs->qcoeff_buf[i] = NULL;
+ shared_bufs->dqcoeff_buf[i] = NULL;
+ }
+}
+
+PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi,
+ BLOCK_SIZE bsize,
+ PC_TREE_SHARED_BUFFERS *shared_bufs) {
+ PICK_MODE_CONTEXT *volatile ctx = NULL;
+ const AV1_COMMON *const cm = &cpi->common;
+ struct aom_internal_error_info error;
+
+ if (setjmp(error.jmp)) {
+ av1_free_pmc(ctx, av1_num_planes(cm));
+ return NULL;
+ }
+ error.setjmp = 1;
+
+ AOM_CHECK_MEM_ERROR(&error, ctx, aom_calloc(1, sizeof(*ctx)));
+ ctx->rd_mode_is_ready = 0;
+
+ const int num_planes = av1_num_planes(cm);
+ const int num_pix = block_size_wide[bsize] * block_size_high[bsize];
+ const int num_blk = num_pix / 16;
+
+ AOM_CHECK_MEM_ERROR(&error, ctx->blk_skip,
+ aom_calloc(num_blk, sizeof(*ctx->blk_skip)));
+ AOM_CHECK_MEM_ERROR(&error, ctx->tx_type_map,
+ aom_calloc(num_blk, sizeof(*ctx->tx_type_map)));
+ ctx->num_4x4_blk = num_blk;
+
+ for (int i = 0; i < num_planes; ++i) {
+ ctx->coeff[i] = shared_bufs->coeff_buf[i];
+ ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i];
+ ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i];
+ AOM_CHECK_MEM_ERROR(&error, ctx->eobs[i],
+ aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
+ AOM_CHECK_MEM_ERROR(
+ &error, ctx->txb_entropy_ctx[i],
+ aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
+ }
+
+ if (num_pix <= MAX_PALETTE_SQUARE) {
+ for (int i = 0; i < 2; ++i) {
+ if (cm->features.allow_screen_content_tools) {
+ AOM_CHECK_MEM_ERROR(
+ &error, ctx->color_index_map[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+ } else {
+ ctx->color_index_map[i] = NULL;
+ }
+ }
+ }
+
+ av1_invalid_rd_stats(&ctx->rd_stats);
+
+ return ctx;
+}
+
+void av1_reset_pmc(PICK_MODE_CONTEXT *ctx) {
+ av1_zero_array(ctx->blk_skip, ctx->num_4x4_blk);
+ av1_zero_array(ctx->tx_type_map, ctx->num_4x4_blk);
+ av1_invalid_rd_stats(&ctx->rd_stats);
+}
+
+void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes) {
+ if (ctx == NULL) return;
+
+ aom_free(ctx->blk_skip);
+ ctx->blk_skip = NULL;
+ aom_free(ctx->tx_type_map);
+ for (int i = 0; i < num_planes; ++i) {
+ ctx->coeff[i] = NULL;
+ ctx->qcoeff[i] = NULL;
+ ctx->dqcoeff[i] = NULL;
+ aom_free(ctx->eobs[i]);
+ ctx->eobs[i] = NULL;
+ aom_free(ctx->txb_entropy_ctx[i]);
+ ctx->txb_entropy_ctx[i] = NULL;
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ if (ctx->color_index_map[i]) {
+ aom_free(ctx->color_index_map[i]);
+ ctx->color_index_map[i] = NULL;
+ }
+ }
+
+ aom_free(ctx);
+}
+
+PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize) {
+ PC_TREE *pc_tree = aom_calloc(1, sizeof(*pc_tree));
+ if (pc_tree == NULL) return NULL;
+
+ pc_tree->partitioning = PARTITION_NONE;
+ pc_tree->block_size = bsize;
+
+ return pc_tree;
+}
+
+#define FREE_PMC_NODE(CTX) \
+ do { \
+ av1_free_pmc(CTX, num_planes); \
+ CTX = NULL; \
+ } while (0)
+
+void av1_free_pc_tree_recursive(PC_TREE *pc_tree, int num_planes, int keep_best,
+ int keep_none,
+ PARTITION_SEARCH_TYPE partition_search_type) {
+ if (pc_tree == NULL) return;
+
+ // Avoid freeing of extended partitions as they are not supported when
+ // partition_search_type is VAR_BASED_PARTITION.
+ if (partition_search_type == VAR_BASED_PARTITION && !keep_best &&
+ !keep_none) {
+ FREE_PMC_NODE(pc_tree->none);
+
+ for (int i = 0; i < 2; ++i) {
+ FREE_PMC_NODE(pc_tree->horizontal[i]);
+ FREE_PMC_NODE(pc_tree->vertical[i]);
+ }
+
+#if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY
+ for (int i = 0; i < 3; ++i) {
+ assert(pc_tree->horizontala[i] == NULL);
+ assert(pc_tree->horizontalb[i] == NULL);
+ assert(pc_tree->verticala[i] == NULL);
+ assert(pc_tree->verticalb[i] == NULL);
+ }
+ for (int i = 0; i < 4; ++i) {
+ assert(pc_tree->horizontal4[i] == NULL);
+ assert(pc_tree->vertical4[i] == NULL);
+ }
+#endif
+
+ for (int i = 0; i < 4; ++i) {
+ if (pc_tree->split[i] != NULL) {
+ av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0,
+ partition_search_type);
+ pc_tree->split[i] = NULL;
+ }
+ }
+ aom_free(pc_tree);
+ return;
+ }
+
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+
+ if (!keep_none && (!keep_best || (partition != PARTITION_NONE)))
+ FREE_PMC_NODE(pc_tree->none);
+
+ for (int i = 0; i < 2; ++i) {
+ if (!keep_best || (partition != PARTITION_HORZ))
+ FREE_PMC_NODE(pc_tree->horizontal[i]);
+ if (!keep_best || (partition != PARTITION_VERT))
+ FREE_PMC_NODE(pc_tree->vertical[i]);
+ }
+#if !CONFIG_REALTIME_ONLY
+ for (int i = 0; i < 3; ++i) {
+ if (!keep_best || (partition != PARTITION_HORZ_A))
+ FREE_PMC_NODE(pc_tree->horizontala[i]);
+ if (!keep_best || (partition != PARTITION_HORZ_B))
+ FREE_PMC_NODE(pc_tree->horizontalb[i]);
+ if (!keep_best || (partition != PARTITION_VERT_A))
+ FREE_PMC_NODE(pc_tree->verticala[i]);
+ if (!keep_best || (partition != PARTITION_VERT_B))
+ FREE_PMC_NODE(pc_tree->verticalb[i]);
+ }
+ for (int i = 0; i < 4; ++i) {
+ if (!keep_best || (partition != PARTITION_HORZ_4))
+ FREE_PMC_NODE(pc_tree->horizontal4[i]);
+ if (!keep_best || (partition != PARTITION_VERT_4))
+ FREE_PMC_NODE(pc_tree->vertical4[i]);
+ }
+#endif
+ if (!keep_best || (partition != PARTITION_SPLIT)) {
+ for (int i = 0; i < 4; ++i) {
+ if (pc_tree->split[i] != NULL) {
+ av1_free_pc_tree_recursive(pc_tree->split[i], num_planes, 0, 0,
+ partition_search_type);
+ pc_tree->split[i] = NULL;
+ }
+ }
+ }
+
+ if (!keep_best && !keep_none) aom_free(pc_tree);
+}
+
+int av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) {
+ // The structure 'sms_tree' is used to store the simple motion search data for
+ // partition pruning in inter frames. Hence, the memory allocations and
+ // initializations related to it are avoided for allintra encoding mode.
+ if (cpi->oxcf.kf_cfg.key_freq_max == 0) return 0;
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int stat_generation_stage = is_stat_generation_stage(cpi);
+ const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+ const int tree_nodes =
+ av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+ int sms_tree_index = 0;
+ SIMPLE_MOTION_DATA_TREE *this_sms;
+ int square_index = 1;
+ int nodes;
+
+ aom_free(td->sms_tree);
+ td->sms_tree =
+ (SIMPLE_MOTION_DATA_TREE *)aom_calloc(tree_nodes, sizeof(*td->sms_tree));
+ if (!td->sms_tree) return -1;
+ this_sms = &td->sms_tree[0];
+
+ if (!stat_generation_stage) {
+ const int leaf_factor = is_sb_size_128 ? 4 : 1;
+ const int leaf_nodes = 256 * leaf_factor;
+
+ // Sets up all the leaf nodes in the tree.
+ for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) {
+ SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+ tree->block_size = square[0];
+ }
+
+ // Each node has 4 leaf nodes, fill each block_size level of the tree
+ // from leafs to the root.
+ for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+ for (int i = 0; i < nodes; ++i) {
+ SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+ tree->block_size = square[square_index];
+ for (int j = 0; j < 4; j++) tree->split[j] = this_sms++;
+ ++sms_tree_index;
+ }
+ ++square_index;
+ }
+ } else {
+ // Allocation for firstpass/LAP stage
+ // TODO(Mufaddal): refactor square_index to use a common block_size macro
+ // from firstpass.c
+ SIMPLE_MOTION_DATA_TREE *const tree = &td->sms_tree[sms_tree_index];
+ square_index = 2;
+ tree->block_size = square[square_index];
+ }
+
+ // Set up the root node for the largest superblock size
+ td->sms_root = &td->sms_tree[tree_nodes - 1];
+ return 0;
+}
+
+void av1_free_sms_tree(ThreadData *td) {
+ aom_free(td->sms_tree);
+ td->sms_tree = NULL;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
new file mode 100644
index 0000000000..0be7ccbb54
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_
+#define AOM_AV1_ENCODER_CONTEXT_TREE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/speed_features.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_PRIMARY;
+struct AV1_COMP;
+struct AV1Common;
+struct ThreadData;
+
+typedef struct {
+ tran_low_t *coeff_buf[MAX_MB_PLANE];
+ tran_low_t *qcoeff_buf[MAX_MB_PLANE];
+ tran_low_t *dqcoeff_buf[MAX_MB_PLANE];
+} PC_TREE_SHARED_BUFFERS;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct PICK_MODE_CONTEXT {
+ MB_MODE_INFO mic;
+ MB_MODE_INFO_EXT_FRAME mbmi_ext_best;
+ uint8_t *color_index_map[2];
+ uint8_t *blk_skip;
+
+ tran_low_t *coeff[MAX_MB_PLANE];
+ tran_low_t *qcoeff[MAX_MB_PLANE];
+ tran_low_t *dqcoeff[MAX_MB_PLANE];
+ uint16_t *eobs[MAX_MB_PLANE];
+ uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
+ uint8_t *tx_type_map;
+
+ int num_4x4_blk;
+ // For current partition, only if all Y, U, and V transform blocks'
+ // coefficients are quantized to 0, skippable is set to 1.
+ int skippable;
+#if CONFIG_INTERNAL_STATS
+ THR_MODES best_mode_index;
+#endif // CONFIG_INTERNAL_STATS
+ RD_STATS rd_stats;
+
+ int rd_mode_is_ready; // Flag to indicate whether rd pick mode decision has
+ // been made.
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ int64_t newmv_sse;
+ int64_t zeromv_sse;
+ int64_t zeromv_lastref_sse;
+ PREDICTION_MODE best_sse_inter_mode;
+ int_mv best_sse_mv;
+ MV_REFERENCE_FRAME best_reference_frame;
+ MV_REFERENCE_FRAME best_zeromv_reference_frame;
+ int sb_skip_denoising;
+#endif
+} PICK_MODE_CONTEXT;
+
+typedef struct PC_TREE {
+ PARTITION_TYPE partitioning;
+ BLOCK_SIZE block_size;
+ PICK_MODE_CONTEXT *none;
+ PICK_MODE_CONTEXT *horizontal[2];
+ PICK_MODE_CONTEXT *vertical[2];
+#if !CONFIG_REALTIME_ONLY
+ PICK_MODE_CONTEXT *horizontala[3];
+ PICK_MODE_CONTEXT *horizontalb[3];
+ PICK_MODE_CONTEXT *verticala[3];
+ PICK_MODE_CONTEXT *verticalb[3];
+ PICK_MODE_CONTEXT *horizontal4[4];
+ PICK_MODE_CONTEXT *vertical4[4];
+#endif
+ struct PC_TREE *split[4];
+ int index;
+} PC_TREE;
+
+typedef struct SIMPLE_MOTION_DATA_TREE {
+ BLOCK_SIZE block_size;
+ PARTITION_TYPE partitioning;
+ struct SIMPLE_MOTION_DATA_TREE *split[4];
+
+ // Simple motion search_features
+ FULLPEL_MV start_mvs[REF_FRAMES];
+ unsigned int sms_none_feat[2];
+ unsigned int sms_rect_feat[8];
+ int sms_none_valid;
+ int sms_rect_valid;
+} SIMPLE_MOTION_DATA_TREE;
+
+void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params,
+ PC_TREE_SHARED_BUFFERS *shared_bufs,
+ struct aom_internal_error_info *error);
+void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs);
+
+PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize);
+void av1_free_pc_tree_recursive(PC_TREE *tree, int num_planes, int keep_best,
+ int keep_none,
+ PARTITION_SEARCH_TYPE partition_search_type);
+
+PICK_MODE_CONTEXT *av1_alloc_pmc(const struct AV1_COMP *const cpi,
+ BLOCK_SIZE bsize,
+ PC_TREE_SHARED_BUFFERS *shared_bufs);
+void av1_reset_pmc(PICK_MODE_CONTEXT *ctx);
+void av1_free_pmc(PICK_MODE_CONTEXT *ctx, int num_planes);
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+ PICK_MODE_CONTEXT *src_ctx);
+
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
+ BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
+};
+
+static AOM_INLINE int av1_get_pc_tree_nodes(const int is_sb_size_128,
+ int stat_generation_stage) {
+ const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
+ const int tree_nodes =
+ stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+ return tree_nodes;
+}
+
+// Returns 0 on success, -1 on memory allocation failure.
+int av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
+void av1_free_sms_tree(struct ThreadData *td);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_
diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c
new file mode 100644
index 0000000000..323e2aed58
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/common/entropy.h"
+
+// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255.
+const uint16_t av1_prob_cost[128] = {
+ 512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435,
+ 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361,
+ 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294,
+ 289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232,
+ 228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175,
+ 171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122,
+ 119, 115, 112, 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73,
+ 70, 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, 26,
+ 23, 20, 18, 15, 12, 9, 6, 3,
+};
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+ const int *inv_map) {
+ int i;
+ aom_cdf_prob prev_cdf = 0;
+ for (i = 0;; ++i) {
+ aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+ p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15;
+ prev_cdf = AOM_ICDF(cdf[i]);
+
+ if (inv_map)
+ costs[inv_map[i]] = av1_cost_symbol(p15);
+ else
+ costs[i] = av1_cost_symbol(p15);
+
+ // Stop once we reach the end of the CDF
+ if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break;
+ }
+}
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
new file mode 100644
index 0000000000..be0241a820
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COST_H_
+#define AOM_AV1_ENCODER_COST_H_
+
+#include "aom_dsp/prob.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint16_t av1_prob_cost[128];
+
+// The factor to scale from cost in bits to cost in av1_prob_cost units.
+#define AV1_PROB_COST_SHIFT 9
+
+// Cost of coding an n bit literal, using 128 (i.e. 50%) probability
+// for each bit.
+#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
+
+// Calculate the cost of a symbol with probability p15 / 2^15
+static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+ // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the
+ // following cost calculation works correctly. Otherwise, if p15 =
+ // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong.
+ p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1);
+ assert(0 < p15 && p15 < CDF_PROB_TOP);
+ const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
+ const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
+ assert(prob >= 128);
+ return av1_prob_cost[prob - 128] + av1_cost_literal(shift);
+}
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+ const int *inv_map);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_COST_H_
diff --git a/third_party/aom/av1/encoder/deltaq4_model.c b/third_party/aom/av1/encoder/deltaq4_model.c
new file mode 100644
index 0000000000..60a7e6d2cf
--- /dev/null
+++ b/third_party/aom/av1/encoder/deltaq4_model.c
@@ -0,0 +1,7776 @@
+/* Embedded file: model.tflite */
+const int av1_deltaq4_model_fsize = 101032;
+const unsigned char av1_deltaq4_model_file[101032] = {
+ 0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x14, 0x00, 0x20, 0x00, 0x1c,
+ 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00,
+ 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00,
+ 0x00, 0xc0, 0x00, 0x00, 0x00, 0xc0, 0x7e, 0x01, 0x00, 0xd0, 0x7e, 0x01, 0x00,
+ 0x24, 0x8a, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04,
+ 0x00, 0x00, 0x00, 0x6a, 0x80, 0xfe, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x1c, 0x00,
+ 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x72,
+ 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb4, 0xff, 0xff, 0xff, 0x14,
+ 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x64, 0x65,
+ 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+ 0x00, 0x04, 0x00, 0x00, 0x00, 0xca, 0x81, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00,
+ 0x10, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+ 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00,
+ 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04,
+ 0x00, 0x08, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+ 0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74, 0x69,
+ 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00, 0x17, 0x00,
+ 0x00, 0x00, 0xfc, 0x7d, 0x01, 0x00, 0xf4, 0x7d, 0x01, 0x00, 0xdc, 0x7d, 0x01,
+ 0x00, 0x84, 0x7d, 0x01, 0x00, 0xf4, 0x7c, 0x01, 0x00, 0xa4, 0x7c, 0x01, 0x00,
+ 0x74, 0x7c, 0x01, 0x00, 0x5c, 0x7c, 0x01, 0x00, 0x4c, 0x5c, 0x00, 0x00, 0xbc,
+ 0x5b, 0x00, 0x00, 0x8c, 0x5a, 0x00, 0x00, 0x7c, 0x48, 0x00, 0x00, 0x6c, 0x00,
+ 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00,
+ 0x00, 0x4c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+ 0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04,
+ 0x00, 0x00, 0x00, 0x7e, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00,
+ 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x50, 0x77, 0xfe, 0xff, 0x54, 0x77, 0xfe, 0xff,
+ 0x58, 0x77, 0xfe, 0xff, 0x5c, 0x77, 0xfe, 0xff, 0x60, 0x77, 0xfe, 0xff, 0x64,
+ 0x77, 0xfe, 0xff, 0x68, 0x77, 0xfe, 0xff, 0x6c, 0x77, 0xfe, 0xff, 0x70, 0x77,
+ 0xfe, 0xff, 0xbe, 0x82, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00,
+ 0x00, 0x3e, 0x84, 0xfc, 0x3b, 0xef, 0x95, 0x2f, 0xbd, 0xd3, 0x21, 0x96, 0xbd,
+ 0x11, 0x9a, 0xc6, 0x3d, 0xd9, 0x7e, 0x0c, 0xbe, 0xcb, 0xd2, 0x8c, 0xbb, 0x60,
+ 0xf5, 0x92, 0xbd, 0x70, 0xce, 0x9e, 0x3d, 0x26, 0x67, 0xc4, 0x3d, 0x9b, 0x2a,
+ 0x8b, 0x3b, 0x3b, 0xdd, 0x2a, 0xbd, 0xf9, 0x09, 0x8a, 0xbd, 0x1b, 0xae, 0xd7,
+ 0x3c, 0xbf, 0x39, 0x87, 0xbd, 0x4c, 0x9e, 0xe2, 0x3d, 0x50, 0x9c, 0xe7, 0xbd,
+ 0x1e, 0x58, 0x57, 0x3d, 0x38, 0x8c, 0x58, 0xbd, 0x48, 0x9f, 0x4a, 0x3d, 0xcb,
+ 0x1c, 0x93, 0xbd, 0xeb, 0xb8, 0x5a, 0xbc, 0x63, 0x04, 0x4b, 0xbd, 0x9b, 0x76,
+ 0xa8, 0x3d, 0x20, 0xb4, 0x69, 0x3d, 0xee, 0xcc, 0xe5, 0x3a, 0x4f, 0x40, 0x02,
+ 0x3e, 0x21, 0x2e, 0x03, 0x3e, 0x25, 0x77, 0x99, 0xbd, 0xf5, 0xa1, 0xd0, 0x3c,
+ 0xc5, 0x15, 0xeb, 0x3c, 0x58, 0xb5, 0xb7, 0x3c, 0x80, 0x63, 0x33, 0xbd, 0xc9,
+ 0x66, 0x63, 0xbd, 0xf6, 0xef, 0xb8, 0xbd, 0xd7, 0xbf, 0x9f, 0x3b, 0x93, 0x68,
+ 0x35, 0x3d, 0x60, 0xfc, 0xf3, 0xbd, 0xed, 0xd9, 0x35, 0xbd, 0x57, 0xef, 0x8a,
+ 0x3d, 0x31, 0x97, 0xa4, 0x3d, 0x8e, 0x55, 0xe2, 0x3d, 0x27, 0xa5, 0xe9, 0x3d,
+ 0x36, 0x26, 0x67, 0xbc, 0xeb, 0xd1, 0x9e, 0xbd, 0xc7, 0xcd, 0x37, 0x3d, 0x31,
+ 0xfc, 0xce, 0x3d, 0x5e, 0xe3, 0x96, 0xbd, 0xeb, 0x24, 0x4d, 0x3c, 0xe6, 0x00,
+ 0xe2, 0xbd, 0x9b, 0x00, 0x17, 0xbd, 0xee, 0x9f, 0xc4, 0xbd, 0x6a, 0xcd, 0xba,
+ 0xbc, 0x2c, 0x2b, 0x97, 0xbd, 0x8a, 0x02, 0x68, 0xbc, 0xc3, 0x46, 0x9f, 0xbd,
+ 0x85, 0x3d, 0xc2, 0x3d, 0xbc, 0x16, 0x22, 0x3c, 0xf1, 0xca, 0xdf, 0x3d, 0xaf,
+ 0xef, 0xbc, 0x3c, 0x4c, 0xde, 0xe8, 0xbd, 0x5c, 0x5a, 0xc9, 0xbb, 0x35, 0xe5,
+ 0xc1, 0x3d, 0x14, 0xc7, 0xba, 0xbc, 0x05, 0xfb, 0x1d, 0x3d, 0x61, 0x23, 0xb7,
+ 0xbb, 0x17, 0x50, 0xb0, 0xbd, 0x14, 0x5b, 0xf4, 0xbd, 0xb1, 0x4d, 0x40, 0x3d,
+ 0x7e, 0x3d, 0xd8, 0x3d, 0x35, 0x2e, 0x90, 0x3d, 0x93, 0xcd, 0x0d, 0xbe, 0x8d,
+ 0x60, 0x70, 0x3d, 0x4a, 0x7c, 0xf2, 0x3c, 0x07, 0x2a, 0x7f, 0x3d, 0x2c, 0xab,
+ 0xd8, 0x3d, 0xb3, 0x1f, 0x1d, 0xbd, 0x44, 0x69, 0xf7, 0x3c, 0x71, 0xfd, 0x5e,
+ 0x3c, 0xc8, 0x14, 0x28, 0x3d, 0x71, 0x2e, 0x0c, 0x3b, 0x7f, 0xa3, 0xb5, 0x3d,
+ 0x55, 0x5c, 0x07, 0x3e, 0x0f, 0xf0, 0x3b, 0x3c, 0xd9, 0xc2, 0xbd, 0xbc, 0x71,
+ 0xaa, 0xc5, 0xbb, 0xa3, 0x86, 0xc7, 0x3d, 0xcf, 0x37, 0x95, 0xbd, 0x09, 0x63,
+ 0xc3, 0x3d, 0x0c, 0x01, 0x4e, 0xbd, 0xf1, 0xf9, 0x8d, 0x3d, 0xe2, 0x98, 0x45,
+ 0x3d, 0x76, 0xbc, 0x3b, 0x3d, 0x2a, 0xa2, 0x47, 0x3d, 0x8c, 0x1d, 0xae, 0xbd,
+ 0x5f, 0x35, 0x8c, 0xbd, 0x17, 0xeb, 0x05, 0x3d, 0x75, 0x62, 0xdb, 0xbd, 0x37,
+ 0xf8, 0xea, 0x3d, 0xf8, 0xa6, 0x6c, 0xbd, 0x8a, 0x86, 0x03, 0x3d, 0x67, 0x6c,
+ 0x8d, 0xbd, 0x58, 0xaf, 0xc5, 0xbd, 0x36, 0x51, 0x14, 0xbe, 0x60, 0xac, 0xe3,
+ 0x3d, 0x86, 0x4f, 0xf4, 0x3c, 0xf6, 0xa3, 0x29, 0x3d, 0xc3, 0x1d, 0x9a, 0x3c,
+ 0x44, 0xdc, 0x0e, 0xbc, 0x6b, 0x97, 0x8f, 0x3c, 0xc9, 0x3d, 0x88, 0xbc, 0x74,
+ 0x90, 0x9d, 0x3d, 0x0f, 0x02, 0xec, 0xbd, 0x12, 0xec, 0xb2, 0x3d, 0x6c, 0x32,
+ 0x31, 0x3d, 0x0b, 0x84, 0x35, 0x3d, 0xfc, 0xc2, 0x3c, 0x3d, 0x59, 0xdf, 0x16,
+ 0x3d, 0x8e, 0x29, 0xee, 0x3d, 0x83, 0xc3, 0xb7, 0xbd, 0x66, 0xbd, 0x84, 0xbd,
+ 0xb7, 0x49, 0x1b, 0x3d, 0x3f, 0xc1, 0x4a, 0x3d, 0x1a, 0x7d, 0xdf, 0x3d, 0xee,
+ 0x12, 0xb1, 0x3c, 0x29, 0x47, 0xe6, 0xbd, 0xd6, 0x04, 0xd6, 0x3d, 0xc2, 0x31,
+ 0x6f, 0xbd, 0xb0, 0x2c, 0x3e, 0xbd, 0x20, 0xd8, 0x43, 0xbd, 0x2d, 0x0c, 0x26,
+ 0xbd, 0x23, 0x47, 0x06, 0xbe, 0xb9, 0xd2, 0xb9, 0xbd, 0x7b, 0xef, 0xc8, 0x3d,
+ 0x23, 0x06, 0x06, 0x3d, 0x65, 0xc6, 0x45, 0xbd, 0x20, 0xc9, 0x24, 0xbc, 0xf7,
+ 0x2b, 0xf5, 0x3d, 0x41, 0x91, 0x15, 0xbd, 0x90, 0xbe, 0x0f, 0x3d, 0xe8, 0x94,
+ 0x8c, 0xbd, 0xdf, 0x96, 0x72, 0x3c, 0x8d, 0xb4, 0xed, 0x3d, 0x33, 0xf0, 0xb3,
+ 0xbd, 0x60, 0x49, 0xbc, 0xbd, 0x32, 0xf2, 0xd5, 0x3d, 0x3e, 0x3e, 0x6b, 0xbd,
+ 0xb4, 0x31, 0x09, 0x3e, 0xc6, 0x40, 0xfb, 0xbc, 0x75, 0x1a, 0x88, 0xbd, 0xbf,
+ 0x13, 0xb2, 0xbd, 0xe3, 0x78, 0xc4, 0xba, 0x68, 0xfc, 0x10, 0x3e, 0x27, 0x4c,
+ 0xf5, 0x3c, 0xfc, 0x68, 0x27, 0x3d, 0xb2, 0x2c, 0xe0, 0x3c, 0x6e, 0x4f, 0x9a,
+ 0xbb, 0xbb, 0x9f, 0xa1, 0xbd, 0x91, 0x7b, 0x9a, 0xbc, 0x17, 0x21, 0x52, 0xba,
+ 0x39, 0x8e, 0x4c, 0xbd, 0x03, 0xf5, 0xe5, 0x3d, 0x3a, 0x22, 0xcd, 0xbd, 0x90,
+ 0x1c, 0x78, 0xbd, 0x3f, 0xb1, 0x8d, 0xbd, 0xfc, 0x77, 0x25, 0xbe, 0x48, 0x9a,
+ 0xfd, 0x3c, 0xca, 0x6a, 0xa2, 0x3d, 0x45, 0xd6, 0x7a, 0xbd, 0xce, 0x9d, 0xbf,
+ 0x3d, 0x94, 0x1c, 0xbe, 0xbd, 0xcc, 0xc4, 0x83, 0xbc, 0xe9, 0xc7, 0xf3, 0xbc,
+ 0xdc, 0x31, 0x19, 0x39, 0x3a, 0x36, 0xea, 0x3d, 0x40, 0xa6, 0x72, 0xbd, 0x66,
+ 0xeb, 0x85, 0xb9, 0x68, 0xa0, 0x97, 0xbd, 0xa7, 0xeb, 0xa9, 0x3c, 0x4d, 0x79,
+ 0xf9, 0x3c, 0x55, 0x67, 0xb2, 0x3c, 0x80, 0x2a, 0x8f, 0xbd, 0xd5, 0x70, 0x17,
+ 0x3b, 0x41, 0xfb, 0xed, 0xbd, 0xae, 0xfe, 0x0e, 0xbd, 0x6d, 0x06, 0xd6, 0xbc,
+ 0x90, 0xc9, 0xd1, 0x3d, 0xb4, 0x6c, 0x19, 0x3b, 0xa3, 0x4f, 0x11, 0x3c, 0xb1,
+ 0x71, 0xc1, 0xbd, 0xcc, 0x5b, 0x20, 0xbc, 0x7a, 0xb5, 0xe9, 0x3d, 0x6f, 0x8c,
+ 0x95, 0x3d, 0x10, 0x56, 0x79, 0xbd, 0x45, 0x06, 0x69, 0x3c, 0xe4, 0x89, 0x9f,
+ 0xbd, 0xad, 0x43, 0x82, 0xbd, 0x7a, 0x1f, 0xbd, 0xbd, 0xbb, 0x25, 0x9b, 0x3c,
+ 0x27, 0xdc, 0x0f, 0xbe, 0x42, 0x7b, 0xe1, 0x3d, 0xaa, 0xd9, 0xcb, 0xbd, 0xa4,
+ 0xdf, 0x0e, 0x3e, 0xdd, 0x57, 0xbe, 0xbd, 0xf0, 0xb7, 0x87, 0xbd, 0xbb, 0x8a,
+ 0x73, 0xbd, 0x20, 0x8b, 0xb5, 0x3c, 0xb3, 0xac, 0x57, 0xbd, 0x4a, 0x5c, 0x68,
+ 0x3d, 0x46, 0xc5, 0x6e, 0x3b, 0x44, 0xd8, 0x22, 0xbd, 0xc8, 0x88, 0x93, 0xbd,
+ 0x71, 0x42, 0xd3, 0xbc, 0x80, 0x60, 0xf6, 0xbc, 0xe0, 0xb7, 0x04, 0x3d, 0xcb,
+ 0x28, 0xf7, 0xbd, 0xfd, 0x2e, 0x9d, 0xbd, 0xd8, 0x81, 0x5b, 0x3d, 0x90, 0x88,
+ 0x06, 0xbd, 0xb1, 0x2d, 0x8b, 0xbc, 0x74, 0x4d, 0x80, 0xbd, 0x1b, 0xce, 0x54,
+ 0x3d, 0xd3, 0xea, 0x89, 0xbd, 0x7a, 0x0a, 0xc6, 0x3c, 0x8b, 0x33, 0xa2, 0x3d,
+ 0x68, 0xe5, 0x8b, 0x3d, 0xcf, 0x19, 0x63, 0xbd, 0x50, 0x05, 0xc1, 0xbd, 0x2b,
+ 0x1f, 0xc4, 0xbc, 0x9f, 0xed, 0xaf, 0xbd, 0xc6, 0x72, 0x07, 0xbb, 0xc1, 0x58,
+ 0xa2, 0x3d, 0xf6, 0x27, 0x43, 0xbc, 0xa1, 0x5b, 0x36, 0x3d, 0x6b, 0x6b, 0x20,
+ 0x3d, 0x03, 0xb0, 0xfb, 0xbd, 0xf9, 0xf7, 0x9b, 0xbd, 0x9a, 0xbf, 0x92, 0x3d,
+ 0xa2, 0x0c, 0x5c, 0x3d, 0xd2, 0xc2, 0x73, 0xbd, 0x5c, 0xd3, 0xac, 0x3d, 0x9f,
+ 0x28, 0xa6, 0x3d, 0x23, 0xf4, 0x46, 0xbd, 0xf5, 0xfe, 0x6b, 0x3d, 0x2d, 0x03,
+ 0x56, 0x3d, 0x0c, 0x21, 0xe8, 0x3c, 0x6f, 0xdb, 0xe5, 0xbd, 0xd4, 0x8c, 0xe3,
+ 0xbd, 0xdf, 0x9d, 0x62, 0x3d, 0x38, 0xa0, 0xd1, 0xbd, 0x67, 0x9e, 0x8d, 0xbc,
+ 0xab, 0x78, 0x46, 0x3d, 0xf8, 0x88, 0x8e, 0xbc, 0x5a, 0x87, 0xd3, 0xbd, 0x40,
+ 0xba, 0xab, 0xbd, 0x45, 0xf8, 0x9a, 0x3d, 0x77, 0x60, 0x49, 0xbd, 0xa5, 0x29,
+ 0x98, 0xbc, 0xf9, 0xa7, 0x6b, 0x3d, 0xf8, 0x57, 0x1b, 0x3e, 0xf9, 0x7f, 0xcb,
+ 0x3d, 0xc8, 0x38, 0x3f, 0xbb, 0x0e, 0x77, 0xd9, 0x3d, 0xa9, 0x8f, 0xca, 0x3d,
+ 0x78, 0xbc, 0x92, 0x3d, 0xde, 0xe4, 0x31, 0xbc, 0x7f, 0x35, 0xec, 0x3d, 0x0b,
+ 0x98, 0x5c, 0x3d, 0x3a, 0x86, 0xa0, 0x3d, 0x9d, 0xb7, 0xad, 0xbd, 0x42, 0x3c,
+ 0xc2, 0xbc, 0x26, 0x4b, 0x7b, 0x3d, 0xbe, 0x8b, 0x0a, 0xb9, 0x28, 0x3e, 0xc5,
+ 0x3d, 0xef, 0xac, 0xbb, 0xbd, 0xb3, 0xcc, 0x69, 0xbd, 0xb9, 0xff, 0x07, 0x3d,
+ 0x30, 0xf6, 0x26, 0x3d, 0xa9, 0x18, 0xe6, 0x3d, 0x85, 0x72, 0xdb, 0xbd, 0xda,
+ 0x6e, 0xa1, 0x3d, 0x3b, 0x16, 0xf7, 0x3c, 0xb1, 0x3d, 0x96, 0xbd, 0xd9, 0x88,
+ 0xeb, 0x3b, 0x52, 0x76, 0x9a, 0xbd, 0xb9, 0x81, 0x1a, 0xbd, 0x81, 0x94, 0x96,
+ 0xbc, 0xd4, 0x4b, 0xe8, 0x3d, 0x0f, 0x6c, 0xe4, 0xbc, 0xc0, 0xbd, 0xab, 0x3c,
+ 0x1b, 0xdd, 0x76, 0x3c, 0x98, 0x18, 0xae, 0xbd, 0xfb, 0x1a, 0x6f, 0xbd, 0x72,
+ 0x50, 0x83, 0xbd, 0x46, 0x0b, 0x12, 0xbc, 0x64, 0x93, 0xf2, 0x3d, 0x1f, 0xad,
+ 0x71, 0x3b, 0xcf, 0x26, 0x77, 0xbd, 0x8b, 0x31, 0x2d, 0xbd, 0x0d, 0xb7, 0x54,
+ 0x3b, 0x5b, 0x00, 0xc4, 0x3d, 0x57, 0x4c, 0x58, 0x3d, 0x11, 0x4c, 0x15, 0x3d,
+ 0x1a, 0xfc, 0xa2, 0xbc, 0xf2, 0xed, 0xea, 0x3d, 0x9e, 0xad, 0xf7, 0xbd, 0x47,
+ 0x8d, 0x41, 0x3d, 0xce, 0xc5, 0x96, 0xbb, 0x2a, 0x72, 0xa0, 0xbd, 0x93, 0x27,
+ 0x9a, 0xbd, 0x3f, 0xcb, 0xef, 0xbb, 0xb5, 0xa5, 0x1e, 0x3d, 0xd6, 0x2a, 0xfd,
+ 0xbc, 0xf5, 0xe0, 0xd4, 0xbc, 0xa1, 0x7d, 0x9d, 0x3d, 0xbb, 0x60, 0x22, 0xbd,
+ 0x32, 0x15, 0x16, 0x3e, 0x80, 0x77, 0xb7, 0xbc, 0xba, 0x1c, 0xa4, 0xbd, 0x45,
+ 0xb7, 0x0b, 0xbd, 0x6a, 0x33, 0x9a, 0x3d, 0xfc, 0x27, 0xab, 0xbc, 0x10, 0xcd,
+ 0x2c, 0x3e, 0xb3, 0xf1, 0xa5, 0x3d, 0x03, 0xf7, 0xa3, 0x3c, 0x25, 0x0c, 0xe1,
+ 0x3c, 0xc4, 0x82, 0xaa, 0xbd, 0x3a, 0x4a, 0x15, 0x3c, 0x5c, 0x56, 0x9e, 0x3d,
+ 0x96, 0x52, 0xee, 0x3d, 0x67, 0xf7, 0x96, 0x3d, 0x3e, 0xb0, 0xd6, 0xbd, 0x6e,
+ 0xbd, 0x8e, 0xbd, 0x16, 0xb3, 0x85, 0x3d, 0x84, 0xca, 0x6e, 0xbd, 0x0f, 0xfc,
+ 0x40, 0x3d, 0x2d, 0xe0, 0xdc, 0x3d, 0xc1, 0xa1, 0xde, 0x39, 0x30, 0x79, 0xe7,
+ 0x3d, 0x0a, 0xab, 0xba, 0x3d, 0x35, 0x57, 0xc7, 0xbd, 0x7e, 0x38, 0xa1, 0x3d,
+ 0xe3, 0x25, 0x60, 0x3d, 0x47, 0xbd, 0x56, 0x3d, 0x62, 0xcf, 0xf6, 0x3d, 0xad,
+ 0x06, 0xd5, 0xbd, 0x41, 0xda, 0xe8, 0x3a, 0x81, 0xcb, 0xbb, 0x3d, 0xce, 0x38,
+ 0x4c, 0xbc, 0x17, 0xc0, 0x88, 0xbd, 0x12, 0x25, 0xd7, 0xbd, 0x3b, 0xf5, 0x9b,
+ 0xbd, 0x4e, 0xa0, 0xb1, 0xbc, 0xa1, 0x8c, 0x9c, 0x3d, 0xc5, 0x2f, 0xb3, 0x3d,
+ 0xe0, 0xc2, 0x08, 0x3e, 0x0b, 0xcc, 0x2f, 0x3d, 0x87, 0x3f, 0x1d, 0x3e, 0x76,
+ 0xcd, 0xc3, 0xbd, 0x4f, 0x1d, 0xd4, 0xbd, 0x65, 0x6f, 0x00, 0x3e, 0x95, 0x4f,
+ 0x9a, 0x3d, 0xa2, 0x66, 0x28, 0xbd, 0xaf, 0x81, 0x90, 0x3d, 0x16, 0x50, 0xde,
+ 0x3b, 0x65, 0xec, 0xe3, 0xbd, 0x47, 0x6c, 0x34, 0xbc, 0xae, 0xe8, 0xe5, 0xbd,
+ 0x5b, 0x7c, 0xa6, 0xbb, 0x1d, 0x4d, 0x8d, 0xbc, 0xb1, 0x7a, 0x1d, 0x3e, 0xbf,
+ 0x37, 0xe6, 0xbc, 0x7b, 0x0c, 0x70, 0x3d, 0x09, 0x57, 0xe2, 0x3d, 0x10, 0x4a,
+ 0x35, 0xbc, 0x5d, 0x58, 0xf5, 0xbc, 0xb9, 0x89, 0xa1, 0x3d, 0x6a, 0xb2, 0x68,
+ 0xbd, 0xf4, 0xf6, 0x03, 0x3e, 0xf1, 0xc6, 0x3a, 0xbd, 0xf5, 0x3b, 0xe2, 0x3d,
+ 0x3a, 0xd2, 0x4a, 0x3d, 0xe7, 0xb8, 0x9e, 0xbd, 0x18, 0xe7, 0xd9, 0x3c, 0x1d,
+ 0x95, 0x8e, 0x3d, 0xde, 0x6f, 0x9e, 0xbc, 0xae, 0x7d, 0x0f, 0x3e, 0xb0, 0xf3,
+ 0x04, 0x3d, 0xe0, 0xdc, 0x6b, 0x3d, 0x02, 0x2c, 0xee, 0xbd, 0x7c, 0xb2, 0x9f,
+ 0xbd, 0xae, 0x94, 0xc3, 0x3c, 0x82, 0xba, 0xab, 0x3d, 0x07, 0x80, 0xde, 0x3c,
+ 0x75, 0xec, 0xb3, 0xbd, 0x34, 0x42, 0x74, 0xbd, 0x44, 0xce, 0x7a, 0x3d, 0x21,
+ 0xac, 0x28, 0xbe, 0xb1, 0xbb, 0x14, 0xbd, 0xe2, 0xe1, 0xdb, 0x3c, 0x41, 0x82,
+ 0xc7, 0x3d, 0x3e, 0x0f, 0x9c, 0xbd, 0x92, 0x4e, 0x97, 0x3d, 0x69, 0x45, 0xf2,
+ 0x3d, 0xc3, 0x86, 0xc4, 0xbb, 0x57, 0x0f, 0xb1, 0x3d, 0x8c, 0xa7, 0xc6, 0x3d,
+ 0x27, 0xe2, 0xf3, 0xbc, 0xdd, 0x31, 0x44, 0xbd, 0x94, 0x2c, 0x29, 0xbc, 0xe6,
+ 0xeb, 0xd1, 0xbd, 0x74, 0xf9, 0x02, 0x3d, 0x43, 0x51, 0x92, 0xbd, 0x38, 0xb8,
+ 0x72, 0x3d, 0x73, 0xd3, 0x89, 0xbc, 0x06, 0x13, 0xdb, 0x3d, 0x75, 0xc5, 0xb2,
+ 0x3b, 0x9a, 0xe9, 0x95, 0xbc, 0xd2, 0x6a, 0x05, 0x3e, 0x65, 0xc5, 0xa3, 0x3d,
+ 0x59, 0x09, 0x72, 0xbd, 0x93, 0x0e, 0x85, 0xbc, 0x0d, 0x55, 0x6b, 0xbd, 0x55,
+ 0x64, 0x16, 0xbd, 0x50, 0x04, 0x9f, 0x3d, 0x93, 0x37, 0x14, 0xbd, 0xe9, 0x24,
+ 0x58, 0x3d, 0x04, 0x8e, 0xe9, 0xbd, 0xe4, 0x6e, 0x2b, 0xbd, 0x43, 0xbc, 0xba,
+ 0xbd, 0x80, 0xa1, 0xc3, 0xbd, 0x32, 0x81, 0xf5, 0xbd, 0x94, 0x5a, 0x10, 0x3d,
+ 0xfb, 0x5d, 0x27, 0x3c, 0xd7, 0x26, 0xc5, 0x3d, 0xf5, 0xc3, 0x4b, 0x3d, 0x32,
+ 0xca, 0xdc, 0x3d, 0xb2, 0xe8, 0x35, 0xbc, 0xb2, 0x47, 0xb9, 0xbd, 0xfa, 0x59,
+ 0x29, 0xbe, 0xab, 0x6f, 0x0a, 0x3e, 0x81, 0xa5, 0x10, 0xbd, 0x73, 0x96, 0x99,
+ 0xbd, 0x39, 0x77, 0x23, 0xbc, 0xa8, 0x50, 0xf8, 0xbd, 0x4c, 0x1d, 0xdd, 0xbd,
+ 0xf8, 0xf5, 0xb9, 0xbd, 0x65, 0x4e, 0x12, 0x3e, 0xc0, 0xa1, 0x7a, 0xbd, 0x16,
+ 0x33, 0x27, 0x3d, 0xc4, 0xc6, 0x31, 0x3b, 0x0e, 0xcd, 0x48, 0xbd, 0xd2, 0x7f,
+ 0xb4, 0xbd, 0x2c, 0x3a, 0x8b, 0x3c, 0x6f, 0x43, 0x59, 0x3d, 0x4e, 0x8a, 0x52,
+ 0x3d, 0x91, 0x68, 0xc4, 0x3d, 0xa2, 0x78, 0x16, 0xbd, 0xe5, 0x2c, 0x60, 0x3d,
+ 0x7f, 0x73, 0x8f, 0x3d, 0x9f, 0x70, 0x09, 0xbe, 0xf2, 0xf2, 0x05, 0x3c, 0x1e,
+ 0x58, 0x98, 0x3d, 0xec, 0xfc, 0x03, 0x3e, 0x88, 0xbf, 0x56, 0xbd, 0x2b, 0xc8,
+ 0x99, 0xbd, 0x9e, 0x13, 0x9a, 0xbc, 0x4f, 0x72, 0xca, 0xbd, 0x79, 0x6e, 0xef,
+ 0x3d, 0x87, 0xc3, 0x80, 0xbc, 0xe7, 0xef, 0x05, 0x3d, 0xc7, 0x99, 0x0a, 0x3d,
+ 0x17, 0x7c, 0x56, 0x3d, 0x01, 0xab, 0xd3, 0xbd, 0x48, 0x8b, 0xa2, 0xbd, 0x06,
+ 0xad, 0xcc, 0xbc, 0xf0, 0xf5, 0x6d, 0xbd, 0x6a, 0x67, 0x0c, 0xbe, 0x7e, 0x2e,
+ 0x6e, 0x3d, 0x53, 0x50, 0x29, 0xbd, 0x8c, 0x40, 0xb3, 0x3d, 0x5c, 0x9a, 0x0f,
+ 0xbd, 0xe9, 0x4e, 0x0a, 0x3e, 0x4d, 0x05, 0xac, 0x3d, 0xf9, 0x1a, 0x8e, 0x3d,
+ 0x0d, 0x69, 0xa6, 0xbd, 0x88, 0x94, 0x60, 0x3d, 0x48, 0x2a, 0x8a, 0xbb, 0x5a,
+ 0x5d, 0x39, 0x3d, 0x88, 0x56, 0xc8, 0x3c, 0xb8, 0x91, 0x93, 0x3a, 0x64, 0x69,
+ 0x8b, 0x3d, 0x4b, 0x48, 0x43, 0xbd, 0xb8, 0x91, 0xa7, 0xbd, 0x92, 0x96, 0xe5,
+ 0x3d, 0x4c, 0x62, 0xd6, 0x3d, 0xa6, 0x7a, 0x88, 0xbd, 0x6c, 0xdb, 0xc6, 0x3d,
+ 0x1c, 0x4d, 0xab, 0x3d, 0xe0, 0x1d, 0x57, 0x3c, 0x2a, 0xa3, 0x0c, 0x3d, 0xac,
+ 0xff, 0xe8, 0xbb, 0x12, 0x86, 0x89, 0xbd, 0xc6, 0x68, 0xd3, 0xbd, 0xe7, 0xb0,
+ 0xa6, 0xbc, 0x3c, 0xd2, 0xfa, 0xbb, 0xf2, 0xd6, 0xda, 0xbd, 0x80, 0x95, 0xc5,
+ 0xbd, 0x0a, 0x19, 0x93, 0xbd, 0x94, 0xc1, 0xe4, 0xbd, 0xdd, 0x20, 0x18, 0x3e,
+ 0xb3, 0x48, 0xba, 0xbd, 0xdd, 0x6b, 0x86, 0xbd, 0x3d, 0xbc, 0xb1, 0xbd, 0xbe,
+ 0xc1, 0x7f, 0xbc, 0xfc, 0x54, 0x83, 0x3d, 0xb5, 0x4e, 0x1e, 0xbd, 0x5f, 0x54,
+ 0xc3, 0x3c, 0xe4, 0x2e, 0x0a, 0x3e, 0xc9, 0x05, 0x05, 0x3d, 0xc7, 0x8d, 0x2c,
+ 0xbc, 0x37, 0x21, 0xc2, 0xbc, 0xea, 0x7e, 0x96, 0x3d, 0x64, 0x7a, 0xca, 0x3d,
+ 0xcb, 0xcf, 0xc8, 0x3b, 0x5a, 0xd4, 0x00, 0xbe, 0x5f, 0x49, 0xd0, 0x3d, 0xbe,
+ 0x56, 0x15, 0x3e, 0x3f, 0x1d, 0x9e, 0xbd, 0xd4, 0x91, 0xa9, 0x3d, 0xf1, 0xea,
+ 0x4b, 0xbb, 0x78, 0x4a, 0xa5, 0x3c, 0xc2, 0x9b, 0xac, 0xbd, 0x8c, 0xd3, 0x94,
+ 0xbd, 0xb1, 0x52, 0x94, 0xbd, 0x55, 0xdd, 0x0d, 0xbe, 0x93, 0x2e, 0xa1, 0x3d,
+ 0x31, 0x1e, 0xe0, 0x3c, 0xaf, 0xba, 0x6c, 0x3d, 0x8e, 0xec, 0x8f, 0xbd, 0x38,
+ 0x79, 0xd2, 0xbc, 0x21, 0x7e, 0x9d, 0x3d, 0xbb, 0x21, 0xeb, 0x3d, 0x6e, 0x68,
+ 0xec, 0x3d, 0xc2, 0xf4, 0xb6, 0xbd, 0x80, 0xe2, 0x91, 0xbc, 0x45, 0xa5, 0x8f,
+ 0xbb, 0xf8, 0xb2, 0xc7, 0xbd, 0xe4, 0x47, 0x3a, 0xbd, 0xa2, 0x4f, 0xe9, 0xbd,
+ 0xcc, 0x37, 0x53, 0x3c, 0x51, 0x03, 0x4f, 0x3d, 0x35, 0xa2, 0xfa, 0x3d, 0xea,
+ 0x64, 0x7b, 0xbc, 0xbf, 0x49, 0xfb, 0x3d, 0x3d, 0x8e, 0x7b, 0x3b, 0x9c, 0x4b,
+ 0x35, 0xbd, 0x62, 0xf1, 0x10, 0xbe, 0xac, 0xd2, 0xd8, 0xbd, 0x80, 0x00, 0x9d,
+ 0x3d, 0xcc, 0x19, 0xaf, 0xbc, 0x97, 0x73, 0xdb, 0x3d, 0x6d, 0xb6, 0xf3, 0x3d,
+ 0x19, 0xe7, 0x7a, 0xbd, 0xcf, 0xba, 0xc6, 0x3c, 0x77, 0xfc, 0x23, 0x3d, 0xd6,
+ 0xfe, 0x3f, 0x3d, 0x73, 0xf2, 0xdb, 0xbd, 0x3d, 0x21, 0x95, 0xbb, 0x58, 0xb8,
+ 0x86, 0xbd, 0x01, 0x3c, 0x6f, 0x3d, 0xaf, 0x2e, 0x3e, 0xbd, 0x7b, 0x6d, 0x73,
+ 0xbd, 0x33, 0xe2, 0x5f, 0xbc, 0x64, 0x5f, 0xdb, 0xbd, 0x31, 0xf5, 0xb6, 0xbd,
+ 0xfc, 0x90, 0xd4, 0xbd, 0x25, 0xd8, 0xc4, 0xbd, 0x38, 0xdf, 0xb9, 0x3d, 0x89,
+ 0x14, 0x8b, 0x3d, 0x8d, 0x05, 0x2c, 0xbd, 0x20, 0xb8, 0xa3, 0xbc, 0xaf, 0x68,
+ 0x12, 0x3d, 0xce, 0x53, 0xb0, 0xbd, 0xca, 0x8a, 0x95, 0x3d, 0x11, 0x84, 0x8a,
+ 0x3d, 0x6d, 0xbd, 0x67, 0xbb, 0xe8, 0xd5, 0x76, 0xbc, 0xac, 0xc8, 0xfb, 0xbd,
+ 0xa9, 0x8b, 0xa4, 0xbb, 0x3e, 0x3a, 0xba, 0x3d, 0xe2, 0xa5, 0x50, 0x3d, 0xf0,
+ 0x4d, 0x81, 0x3b, 0x96, 0x79, 0x31, 0xbd, 0x87, 0xaf, 0xe5, 0x3a, 0x27, 0xb7,
+ 0xa5, 0x3d, 0xd4, 0x71, 0xb5, 0xbd, 0x95, 0x06, 0xd1, 0xbd, 0x82, 0x3d, 0x1c,
+ 0xbc, 0xdc, 0xe4, 0x6e, 0x3d, 0x21, 0xcf, 0x80, 0xbc, 0xbe, 0xc7, 0xb7, 0xbc,
+ 0x21, 0x87, 0x3c, 0x3d, 0x11, 0x3a, 0x67, 0xbd, 0xa5, 0xd3, 0xe8, 0xbd, 0x9a,
+ 0xb7, 0xc2, 0x3d, 0x2e, 0xa7, 0x86, 0xbc, 0xbe, 0x03, 0x26, 0xbc, 0x5e, 0x12,
+ 0x08, 0xbe, 0x1d, 0xd9, 0xf8, 0xbd, 0xf3, 0x79, 0xe4, 0xbd, 0x38, 0xaa, 0x04,
+ 0x3e, 0x98, 0x40, 0xa7, 0x3d, 0xfa, 0xd9, 0xce, 0xbd, 0x08, 0x73, 0x16, 0xb9,
+ 0xd6, 0x47, 0x2c, 0x3d, 0x08, 0xb5, 0x8b, 0xbd, 0x04, 0x66, 0x70, 0x3c, 0x9f,
+ 0xe6, 0xe4, 0xbd, 0x7f, 0xcd, 0xa5, 0x3b, 0x5b, 0x92, 0x8b, 0xbd, 0x29, 0x55,
+ 0x19, 0xbd, 0x79, 0x98, 0x26, 0x3d, 0x32, 0x3d, 0xc3, 0xb9, 0x29, 0x8a, 0x05,
+ 0xbe, 0xe8, 0x61, 0x92, 0x3d, 0x4f, 0x64, 0xa9, 0x3d, 0x00, 0x9a, 0xa0, 0xbd,
+ 0x34, 0xcc, 0xd8, 0x3c, 0xcd, 0x8a, 0xaf, 0x3d, 0x69, 0xc6, 0x5c, 0x3c, 0xe0,
+ 0x76, 0xd3, 0x3d, 0x49, 0x6a, 0x79, 0x3b, 0x33, 0x10, 0xbd, 0x3c, 0xe9, 0x47,
+ 0x2a, 0xbd, 0x7f, 0xb4, 0x3e, 0xbb, 0x80, 0xd2, 0x18, 0xbe, 0xf3, 0x5c, 0x90,
+ 0xbd, 0x0b, 0x88, 0xaf, 0xbd, 0x24, 0x0c, 0x94, 0xbd, 0xfd, 0xa9, 0xa1, 0xbd,
+ 0x40, 0xc9, 0x82, 0xbd, 0x24, 0x56, 0xa0, 0x3c, 0xa0, 0x3e, 0x09, 0x3e, 0x30,
+ 0x93, 0xc7, 0x3d, 0x03, 0xa3, 0x0c, 0x3c, 0x88, 0xdc, 0x96, 0x3d, 0xac, 0x34,
+ 0xc7, 0xbd, 0x64, 0xb0, 0xe5, 0x3d, 0x61, 0x56, 0xc8, 0x3d, 0x08, 0x55, 0x99,
+ 0x3d, 0xb5, 0xa9, 0x56, 0xbd, 0xfb, 0x4f, 0x95, 0xbd, 0xe9, 0xeb, 0x55, 0x3d,
+ 0xbf, 0x4c, 0xdf, 0xbd, 0xbf, 0x4a, 0x12, 0xbb, 0x93, 0x9d, 0x65, 0xbd, 0x26,
+ 0xd0, 0xce, 0x3d, 0x89, 0x19, 0x64, 0xbd, 0x91, 0x3d, 0x3f, 0x3d, 0x23, 0x3a,
+ 0x3b, 0xbd, 0xc8, 0x9d, 0x20, 0xbc, 0xa1, 0x2c, 0xff, 0xbb, 0x8c, 0x39, 0xb2,
+ 0x3b, 0xf3, 0xbe, 0x86, 0x3d, 0xa3, 0xfa, 0xcc, 0xbd, 0x3d, 0x3c, 0x07, 0xbe,
+ 0xd4, 0xb4, 0xa7, 0xbd, 0x94, 0xfc, 0x71, 0x3d, 0x8b, 0xe6, 0x2e, 0x3d, 0x94,
+ 0x30, 0x41, 0xbd, 0xb3, 0x63, 0x18, 0x3d, 0xbf, 0x35, 0x3c, 0xbb, 0x4c, 0xaa,
+ 0xd9, 0xbd, 0x20, 0x83, 0xa1, 0x3d, 0xdb, 0xca, 0x49, 0x3c, 0x1d, 0xbb, 0xac,
+ 0xbb, 0x3c, 0xea, 0x1c, 0xbc, 0x5b, 0xc3, 0xd1, 0x3d, 0x15, 0xd3, 0xc9, 0xbd,
+ 0xb9, 0x30, 0x12, 0xbb, 0xe3, 0x34, 0xde, 0xbd, 0xa0, 0x31, 0xeb, 0xbd, 0xc2,
+ 0x64, 0xe2, 0x3d, 0xb2, 0xfd, 0xf4, 0xbd, 0x45, 0xa5, 0xbe, 0x3c, 0xa1, 0x40,
+ 0x56, 0xbd, 0x52, 0x01, 0xed, 0x3d, 0xd0, 0x6b, 0xfc, 0xbd, 0xef, 0x73, 0xb2,
+ 0xbd, 0x03, 0xa0, 0xcd, 0xbd, 0x24, 0x69, 0xbe, 0x3c, 0x76, 0xcd, 0x9e, 0x3d,
+ 0xbe, 0xcb, 0x3b, 0x3d, 0x55, 0x49, 0x4e, 0xbd, 0x99, 0xe9, 0xd5, 0xbc, 0x9c,
+ 0x73, 0x88, 0x3c, 0x9a, 0x64, 0x75, 0xbd, 0x53, 0x89, 0xb2, 0xbd, 0x73, 0xa4,
+ 0xb9, 0x3d, 0xa8, 0x68, 0xf3, 0xbd, 0x2a, 0xf3, 0x89, 0xbd, 0x8d, 0x63, 0x85,
+ 0x3c, 0xbb, 0x72, 0x63, 0x3d, 0x29, 0x8a, 0xe8, 0xbd, 0x87, 0x03, 0xab, 0x3d,
+ 0xbf, 0x88, 0x44, 0xbd, 0x74, 0x28, 0xae, 0xbd, 0xf7, 0xe8, 0x87, 0xbd, 0x16,
+ 0x46, 0x04, 0xbd, 0x87, 0xf6, 0xcf, 0xbd, 0x8b, 0x67, 0x44, 0xbd, 0xac, 0xd4,
+ 0xa5, 0xbd, 0xed, 0x0b, 0xf2, 0xbd, 0x20, 0x9e, 0xf5, 0xbd, 0xc1, 0xbd, 0x70,
+ 0x3d, 0xae, 0xfe, 0x77, 0x3d, 0x27, 0x07, 0x82, 0xbd, 0xbe, 0x56, 0x19, 0xbd,
+ 0xae, 0x94, 0xc9, 0xbd, 0x7a, 0x52, 0xc6, 0xbd, 0x4e, 0x64, 0x4d, 0x3c, 0xf7,
+ 0xe4, 0x18, 0x3d, 0xef, 0x06, 0xa4, 0xbd, 0x8c, 0xad, 0xa8, 0xbd, 0xab, 0xcc,
+ 0x62, 0xbc, 0x4a, 0x7c, 0x09, 0xba, 0x01, 0x0d, 0x2b, 0xbd, 0x3d, 0x77, 0xb6,
+ 0x3b, 0xd3, 0x48, 0xc8, 0x3d, 0x89, 0xcf, 0x05, 0x3e, 0xdb, 0x48, 0x92, 0x3d,
+ 0x1e, 0xa5, 0xc9, 0x3c, 0xc7, 0xad, 0x74, 0x3d, 0x66, 0x26, 0x4e, 0xbd, 0x8f,
+ 0x4c, 0x85, 0x3d, 0xe2, 0x14, 0xe3, 0x3d, 0xad, 0x90, 0x2b, 0xbd, 0xcd, 0x7c,
+ 0xf4, 0x3d, 0xe6, 0xae, 0x98, 0x3c, 0xa6, 0x86, 0x66, 0x3c, 0x18, 0x11, 0x1f,
+ 0xbc, 0xb8, 0xe5, 0xa3, 0xbc, 0xea, 0xd7, 0x47, 0xbd, 0x39, 0x8a, 0xbb, 0x3d,
+ 0x1c, 0x27, 0x4c, 0xba, 0x50, 0x9a, 0x4b, 0xbd, 0xda, 0x55, 0x5c, 0xbd, 0xa7,
+ 0xd6, 0xb4, 0x3d, 0x40, 0x3f, 0xa0, 0xbd, 0x26, 0xa7, 0xba, 0xbd, 0x4c, 0xc0,
+ 0x5c, 0x3d, 0x5c, 0xe1, 0x96, 0x3d, 0x50, 0xd9, 0x36, 0xbb, 0x8b, 0xf8, 0x7e,
+ 0xbb, 0xb4, 0x9c, 0xf0, 0x3d, 0x88, 0xf4, 0xa8, 0xbd, 0x92, 0x72, 0x0e, 0xbd,
+ 0x18, 0xc1, 0xa0, 0x3c, 0x78, 0x3f, 0xc6, 0xbd, 0xfa, 0xec, 0xe8, 0xbd, 0xa4,
+ 0xbc, 0x3d, 0xbd, 0x47, 0x9d, 0xc6, 0xbc, 0x8e, 0x10, 0x4b, 0x3d, 0x18, 0x89,
+ 0x51, 0xbd, 0x26, 0xd5, 0x9b, 0xbd, 0xb9, 0xbb, 0x0a, 0xbe, 0xa7, 0x0f, 0x8f,
+ 0x3d, 0x62, 0x63, 0x4b, 0xbb, 0xfe, 0x46, 0x56, 0xbd, 0x64, 0xcc, 0xbb, 0x3d,
+ 0x85, 0x17, 0x52, 0x3d, 0x08, 0xa8, 0x0e, 0x3d, 0x75, 0xdc, 0x4c, 0xbd, 0xf9,
+ 0xc3, 0x92, 0x3d, 0xe0, 0x13, 0x84, 0x3d, 0xa1, 0x30, 0xe8, 0xbd, 0x2d, 0x2b,
+ 0xd0, 0xbd, 0x68, 0x62, 0x91, 0xbc, 0x32, 0xd7, 0xd3, 0xbb, 0xac, 0xd6, 0xdb,
+ 0x3d, 0x0d, 0x70, 0xe9, 0xbd, 0xed, 0xea, 0x69, 0x3d, 0xa4, 0xa3, 0x99, 0x3d,
+ 0x60, 0xa0, 0xcd, 0xbd, 0xd8, 0x9b, 0x20, 0x3c, 0x29, 0x39, 0xaf, 0x3d, 0xd3,
+ 0x2d, 0x2e, 0x3d, 0x10, 0xd7, 0x60, 0x3d, 0x2b, 0x82, 0xb1, 0xbd, 0x3d, 0x6b,
+ 0x94, 0xbd, 0x73, 0xa6, 0x24, 0x3d, 0x33, 0x6b, 0xf9, 0xbd, 0x94, 0xe1, 0xac,
+ 0x3d, 0xdf, 0x2c, 0x77, 0x3d, 0x82, 0x66, 0xa0, 0x3c, 0x9d, 0x7c, 0xd1, 0xbd,
+ 0x67, 0x66, 0x39, 0x3d, 0x1b, 0xb4, 0x5e, 0x3d, 0x0a, 0x50, 0x7f, 0x3d, 0x1a,
+ 0x08, 0x6c, 0x3d, 0x6c, 0x55, 0xac, 0xbd, 0x27, 0x4d, 0x04, 0xbc, 0x28, 0x6e,
+ 0x54, 0x3c, 0x8d, 0x2e, 0x95, 0xbd, 0x56, 0x25, 0xd5, 0x3a, 0x8d, 0xf8, 0xde,
+ 0xbd, 0x53, 0xd6, 0xe0, 0x3c, 0x09, 0xfc, 0x3f, 0x3d, 0x95, 0x29, 0xbe, 0xba,
+ 0x9b, 0x98, 0xa6, 0x3d, 0xfd, 0xd1, 0xe1, 0x3d, 0x00, 0x2a, 0x04, 0xbe, 0x06,
+ 0x73, 0x8b, 0xbd, 0x1e, 0x77, 0xcd, 0x3d, 0xf3, 0x47, 0x01, 0xbe, 0x41, 0x8d,
+ 0xd2, 0xbc, 0x98, 0xba, 0x02, 0xbe, 0x14, 0x4e, 0x84, 0xbc, 0x7b, 0xee, 0xc1,
+ 0x3d, 0x5c, 0x1f, 0x5f, 0xbd, 0x66, 0x1e, 0xd4, 0xbd, 0xa7, 0x18, 0x51, 0x3d,
+ 0xaa, 0xbb, 0x7f, 0x3b, 0x9a, 0x15, 0x33, 0x3d, 0xcd, 0x6b, 0x8d, 0x3d, 0x9c,
+ 0x73, 0x6d, 0xbd, 0x76, 0x3e, 0x54, 0x3c, 0x3d, 0x4f, 0xe4, 0x3d, 0x89, 0xaf,
+ 0xf9, 0x3d, 0x0f, 0x5f, 0x8b, 0xbd, 0x5d, 0xcc, 0x9c, 0xbd, 0x8b, 0x08, 0xf1,
+ 0xbd, 0xe3, 0xc3, 0x04, 0xbd, 0x5f, 0x0b, 0xf8, 0x3d, 0x4f, 0xd8, 0xaf, 0x3d,
+ 0x2f, 0xff, 0x3e, 0x3d, 0x07, 0xf0, 0x5f, 0xbb, 0xcd, 0x6b, 0xbd, 0xbd, 0x0a,
+ 0x80, 0xee, 0x3d, 0x58, 0xa2, 0xbd, 0x3c, 0xa6, 0x43, 0xf9, 0xbc, 0x7e, 0x76,
+ 0xbb, 0x3d, 0x0b, 0x75, 0x11, 0xb9, 0x7c, 0x78, 0x46, 0x3d, 0xe9, 0xf0, 0x73,
+ 0x3d, 0x6d, 0x01, 0x50, 0xbc, 0x6f, 0x55, 0x80, 0x3d, 0x88, 0x5d, 0xd4, 0xbc,
+ 0x20, 0x61, 0x94, 0xbd, 0xbd, 0x32, 0xa3, 0x3c, 0x91, 0x29, 0xb3, 0xbd, 0x7a,
+ 0x60, 0x62, 0xbc, 0xd8, 0x67, 0x99, 0xbb, 0xea, 0xd6, 0x4a, 0xbd, 0xb2, 0xb3,
+ 0x14, 0xbd, 0x15, 0x9f, 0xf6, 0x3d, 0xc4, 0x35, 0xbe, 0xbd, 0xc6, 0x0b, 0x63,
+ 0x3d, 0x43, 0x76, 0x43, 0xbd, 0x4f, 0x5e, 0x18, 0xbc, 0x6b, 0xac, 0xb1, 0x3d,
+ 0x4e, 0xca, 0xd8, 0xbd, 0x2f, 0xef, 0xc3, 0x3d, 0x96, 0xc3, 0x48, 0x3c, 0x1c,
+ 0x73, 0x17, 0x3d, 0x56, 0x34, 0xfb, 0x3c, 0x25, 0xa7, 0xb2, 0x3d, 0x29, 0x5e,
+ 0xac, 0x3d, 0xdd, 0x3b, 0x80, 0x3d, 0x5a, 0xec, 0x37, 0x3c, 0xdc, 0xf9, 0x92,
+ 0x3b, 0x66, 0x0b, 0xc6, 0xbd, 0x75, 0x09, 0xfc, 0xbc, 0x55, 0xd9, 0xea, 0xbd,
+ 0x01, 0xed, 0x7a, 0x3c, 0x90, 0x7d, 0x5e, 0xbd, 0xb8, 0x38, 0xc9, 0x3d, 0xb8,
+ 0x23, 0xa6, 0x3d, 0xb8, 0x83, 0x01, 0x3e, 0xe8, 0x22, 0xda, 0x3c, 0x66, 0xf5,
+ 0x92, 0x3d, 0x82, 0xe0, 0x87, 0x3c, 0x6f, 0xa1, 0x6e, 0x3d, 0x27, 0xca, 0xaf,
+ 0x3c, 0x7f, 0x68, 0xd6, 0xbd, 0x38, 0x98, 0x93, 0x3d, 0x4d, 0xdc, 0x5e, 0x3d,
+ 0xc8, 0xb8, 0xb2, 0x3d, 0xab, 0xeb, 0x8a, 0xbb, 0x39, 0x48, 0xbb, 0xbd, 0x17,
+ 0xe6, 0x0f, 0x3d, 0x57, 0x79, 0xea, 0xbc, 0xb2, 0x5e, 0xdb, 0x3d, 0x0c, 0x19,
+ 0xc7, 0xbd, 0xeb, 0x33, 0x2b, 0x3d, 0x4b, 0x15, 0xf6, 0x3d, 0x96, 0x9b, 0xa1,
+ 0xbc, 0x5c, 0xc8, 0x03, 0xbd, 0x88, 0x56, 0x21, 0x3e, 0x85, 0x0c, 0xa5, 0x3c,
+ 0x85, 0xcb, 0xf4, 0xbd, 0x61, 0x03, 0x4d, 0x3c, 0xf1, 0xf4, 0x8c, 0xbd, 0x7b,
+ 0x39, 0x34, 0x3b, 0xf4, 0xa2, 0x47, 0xbc, 0x10, 0x2d, 0xfc, 0xbd, 0xe8, 0xdd,
+ 0xe6, 0x3c, 0xa5, 0x7c, 0x85, 0x3c, 0x3f, 0xcd, 0xeb, 0xbc, 0x42, 0x94, 0xba,
+ 0xbd, 0x50, 0x23, 0xe3, 0xbd, 0x92, 0xf6, 0xa7, 0xbd, 0x5c, 0x36, 0xd0, 0xbd,
+ 0x27, 0x9e, 0x18, 0x3e, 0x33, 0x9a, 0xe8, 0xbc, 0x80, 0x3a, 0x5d, 0x3d, 0xd0,
+ 0xdc, 0x9c, 0xbd, 0xa3, 0x93, 0x51, 0xbd, 0x36, 0xab, 0x7a, 0x3d, 0x74, 0x9c,
+ 0x63, 0x3d, 0x1c, 0x19, 0x9b, 0xbd, 0xa6, 0x10, 0xb4, 0xbd, 0xf4, 0x80, 0xb4,
+ 0xbc, 0xd3, 0x9c, 0xd2, 0xbc, 0x6d, 0x1b, 0x68, 0xbd, 0x31, 0x6a, 0xfd, 0xbd,
+ 0xdc, 0xa4, 0x82, 0xbd, 0xa7, 0xe7, 0x37, 0xbd, 0x5c, 0xd1, 0x07, 0xbd, 0x4e,
+ 0x82, 0x15, 0xbc, 0x31, 0x43, 0x16, 0x3e, 0xe2, 0xf3, 0x1e, 0x3e, 0x62, 0x22,
+ 0x14, 0x3e, 0x27, 0x65, 0x0d, 0x39, 0xaa, 0x9e, 0x8f, 0x3d, 0xdd, 0x59, 0x4c,
+ 0x3c, 0x4a, 0xc5, 0xc5, 0xbd, 0x4a, 0xa5, 0xc7, 0x3b, 0xb9, 0x73, 0xcc, 0x3d,
+ 0x10, 0x62, 0x5c, 0x3c, 0x87, 0xd8, 0xb2, 0xbd, 0x15, 0x50, 0xf8, 0x3d, 0xd7,
+ 0x7f, 0x91, 0xbd, 0xf4, 0x07, 0xfb, 0x3c, 0x93, 0x09, 0xae, 0xbc, 0x54, 0x19,
+ 0x76, 0x3a, 0x42, 0x4f, 0xbe, 0xbc, 0x6a, 0xef, 0xee, 0x3d, 0x98, 0x97, 0xb7,
+ 0x3d, 0x33, 0x07, 0x3c, 0xbd, 0xe0, 0xc2, 0x46, 0x3c, 0x33, 0x5f, 0x80, 0x3c,
+ 0x4d, 0x5e, 0xff, 0xbc, 0x4e, 0x02, 0xe8, 0xbc, 0x1f, 0x5b, 0xcd, 0xbc, 0x2d,
+ 0x41, 0x8a, 0x3d, 0x2d, 0xeb, 0x5e, 0xbd, 0xff, 0x53, 0xb0, 0x3d, 0x7c, 0x37,
+ 0xb0, 0x3c, 0x0b, 0xc9, 0x87, 0xbd, 0x32, 0xd1, 0xe6, 0xbb, 0xc0, 0x2f, 0xcf,
+ 0x3d, 0x42, 0x5e, 0xb5, 0x3d, 0xd4, 0xbf, 0x36, 0xbd, 0x26, 0xd8, 0xf1, 0xbd,
+ 0xf3, 0x8b, 0xc2, 0x3d, 0x1d, 0xd9, 0xe7, 0xbb, 0xab, 0xf9, 0x16, 0x3d, 0x13,
+ 0x82, 0x93, 0x3d, 0x5e, 0xab, 0xbc, 0xbd, 0x57, 0xf5, 0x2f, 0x3c, 0x86, 0x19,
+ 0x96, 0x3c, 0x17, 0xb1, 0x3e, 0x3d, 0xcd, 0xfd, 0x72, 0xbd, 0xae, 0x8d, 0xbf,
+ 0x3c, 0x5e, 0x94, 0x5c, 0x3d, 0x16, 0x67, 0x88, 0x3d, 0xf1, 0xcb, 0x43, 0xbd,
+ 0xc5, 0x5e, 0x6b, 0xbd, 0xa0, 0xc2, 0xdb, 0x3d, 0x94, 0x36, 0x11, 0xbd, 0x26,
+ 0xb6, 0xb2, 0xbd, 0xe6, 0x9d, 0x93, 0xbd, 0x66, 0x04, 0x5e, 0xbd, 0xed, 0xfe,
+ 0xaf, 0xbb, 0xbc, 0x70, 0x50, 0x3d, 0x0a, 0xeb, 0xd0, 0xbd, 0x3d, 0x06, 0xb5,
+ 0x3d, 0xa7, 0x77, 0x31, 0xbd, 0x5f, 0x4b, 0xa6, 0xbd, 0x9b, 0x0f, 0x96, 0xbc,
+ 0x7e, 0x02, 0xd4, 0xbc, 0x39, 0x52, 0xc4, 0xbd, 0xc3, 0x4e, 0x09, 0x3e, 0x5c,
+ 0xc9, 0x48, 0x3d, 0xa4, 0x28, 0x36, 0xbd, 0xe3, 0xa7, 0x31, 0x3b, 0xdd, 0x29,
+ 0xf4, 0x3d, 0x30, 0x52, 0x76, 0x3d, 0x10, 0xa8, 0x27, 0x3c, 0x0c, 0x16, 0x56,
+ 0x3d, 0x84, 0xd6, 0x1a, 0xbd, 0x34, 0xea, 0xaa, 0x3c, 0x8b, 0xaa, 0x50, 0xbc,
+ 0x02, 0x56, 0xc2, 0x3c, 0xee, 0x61, 0xe8, 0xbd, 0xf2, 0xaa, 0xb0, 0x3d, 0x22,
+ 0xd5, 0x23, 0x3e, 0x2d, 0x7d, 0x62, 0xbd, 0x8a, 0x95, 0x6d, 0xbc, 0x6a, 0xaf,
+ 0xb4, 0xbb, 0x34, 0x65, 0xad, 0x3d, 0x14, 0xff, 0xda, 0xbd, 0x43, 0xdc, 0x04,
+ 0xbd, 0x26, 0xed, 0xa8, 0xbd, 0x97, 0xc7, 0xc3, 0x3d, 0x76, 0x2d, 0xd3, 0xbc,
+ 0xe1, 0xc3, 0xbd, 0xbd, 0x75, 0x52, 0xca, 0x3c, 0x84, 0xfa, 0x13, 0x3c, 0x2e,
+ 0xea, 0x00, 0xbd, 0xb9, 0xbc, 0xcf, 0x3d, 0xcb, 0x67, 0x65, 0xbd, 0xda, 0x95,
+ 0xac, 0xbd, 0x51, 0x71, 0xed, 0x3c, 0xaf, 0xe1, 0x2c, 0xbd, 0xbf, 0x09, 0x2c,
+ 0xba, 0xd1, 0xdc, 0xab, 0xbd, 0x60, 0xab, 0x71, 0xbc, 0x10, 0xa2, 0x2b, 0xbd,
+ 0xb7, 0xba, 0x8f, 0xbd, 0x5e, 0x4b, 0x18, 0x3d, 0x4f, 0x72, 0xa6, 0xbc, 0xbb,
+ 0x54, 0xc5, 0x3d, 0x2a, 0x54, 0xeb, 0xbd, 0x5b, 0x2e, 0x67, 0xbd, 0xc0, 0xd2,
+ 0x61, 0x3b, 0x30, 0x8d, 0x34, 0x3d, 0xaa, 0x2e, 0xfe, 0xbc, 0x37, 0xa2, 0x7b,
+ 0xbd, 0xb0, 0x0d, 0x7c, 0xbd, 0x05, 0x3f, 0x39, 0x3d, 0x52, 0xfc, 0xb2, 0x3d,
+ 0xe8, 0x4a, 0xe6, 0xbd, 0x49, 0x3f, 0xd0, 0x3c, 0x1d, 0x43, 0x1a, 0xbd, 0x52,
+ 0xcc, 0xc7, 0x3d, 0x6a, 0x3f, 0x72, 0x3b, 0x47, 0x6e, 0xdb, 0xbd, 0x6b, 0x97,
+ 0xc2, 0xbd, 0xa0, 0x78, 0xe5, 0xbc, 0x01, 0xb0, 0xd8, 0xbc, 0xd0, 0x9f, 0x9f,
+ 0xbc, 0x51, 0x99, 0x79, 0x3d, 0xf1, 0xd4, 0x1d, 0x3b, 0xe6, 0x19, 0x78, 0x3c,
+ 0xb0, 0x8a, 0x8e, 0xbd, 0x90, 0xfc, 0xc9, 0x3d, 0x91, 0xe7, 0x85, 0x3d, 0xdd,
+ 0xe2, 0x09, 0x3d, 0xb6, 0xf7, 0x5a, 0xbd, 0x26, 0xe8, 0xdc, 0xbd, 0x42, 0xca,
+ 0x18, 0xbd, 0x2a, 0x1d, 0xb4, 0xbd, 0x83, 0x0b, 0xf1, 0x3a, 0xbd, 0x7b, 0x15,
+ 0x3c, 0xf1, 0x7b, 0xa6, 0xbd, 0x55, 0xe4, 0x4d, 0xbd, 0xed, 0x07, 0xf8, 0xbc,
+ 0xf3, 0x73, 0xa0, 0x3d, 0x75, 0x8a, 0xc5, 0xbd, 0x44, 0x2f, 0x7f, 0x3d, 0x35,
+ 0x6c, 0x87, 0x3c, 0x61, 0x2c, 0x4b, 0xbc, 0x67, 0xde, 0x7d, 0xbd, 0x17, 0xaf,
+ 0xe9, 0x3c, 0xaa, 0xd5, 0x0c, 0x3d, 0x98, 0xf5, 0xd8, 0xbc, 0x86, 0xa5, 0x2c,
+ 0xbb, 0xad, 0x8e, 0x43, 0x3d, 0xd2, 0x59, 0xbd, 0xbd, 0x94, 0xc9, 0x69, 0xbd,
+ 0x15, 0xa0, 0x81, 0x3d, 0x18, 0x49, 0x1e, 0x3d, 0xe7, 0xd7, 0xb5, 0xbd, 0x1f,
+ 0x20, 0x10, 0xbd, 0xb0, 0x8b, 0xe0, 0xbd, 0xe0, 0x7c, 0x46, 0x3d, 0x1f, 0xc6,
+ 0x5c, 0xbd, 0xbc, 0xc1, 0x1b, 0x3d, 0xc1, 0x1c, 0xc5, 0xbd, 0xf3, 0x52, 0x48,
+ 0xbb, 0x39, 0x79, 0x86, 0x3d, 0x72, 0xbd, 0x36, 0x3c, 0xa5, 0xd7, 0x95, 0xbd,
+ 0x73, 0xe0, 0x13, 0x3c, 0xe4, 0x9a, 0x50, 0xbd, 0x90, 0x58, 0x93, 0xbd, 0x3d,
+ 0x9e, 0xac, 0x3d, 0x57, 0x08, 0xbb, 0x3d, 0x4e, 0xaf, 0x84, 0xbd, 0xdc, 0x16,
+ 0xbc, 0xbd, 0x51, 0x1a, 0xbf, 0x3d, 0x62, 0x61, 0x97, 0x3d, 0x7a, 0xeb, 0x45,
+ 0x3d, 0xa1, 0x27, 0xe7, 0x3d, 0x20, 0xcb, 0x45, 0xbd, 0xc3, 0x36, 0xda, 0x3d,
+ 0xa2, 0x88, 0x48, 0x3d, 0x7c, 0x0d, 0x0d, 0x3b, 0x00, 0xa8, 0xaf, 0xbd, 0xda,
+ 0x09, 0x51, 0xbd, 0xbd, 0xb3, 0x99, 0xbc, 0x6e, 0x40, 0x6a, 0xbd, 0x31, 0xdb,
+ 0x71, 0x3c, 0x14, 0x0e, 0x0b, 0xbd, 0xe8, 0x4f, 0xae, 0xbd, 0xbb, 0xf3, 0xd4,
+ 0x3d, 0xad, 0xdb, 0x8d, 0x3c, 0x72, 0x12, 0x66, 0xbd, 0x1f, 0xea, 0x98, 0xbd,
+ 0xf7, 0xd0, 0x68, 0x3d, 0x47, 0x27, 0x13, 0x3d, 0xe9, 0x9d, 0xa2, 0xbd, 0x01,
+ 0x07, 0xa9, 0x3d, 0x81, 0xa9, 0xa2, 0x3c, 0x54, 0x75, 0xb5, 0xbc, 0xbc, 0x9f,
+ 0x8e, 0x3c, 0xdd, 0x55, 0x8c, 0x3c, 0xf6, 0x8f, 0xdc, 0x3d, 0x63, 0x45, 0xe7,
+ 0x3c, 0xc2, 0x06, 0x48, 0x3c, 0x63, 0x7a, 0xe9, 0xbd, 0xb0, 0x14, 0x3f, 0x3d,
+ 0x1b, 0x99, 0xe4, 0xbd, 0x0d, 0xa5, 0x89, 0x3d, 0x5d, 0x1e, 0xc4, 0xbd, 0x9b,
+ 0x12, 0x8e, 0x3d, 0x47, 0xa7, 0xb6, 0xbc, 0xc7, 0x3f, 0xf3, 0xbd, 0x82, 0x32,
+ 0x8f, 0xbd, 0xed, 0x11, 0xbe, 0x3d, 0xe4, 0x1e, 0xc6, 0xbc, 0x9d, 0x73, 0xee,
+ 0xbd, 0xce, 0x18, 0xe3, 0xbd, 0x3f, 0x2c, 0x90, 0xbd, 0xc6, 0x82, 0xad, 0x3d,
+ 0xa4, 0x9e, 0xf1, 0xbd, 0x6e, 0x4f, 0xe7, 0x3d, 0x63, 0x8b, 0x28, 0xbd, 0x0a,
+ 0x66, 0x80, 0xbd, 0xa0, 0xa5, 0x84, 0xbd, 0xb0, 0xce, 0xbb, 0xbd, 0x72, 0xba,
+ 0xa1, 0xbd, 0x42, 0x55, 0xa6, 0xbd, 0x36, 0x00, 0xce, 0x3d, 0x11, 0x44, 0xbc,
+ 0x3b, 0xb4, 0x63, 0xa9, 0x3d, 0x07, 0x61, 0x9b, 0x3d, 0x50, 0xb7, 0xb3, 0xbd,
+ 0xe1, 0xcc, 0x74, 0xbd, 0xa1, 0x8e, 0x6c, 0x3d, 0xa6, 0x54, 0xb6, 0xbd, 0xce,
+ 0xde, 0xb4, 0x3c, 0x29, 0xd3, 0x31, 0xbc, 0x74, 0x1c, 0x78, 0xbd, 0xa7, 0xa4,
+ 0x25, 0xbb, 0x01, 0xe0, 0x85, 0x3d, 0x67, 0xc7, 0xbd, 0xbc, 0xae, 0xdb, 0x3a,
+ 0xbd, 0xaa, 0x9c, 0xdd, 0xbd, 0x7a, 0x65, 0xaa, 0xbc, 0x11, 0x1d, 0x53, 0xbd,
+ 0xc0, 0xf8, 0x3a, 0xbd, 0x50, 0xd4, 0x84, 0xbc, 0x3b, 0x49, 0x7f, 0xbd, 0x44,
+ 0x79, 0xde, 0x3d, 0xb9, 0x83, 0xfb, 0x3d, 0x12, 0x34, 0x8d, 0xbd, 0x0a, 0x31,
+ 0xf0, 0x3c, 0x16, 0x71, 0x4e, 0xbd, 0xc4, 0x6a, 0x5f, 0x3d, 0x5a, 0xbe, 0x7e,
+ 0x3d, 0xca, 0x56, 0xe7, 0xbc, 0xe7, 0xa1, 0xb8, 0xbd, 0xf7, 0xac, 0x17, 0x3d,
+ 0xf1, 0x7c, 0x83, 0xbd, 0xe4, 0x5f, 0xec, 0xbd, 0x18, 0x92, 0xa9, 0xbb, 0x71,
+ 0x9a, 0x3d, 0xbd, 0xd1, 0x18, 0x20, 0xbd, 0x94, 0xfa, 0xbd, 0x3d, 0x2f, 0x1f,
+ 0x85, 0xbd, 0xc1, 0xc3, 0xa3, 0x3d, 0x36, 0xdb, 0x96, 0x3d, 0xa5, 0xae, 0x4e,
+ 0xbc, 0xaa, 0x11, 0x9c, 0xbd, 0x44, 0xa2, 0x95, 0x3d, 0xe7, 0x39, 0x73, 0x3b,
+ 0x1d, 0x57, 0x86, 0xbd, 0x14, 0x17, 0xa7, 0xbd, 0xaf, 0xc3, 0x09, 0xbd, 0x2f,
+ 0x90, 0x20, 0xbd, 0x08, 0x91, 0x9c, 0x3c, 0x88, 0x0c, 0xd1, 0x3d, 0x56, 0x99,
+ 0x9d, 0xbd, 0xb3, 0x75, 0xb2, 0x3d, 0xa1, 0x04, 0x59, 0xbb, 0x44, 0x0a, 0x6f,
+ 0x3b, 0x5a, 0x42, 0xce, 0xbd, 0x1b, 0x3b, 0x91, 0x3d, 0x14, 0xb8, 0xdf, 0xbd,
+ 0x85, 0x51, 0x8c, 0xbc, 0xa7, 0xd5, 0x5f, 0x3d, 0xe7, 0x88, 0x61, 0xbd, 0x97,
+ 0x11, 0xd9, 0x39, 0x5c, 0x0b, 0x6d, 0xbd, 0xe4, 0xe3, 0xb1, 0xbd, 0xeb, 0xfe,
+ 0xeb, 0xbd, 0xd3, 0x37, 0x66, 0x3c, 0x4b, 0x72, 0x49, 0xbd, 0x12, 0x06, 0xbf,
+ 0x3b, 0x12, 0x40, 0x77, 0x3d, 0x7c, 0x9d, 0x92, 0x3d, 0xb2, 0xcd, 0xad, 0x3d,
+ 0xb2, 0xe3, 0x65, 0x3d, 0x91, 0x55, 0xbd, 0x3c, 0x31, 0x00, 0xc0, 0xbd, 0xc9,
+ 0x3b, 0x46, 0x3d, 0x51, 0xd9, 0xa6, 0x3d, 0xb9, 0xcb, 0xaf, 0xbd, 0xf8, 0x85,
+ 0xd4, 0xbd, 0x47, 0x6f, 0xf2, 0xbd, 0x70, 0xd4, 0x13, 0x3d, 0x2c, 0x38, 0x55,
+ 0x3d, 0x61, 0x11, 0xd7, 0x3d, 0x62, 0x90, 0xed, 0xbc, 0xd0, 0x71, 0x79, 0xbd,
+ 0xc5, 0xc9, 0x87, 0xbd, 0x6d, 0x23, 0x96, 0xbc, 0xc1, 0x06, 0x9b, 0xbd, 0xc8,
+ 0x2d, 0xfc, 0xbc, 0x79, 0x8d, 0xb8, 0xbd, 0xb3, 0x32, 0xca, 0xbc, 0x17, 0x71,
+ 0xd3, 0xbd, 0x51, 0x07, 0xc6, 0xbc, 0x59, 0x04, 0x49, 0x3d, 0x15, 0x14, 0x8a,
+ 0xbd, 0xd0, 0xae, 0xa4, 0xbd, 0x4c, 0x5f, 0xdd, 0x3d, 0xb5, 0x52, 0xbc, 0x3b,
+ 0x4d, 0xca, 0x3f, 0xbd, 0x85, 0x21, 0xb0, 0xbd, 0x9e, 0x8b, 0xc3, 0xbd, 0x51,
+ 0xd9, 0xa8, 0x3d, 0x53, 0x49, 0xd1, 0x3c, 0x35, 0x6f, 0xe3, 0xbd, 0x7f, 0xe2,
+ 0x9e, 0xbd, 0x42, 0xd8, 0x14, 0xbd, 0x00, 0x6f, 0x19, 0x3d, 0xe1, 0x4e, 0x53,
+ 0x3d, 0xda, 0xc8, 0x66, 0xbd, 0xf1, 0x51, 0xea, 0xbd, 0x8a, 0x7f, 0xbb, 0x3d,
+ 0xa6, 0x85, 0x10, 0xbd, 0x4e, 0xcc, 0xd7, 0x3d, 0x8b, 0x94, 0xad, 0xbd, 0xaa,
+ 0x92, 0x92, 0xbc, 0xdb, 0xcd, 0x3a, 0x3d, 0x43, 0x71, 0x99, 0x3d, 0xa0, 0xeb,
+ 0xe1, 0x3d, 0xbe, 0x5e, 0xe3, 0x3c, 0x43, 0x28, 0x98, 0xbd, 0x04, 0x2b, 0x96,
+ 0xbd, 0xc6, 0x1a, 0x21, 0xbb, 0xce, 0xba, 0xd3, 0xbd, 0x57, 0xee, 0x04, 0x3d,
+ 0x87, 0xf6, 0x8a, 0xbb, 0xda, 0x72, 0x99, 0x3d, 0xcb, 0x2f, 0x8a, 0x3d, 0x1f,
+ 0x20, 0xb5, 0xbd, 0xbe, 0x1f, 0x1e, 0xbd, 0x17, 0x5e, 0x84, 0xbd, 0xfd, 0xce,
+ 0xb2, 0xbd, 0xfc, 0xcc, 0x74, 0x3d, 0x66, 0x53, 0xca, 0x3c, 0x35, 0x5e, 0x9e,
+ 0x3d, 0x6c, 0x9b, 0xb4, 0x3d, 0x08, 0xbd, 0x90, 0x3d, 0x45, 0xc0, 0xc1, 0xbd,
+ 0x83, 0x2c, 0xd3, 0xbc, 0x85, 0xa9, 0x81, 0xbc, 0xa4, 0x47, 0xbc, 0x3d, 0xc2,
+ 0xc6, 0x91, 0xbb, 0x45, 0xf7, 0x51, 0x3d, 0x7c, 0x74, 0x32, 0x3d, 0x64, 0x6d,
+ 0x67, 0xbd, 0xaf, 0x34, 0x37, 0x3d, 0xea, 0xb0, 0x95, 0xbd, 0xe6, 0x42, 0x22,
+ 0x3d, 0xe4, 0x2b, 0xf9, 0xbd, 0x27, 0x85, 0x8c, 0xbc, 0x57, 0x16, 0xd4, 0x3d,
+ 0x0d, 0x41, 0xb9, 0xbc, 0xde, 0xf7, 0xb3, 0xbc, 0xb1, 0x86, 0x5a, 0x3d, 0x16,
+ 0x06, 0x99, 0x3d, 0x36, 0x5c, 0xf2, 0x3d, 0x96, 0x49, 0xfc, 0xbd, 0xd0, 0xda,
+ 0x0b, 0xbd, 0x74, 0x35, 0xfd, 0x3d, 0x3c, 0x9d, 0x12, 0xbd, 0x88, 0xae, 0xc0,
+ 0xbd, 0xd6, 0xe7, 0x5e, 0x3d, 0x31, 0x3f, 0xba, 0xbd, 0x0a, 0x05, 0xb9, 0xbd,
+ 0x8d, 0xe3, 0x35, 0xbd, 0x83, 0xd0, 0x26, 0xbd, 0x04, 0xba, 0x97, 0xbc, 0x46,
+ 0x99, 0xbf, 0xbd, 0xa1, 0x44, 0x75, 0x3b, 0xb8, 0x9b, 0x07, 0x3e, 0x32, 0xe6,
+ 0xd5, 0xbd, 0xc0, 0x9f, 0xf3, 0x3d, 0x7f, 0x4f, 0x36, 0xbc, 0x42, 0xda, 0xe3,
+ 0x3d, 0x3b, 0xb2, 0x5c, 0x3c, 0x97, 0x30, 0xd7, 0x3d, 0x51, 0xe8, 0xea, 0xbc,
+ 0x6e, 0x73, 0x4d, 0x3d, 0x2f, 0x77, 0xb5, 0x3b, 0x0b, 0x79, 0xc1, 0x3c, 0x2f,
+ 0xd9, 0x8c, 0xbd, 0x0e, 0x78, 0xbf, 0xbd, 0x3c, 0xec, 0x84, 0x3d, 0x59, 0xa9,
+ 0xaa, 0xbd, 0x35, 0xdc, 0xe4, 0xbd, 0x91, 0xcf, 0x2e, 0x3d, 0x3c, 0x17, 0x0d,
+ 0xbc, 0x10, 0xd0, 0xf9, 0x3d, 0xab, 0xca, 0xf9, 0xbd, 0x4b, 0xd7, 0x9b, 0x3d,
+ 0xd0, 0x10, 0xc9, 0xbd, 0x11, 0x82, 0x05, 0x3e, 0xd0, 0x14, 0x21, 0xbd, 0x6d,
+ 0x61, 0x99, 0xbd, 0xae, 0x85, 0x7a, 0xbd, 0x67, 0xc0, 0x86, 0xbb, 0x1e, 0xd0,
+ 0xbf, 0x3d, 0x92, 0x46, 0xf8, 0xbc, 0x0d, 0xad, 0xa1, 0x3c, 0xea, 0x8d, 0xd0,
+ 0x3c, 0x61, 0x10, 0x49, 0x3c, 0x8a, 0x7e, 0xe9, 0xbc, 0x31, 0x95, 0xdf, 0xb9,
+ 0xb5, 0x03, 0x0d, 0x3d, 0x0b, 0xf5, 0xd9, 0xbb, 0xba, 0x95, 0x8f, 0xbd, 0x7c,
+ 0x81, 0xde, 0xbd, 0xfc, 0x64, 0xcb, 0x3d, 0x0e, 0x80, 0x2c, 0x3d, 0x64, 0xa8,
+ 0x0b, 0x3d, 0x58, 0xd7, 0xcc, 0xbc, 0x06, 0x10, 0x81, 0x3d, 0xd6, 0x24, 0x2f,
+ 0xbe, 0x2f, 0x77, 0x4e, 0xbd, 0x53, 0x72, 0x1a, 0xbd, 0xc1, 0x05, 0x6e, 0x3d,
+ 0x0b, 0x99, 0x8e, 0xbd, 0x30, 0x10, 0x04, 0xbd, 0xc3, 0x1c, 0x00, 0xbd, 0xf1,
+ 0x16, 0xba, 0xbd, 0x00, 0x43, 0x03, 0xbc, 0xb8, 0x2d, 0xf4, 0x3c, 0x18, 0x18,
+ 0x4d, 0x3d, 0x70, 0x7c, 0x99, 0xb9, 0x49, 0xef, 0xd2, 0xbc, 0x8a, 0xa4, 0x11,
+ 0x3d, 0xe4, 0x8b, 0x5b, 0xbc, 0x16, 0xc1, 0x8c, 0xb9, 0x71, 0xa4, 0x37, 0x3d,
+ 0xb2, 0xa4, 0xb0, 0x3c, 0x79, 0x6c, 0x8a, 0x3d, 0xb6, 0x86, 0x96, 0x3c, 0x06,
+ 0xd1, 0x58, 0xbd, 0xae, 0x40, 0x92, 0xbc, 0x4c, 0x63, 0xa7, 0x3d, 0xac, 0x67,
+ 0xb4, 0xbd, 0x5b, 0xda, 0x17, 0xbd, 0xeb, 0xfc, 0x09, 0x3d, 0x44, 0x95, 0x68,
+ 0x3c, 0x03, 0xee, 0xd7, 0x3d, 0x57, 0x9f, 0xc2, 0x3d, 0x9c, 0xa6, 0xe7, 0x3b,
+ 0xff, 0x8e, 0xcd, 0xbc, 0x22, 0x41, 0xf7, 0x3c, 0x19, 0xe0, 0x1d, 0xbd, 0xae,
+ 0xcc, 0xe2, 0x3b, 0x70, 0xb1, 0x9f, 0x3d, 0xd8, 0x1d, 0xb7, 0x3d, 0xa1, 0xde,
+ 0x4d, 0x3c, 0x12, 0xb6, 0x08, 0x3e, 0x1d, 0x9c, 0xbf, 0x3d, 0xd8, 0x48, 0x4a,
+ 0xbb, 0x07, 0xd1, 0x5e, 0xbd, 0xd3, 0x82, 0xb1, 0x3d, 0x82, 0xef, 0x8d, 0x3d,
+ 0x40, 0x79, 0xe5, 0xbc, 0x3f, 0x85, 0x8b, 0x3d, 0x6a, 0xa3, 0xa7, 0xbd, 0xed,
+ 0xd4, 0xaf, 0xbd, 0x15, 0xf2, 0x96, 0xbd, 0x16, 0x8b, 0xf2, 0xbc, 0xdc, 0x5f,
+ 0xc8, 0xbd, 0xef, 0x46, 0xb3, 0xbd, 0x41, 0x7a, 0x8c, 0xbd, 0x24, 0xfe, 0x62,
+ 0xbd, 0xdf, 0xab, 0x89, 0xbb, 0xa9, 0x9c, 0xd6, 0x3d, 0xf5, 0xc0, 0x2c, 0x3d,
+ 0x20, 0x81, 0xef, 0x3d, 0x1d, 0x1f, 0xd8, 0x3d, 0xe3, 0xea, 0xb7, 0xbc, 0xe5,
+ 0x98, 0xb7, 0x3d, 0x97, 0x67, 0x48, 0x3d, 0x42, 0x5e, 0x10, 0xbe, 0x52, 0xdd,
+ 0xb2, 0xbd, 0x79, 0x0f, 0x60, 0x3d, 0x7e, 0xc5, 0x1c, 0x3d, 0x9b, 0x47, 0x8a,
+ 0xbd, 0xfe, 0x5a, 0x90, 0xba, 0xb3, 0x60, 0x7e, 0xbd, 0x59, 0x16, 0x7e, 0xbd,
+ 0xb6, 0xb7, 0x01, 0x3d, 0x0d, 0x3c, 0xed, 0xbc, 0x0d, 0x44, 0x3c, 0xbb, 0x77,
+ 0x3f, 0xf6, 0xbc, 0x74, 0x91, 0xb9, 0x3d, 0x15, 0xa6, 0x38, 0xbd, 0x6f, 0xa1,
+ 0x39, 0x3d, 0xc8, 0x2e, 0xd8, 0x3d, 0x70, 0xf9, 0x7c, 0xbc, 0x17, 0x9c, 0xa5,
+ 0x3a, 0xfd, 0x15, 0x0a, 0x3d, 0x55, 0x8c, 0xa7, 0x3d, 0xff, 0x06, 0x22, 0xbd,
+ 0x2d, 0x31, 0x15, 0xbe, 0x70, 0x92, 0x92, 0xbd, 0x29, 0x8a, 0x0d, 0x3b, 0x6b,
+ 0xca, 0x3d, 0xbd, 0xf2, 0xe1, 0x28, 0xbc, 0x36, 0x7a, 0x44, 0xbc, 0xea, 0x62,
+ 0xd9, 0x3a, 0xd2, 0xdd, 0x9e, 0xbc, 0xda, 0xce, 0x16, 0xbe, 0x79, 0x5e, 0x97,
+ 0x3b, 0x26, 0x34, 0x38, 0xbd, 0x77, 0x5d, 0x97, 0x3c, 0xc6, 0xcb, 0x84, 0xbd,
+ 0xed, 0xa4, 0xda, 0x3d, 0xd2, 0x4f, 0x6d, 0xbc, 0x35, 0x16, 0xdc, 0xbd, 0xea,
+ 0xfb, 0x08, 0xbe, 0x84, 0xea, 0x1e, 0xbd, 0x0e, 0x3a, 0x60, 0xb8, 0x4f, 0x4b,
+ 0x0a, 0xbe, 0xfe, 0x33, 0x87, 0x3d, 0x63, 0x5e, 0x8d, 0x3d, 0x68, 0x29, 0x17,
+ 0x3e, 0xa5, 0x25, 0x8f, 0xbc, 0x0a, 0x09, 0x78, 0xbd, 0x43, 0x98, 0x6d, 0xbd,
+ 0x98, 0xa8, 0xa0, 0xbd, 0x7c, 0xa3, 0x13, 0x3d, 0xd4, 0xb8, 0x6d, 0xbc, 0x20,
+ 0x1f, 0xc5, 0xbc, 0x06, 0xb5, 0x16, 0x3e, 0xcd, 0x4d, 0x90, 0xbd, 0xb8, 0xcc,
+ 0xd4, 0x3d, 0xbd, 0xe9, 0xd1, 0xbd, 0x90, 0x68, 0xcf, 0x3d, 0xa7, 0xc6, 0x08,
+ 0xbe, 0x1c, 0xe5, 0x5c, 0xbd, 0x6e, 0x56, 0xa6, 0x3d, 0x74, 0x4f, 0xa5, 0x3d,
+ 0x96, 0x2b, 0x5a, 0x3d, 0xbe, 0xc6, 0x9b, 0xbd, 0x94, 0x33, 0x18, 0x3d, 0x57,
+ 0x1a, 0x6b, 0xbd, 0xd7, 0x3d, 0x03, 0xbe, 0x6a, 0x36, 0x65, 0xbd, 0x13, 0x36,
+ 0xbf, 0x3d, 0x82, 0x9a, 0x0a, 0x3d, 0x3c, 0x1d, 0xca, 0xbd, 0x0c, 0x40, 0x0e,
+ 0xbe, 0x3f, 0x94, 0xae, 0xbd, 0x1f, 0x7e, 0x89, 0x3d, 0xe3, 0xbf, 0x30, 0xbe,
+ 0x7a, 0x48, 0x23, 0x3a, 0xe5, 0x0e, 0x5d, 0x3d, 0x91, 0xd3, 0xf2, 0x3d, 0xb6,
+ 0xef, 0x4a, 0xbd, 0xd4, 0xb3, 0x08, 0xbe, 0xa9, 0xba, 0xac, 0x3d, 0x31, 0x40,
+ 0x86, 0x3d, 0xc2, 0xc7, 0x04, 0xbe, 0x7c, 0x3b, 0xdb, 0x3d, 0x11, 0x25, 0x04,
+ 0xbd, 0x3f, 0x5d, 0xf3, 0xbc, 0xc2, 0x3f, 0xfb, 0x3c, 0x12, 0xac, 0xf4, 0xbd,
+ 0xa7, 0xc4, 0x32, 0x3c, 0xc9, 0xea, 0xe3, 0x3c, 0x7d, 0xda, 0x36, 0x3c, 0x43,
+ 0x55, 0x09, 0x3e, 0x5f, 0xd8, 0x22, 0xbd, 0x33, 0xf5, 0x29, 0x3e, 0xb8, 0x23,
+ 0x8a, 0xbc, 0xfb, 0x3f, 0x52, 0xbe, 0xec, 0x1c, 0x79, 0x3d, 0x09, 0x9e, 0x24,
+ 0xbd, 0x5b, 0x3c, 0xd3, 0xbd, 0x9f, 0x0b, 0x1f, 0x3e, 0x1f, 0xa2, 0xfc, 0xbd,
+ 0x3b, 0x42, 0x9b, 0x3b, 0x0a, 0xae, 0xc4, 0xbc, 0x8b, 0xc8, 0xa7, 0x3d, 0x88,
+ 0xaa, 0x9b, 0xbd, 0xaa, 0x37, 0xb6, 0x3d, 0x0d, 0x6a, 0x15, 0x3d, 0x47, 0xa8,
+ 0x87, 0x3d, 0x53, 0xb1, 0xe3, 0x3d, 0xf7, 0x63, 0x0e, 0x3c, 0x37, 0x70, 0x8e,
+ 0xbc, 0xc5, 0x5c, 0x32, 0xbe, 0x72, 0x7a, 0xd5, 0x3d, 0xcb, 0xac, 0xc7, 0xbd,
+ 0x6f, 0xf1, 0x3a, 0xbd, 0x74, 0x40, 0x99, 0x3d, 0x35, 0x16, 0x88, 0xbc, 0xb4,
+ 0x80, 0x14, 0x3e, 0x0b, 0x98, 0xd9, 0x3c, 0xa7, 0x98, 0x17, 0xbc, 0x6e, 0xd0,
+ 0x60, 0xbb, 0xd9, 0xc2, 0x8f, 0x3d, 0xea, 0x37, 0xe1, 0xbd, 0x00, 0x42, 0xfd,
+ 0x3d, 0xde, 0xb0, 0x3a, 0x3d, 0x4f, 0xe2, 0x50, 0x3c, 0x76, 0x9f, 0x42, 0xbd,
+ 0x73, 0x18, 0x4e, 0xbe, 0x9b, 0xfd, 0x69, 0xbd, 0x69, 0xb2, 0x88, 0xbc, 0x6a,
+ 0x13, 0x3e, 0xbd, 0x29, 0xf0, 0x0c, 0x3c, 0x1f, 0x81, 0x18, 0x3d, 0x03, 0x2e,
+ 0x0c, 0x3e, 0xff, 0xf1, 0x4a, 0xbc, 0xb7, 0x9c, 0x14, 0xbe, 0xd5, 0x52, 0xce,
+ 0xbd, 0xf6, 0x45, 0xf0, 0x3d, 0x8d, 0xc8, 0x55, 0xbd, 0x8f, 0xf0, 0x88, 0x3d,
+ 0x8c, 0x8f, 0x20, 0xbd, 0x38, 0x7c, 0x4d, 0x3e, 0x6d, 0xba, 0x95, 0xbd, 0xdc,
+ 0x7b, 0x0d, 0xbe, 0x3d, 0xbf, 0x2d, 0x3c, 0xee, 0xf6, 0xcb, 0x3c, 0x42, 0x85,
+ 0x2e, 0x3d, 0x43, 0x4c, 0xb3, 0x3d, 0xe6, 0x70, 0x91, 0xbd, 0x58, 0x98, 0xfd,
+ 0x3d, 0x70, 0x75, 0x52, 0xbd, 0xb7, 0x44, 0x34, 0xbe, 0x62, 0x65, 0xdc, 0xbd,
+ 0xb8, 0xc7, 0x83, 0x3c, 0x0d, 0x0a, 0xaa, 0xbd, 0x09, 0xcb, 0x92, 0x3c, 0xbd,
+ 0x5d, 0xc7, 0xb9, 0x3a, 0x4e, 0xa6, 0xbd, 0xd8, 0xfb, 0xa6, 0xbd, 0xcd, 0xfc,
+ 0x72, 0xbe, 0x12, 0xdc, 0x4d, 0xbd, 0x0a, 0x7c, 0x5d, 0x3d, 0x8c, 0xce, 0x7a,
+ 0x3d, 0xe8, 0x3d, 0x83, 0xbd, 0x0d, 0x6c, 0x9e, 0x3d, 0x14, 0xb3, 0x3c, 0x3d,
+ 0x05, 0x0e, 0xdf, 0x3d, 0xf7, 0x27, 0xb7, 0xbd, 0xa3, 0x18, 0x08, 0x3d, 0x54,
+ 0xdb, 0x6a, 0x3c, 0x93, 0x1a, 0x80, 0xbd, 0xf9, 0x13, 0x05, 0x3e, 0xd9, 0x61,
+ 0x87, 0x3d, 0x08, 0xa5, 0x9b, 0xbd, 0x70, 0x5d, 0xc9, 0xbc, 0x9b, 0x99, 0x94,
+ 0xbd, 0xc5, 0x6e, 0xd4, 0xbd, 0xc8, 0x60, 0xad, 0x3d, 0x29, 0x62, 0x05, 0xbd,
+ 0x83, 0xd8, 0xc1, 0xbd, 0xa2, 0x72, 0xf1, 0x3d, 0x57, 0x3f, 0x2e, 0xbb, 0xb8,
+ 0x1a, 0xcf, 0xbc, 0xc3, 0xda, 0x96, 0xbd, 0xd3, 0xbc, 0x81, 0xbd, 0xca, 0x52,
+ 0xa1, 0xbb, 0xe8, 0xaf, 0x6a, 0x3d, 0x49, 0xaa, 0xf8, 0x3c, 0x5f, 0x2a, 0x9a,
+ 0xbd, 0xcb, 0x12, 0x6b, 0xbd, 0xc9, 0x4a, 0x8f, 0xbc, 0xce, 0x3c, 0xfd, 0x3d,
+ 0x71, 0x17, 0xed, 0x3d, 0x54, 0x40, 0xea, 0xbd, 0xcb, 0x7f, 0x2d, 0xbd, 0x2c,
+ 0x13, 0x86, 0x3d, 0xcd, 0x8c, 0x44, 0xbd, 0xe4, 0x65, 0xa6, 0xbb, 0x06, 0x81,
+ 0x04, 0x3d, 0x64, 0x45, 0x8e, 0x3d, 0xef, 0x80, 0x22, 0xbd, 0x35, 0x90, 0xaa,
+ 0xbd, 0x02, 0xb6, 0x48, 0x3d, 0x76, 0xba, 0x39, 0x3d, 0xf3, 0xce, 0x66, 0xbd,
+ 0x3f, 0x8e, 0xf1, 0xbd, 0x2a, 0x81, 0x0e, 0xbd, 0x82, 0x05, 0x0b, 0x3e, 0x7b,
+ 0xdb, 0x2f, 0x3d, 0x86, 0xe3, 0xba, 0x3d, 0xac, 0x47, 0x17, 0x3e, 0xcb, 0x96,
+ 0x8f, 0x3c, 0x3b, 0x58, 0xe7, 0xbd, 0x38, 0x64, 0x46, 0xbe, 0x9e, 0x73, 0x88,
+ 0xbd, 0x0f, 0xf0, 0x8e, 0xbd, 0xc1, 0x4c, 0x00, 0xbd, 0x70, 0xbb, 0x54, 0xbd,
+ 0x74, 0x55, 0x20, 0x3b, 0x1f, 0x22, 0x8d, 0x3d, 0xc9, 0x1d, 0xce, 0x3c, 0xad,
+ 0x53, 0x3f, 0x3d, 0x7e, 0xd8, 0xb2, 0x3d, 0x9e, 0xc0, 0xf5, 0x3d, 0x79, 0x01,
+ 0x32, 0xbd, 0x49, 0x13, 0x2e, 0x3d, 0xff, 0x7a, 0xce, 0x3d, 0xb5, 0xbc, 0x46,
+ 0x3d, 0x43, 0xa5, 0xc8, 0xbd, 0xf2, 0x4d, 0xd3, 0x3b, 0x78, 0x3e, 0x39, 0x3d,
+ 0x2c, 0x01, 0xc7, 0xbd, 0x5d, 0x5b, 0x8d, 0xbd, 0xb1, 0x3b, 0xa3, 0xbd, 0x1f,
+ 0x70, 0x6e, 0x3c, 0x62, 0x07, 0x58, 0xbd, 0x29, 0xd9, 0xc8, 0xba, 0x13, 0xa6,
+ 0xd3, 0xbd, 0xc1, 0x45, 0xbf, 0xbc, 0x3e, 0x9f, 0xea, 0xbc, 0x7c, 0x4d, 0xcc,
+ 0x3d, 0x6c, 0x0c, 0x2e, 0xbd, 0xcf, 0xa0, 0x9a, 0x3b, 0x83, 0x9e, 0xfa, 0xbd,
+ 0x77, 0x21, 0xaa, 0x3d, 0xcf, 0x18, 0xf5, 0xbd, 0xfe, 0x30, 0x79, 0x3d, 0x24,
+ 0x33, 0x4d, 0x3d, 0xf7, 0x5f, 0x54, 0x3d, 0xda, 0x9d, 0xc9, 0xbd, 0x28, 0x08,
+ 0x16, 0x3d, 0x53, 0x5a, 0xf6, 0xbc, 0xa5, 0x86, 0x84, 0xbd, 0x91, 0x39, 0xc5,
+ 0xbc, 0x54, 0x2b, 0xda, 0xbd, 0x49, 0x34, 0xae, 0xbd, 0x9d, 0xad, 0x3a, 0xbd,
+ 0x43, 0x59, 0xf1, 0x3d, 0x5c, 0xef, 0x06, 0x3e, 0xc7, 0xe0, 0x32, 0x3d, 0x43,
+ 0xb3, 0x87, 0x3d, 0x12, 0x6c, 0x02, 0xbe, 0x9c, 0xdc, 0x02, 0x3e, 0x22, 0xcc,
+ 0x1b, 0xbe, 0x46, 0x37, 0xe8, 0x3d, 0xf0, 0x11, 0x3b, 0xbd, 0x0d, 0x62, 0x51,
+ 0x3d, 0x8b, 0x64, 0x2f, 0x3d, 0x57, 0x97, 0x5e, 0x3d, 0x53, 0xdd, 0xd6, 0x3c,
+ 0x00, 0xf5, 0xfb, 0xbc, 0x6f, 0x83, 0xea, 0x3b, 0xec, 0x88, 0x20, 0xbb, 0xe5,
+ 0x7f, 0xe6, 0x3d, 0xe6, 0xc4, 0xb5, 0x3d, 0x05, 0x76, 0x0f, 0xbe, 0x4a, 0x2f,
+ 0x61, 0xbd, 0xa0, 0x69, 0xe2, 0x3d, 0xab, 0xc9, 0xb4, 0x3d, 0xeb, 0xd7, 0x88,
+ 0xbc, 0x8f, 0x65, 0xfb, 0xbd, 0xc5, 0xca, 0x93, 0xbc, 0x1f, 0xe5, 0xa9, 0x3d,
+ 0x0b, 0x34, 0x06, 0x3e, 0xbd, 0x9e, 0xe1, 0x3d, 0x58, 0x9d, 0xec, 0xbd, 0x60,
+ 0x28, 0xe3, 0xbc, 0x62, 0x2e, 0x85, 0x3d, 0xec, 0x10, 0xb6, 0x3d, 0xd4, 0x0e,
+ 0x55, 0x3d, 0x6a, 0xd9, 0x22, 0xbd, 0xa4, 0x2c, 0xb0, 0xbd, 0x8f, 0x8c, 0x8b,
+ 0x3d, 0x05, 0xa0, 0xbb, 0x3d, 0x7b, 0xf7, 0xc0, 0x3d, 0xca, 0x2f, 0x90, 0xbc,
+ 0x07, 0x79, 0xe3, 0xbd, 0x8b, 0x7d, 0x83, 0xbd, 0xfe, 0x8a, 0x93, 0xbc, 0xc0,
+ 0xe9, 0xd0, 0x3d, 0xfb, 0x88, 0x76, 0xbc, 0x2d, 0x4b, 0x99, 0x3c, 0x69, 0x04,
+ 0xd3, 0x3c, 0xb6, 0xd2, 0x88, 0x3d, 0xeb, 0xe2, 0x71, 0xbd, 0xa8, 0xb5, 0x98,
+ 0x3d, 0x08, 0x79, 0xea, 0xbd, 0x7c, 0x53, 0x03, 0xbd, 0xb1, 0xda, 0xf9, 0xbd,
+ 0xf1, 0x53, 0x83, 0xbc, 0xa0, 0xb3, 0x49, 0xbd, 0x7c, 0x79, 0x07, 0x3c, 0x68,
+ 0x60, 0x21, 0x3c, 0xb1, 0x1f, 0x38, 0x3d, 0x5d, 0x0c, 0x4e, 0x3d, 0x36, 0x83,
+ 0x62, 0x3c, 0x87, 0x96, 0x22, 0xbd, 0xd2, 0x3a, 0x09, 0x3c, 0xa2, 0x6e, 0x7a,
+ 0xbd, 0x54, 0xc7, 0x31, 0xbc, 0x3a, 0x58, 0x1e, 0xbd, 0x51, 0x31, 0x94, 0x3d,
+ 0x28, 0x85, 0xde, 0xbc, 0x52, 0x0e, 0xce, 0xbd, 0x79, 0x6a, 0xfb, 0xbd, 0x0f,
+ 0x76, 0x14, 0xbd, 0xb4, 0xf0, 0xb3, 0x3c, 0x30, 0x4e, 0xab, 0xbd, 0xbc, 0x21,
+ 0x2a, 0x3d, 0xa7, 0x29, 0x93, 0x3d, 0x05, 0x5e, 0x79, 0x3c, 0xc0, 0xdc, 0x93,
+ 0xbd, 0x8c, 0x46, 0xd3, 0x3d, 0x6d, 0xef, 0x21, 0x3d, 0xcd, 0x62, 0xe5, 0x3d,
+ 0xf2, 0x5f, 0xbc, 0xbd, 0xec, 0xb5, 0x6e, 0x3d, 0x8f, 0xdd, 0xd1, 0x3c, 0xb6,
+ 0x13, 0x93, 0xbd, 0x1e, 0x1d, 0x0a, 0x3e, 0xfe, 0x00, 0x0a, 0x3d, 0xfe, 0xea,
+ 0x70, 0x3c, 0x1e, 0x69, 0x94, 0xbd, 0x54, 0x92, 0xdf, 0x3d, 0x8d, 0xc4, 0xe3,
+ 0xbd, 0xa8, 0x26, 0xc1, 0x3d, 0x90, 0x69, 0x97, 0x3d, 0x5f, 0xf7, 0x21, 0x3e,
+ 0xd8, 0xf4, 0x13, 0x3d, 0x8e, 0x0f, 0x2a, 0x3d, 0x1a, 0xf3, 0xe8, 0x3d, 0xb1,
+ 0x70, 0x75, 0xbd, 0x3d, 0x10, 0x87, 0x3d, 0xf2, 0x55, 0x8f, 0xbd, 0x7f, 0x15,
+ 0x07, 0xbe, 0xe0, 0x3c, 0xba, 0x3d, 0x6d, 0x1f, 0xc2, 0xbc, 0xd6, 0xbf, 0x2c,
+ 0xbd, 0x01, 0x4c, 0x87, 0x3c, 0xd8, 0xe5, 0x93, 0x3d, 0x6e, 0x5a, 0x12, 0x3d,
+ 0xff, 0x3a, 0xd1, 0x3d, 0xfa, 0x05, 0x0a, 0x3d, 0x5a, 0xce, 0xa3, 0xbc, 0xc5,
+ 0x2b, 0xd8, 0x3d, 0x98, 0xb3, 0xce, 0xbd, 0x6b, 0x72, 0x90, 0x3d, 0xa7, 0x35,
+ 0xbb, 0xbd, 0xe2, 0xcb, 0xae, 0xbc, 0x8e, 0xe3, 0x74, 0x3d, 0xcd, 0x32, 0xcf,
+ 0xbd, 0x76, 0x8d, 0x1d, 0x3d, 0x27, 0xc5, 0x0c, 0xbe, 0x27, 0x7e, 0x6c, 0xbd,
+ 0x54, 0xf1, 0xdb, 0x3d, 0x39, 0x03, 0xed, 0xbc, 0xd7, 0x4b, 0xe1, 0x3a, 0x19,
+ 0x67, 0x90, 0x3d, 0xf5, 0x03, 0x89, 0x3d, 0x31, 0x9d, 0xd4, 0x3a, 0x06, 0x9d,
+ 0x05, 0x3e, 0xde, 0xaf, 0x63, 0xbd, 0xed, 0xfe, 0x54, 0x3c, 0xdd, 0x40, 0xc5,
+ 0xbd, 0xf5, 0x54, 0x0d, 0xbc, 0x3e, 0xaa, 0xcd, 0x3c, 0x08, 0x18, 0xbf, 0xbd,
+ 0x79, 0x2e, 0x90, 0xbd, 0x15, 0xe3, 0x8a, 0x3d, 0x7b, 0x54, 0x7c, 0xbd, 0x85,
+ 0x07, 0xd0, 0x3d, 0xfb, 0x39, 0x01, 0xbd, 0x12, 0x57, 0xf0, 0xbd, 0x56, 0x7c,
+ 0x8d, 0xbd, 0xae, 0x9e, 0xaf, 0x3c, 0x90, 0xc3, 0x85, 0x3d, 0x9c, 0x00, 0x88,
+ 0x3d, 0x1f, 0x9a, 0x8f, 0xbd, 0x80, 0xef, 0xc4, 0xb9, 0x60, 0xba, 0x5b, 0xbd,
+ 0x05, 0x25, 0xd8, 0x3c, 0x76, 0x60, 0x6d, 0x3d, 0xc5, 0xf0, 0xe1, 0x3c, 0x0d,
+ 0x00, 0xf7, 0x3d, 0x57, 0xb7, 0x24, 0x3d, 0x2c, 0x11, 0x06, 0xbe, 0x48, 0x15,
+ 0x5b, 0xbd, 0x0c, 0x67, 0x22, 0xbd, 0xc9, 0x10, 0x07, 0x3c, 0x69, 0x42, 0xbb,
+ 0xbd, 0x5b, 0x32, 0xb8, 0xbd, 0x62, 0x5e, 0x35, 0xbd, 0xfc, 0xe1, 0x22, 0xbd,
+ 0xff, 0xb3, 0x51, 0xbd, 0x6e, 0x4d, 0x2d, 0x3c, 0xfb, 0xca, 0xc5, 0xbd, 0x15,
+ 0x16, 0x32, 0x3d, 0x50, 0xff, 0xbe, 0xbd, 0xf7, 0x84, 0x5e, 0xbb, 0x27, 0xa2,
+ 0x17, 0x3c, 0x83, 0x85, 0xda, 0xbd, 0xd3, 0x8f, 0xd8, 0x3d, 0x19, 0xd4, 0x9d,
+ 0xbd, 0x05, 0x56, 0xbd, 0x3b, 0x80, 0x5c, 0x8d, 0xbd, 0x02, 0x07, 0x01, 0x3e,
+ 0x46, 0x0a, 0xd0, 0x3c, 0x28, 0x0a, 0x74, 0x3d, 0x45, 0xd8, 0x9c, 0x3d, 0x51,
+ 0x8c, 0xe1, 0x3d, 0x94, 0x9d, 0x44, 0xbc, 0x1a, 0xfd, 0x6d, 0x3d, 0x6a, 0xa7,
+ 0x00, 0x3e, 0x03, 0xb0, 0xa5, 0xbd, 0x84, 0xb6, 0x94, 0x3c, 0x6e, 0x1b, 0xd2,
+ 0xbd, 0xff, 0xcf, 0xbd, 0xbd, 0x7f, 0x7c, 0x6c, 0xbd, 0xa0, 0xb0, 0x4a, 0xbd,
+ 0x8c, 0xfc, 0xca, 0xbc, 0xf4, 0xa1, 0x81, 0xbd, 0x22, 0xad, 0xe2, 0x3c, 0xfa,
+ 0x91, 0xaf, 0x3d, 0xf4, 0x2e, 0x19, 0xbd, 0x0b, 0x57, 0x71, 0xbc, 0x21, 0xca,
+ 0x8d, 0x3c, 0xee, 0x8c, 0x2b, 0x3a, 0x46, 0x1a, 0xc1, 0xbb, 0x51, 0xbe, 0x2c,
+ 0xbd, 0xc0, 0x3f, 0x40, 0x3d, 0xb2, 0xbb, 0x96, 0x3d, 0x88, 0x43, 0x23, 0xbe,
+ 0x26, 0xd9, 0xe8, 0xbd, 0xf7, 0xfc, 0x9d, 0xbd, 0x4e, 0xf6, 0xd3, 0xbc, 0x2a,
+ 0xda, 0xba, 0xbd, 0xe1, 0x21, 0xe1, 0x3d, 0x81, 0xea, 0x2e, 0xbd, 0xde, 0xaa,
+ 0xd2, 0xbb, 0xde, 0x20, 0xbe, 0x3d, 0x15, 0x2f, 0x44, 0x3d, 0x37, 0x58, 0x6e,
+ 0xbd, 0xcd, 0x34, 0x4c, 0xbb, 0x8d, 0xad, 0x08, 0xbc, 0xd9, 0xe2, 0x21, 0x3d,
+ 0xfe, 0x8b, 0xab, 0x3d, 0xa2, 0x7f, 0x47, 0xbd, 0xad, 0xbe, 0xe3, 0xbc, 0x5f,
+ 0x5d, 0x20, 0x3d, 0xa7, 0xa7, 0x19, 0xbe, 0x27, 0x1b, 0x8a, 0xbd, 0x2e, 0xcf,
+ 0x4d, 0x3d, 0x68, 0x43, 0xb0, 0x3d, 0x54, 0xe8, 0xec, 0x3b, 0x5f, 0x47, 0x57,
+ 0xbd, 0xde, 0x1b, 0xc4, 0x3d, 0xd2, 0x08, 0xfa, 0xbb, 0x23, 0x97, 0xe5, 0x3d,
+ 0xb3, 0x70, 0x6b, 0x3d, 0x33, 0x68, 0x2a, 0xbc, 0xbb, 0xc7, 0xb5, 0xbd, 0x31,
+ 0xe2, 0xcd, 0xbd, 0xe3, 0x77, 0x44, 0x3d, 0xb1, 0xf5, 0x60, 0x3d, 0x03, 0x24,
+ 0xf7, 0xbd, 0x6c, 0x04, 0xb0, 0x3c, 0xba, 0x53, 0xa9, 0xbd, 0xcb, 0x94, 0x03,
+ 0xbe, 0x19, 0x25, 0xfc, 0xbb, 0x8d, 0xaf, 0xe5, 0x3d, 0x95, 0xec, 0xa3, 0x3d,
+ 0xca, 0x8d, 0xcb, 0xbd, 0x71, 0x02, 0xee, 0x3c, 0x31, 0x55, 0xdf, 0xbd, 0x85,
+ 0xd6, 0x69, 0x3d, 0xa1, 0xd8, 0x1d, 0x3d, 0xd6, 0x60, 0x12, 0xbb, 0x46, 0x47,
+ 0x46, 0x3d, 0x75, 0xf9, 0x97, 0x3d, 0x4c, 0xd5, 0x87, 0x3d, 0xc4, 0x77, 0xb7,
+ 0x3c, 0x0a, 0xd5, 0x08, 0x3d, 0x7f, 0x4d, 0x74, 0xbd, 0xdd, 0x0e, 0x07, 0xbe,
+ 0x0d, 0xb1, 0x51, 0xbb, 0x95, 0xf0, 0xa7, 0x3d, 0x8d, 0xdc, 0xe7, 0xbd, 0x11,
+ 0x22, 0xd1, 0x3d, 0x81, 0xad, 0x8c, 0x3d, 0x51, 0x36, 0x1e, 0x3d, 0xe3, 0x75,
+ 0x01, 0x3e, 0xa1, 0xd1, 0x9a, 0x3d, 0x4f, 0xd4, 0xc4, 0x3d, 0x50, 0x2a, 0x61,
+ 0x3c, 0x9a, 0xd5, 0xbd, 0xbd, 0x37, 0xd1, 0xd5, 0x3c, 0xd5, 0x83, 0x8e, 0x3d,
+ 0xbd, 0x05, 0xb6, 0xbb, 0x52, 0x6b, 0x66, 0x3d, 0x25, 0xcb, 0x0c, 0xbe, 0x3a,
+ 0xff, 0xd3, 0xbd, 0xaf, 0xdc, 0xb3, 0xbd, 0xde, 0xdf, 0x06, 0x3d, 0x91, 0x0f,
+ 0xc8, 0xbd, 0x62, 0xa1, 0x8f, 0xbc, 0x1c, 0x36, 0x40, 0x3c, 0x7d, 0x4f, 0xfa,
+ 0x3d, 0x99, 0x76, 0xd5, 0x3d, 0xc3, 0x21, 0x5c, 0xbb, 0x61, 0x54, 0x52, 0xbc,
+ 0xc4, 0x07, 0x9b, 0xbd, 0xb3, 0x00, 0x44, 0xbc, 0xbe, 0x1b, 0x06, 0xbd, 0x35,
+ 0x4c, 0x5d, 0x3d, 0x6b, 0x45, 0x17, 0xbd, 0x10, 0xd6, 0xe5, 0xbd, 0x40, 0x57,
+ 0x83, 0x3d, 0x62, 0xd1, 0x64, 0xbd, 0x79, 0x90, 0xbd, 0xbc, 0xce, 0xf0, 0x07,
+ 0x3e, 0xc0, 0xbd, 0xaf, 0x3d, 0x88, 0xe1, 0x84, 0xbd, 0xf0, 0xdb, 0x4c, 0x3d,
+ 0x17, 0x35, 0x02, 0x3b, 0x30, 0x1c, 0xed, 0xbd, 0x4f, 0xfc, 0xda, 0x3d, 0x92,
+ 0x80, 0x87, 0xbc, 0x02, 0x74, 0x1a, 0xbe, 0xdc, 0xb1, 0xb3, 0xbd, 0x6c, 0x01,
+ 0xc0, 0xbc, 0x8f, 0x2d, 0x8c, 0x3d, 0xf5, 0x96, 0xc0, 0xbd, 0x77, 0xbc, 0x7f,
+ 0xbd, 0x8a, 0x64, 0xf1, 0x3c, 0xb7, 0x6c, 0xb4, 0xbd, 0x1c, 0x6f, 0x84, 0x3d,
+ 0xa1, 0xd5, 0xc0, 0xbd, 0xbf, 0x63, 0xd4, 0x3d, 0xd6, 0xd7, 0xe7, 0x3d, 0x89,
+ 0x1e, 0x64, 0x3c, 0xf3, 0x81, 0xbe, 0xbd, 0xb3, 0x57, 0xe9, 0xbd, 0x84, 0x5e,
+ 0x9a, 0x3d, 0x77, 0x22, 0x01, 0xbe, 0x53, 0xa3, 0xb8, 0xbd, 0xc0, 0x62, 0xff,
+ 0x3b, 0x9a, 0xfb, 0xbd, 0x3d, 0x13, 0x1a, 0xeb, 0x3b, 0x3b, 0x96, 0x78, 0x3d,
+ 0xfc, 0xc6, 0x93, 0x3d, 0xfc, 0x33, 0x92, 0x3d, 0xcc, 0xc1, 0x62, 0xbd, 0x63,
+ 0x7c, 0x77, 0xbd, 0x69, 0x92, 0x05, 0xbd, 0xbd, 0xee, 0xb8, 0x3a, 0xa2, 0x9d,
+ 0x0e, 0xbe, 0xf3, 0xba, 0xed, 0xbd, 0x2f, 0x6a, 0xaa, 0x3d, 0x77, 0x4a, 0xc6,
+ 0x3d, 0x4f, 0xe7, 0xa8, 0x3d, 0x1e, 0x3f, 0xbb, 0xbd, 0xae, 0x6c, 0xb8, 0xbc,
+ 0x75, 0xf1, 0x6d, 0xbd, 0xc1, 0x5d, 0x11, 0xbe, 0x2b, 0xe2, 0x4f, 0xbd, 0x54,
+ 0x21, 0xf6, 0x3b, 0x5c, 0xe2, 0x96, 0x3c, 0xbe, 0xe8, 0x2e, 0x3d, 0x38, 0x39,
+ 0x93, 0x3c, 0xc3, 0x50, 0xbc, 0x3d, 0x67, 0x1d, 0xc4, 0x3d, 0xe6, 0x29, 0x56,
+ 0xbc, 0x4d, 0x70, 0x4d, 0x3c, 0xd2, 0xca, 0xc4, 0xbd, 0xa1, 0x30, 0x3b, 0xbd,
+ 0x97, 0x9b, 0xb5, 0xbd, 0x65, 0x99, 0x9b, 0xbd, 0xb5, 0x65, 0xb7, 0xbd, 0x51,
+ 0xe1, 0x9a, 0xbd, 0x2f, 0x56, 0x4a, 0xbb, 0x9c, 0x68, 0x98, 0xbd, 0x36, 0x75,
+ 0x73, 0xbd, 0x19, 0xe1, 0x83, 0xbd, 0x37, 0x69, 0xee, 0x3d, 0xe7, 0xd1, 0xad,
+ 0xbd, 0x3b, 0x29, 0x95, 0xbd, 0xcd, 0x10, 0x75, 0x3d, 0xb4, 0x82, 0xc2, 0xbc,
+ 0x72, 0xd7, 0x91, 0x3d, 0xc8, 0x77, 0x49, 0xbd, 0x96, 0x67, 0x4d, 0xbd, 0xc5,
+ 0x75, 0x98, 0xbd, 0x96, 0x67, 0xcc, 0x3d, 0xba, 0x7a, 0x1e, 0xbe, 0x30, 0x3a,
+ 0x02, 0x3d, 0xc1, 0xf8, 0x78, 0x3d, 0x46, 0xfc, 0xc1, 0x3d, 0x99, 0x3c, 0xc5,
+ 0xbd, 0xbc, 0x69, 0x39, 0x3d, 0x7f, 0x95, 0xf0, 0x3b, 0x50, 0x78, 0x57, 0xbd,
+ 0xfa, 0xf7, 0xa9, 0xbc, 0xb2, 0xae, 0x2b, 0x3c, 0x22, 0x75, 0x0d, 0x3e, 0x63,
+ 0xaa, 0x03, 0x3d, 0xfa, 0x00, 0xd7, 0x3d, 0xc3, 0xcb, 0x60, 0x3c, 0xab, 0xf2,
+ 0x61, 0x3c, 0x1b, 0x9a, 0x38, 0xbd, 0x1a, 0x33, 0xef, 0xbd, 0x9e, 0x11, 0xc5,
+ 0x3d, 0xf5, 0xb1, 0x99, 0xbc, 0x65, 0xee, 0x5e, 0xbc, 0xde, 0x02, 0xe8, 0xbd,
+ 0xef, 0x87, 0x58, 0x3d, 0x0e, 0x01, 0xcf, 0x3d, 0x51, 0xf7, 0xcb, 0xbc, 0x9e,
+ 0x48, 0x50, 0xbd, 0xd2, 0xc8, 0x88, 0xbc, 0x56, 0x0a, 0x18, 0x3e, 0x49, 0xa6,
+ 0xce, 0xbd, 0x9d, 0x8d, 0xf4, 0x3d, 0xd9, 0x71, 0x7e, 0x3d, 0x49, 0xcb, 0x67,
+ 0x3d, 0x3d, 0x4f, 0xdb, 0x3c, 0x8c, 0x3b, 0xaa, 0xbd, 0xce, 0xc4, 0x1f, 0x3d,
+ 0xda, 0x94, 0xaa, 0x3c, 0x4c, 0xae, 0x89, 0x3d, 0xac, 0x7e, 0x8d, 0x3d, 0xff,
+ 0xfe, 0xf7, 0x3d, 0x89, 0xba, 0xbd, 0xbd, 0x98, 0xc1, 0x5c, 0x3d, 0x9a, 0xcf,
+ 0x1b, 0xba, 0xdb, 0x22, 0xf3, 0x3d, 0x3a, 0xa6, 0x58, 0xbd, 0x6b, 0x7d, 0x2b,
+ 0x3d, 0x22, 0x6f, 0xa2, 0xbd, 0x95, 0xf3, 0x07, 0x3e, 0x14, 0xfb, 0x7a, 0x3d,
+ 0xda, 0x56, 0x40, 0xbd, 0x85, 0xe7, 0xcf, 0xbd, 0x7f, 0x4c, 0xb8, 0x3c, 0xf0,
+ 0x6d, 0xc1, 0xbd, 0xb1, 0x01, 0xbd, 0x3d, 0xb4, 0xc0, 0xc0, 0xbd, 0x4f, 0x5f,
+ 0xca, 0xbd, 0x4e, 0x96, 0xe1, 0x3d, 0x92, 0x0a, 0xa6, 0x3d, 0xd6, 0xd9, 0xb7,
+ 0x3d, 0x8b, 0x52, 0xa8, 0x3d, 0xa9, 0xe6, 0xb4, 0xbc, 0x16, 0x49, 0xc0, 0x3b,
+ 0xed, 0x64, 0xd1, 0x3d, 0xf1, 0xaf, 0x20, 0xbc, 0x8f, 0x44, 0xd9, 0x3b, 0xc0,
+ 0x7a, 0xb4, 0x3d, 0x31, 0xb6, 0x15, 0xbe, 0x82, 0x8e, 0x62, 0xbd, 0xb3, 0x93,
+ 0x1e, 0xbd, 0xae, 0x33, 0x8c, 0xbd, 0x82, 0xf3, 0xa6, 0x3c, 0xd2, 0x41, 0xb2,
+ 0xbc, 0x58, 0x37, 0xce, 0x3d, 0xb9, 0xd2, 0xce, 0x3d, 0x99, 0x90, 0x69, 0x3d,
+ 0xc3, 0x4b, 0xc8, 0x3d, 0xba, 0xfa, 0xcb, 0x3d, 0xee, 0x4a, 0xfe, 0xbc, 0x24,
+ 0xc5, 0x3c, 0xbd, 0x5a, 0x95, 0xb3, 0xbd, 0xb1, 0xc0, 0x1f, 0xbd, 0x61, 0x53,
+ 0xb4, 0x3c, 0x2e, 0x79, 0xc7, 0xbd, 0xd6, 0x70, 0x9d, 0xbd, 0x9d, 0xe7, 0x16,
+ 0x3d, 0x4f, 0xe9, 0xa9, 0xbc, 0x7d, 0xbb, 0x7c, 0xbd, 0xf0, 0xdf, 0xe9, 0xbc,
+ 0x66, 0xc4, 0x3f, 0xbd, 0xfc, 0xd3, 0x20, 0xbd, 0xd3, 0x4f, 0x36, 0xbd, 0x72,
+ 0x8d, 0xec, 0x3d, 0x79, 0xbc, 0xaa, 0x3d, 0x69, 0x95, 0xe7, 0x3d, 0x46, 0xb6,
+ 0xcc, 0xbc, 0xdd, 0x97, 0x70, 0xbd, 0x96, 0x31, 0x0c, 0xbe, 0x48, 0x86, 0xeb,
+ 0x3d, 0x74, 0xf6, 0xa3, 0x3c, 0xe8, 0x26, 0xa1, 0x3d, 0xe3, 0xdd, 0x70, 0xbd,
+ 0xcf, 0xbd, 0x02, 0x3c, 0x13, 0x3e, 0xbc, 0xbd, 0x69, 0xad, 0x05, 0xbd, 0xc0,
+ 0xad, 0x53, 0x3c, 0xb6, 0x7c, 0xb2, 0xbd, 0x27, 0xc3, 0xfd, 0xbc, 0x5f, 0x42,
+ 0xc5, 0x3d, 0x2f, 0x17, 0xd6, 0x3d, 0xb2, 0x68, 0xda, 0xbd, 0x95, 0xe5, 0x4f,
+ 0x3c, 0xae, 0x99, 0xe4, 0x3d, 0x8f, 0x5c, 0xde, 0xbd, 0xf1, 0x87, 0x02, 0xbb,
+ 0x17, 0x17, 0x7a, 0x3d, 0x75, 0x72, 0x1f, 0x3d, 0x70, 0x34, 0xa4, 0xbd, 0x43,
+ 0x2a, 0xb2, 0x3d, 0xd9, 0x5a, 0xc7, 0x3d, 0xa5, 0x58, 0xc6, 0x3d, 0xa3, 0xb8,
+ 0x76, 0xbd, 0x5b, 0xf5, 0x27, 0x3c, 0x58, 0xfa, 0x60, 0x3c, 0xcc, 0x2e, 0xd4,
+ 0x3d, 0x71, 0xc3, 0x54, 0x3c, 0x75, 0xe3, 0x6b, 0x3d, 0x29, 0xf3, 0x9a, 0x3d,
+ 0x9d, 0x62, 0x8b, 0xbd, 0xcd, 0xa8, 0x9f, 0xbd, 0xee, 0xaa, 0xbf, 0x3c, 0xd7,
+ 0xe4, 0x20, 0xbd, 0x9f, 0x2c, 0xa4, 0x3c, 0x3a, 0x5e, 0x76, 0xbd, 0x9b, 0xcb,
+ 0x07, 0x3e, 0x3e, 0x33, 0x34, 0x3d, 0x69, 0x57, 0x26, 0x3c, 0xf5, 0x54, 0xef,
+ 0xbd, 0xf5, 0x3d, 0xe9, 0xbd, 0x8e, 0xed, 0x2b, 0x3d, 0x86, 0xf8, 0xb2, 0x3c,
+ 0xb2, 0x7f, 0x45, 0x3d, 0xe1, 0x4f, 0xbd, 0x3c, 0xa7, 0xc8, 0x91, 0xbd, 0xea,
+ 0x4c, 0xc5, 0x3d, 0x7a, 0x60, 0x7c, 0x3d, 0xce, 0x3e, 0xb6, 0x3d, 0xc3, 0x22,
+ 0x52, 0xbd, 0xbf, 0x54, 0xd3, 0xbc, 0xc7, 0xe0, 0xe1, 0xbd, 0x08, 0x86, 0xc8,
+ 0x3c, 0x98, 0x6c, 0xc3, 0xbd, 0xe6, 0xe1, 0x25, 0xbd, 0xdb, 0x07, 0x53, 0xbb,
+ 0xbd, 0x04, 0x5f, 0xbd, 0x12, 0xfd, 0xe6, 0xbd, 0x2d, 0x0f, 0xe8, 0x3d, 0x9e,
+ 0x08, 0x47, 0x3d, 0x93, 0xc8, 0xdc, 0xbd, 0x97, 0x91, 0xc9, 0xbd, 0xbd, 0x45,
+ 0x88, 0xbd, 0x45, 0x8e, 0x0b, 0xbe, 0x8f, 0xb7, 0xd1, 0xbd, 0x9b, 0x3c, 0xc2,
+ 0x3c, 0x04, 0xc5, 0xda, 0xba, 0xce, 0x19, 0x9a, 0x3d, 0xaf, 0xee, 0x25, 0x3e,
+ 0xdf, 0x56, 0x48, 0xbd, 0x9d, 0x42, 0x02, 0x3e, 0x2c, 0x6a, 0xef, 0x3c, 0x25,
+ 0x99, 0x07, 0x3c, 0x74, 0xa1, 0xca, 0x3c, 0xae, 0x08, 0x9e, 0x3c, 0xe5, 0xec,
+ 0x25, 0xbd, 0x63, 0x8f, 0xd5, 0x3d, 0xf3, 0x4a, 0xc5, 0xbc, 0xab, 0x02, 0x53,
+ 0xbd, 0x3e, 0xec, 0x5e, 0x3d, 0xea, 0xf2, 0x8f, 0x3d, 0xb9, 0xa3, 0x91, 0xbd,
+ 0xa9, 0x34, 0x93, 0xbd, 0xd4, 0x95, 0x78, 0x3d, 0x84, 0x2b, 0x04, 0x3e, 0xe7,
+ 0x61, 0x87, 0x3d, 0x41, 0x40, 0xe9, 0x3d, 0x3f, 0xea, 0xdc, 0xbc, 0xc9, 0xfd,
+ 0xa4, 0x3d, 0xf6, 0xd5, 0x69, 0x3d, 0xa5, 0x93, 0x99, 0xbb, 0x21, 0x84, 0x76,
+ 0x3d, 0xaa, 0xf2, 0x52, 0x3d, 0xbb, 0x3d, 0x9f, 0xbd, 0xd3, 0xd6, 0x6c, 0x3d,
+ 0xe6, 0xb2, 0xcc, 0xbc, 0x18, 0x3b, 0x30, 0x3d, 0x25, 0xcf, 0xc5, 0xbc, 0xe0,
+ 0xfd, 0xb4, 0x3c, 0x5c, 0x92, 0x6b, 0x3d, 0xa8, 0x01, 0x17, 0x3d, 0xf6, 0xed,
+ 0xa2, 0xbd, 0x42, 0x7b, 0xec, 0x3d, 0x8e, 0x87, 0xd7, 0x3d, 0xfa, 0x30, 0xb7,
+ 0x3d, 0x54, 0x66, 0x38, 0xbd, 0x68, 0xb5, 0xa9, 0xbd, 0x30, 0x1e, 0x7d, 0x3d,
+ 0x93, 0xf4, 0xd5, 0xbc, 0x69, 0x6a, 0x98, 0xbd, 0x8f, 0x2b, 0x4f, 0xbd, 0xd3,
+ 0x99, 0x9a, 0xbd, 0x9b, 0x72, 0xfe, 0xbc, 0xaf, 0xc3, 0xad, 0xbd, 0xe2, 0xdf,
+ 0xde, 0x3c, 0xdc, 0x3e, 0xd3, 0x3d, 0x46, 0xb7, 0x92, 0xbd, 0x22, 0xd0, 0x21,
+ 0xbd, 0x7a, 0x5e, 0xae, 0x3c, 0xb6, 0x91, 0xa4, 0x3d, 0xba, 0xda, 0x8f, 0xbc,
+ 0xad, 0xb4, 0x18, 0x3b, 0xb1, 0x16, 0x9c, 0xbd, 0x2f, 0xf7, 0x89, 0xbd, 0x89,
+ 0x33, 0xba, 0xbd, 0x03, 0x89, 0x61, 0xbd, 0xa8, 0x17, 0x50, 0xbd, 0xf5, 0xfe,
+ 0x1a, 0x3d, 0xd2, 0x25, 0x02, 0x3d, 0xbb, 0xc9, 0x67, 0xbd, 0xc8, 0x32, 0xe0,
+ 0x3d, 0x8e, 0xb2, 0x9e, 0xbd, 0x57, 0x57, 0x2a, 0xbc, 0xb4, 0xc4, 0x76, 0x3d,
+ 0xfd, 0x46, 0x11, 0x3b, 0x38, 0x45, 0xe8, 0x3a, 0x90, 0x49, 0xc6, 0xbd, 0xc3,
+ 0x50, 0x0b, 0xbe, 0x19, 0xca, 0xd9, 0x3d, 0x17, 0x4d, 0xe0, 0x3d, 0x68, 0x36,
+ 0x3f, 0xbc, 0x3a, 0x6e, 0xda, 0xbd, 0x50, 0xd8, 0xde, 0x3d, 0x6f, 0x09, 0x29,
+ 0xbe, 0x9d, 0x50, 0x03, 0xbd, 0x9a, 0x25, 0xf6, 0xbd, 0x43, 0xa2, 0xbc, 0x3d,
+ 0x9a, 0x55, 0xa5, 0x3d, 0xa9, 0x0d, 0x2f, 0xbd, 0x5c, 0x8e, 0x22, 0xbd, 0x2e,
+ 0xc1, 0x58, 0xbd, 0x5a, 0x05, 0x2c, 0xbd, 0xec, 0x19, 0xa1, 0xbd, 0xd7, 0x75,
+ 0x7b, 0x3d, 0x9a, 0xcf, 0x82, 0x3c, 0x46, 0xc6, 0xff, 0x3c, 0x37, 0xc8, 0xca,
+ 0x3d, 0xa0, 0xb7, 0x28, 0x3d, 0xaa, 0xb5, 0x2f, 0x3d, 0xaa, 0xa3, 0x9e, 0xbb,
+ 0x01, 0x2b, 0xd6, 0xbd, 0xa5, 0x6d, 0xb1, 0x3d, 0x2c, 0x3d, 0x97, 0xbc, 0x63,
+ 0xfb, 0x18, 0xbe, 0xb9, 0xa9, 0xcb, 0x3d, 0xb0, 0x7d, 0xb4, 0x3d, 0x22, 0x6a,
+ 0x65, 0x3d, 0x7a, 0xaf, 0xf5, 0xba, 0xed, 0x29, 0x0e, 0x3d, 0x5c, 0xd5, 0x6f,
+ 0xbd, 0xbe, 0xd9, 0xa0, 0xbc, 0x05, 0x8b, 0xe2, 0x3c, 0x35, 0xec, 0x8b, 0xbc,
+ 0xa9, 0x59, 0x0d, 0x3c, 0x0b, 0x4c, 0x56, 0x3c, 0x39, 0x59, 0xad, 0xbd, 0x41,
+ 0x06, 0xe3, 0xbd, 0xb1, 0xcd, 0xaa, 0x3d, 0xa8, 0xcc, 0xa1, 0xbd, 0x35, 0x63,
+ 0x36, 0xbd, 0x44, 0xf9, 0x43, 0x3c, 0xee, 0x2c, 0xdb, 0x3c, 0x79, 0xd4, 0x78,
+ 0x3d, 0x81, 0x34, 0x96, 0x3d, 0xc0, 0x43, 0xda, 0x3b, 0x9f, 0x9c, 0x0b, 0xbd,
+ 0xaf, 0x07, 0xac, 0x3d, 0xcf, 0xe3, 0xf0, 0x3c, 0x44, 0x9b, 0xf8, 0x3d, 0xd4,
+ 0x1f, 0x4e, 0xbd, 0xa6, 0xab, 0x9f, 0x3d, 0xcb, 0xd4, 0x30, 0x3d, 0x4b, 0xd4,
+ 0x17, 0x3d, 0x7e, 0xf2, 0x3d, 0x3b, 0x47, 0x47, 0xac, 0x3b, 0x2f, 0xda, 0xa8,
+ 0xbd, 0xb0, 0x53, 0xde, 0xbd, 0x2e, 0x06, 0xdc, 0x3d, 0x9a, 0x92, 0x9a, 0xbd,
+ 0x86, 0xf9, 0xf2, 0xbd, 0xb0, 0x9b, 0xd6, 0xbd, 0x8f, 0x36, 0x53, 0x3d, 0x09,
+ 0x68, 0x99, 0x3d, 0x25, 0xbb, 0xeb, 0x3d, 0x76, 0x5e, 0xfb, 0xbc, 0x24, 0x11,
+ 0x05, 0xbd, 0xcf, 0xaf, 0xb7, 0xbd, 0x97, 0xcd, 0x65, 0xbd, 0xeb, 0x59, 0xf7,
+ 0xb8, 0x95, 0x28, 0xb1, 0xbc, 0xff, 0xba, 0x91, 0xbd, 0x58, 0x33, 0xf0, 0x3c,
+ 0x42, 0x68, 0xd9, 0xbd, 0xa7, 0x71, 0x95, 0xbb, 0x41, 0x0b, 0x6a, 0x3d, 0xe4,
+ 0x83, 0x06, 0x3d, 0xae, 0x90, 0xa0, 0xbd, 0xfe, 0xf5, 0x27, 0xbd, 0x7f, 0xdc,
+ 0xb4, 0x3d, 0x32, 0xf0, 0x75, 0xbd, 0x99, 0xfa, 0x7b, 0x3d, 0x5f, 0xca, 0x7a,
+ 0x3d, 0xd9, 0x7e, 0x49, 0xbd, 0x7f, 0x2b, 0x5b, 0x3d, 0x02, 0x92, 0x46, 0xbb,
+ 0x20, 0x77, 0x5b, 0x3c, 0x57, 0xa6, 0xd1, 0x3a, 0x74, 0x68, 0xb2, 0xbd, 0xa2,
+ 0x4c, 0x0a, 0xbe, 0xb9, 0xcf, 0x43, 0xbd, 0xd6, 0x2e, 0x2d, 0xbc, 0x0f, 0x5d,
+ 0xde, 0x3d, 0xfc, 0xdc, 0x1c, 0xb9, 0x6d, 0x7b, 0x91, 0xbc, 0x33, 0x39, 0x97,
+ 0x3d, 0x37, 0xcf, 0x1f, 0x3d, 0xb3, 0x0b, 0xe3, 0x3d, 0x45, 0xbe, 0xa0, 0x3d,
+ 0xda, 0x7c, 0x0e, 0x3d, 0x66, 0xd7, 0x25, 0xbd, 0xa7, 0xe0, 0x0f, 0x3d, 0xd2,
+ 0x48, 0x8f, 0xbc, 0x2b, 0xbd, 0x9a, 0x3d, 0xf9, 0xe3, 0xd9, 0x3d, 0x0d, 0x1e,
+ 0xf3, 0x3c, 0x12, 0xc5, 0xfe, 0xbc, 0x59, 0x75, 0x9f, 0x3c, 0x76, 0x0e, 0x46,
+ 0xbd, 0xa3, 0x5d, 0xb9, 0x3d, 0x8c, 0x5a, 0xc9, 0x3c, 0xb5, 0x90, 0xbd, 0x3d,
+ 0xe5, 0xaa, 0x42, 0x3d, 0xaf, 0x43, 0x9b, 0xbd, 0x50, 0x0e, 0xc9, 0xbc, 0xea,
+ 0x53, 0x75, 0x3d, 0xfd, 0x0d, 0x4b, 0x3d, 0x7d, 0xc8, 0x17, 0x3d, 0xdd, 0xf0,
+ 0xb5, 0xbd, 0x00, 0x53, 0xf4, 0xba, 0xa6, 0x3a, 0x54, 0xbd, 0x7f, 0x57, 0x5f,
+ 0xbd, 0x00, 0x98, 0x56, 0xbd, 0xe6, 0x33, 0xbe, 0x3c, 0xe2, 0x66, 0x96, 0x3c,
+ 0x41, 0x08, 0x88, 0x3c, 0x66, 0x40, 0x88, 0xbd, 0xfd, 0x89, 0xbb, 0x3d, 0xa6,
+ 0xde, 0x99, 0x3a, 0xa4, 0x22, 0xf4, 0x3c, 0x94, 0xbc, 0xaf, 0xbd, 0x94, 0x01,
+ 0xcd, 0xbd, 0x89, 0x93, 0x0d, 0x3d, 0x74, 0x5a, 0xdf, 0x3b, 0x5b, 0x0a, 0xce,
+ 0xbd, 0xee, 0x6d, 0x87, 0x3d, 0x7c, 0x6a, 0xb0, 0x3d, 0x6d, 0xb0, 0x7b, 0x3c,
+ 0x6f, 0xb8, 0x4e, 0x3d, 0x06, 0x6a, 0x25, 0xbd, 0x7c, 0xb9, 0xcc, 0x3d, 0xf5,
+ 0x54, 0xb0, 0xbd, 0xf3, 0xf9, 0xe1, 0xbd, 0xcf, 0x6d, 0x91, 0x3c, 0x8d, 0x15,
+ 0xa4, 0x3c, 0x15, 0xa1, 0x86, 0x3d, 0x47, 0x35, 0xc3, 0xbd, 0x34, 0xa8, 0x16,
+ 0xbd, 0x11, 0xda, 0x49, 0x3d, 0x45, 0xb4, 0x61, 0x3d, 0x41, 0x15, 0xbf, 0xbc,
+ 0xd4, 0x07, 0xfa, 0x3d, 0xb0, 0x3a, 0x18, 0x3d, 0xda, 0x7f, 0x69, 0xbd, 0x6b,
+ 0xec, 0x9f, 0xbd, 0x6e, 0xfc, 0xe6, 0x3d, 0xc9, 0x5d, 0xb4, 0x3d, 0xa2, 0x1d,
+ 0x12, 0xbc, 0x51, 0x23, 0xce, 0xbd, 0x0a, 0x20, 0x86, 0xbc, 0xc4, 0x1f, 0xbe,
+ 0x3d, 0x18, 0x10, 0x6a, 0x3d, 0xe1, 0x58, 0x9f, 0x3c, 0x22, 0x7f, 0xc9, 0xbc,
+ 0x1a, 0xed, 0x1e, 0xbe, 0x47, 0x93, 0x87, 0x3c, 0x4d, 0x77, 0x31, 0xbc, 0xf9,
+ 0x29, 0xb2, 0x3d, 0xa9, 0xb3, 0x77, 0xbd, 0x43, 0x16, 0x0a, 0x3d, 0x88, 0x2f,
+ 0x98, 0x3d, 0x3b, 0x7c, 0x2b, 0x3d, 0xfc, 0x29, 0x07, 0x3e, 0xa6, 0x27, 0x93,
+ 0xbd, 0x5a, 0xa8, 0x13, 0xbe, 0xa8, 0xb8, 0x88, 0xbd, 0x9b, 0x64, 0xc5, 0xbc,
+ 0xef, 0xb1, 0xe6, 0x3d, 0x33, 0x47, 0xc3, 0x38, 0x56, 0x92, 0x7b, 0xbd, 0x87,
+ 0x81, 0xc7, 0x3c, 0x94, 0xe2, 0x21, 0x3c, 0xc2, 0x28, 0x75, 0x3d, 0xb7, 0x6f,
+ 0x8b, 0xbd, 0x2b, 0xdd, 0x09, 0xbc, 0x1f, 0xb9, 0xbc, 0xbd, 0xd6, 0xef, 0x90,
+ 0xbd, 0x52, 0xc7, 0xa5, 0xbc, 0xf7, 0x2c, 0x4d, 0x3c, 0xc7, 0xfe, 0x94, 0x3c,
+ 0x24, 0x12, 0x46, 0xbc, 0x95, 0x3b, 0x59, 0x3c, 0x64, 0x96, 0xd7, 0xbc, 0xb3,
+ 0x3c, 0xc7, 0xbd, 0xe6, 0x41, 0xbc, 0x3d, 0x70, 0xd8, 0x5c, 0x3b, 0xe2, 0x16,
+ 0x88, 0xbd, 0x21, 0x12, 0xfc, 0x3d, 0xbd, 0x55, 0x1e, 0xbe, 0x3a, 0xf9, 0x1f,
+ 0xbd, 0x59, 0xd3, 0x27, 0xbd, 0x14, 0x3b, 0xd7, 0x3d, 0x13, 0xf9, 0x66, 0x3d,
+ 0x79, 0x92, 0x77, 0xbd, 0x9a, 0x35, 0x63, 0x3d, 0x07, 0xf2, 0x75, 0xbc, 0xc1,
+ 0x6f, 0x73, 0x3d, 0x0f, 0x02, 0xc2, 0x3c, 0xd0, 0x45, 0x0c, 0x3d, 0x37, 0x87,
+ 0x5e, 0x3d, 0x03, 0x9e, 0xce, 0x3d, 0x2b, 0x90, 0x13, 0xbd, 0xf4, 0x1a, 0xc5,
+ 0xbd, 0xdf, 0x42, 0xdb, 0x3d, 0x47, 0x02, 0x58, 0xbd, 0x0f, 0x74, 0x1a, 0xbd,
+ 0x1d, 0x5f, 0x05, 0x3d, 0x99, 0x81, 0xff, 0xbc, 0x56, 0x85, 0xb3, 0x3d, 0xac,
+ 0x62, 0x17, 0xbd, 0xaa, 0x30, 0xc3, 0x3d, 0xdc, 0x53, 0x0f, 0xbe, 0x9b, 0x95,
+ 0x49, 0x3d, 0xf8, 0x4e, 0xa7, 0x3d, 0x76, 0x74, 0x10, 0xbd, 0x2c, 0xe0, 0x9c,
+ 0x3d, 0x7b, 0xc1, 0xc7, 0xbd, 0x15, 0x39, 0xe6, 0x3d, 0x52, 0xb3, 0xff, 0xbd,
+ 0x72, 0x77, 0xd3, 0x3d, 0x6a, 0xc4, 0xfb, 0x3c, 0x27, 0x15, 0x5b, 0x3d, 0xba,
+ 0xa2, 0x6b, 0xbd, 0x2b, 0xbc, 0x02, 0x3e, 0x6c, 0x7c, 0xda, 0x3c, 0x24, 0xa1,
+ 0x61, 0xbb, 0xfb, 0x9b, 0xc9, 0xbc, 0x20, 0xcb, 0x93, 0xbc, 0x95, 0x98, 0x6c,
+ 0xbd, 0x96, 0x34, 0xda, 0x3d, 0x5b, 0xa3, 0xe1, 0xbc, 0x71, 0xff, 0x07, 0x3d,
+ 0x5e, 0x18, 0xd0, 0xbd, 0xc1, 0x9e, 0x26, 0x3e, 0x8b, 0x3d, 0x9c, 0x3d, 0x90,
+ 0xe5, 0x84, 0x3d, 0x0d, 0xaa, 0x37, 0x3b, 0x99, 0x2d, 0xf6, 0x3c, 0x40, 0x23,
+ 0xca, 0x3d, 0x1c, 0x56, 0xb4, 0xbd, 0xa9, 0x04, 0x97, 0xbd, 0x41, 0xa7, 0x9e,
+ 0x3a, 0xb3, 0xfe, 0xb9, 0xbd, 0xf9, 0x34, 0x02, 0xbd, 0x44, 0x97, 0xb4, 0xbd,
+ 0x67, 0x43, 0x80, 0xbd, 0xb0, 0xce, 0x36, 0xbd, 0x28, 0x48, 0xa2, 0x3d, 0x32,
+ 0x52, 0xd3, 0x3d, 0x2a, 0xd4, 0x12, 0x3e, 0x8e, 0x41, 0xd5, 0x3c, 0x5e, 0x6b,
+ 0x64, 0xbd, 0x19, 0x1a, 0xee, 0xbd, 0x91, 0xf3, 0xb1, 0xbb, 0x9e, 0x4f, 0x9b,
+ 0x3d, 0x50, 0x3a, 0x9d, 0x3d, 0x25, 0xbc, 0xb5, 0xbd, 0xf7, 0xd6, 0x7b, 0x3d,
+ 0x69, 0x87, 0x94, 0xbb, 0xed, 0x33, 0x31, 0xbd, 0x8f, 0xf3, 0xaa, 0xbd, 0x5b,
+ 0x0b, 0xc0, 0x3d, 0xd9, 0xac, 0x60, 0xbd, 0x24, 0xa6, 0x9c, 0x3d, 0xfb, 0x17,
+ 0x3f, 0x3d, 0x49, 0x6a, 0x97, 0x3d, 0x02, 0xe9, 0xef, 0xbd, 0x44, 0xbe, 0xb5,
+ 0xbc, 0x61, 0x77, 0x94, 0xbb, 0x9e, 0x6d, 0xe1, 0xbc, 0xfa, 0x8c, 0xf2, 0xbc,
+ 0x9c, 0xfc, 0x45, 0xbd, 0xed, 0x91, 0xde, 0xbd, 0xcd, 0xa8, 0xe7, 0x3d, 0x4e,
+ 0x05, 0x10, 0xbe, 0x33, 0x4d, 0xa1, 0x3c, 0x01, 0x95, 0x91, 0x3d, 0x33, 0xf9,
+ 0x13, 0xbd, 0x78, 0x50, 0x03, 0xbd, 0x7f, 0xa1, 0xd7, 0xbd, 0x0f, 0xe3, 0x92,
+ 0x3d, 0x46, 0x19, 0x9e, 0x3d, 0xa8, 0xa7, 0x06, 0xbc, 0x0e, 0x64, 0xa6, 0x3d,
+ 0xb4, 0x52, 0xe8, 0xbd, 0x87, 0xc6, 0x8f, 0xbd, 0x50, 0x8c, 0xbf, 0xbb, 0x76,
+ 0x39, 0x34, 0x3d, 0xd2, 0x2f, 0x0b, 0xbd, 0xf4, 0xa3, 0x51, 0xbd, 0xb0, 0x28,
+ 0x7d, 0xbd, 0x83, 0x61, 0x57, 0x3d, 0xca, 0x95, 0xb5, 0x3d, 0xdc, 0x22, 0x32,
+ 0xbc, 0x58, 0xb3, 0x69, 0xbd, 0x09, 0x10, 0x79, 0x3c, 0x3c, 0x79, 0x35, 0xbd,
+ 0xa0, 0x99, 0xa9, 0xbd, 0xdf, 0x93, 0x18, 0x3e, 0x6f, 0x5f, 0xad, 0x3d, 0xb2,
+ 0x0b, 0x8e, 0xbd, 0xf5, 0xf2, 0xaa, 0x3d, 0xf2, 0x2e, 0xa9, 0xbd, 0xf6, 0xe2,
+ 0x23, 0x3d, 0x17, 0xa2, 0xaf, 0x3d, 0xd9, 0x35, 0x8e, 0xbd, 0xf1, 0x8d, 0x08,
+ 0x3e, 0xcc, 0x76, 0xb4, 0xbd, 0x71, 0xb4, 0xc9, 0xbd, 0x00, 0x10, 0xd4, 0xbc,
+ 0xbe, 0x87, 0xf0, 0x3c, 0xe8, 0x15, 0xad, 0xbd, 0xfb, 0x2e, 0x5e, 0xbd, 0x6f,
+ 0x3b, 0x99, 0xbc, 0x77, 0xc7, 0xe5, 0xbd, 0xf4, 0x52, 0x03, 0xbe, 0x74, 0x7b,
+ 0x00, 0xbe, 0xe8, 0x51, 0x8c, 0x3d, 0xe1, 0x8d, 0x1c, 0xbc, 0x3d, 0x3c, 0x16,
+ 0x3d, 0x94, 0x51, 0xd5, 0x3d, 0xff, 0x2e, 0xb0, 0x3d, 0xf5, 0x3c, 0xaa, 0xbc,
+ 0x39, 0x6b, 0xb2, 0x3d, 0x1f, 0x8b, 0x44, 0x3d, 0xe4, 0xa4, 0xa8, 0x3d, 0xa9,
+ 0xbc, 0x81, 0x3d, 0x67, 0x10, 0x83, 0xbd, 0x03, 0x1b, 0x08, 0x3d, 0xed, 0xef,
+ 0x29, 0x3d, 0x46, 0x38, 0x58, 0xbc, 0x98, 0x03, 0xa3, 0x3d, 0x7d, 0xd6, 0x34,
+ 0xbd, 0x36, 0xbd, 0xf7, 0x3d, 0xe7, 0xf9, 0x5d, 0xbd, 0x9c, 0x88, 0x87, 0x3d,
+ 0x85, 0x7d, 0xa3, 0x3d, 0x81, 0x29, 0x75, 0xbc, 0xca, 0x17, 0x97, 0x3d, 0xbf,
+ 0xd1, 0x04, 0x3e, 0xc9, 0x18, 0xfa, 0x3b, 0x0f, 0x59, 0xc3, 0x3d, 0x40, 0xa6,
+ 0x05, 0xbd, 0x5e, 0x98, 0x8d, 0x3c, 0x8f, 0x73, 0xff, 0x3c, 0xb2, 0x58, 0xde,
+ 0xbc, 0x97, 0x10, 0x04, 0xbd, 0x2d, 0xd2, 0x1c, 0x3d, 0xac, 0x03, 0x6e, 0xbd,
+ 0xa8, 0x9a, 0xa8, 0x3d, 0x1c, 0x0e, 0x41, 0x3d, 0x30, 0x7a, 0xab, 0xbd, 0xec,
+ 0x58, 0x14, 0xbd, 0xac, 0xe9, 0x9e, 0xbb, 0x0b, 0x14, 0x02, 0x3d, 0xac, 0x78,
+ 0x00, 0x3e, 0xa1, 0xb6, 0xc2, 0xbd, 0x04, 0x51, 0x91, 0xbc, 0x57, 0x51, 0xf1,
+ 0xbd, 0x95, 0x42, 0x49, 0x3d, 0x91, 0x54, 0xa2, 0x3c, 0xbd, 0x0f, 0x03, 0xbe,
+ 0x0a, 0xf8, 0x17, 0xbd, 0xbb, 0x25, 0x14, 0x3d, 0xf2, 0x00, 0x19, 0xbd, 0x79,
+ 0xea, 0x85, 0xbd, 0x4a, 0xf9, 0xb6, 0xbc, 0x4f, 0x1c, 0x34, 0xbc, 0x2e, 0x3e,
+ 0x31, 0x3d, 0xe3, 0x63, 0x5e, 0xbd, 0x63, 0xf1, 0xaf, 0x3d, 0x4e, 0xee, 0xaa,
+ 0x3d, 0x91, 0xc0, 0xcc, 0xbc, 0xc3, 0x43, 0xb2, 0xbc, 0xab, 0x9d, 0x54, 0xbd,
+ 0x0b, 0x92, 0xa3, 0xbc, 0xc5, 0xe0, 0xf6, 0x3d, 0xb5, 0x2d, 0x52, 0xbd, 0x89,
+ 0x8d, 0xf0, 0xbd, 0xd4, 0x40, 0x0c, 0xbe, 0x88, 0xf8, 0xaa, 0x3d, 0xc6, 0x0d,
+ 0x10, 0x3d, 0xe0, 0x7d, 0xcb, 0xbc, 0x14, 0x58, 0xba, 0x3a, 0x11, 0x9d, 0x24,
+ 0xbd, 0x14, 0x54, 0x03, 0x3b, 0x2c, 0xb4, 0x7d, 0x3c, 0x5a, 0x71, 0x99, 0xbd,
+ 0x5d, 0xa3, 0xa3, 0xbd, 0xfc, 0xd0, 0xe5, 0x39, 0x4a, 0x6c, 0xf8, 0xbd, 0x81,
+ 0x0e, 0xab, 0x3d, 0x0d, 0x40, 0x9a, 0x3d, 0x89, 0xff, 0x07, 0x3d, 0xd4, 0x8c,
+ 0x97, 0x3b, 0x8a, 0x7a, 0xc5, 0x3c, 0xbb, 0xbf, 0xe3, 0x3a, 0xcb, 0x47, 0x41,
+ 0x3d, 0x80, 0x8d, 0x29, 0x3d, 0x16, 0xe7, 0xf6, 0xbc, 0x01, 0x5f, 0xc0, 0x3d,
+ 0xf1, 0x20, 0xe3, 0xbc, 0xec, 0x9f, 0x29, 0x3e, 0x8f, 0x46, 0x8d, 0x3d, 0x20,
+ 0x99, 0xe9, 0x3c, 0x90, 0x04, 0x00, 0x3e, 0x35, 0xda, 0xba, 0xbd, 0x6c, 0xc5,
+ 0x5b, 0x3d, 0x9a, 0x42, 0x41, 0xbd, 0x1a, 0x84, 0x6f, 0x3d, 0x94, 0xc4, 0x0c,
+ 0xbd, 0x08, 0x43, 0x8a, 0x3d, 0xd8, 0xdb, 0xa4, 0x3d, 0xac, 0xc6, 0xa8, 0x3d,
+ 0xa5, 0xf4, 0xff, 0xb9, 0xdc, 0x01, 0x58, 0xbc, 0x43, 0x37, 0xf0, 0x3d, 0xed,
+ 0x73, 0x3b, 0xbd, 0x8d, 0x1f, 0x00, 0x3c, 0x4c, 0x89, 0x71, 0x3d, 0xb0, 0xbf,
+ 0x4e, 0x3d, 0x1e, 0x61, 0x83, 0xbd, 0x82, 0xf6, 0x02, 0xbe, 0x3c, 0x97, 0xf9,
+ 0x3d, 0x06, 0x96, 0x97, 0x3d, 0x5c, 0x13, 0xd7, 0xbd, 0xce, 0x77, 0x88, 0xbd,
+ 0x26, 0x76, 0xba, 0x3c, 0x46, 0x28, 0xc4, 0x3d, 0x35, 0x72, 0x8d, 0x3c, 0x3e,
+ 0x63, 0x81, 0xbd, 0x06, 0x13, 0x9b, 0x3d, 0xf9, 0x80, 0x20, 0x3d, 0x9c, 0xfb,
+ 0x94, 0x3c, 0x50, 0x2c, 0x16, 0xbd, 0xdb, 0x7d, 0x59, 0xbd, 0x7a, 0xa8, 0x8d,
+ 0x3d, 0x8b, 0x56, 0x94, 0xbd, 0xa5, 0x49, 0x8b, 0x3d, 0x76, 0xae, 0x99, 0xbc,
+ 0x6e, 0x40, 0x84, 0x3d, 0xe0, 0x5a, 0x40, 0xbd, 0x33, 0xb8, 0x0b, 0xbd, 0x96,
+ 0x14, 0x25, 0x3c, 0x3e, 0x5c, 0x78, 0xbd, 0x31, 0x40, 0x06, 0x3e, 0x05, 0x0b,
+ 0xb7, 0x3c, 0x24, 0x3e, 0xe5, 0xbd, 0x94, 0x06, 0x12, 0x3d, 0x14, 0x07, 0x96,
+ 0xbd, 0x14, 0x1d, 0x80, 0xbd, 0xfc, 0xd3, 0x66, 0xbd, 0xfa, 0xef, 0x67, 0x3d,
+ 0x62, 0x1e, 0x9f, 0x3c, 0x27, 0x05, 0x2a, 0xbc, 0xbb, 0x0b, 0xa2, 0x3d, 0x07,
+ 0x02, 0xaf, 0x3d, 0xcb, 0x9d, 0xc9, 0x3d, 0xbe, 0x5c, 0x15, 0x3b, 0x73, 0xc6,
+ 0x92, 0xbd, 0x70, 0x29, 0xe4, 0x3d, 0x46, 0xa2, 0xb2, 0xbc, 0x56, 0xb8, 0xe1,
+ 0x3d, 0x82, 0xf9, 0x0d, 0xbd, 0x9b, 0x59, 0xa8, 0xbd, 0x42, 0x59, 0x98, 0x3d,
+ 0xae, 0x31, 0x22, 0xbd, 0x0d, 0xa2, 0x1f, 0x3e, 0xc8, 0xfd, 0x58, 0xbc, 0x4e,
+ 0xd4, 0xca, 0x3d, 0xbd, 0x39, 0x81, 0xbd, 0x7c, 0x0a, 0x25, 0x3e, 0xdb, 0x88,
+ 0x7f, 0x3c, 0xf1, 0x64, 0x07, 0x3e, 0xd2, 0x99, 0x1d, 0x3d, 0x2c, 0xc9, 0xb0,
+ 0xbd, 0x7a, 0xe0, 0x9d, 0xbc, 0x9e, 0x93, 0x19, 0x3d, 0x7f, 0xfd, 0xd2, 0xbc,
+ 0xec, 0x44, 0xd5, 0x3d, 0x69, 0x81, 0xbf, 0x3d, 0x9e, 0xff, 0xac, 0x3c, 0x60,
+ 0x6b, 0x6a, 0xbd, 0xe6, 0x22, 0x48, 0xbd, 0x3b, 0xc4, 0xa3, 0xbd, 0x0c, 0xd3,
+ 0xf5, 0x3c, 0x08, 0x03, 0x62, 0x3c, 0x5c, 0x46, 0x16, 0x3e, 0xd3, 0x2a, 0xce,
+ 0x3c, 0xfc, 0x31, 0xa8, 0x3d, 0xbd, 0x02, 0x95, 0x3c, 0xe8, 0xc7, 0x7a, 0x3c,
+ 0xff, 0xc5, 0xf8, 0x3c, 0x3a, 0xb0, 0x79, 0x3b, 0xe6, 0xfd, 0x37, 0xbd, 0x5e,
+ 0xd3, 0x06, 0x3e, 0x21, 0x21, 0xe8, 0x3c, 0xa1, 0x6f, 0xf1, 0x3d, 0xa6, 0xc2,
+ 0x54, 0x3d, 0x9c, 0xae, 0x9c, 0x3d, 0xcb, 0xfd, 0x0a, 0x3c, 0x3e, 0x2e, 0x00,
+ 0xbd, 0xdc, 0xf2, 0x4b, 0xbd, 0x7a, 0xdf, 0xbd, 0x3d, 0xbd, 0x27, 0x8b, 0x3c,
+ 0x1c, 0x12, 0x2d, 0xbd, 0xf9, 0xf3, 0x28, 0x3e, 0x4c, 0x90, 0xb3, 0xbd, 0x49,
+ 0xfc, 0x84, 0x3d, 0x2e, 0xc1, 0x82, 0x3d, 0x54, 0xc7, 0x62, 0x3d, 0xcb, 0x24,
+ 0xf9, 0x3d, 0xf4, 0x6a, 0x2b, 0x3c, 0x38, 0x27, 0x1c, 0xbd, 0x05, 0xf1, 0xf5,
+ 0x3d, 0xc0, 0x87, 0xa2, 0x3d, 0x7e, 0x5c, 0x92, 0x3d, 0xef, 0x33, 0xad, 0x3d,
+ 0x34, 0xff, 0x43, 0x3d, 0x87, 0x47, 0xc6, 0x3d, 0x58, 0x18, 0x76, 0xbd, 0x1d,
+ 0x74, 0x9e, 0x3d, 0xae, 0x41, 0xb1, 0xbc, 0x7d, 0x42, 0x94, 0xbd, 0x37, 0x01,
+ 0x66, 0x3d, 0xb4, 0x18, 0x96, 0xbd, 0x69, 0x31, 0xc4, 0x3c, 0xe7, 0x09, 0x00,
+ 0xbe, 0x46, 0x1a, 0x2b, 0xbd, 0x76, 0xd4, 0x7b, 0xbd, 0x48, 0xcd, 0xfc, 0x3b,
+ 0xf9, 0x98, 0xf6, 0xbc, 0x33, 0x91, 0x2c, 0xbe, 0xe1, 0x08, 0xf5, 0xbd, 0xb0,
+ 0xcd, 0x79, 0x3d, 0xd3, 0x1d, 0x0f, 0x3e, 0x5a, 0x9f, 0x13, 0xbd, 0x7d, 0x6b,
+ 0x44, 0x3c, 0xcf, 0x14, 0x38, 0x3d, 0xe3, 0xfb, 0x47, 0x3d, 0x37, 0x1e, 0x2f,
+ 0x3c, 0x89, 0xa0, 0xb2, 0xbd, 0x89, 0x21, 0x81, 0xbd, 0x04, 0xda, 0xc5, 0x3d,
+ 0xa7, 0xa8, 0x16, 0xbc, 0x07, 0x2e, 0xc1, 0xbb, 0x8c, 0x6f, 0xc2, 0x3c, 0x3b,
+ 0x0c, 0x03, 0xbd, 0x74, 0xc2, 0xa5, 0x3d, 0x3f, 0xeb, 0xb2, 0xbd, 0x2f, 0x66,
+ 0x94, 0xbd, 0x4f, 0x30, 0xab, 0xbd, 0xc4, 0xdd, 0x45, 0x3d, 0x4a, 0xb7, 0x48,
+ 0x3d, 0x55, 0x77, 0x26, 0x3e, 0xbe, 0x1c, 0x96, 0xbb, 0x5b, 0xca, 0x62, 0xbd,
+ 0xcf, 0x1e, 0xd3, 0x3c, 0xa7, 0x0e, 0xb9, 0xbd, 0x67, 0x75, 0x2b, 0xbd, 0x26,
+ 0x12, 0xd5, 0xbc, 0xb6, 0x0f, 0xc0, 0xbd, 0x12, 0xab, 0x23, 0x3d, 0xf6, 0x23,
+ 0xb2, 0x3d, 0x3f, 0x71, 0x83, 0x3d, 0x2a, 0x08, 0x95, 0xbc, 0xd8, 0x6e, 0xdc,
+ 0xbd, 0x1c, 0x85, 0xa6, 0xbd, 0xc4, 0xbc, 0x52, 0xbd, 0xa8, 0xe0, 0x9c, 0x3d,
+ 0xf8, 0xa9, 0xe5, 0x3d, 0xfe, 0xbd, 0x9c, 0x3d, 0x9d, 0x62, 0xc3, 0x3c, 0xe6,
+ 0x95, 0xd6, 0xbc, 0x08, 0x07, 0x68, 0xbc, 0x99, 0x7b, 0xe4, 0xbd, 0xcf, 0x18,
+ 0xb0, 0x3d, 0xdb, 0x65, 0x8e, 0xbd, 0x47, 0x34, 0xa9, 0xbd, 0x65, 0xab, 0x0a,
+ 0xbe, 0xb3, 0x57, 0x24, 0xbe, 0x1f, 0xce, 0xa2, 0xbc, 0xd2, 0x8a, 0xb7, 0xbc,
+ 0x1e, 0xd4, 0x53, 0x3d, 0xec, 0x02, 0x14, 0xbd, 0xd7, 0xc2, 0x05, 0x3d, 0x05,
+ 0xe3, 0xcb, 0xbc, 0x18, 0xc7, 0x9d, 0x3d, 0x99, 0x69, 0x0a, 0xbe, 0xee, 0x58,
+ 0xa1, 0x3d, 0xae, 0xa3, 0x36, 0xbe, 0x5c, 0x5d, 0x9c, 0xbd, 0x39, 0xfb, 0x00,
+ 0xbd, 0x38, 0xcd, 0x70, 0xbd, 0x2f, 0x77, 0xf2, 0xbd, 0x8a, 0x7d, 0x74, 0xbd,
+ 0x4b, 0x08, 0x7b, 0xbd, 0x42, 0xaf, 0x4a, 0xba, 0x56, 0x2e, 0x80, 0xbd, 0x81,
+ 0x9b, 0xb9, 0x3d, 0xf0, 0x6d, 0x86, 0x3c, 0xfe, 0x53, 0x82, 0xbd, 0xb8, 0xac,
+ 0x56, 0xbd, 0xf7, 0xc9, 0x14, 0x3d, 0xea, 0xe6, 0x1f, 0xbd, 0x9f, 0x23, 0xd0,
+ 0xbd, 0x73, 0xd5, 0x6a, 0x3d, 0x24, 0xdb, 0xba, 0xbd, 0xf5, 0xf1, 0xda, 0xbc,
+ 0xe6, 0x8b, 0x34, 0xbd, 0x6c, 0x15, 0x8a, 0x3c, 0x26, 0x05, 0x63, 0x3d, 0x27,
+ 0xc2, 0x8b, 0xbd, 0x62, 0xb2, 0x83, 0x3d, 0x71, 0x11, 0x50, 0xbc, 0x67, 0x3d,
+ 0xe4, 0x3d, 0xa5, 0x3d, 0x59, 0xbd, 0x18, 0xa4, 0x70, 0x3c, 0x6b, 0x86, 0x9c,
+ 0x3d, 0xa6, 0xe4, 0xbf, 0x3d, 0x3a, 0x8f, 0xe2, 0xbd, 0xd7, 0xf8, 0x71, 0x3d,
+ 0x1d, 0x46, 0x00, 0xbd, 0x3c, 0x59, 0xc0, 0xbc, 0x1f, 0x60, 0x50, 0xbd, 0x91,
+ 0xe2, 0xe6, 0xbd, 0x4c, 0x72, 0xb6, 0xbd, 0x49, 0x1e, 0xba, 0x3d, 0xdd, 0x1e,
+ 0x77, 0xbc, 0x35, 0x26, 0xab, 0x3c, 0x63, 0x83, 0xd7, 0xbd, 0x41, 0x6f, 0xa8,
+ 0x3d, 0x6d, 0xf0, 0x50, 0xbd, 0xdc, 0x5f, 0x2f, 0xbd, 0x73, 0x67, 0xce, 0xbc,
+ 0x10, 0x47, 0x0b, 0xbd, 0xdc, 0x85, 0x41, 0x3c, 0xcd, 0x61, 0xc9, 0xbd, 0x9d,
+ 0x79, 0x77, 0x3d, 0xbd, 0xe5, 0xb5, 0xbd, 0xa4, 0x88, 0xf7, 0xbd, 0x43, 0xf7,
+ 0x5e, 0x3b, 0x95, 0x23, 0x26, 0xbd, 0x39, 0x1e, 0xa7, 0x3d, 0x60, 0xd5, 0x2e,
+ 0xbd, 0x78, 0xa7, 0x1b, 0x3d, 0xad, 0x5b, 0xcd, 0x3d, 0x73, 0xba, 0x9d, 0xbd,
+ 0xb7, 0xe0, 0x91, 0x3d, 0xa7, 0x90, 0x8e, 0x3d, 0x12, 0x0d, 0x11, 0x3d, 0x6d,
+ 0xf8, 0x9b, 0xbd, 0x7d, 0xd4, 0xdf, 0x3d, 0x67, 0x4c, 0xa3, 0x3d, 0x21, 0x33,
+ 0x88, 0xbc, 0xc8, 0xd2, 0xc7, 0xbd, 0x93, 0xea, 0x80, 0xbd, 0x4d, 0xe7, 0x42,
+ 0xbd, 0x0b, 0x43, 0xfb, 0xbc, 0xb0, 0x8c, 0x7f, 0xbc, 0x16, 0x83, 0xc3, 0x3d,
+ 0x42, 0xd0, 0x86, 0xbd, 0x7f, 0x6f, 0xa6, 0x3d, 0xed, 0xee, 0x4c, 0x3d, 0xc9,
+ 0x3e, 0x03, 0x3d, 0x72, 0x47, 0x9e, 0xbd, 0x2f, 0x66, 0xda, 0x3d, 0x3d, 0x45,
+ 0x80, 0x3b, 0x3c, 0xab, 0xa6, 0xbd, 0x73, 0xe8, 0x9f, 0xbd, 0xf6, 0x76, 0xc2,
+ 0xbd, 0x18, 0xaf, 0xb4, 0x3d, 0x94, 0x94, 0x9f, 0xbd, 0x46, 0xcd, 0xad, 0xbd,
+ 0xdb, 0xe6, 0x87, 0xbd, 0x67, 0x03, 0x07, 0x3d, 0x05, 0xc2, 0x84, 0xbc, 0xb7,
+ 0x1f, 0x8d, 0xbd, 0x19, 0x72, 0xa1, 0x3d, 0xd8, 0xa5, 0x52, 0x3d, 0x63, 0x90,
+ 0x03, 0xbd, 0xf5, 0xe3, 0xcd, 0x3d, 0xd8, 0xfb, 0x9c, 0x3d, 0x74, 0xd7, 0x06,
+ 0xbd, 0x8c, 0xb5, 0xdd, 0xbd, 0x20, 0x07, 0xba, 0xbd, 0x83, 0xa1, 0xd2, 0x3d,
+ 0x4c, 0x58, 0xe3, 0x3d, 0x31, 0x7d, 0xe1, 0xbd, 0x29, 0x06, 0xa1, 0xbd, 0x64,
+ 0xa9, 0x2e, 0xbd, 0x79, 0x6c, 0xb5, 0xbd, 0x8f, 0xe5, 0xac, 0x3d, 0x68, 0xc1,
+ 0xc3, 0x3c, 0xd5, 0xa7, 0xf2, 0xbd, 0x2e, 0x24, 0x40, 0xbd, 0xd6, 0x39, 0xe7,
+ 0x3d, 0xe0, 0xaf, 0x02, 0xbd, 0xe1, 0xd6, 0xe1, 0xbd, 0xfa, 0xa0, 0x25, 0x3d,
+ 0x26, 0xe8, 0x57, 0x3d, 0xa5, 0x58, 0xf6, 0xbd, 0xd2, 0x32, 0x0f, 0xbd, 0x8e,
+ 0xa1, 0x8d, 0x3c, 0xb6, 0x98, 0xce, 0xbc, 0x71, 0x96, 0xfa, 0xbc, 0xe2, 0x69,
+ 0x35, 0x3c, 0x3d, 0x07, 0x21, 0x3d, 0xc1, 0x9f, 0x8a, 0x3d, 0x0a, 0x9e, 0x64,
+ 0xbd, 0x3b, 0x91, 0x57, 0xbb, 0x99, 0x41, 0x8c, 0x3d, 0xcf, 0x60, 0x8f, 0xbd,
+ 0x5e, 0xe6, 0x25, 0xbd, 0xec, 0x60, 0xb0, 0xbd, 0xcf, 0xd7, 0x87, 0x3d, 0x1a,
+ 0x3f, 0x4e, 0xbd, 0xd7, 0xbf, 0x78, 0xbd, 0xe3, 0x77, 0xd9, 0x3d, 0x81, 0xd8,
+ 0x81, 0xbd, 0x52, 0x2a, 0xd3, 0x3d, 0xc1, 0x32, 0x80, 0xbd, 0xaa, 0xbf, 0x9d,
+ 0x3d, 0xbf, 0x21, 0x3b, 0x3d, 0x30, 0x5e, 0x9e, 0xbd, 0xfa, 0xf3, 0xda, 0xbc,
+ 0x41, 0xeb, 0x9c, 0xbd, 0x71, 0x88, 0xd3, 0xbc, 0xf1, 0x4c, 0x00, 0xbd, 0x38,
+ 0xd5, 0x2f, 0x3c, 0xcd, 0xd9, 0x3e, 0x3d, 0xf4, 0xf8, 0xa4, 0x3d, 0xbc, 0x2f,
+ 0x0e, 0xbd, 0x28, 0x35, 0x34, 0x3d, 0x3a, 0x20, 0x5c, 0x3d, 0x97, 0x22, 0xdb,
+ 0xbd, 0x75, 0xd3, 0x5f, 0xbd, 0xf9, 0x3b, 0x66, 0xbd, 0x4a, 0x18, 0xe7, 0xbb,
+ 0x4e, 0x21, 0x5d, 0xbd, 0x9c, 0x6c, 0x45, 0xbd, 0x2c, 0xb8, 0xe7, 0x3c, 0x65,
+ 0xbf, 0x45, 0x3d, 0x15, 0xbb, 0xa5, 0xbd, 0x7e, 0x1c, 0xba, 0xbd, 0xfa, 0x2d,
+ 0xfc, 0x3c, 0xc2, 0xfb, 0x20, 0xbd, 0x62, 0xc3, 0xa6, 0xbd, 0xae, 0x66, 0xc1,
+ 0x3b, 0x8e, 0x5e, 0x29, 0xbd, 0x1a, 0x5d, 0x27, 0xbd, 0xce, 0x36, 0xaf, 0xbd,
+ 0x6d, 0x03, 0xdd, 0x3d, 0xb5, 0x5d, 0x95, 0x3c, 0xd2, 0x9d, 0x60, 0xbd, 0xf0,
+ 0xb5, 0x60, 0xbc, 0x80, 0x21, 0x34, 0xbd, 0xf1, 0x05, 0xc8, 0x3b, 0x2c, 0x2a,
+ 0x2f, 0x3e, 0x99, 0x23, 0x3c, 0x3d, 0x73, 0x2f, 0xe4, 0x3d, 0xc8, 0x22, 0xce,
+ 0x3d, 0xbf, 0x98, 0xad, 0xbd, 0xa5, 0xb2, 0xd4, 0xbd, 0x6d, 0xca, 0x3b, 0xbe,
+ 0xd1, 0xa0, 0x95, 0x3c, 0xa0, 0xed, 0xe1, 0x3b, 0x8c, 0x5d, 0x6f, 0x3d, 0x10,
+ 0x04, 0x88, 0xbd, 0x76, 0x62, 0xe7, 0x3d, 0x53, 0x28, 0x8c, 0xbd, 0x7b, 0x4f,
+ 0x5d, 0xbd, 0x2e, 0x69, 0x8b, 0x3c, 0xe7, 0x7f, 0x79, 0x3c, 0x2e, 0xe5, 0xbf,
+ 0x3c, 0x56, 0x90, 0xf6, 0xbc, 0x8a, 0xc6, 0x3b, 0x3d, 0x86, 0xbf, 0xb8, 0xbd,
+ 0xe6, 0xf7, 0xd7, 0xbc, 0xc5, 0x96, 0xcb, 0x3d, 0x48, 0xe0, 0x9a, 0xbd, 0xd8,
+ 0xe1, 0x45, 0xbd, 0xa7, 0x00, 0xd7, 0xbd, 0xda, 0x57, 0x1c, 0xbc, 0x8e, 0x49,
+ 0x40, 0x3d, 0x8b, 0x52, 0x0a, 0x3d, 0xe2, 0xe8, 0x1b, 0xbd, 0x74, 0xd1, 0x0f,
+ 0x3e, 0x17, 0x20, 0xc1, 0x3d, 0x3a, 0xbe, 0x8a, 0xbd, 0xa4, 0xd5, 0xca, 0x3c,
+ 0x4f, 0x17, 0x82, 0xbc, 0x1f, 0xea, 0x09, 0xbd, 0x8e, 0xcb, 0xd0, 0x3d, 0x9c,
+ 0x1a, 0x36, 0xbd, 0x99, 0xee, 0x5b, 0xbd, 0x5c, 0x1d, 0x10, 0xbe, 0x9e, 0x99,
+ 0x22, 0x3d, 0x8f, 0x8f, 0xda, 0x3c, 0x42, 0xa7, 0x2e, 0x3d, 0x37, 0x33, 0x03,
+ 0xbe, 0x11, 0x7b, 0x8f, 0xbd, 0xb8, 0xa1, 0x7e, 0x3d, 0x31, 0x04, 0x62, 0x3d,
+ 0x93, 0x03, 0xfe, 0x3b, 0x59, 0x82, 0xa0, 0xbd, 0x07, 0xb8, 0x24, 0x3d, 0x7a,
+ 0x45, 0xf2, 0x3d, 0xab, 0xf4, 0xd7, 0xbd, 0x2f, 0xbd, 0xc6, 0x3d, 0xb2, 0x1c,
+ 0x47, 0x3d, 0xbe, 0xf6, 0xb2, 0x3d, 0xe2, 0xd0, 0x92, 0xbd, 0x0d, 0xec, 0xb2,
+ 0xbd, 0x40, 0x5c, 0xc0, 0xbd, 0xa8, 0xf7, 0x0e, 0x3c, 0xef, 0x56, 0xb1, 0xbd,
+ 0x91, 0x09, 0x4f, 0xbd, 0x47, 0x51, 0xcc, 0x3d, 0xcd, 0x6d, 0x85, 0xbd, 0xfe,
+ 0xb2, 0x6f, 0xbd, 0x3f, 0x9b, 0xec, 0x3c, 0x64, 0x20, 0x98, 0xbb, 0x82, 0x78,
+ 0x09, 0x3d, 0x2f, 0xbf, 0xe7, 0xbc, 0x5d, 0x5e, 0x01, 0xbd, 0x0c, 0xca, 0x4b,
+ 0x3d, 0xf2, 0xa2, 0x89, 0xbd, 0xa6, 0x59, 0x54, 0x3d, 0x62, 0x46, 0x04, 0x3c,
+ 0x99, 0x2f, 0x48, 0xbd, 0x22, 0x21, 0x1b, 0xbd, 0x07, 0x3b, 0xb4, 0xbd, 0x88,
+ 0x42, 0x0a, 0x3e, 0x7e, 0x29, 0xc3, 0xbb, 0xab, 0x7a, 0x86, 0x3d, 0xe7, 0x26,
+ 0xc0, 0x3c, 0xac, 0x99, 0x0f, 0xbd, 0x6e, 0xdb, 0x74, 0x3d, 0xba, 0x02, 0xdb,
+ 0x3d, 0x3c, 0x38, 0xae, 0x3d, 0xdf, 0x34, 0xe1, 0xbd, 0x53, 0xa6, 0x26, 0xbe,
+ 0x26, 0xa7, 0x82, 0x3d, 0x7b, 0x0f, 0x03, 0xbe, 0x85, 0xb6, 0xaa, 0xbc, 0xc5,
+ 0x08, 0xbf, 0x3c, 0x4f, 0xd1, 0xa8, 0xbb, 0x9f, 0x58, 0xa6, 0x3c, 0x51, 0xdc,
+ 0xfb, 0x3d, 0x2e, 0x30, 0xab, 0xbd, 0x38, 0x19, 0x19, 0x3c, 0xa2, 0x6a, 0x7c,
+ 0x3d, 0x1d, 0x52, 0xd5, 0xbc, 0x15, 0x5f, 0xb3, 0x3b, 0x9b, 0xd8, 0x75, 0xbd,
+ 0x5f, 0xa1, 0x13, 0xbd, 0xdc, 0xc7, 0xfd, 0xbb, 0x44, 0x9b, 0x73, 0xbd, 0x41,
+ 0x1d, 0x82, 0xbd, 0xa7, 0x0b, 0x15, 0x3c, 0x87, 0x91, 0x80, 0x3c, 0x74, 0x55,
+ 0xab, 0xbd, 0xf4, 0xb6, 0x3d, 0x3b, 0xa7, 0x2c, 0xcd, 0xbd, 0x19, 0xa5, 0x96,
+ 0xbc, 0xea, 0x8f, 0xfa, 0x3d, 0x98, 0x47, 0x12, 0xbd, 0xfc, 0x40, 0x62, 0x3d,
+ 0x72, 0x61, 0xa0, 0xbd, 0x79, 0x4d, 0x71, 0x3d, 0x2f, 0x4a, 0x89, 0x3d, 0xb8,
+ 0xdc, 0x98, 0x3d, 0x66, 0x46, 0x6f, 0x3d, 0xa2, 0xf2, 0x0d, 0x3d, 0x36, 0xf5,
+ 0xd4, 0x3c, 0xb9, 0xe5, 0x88, 0x3d, 0xa4, 0x93, 0x05, 0x3e, 0x64, 0x7e, 0x18,
+ 0xbe, 0xb6, 0x47, 0x76, 0x3d, 0x8e, 0x31, 0xca, 0x3d, 0x2f, 0x72, 0xf3, 0x3d,
+ 0x73, 0x45, 0x0d, 0x3e, 0xf4, 0x52, 0xfa, 0xbc, 0x40, 0x37, 0x88, 0xbd, 0x44,
+ 0x13, 0xae, 0xbc, 0x25, 0x7e, 0x0a, 0xbd, 0xbe, 0x26, 0x45, 0xbd, 0x2c, 0xf1,
+ 0x37, 0x3d, 0x29, 0xbd, 0x9f, 0xbd, 0xcb, 0xff, 0x1c, 0xbd, 0x62, 0xf2, 0xa0,
+ 0xba, 0x20, 0x57, 0xa8, 0xbc, 0xaa, 0xc1, 0x9c, 0xbd, 0xfb, 0xd0, 0x3b, 0x3d,
+ 0xe2, 0xae, 0x3f, 0x3d, 0x41, 0x4d, 0x93, 0x3d, 0x28, 0x11, 0xcc, 0x3d, 0x52,
+ 0x6e, 0x06, 0x3e, 0x8f, 0x9b, 0xc0, 0x3d, 0x40, 0xb0, 0xa4, 0xbc, 0xb0, 0x45,
+ 0x86, 0x3d, 0xc9, 0x85, 0x40, 0xbd, 0xfa, 0xdb, 0xe3, 0xbd, 0xf3, 0x0e, 0x9b,
+ 0x3d, 0x48, 0x39, 0x03, 0xbe, 0xc4, 0xfc, 0x2f, 0xbd, 0xb9, 0xbf, 0xbe, 0x3d,
+ 0xd9, 0x2f, 0x11, 0xbd, 0x71, 0x6a, 0x75, 0x3c, 0x89, 0x2b, 0xc2, 0xbd, 0x21,
+ 0x82, 0xd4, 0xbd, 0x36, 0xcc, 0xf5, 0x3d, 0xa3, 0x91, 0x3d, 0x3d, 0x16, 0xd1,
+ 0x7d, 0xbd, 0x40, 0xba, 0x75, 0x3b, 0x5a, 0x82, 0xfa, 0x3d, 0xc1, 0x09, 0xaf,
+ 0x3d, 0x1e, 0x44, 0xa3, 0x3d, 0xd7, 0x2a, 0x37, 0xbd, 0xd9, 0x72, 0xcc, 0x3d,
+ 0x58, 0x58, 0x9a, 0xbd, 0xea, 0x90, 0x35, 0xbc, 0x0e, 0x69, 0x92, 0x3c, 0x68,
+ 0x7e, 0x5c, 0xbc, 0x0a, 0xba, 0x55, 0x3d, 0x7e, 0xd4, 0xb9, 0x3b, 0x45, 0x5b,
+ 0xe7, 0xbd, 0x6b, 0xe6, 0xd5, 0xbc, 0xbc, 0x3e, 0x14, 0xbd, 0xe8, 0xb5, 0x09,
+ 0x3d, 0xbd, 0xde, 0xaf, 0x3d, 0xcf, 0x2d, 0x94, 0xbd, 0x12, 0x0f, 0xac, 0x3d,
+ 0x21, 0x99, 0xc2, 0xbd, 0x45, 0x93, 0x0d, 0x3d, 0x8a, 0x1e, 0xe4, 0x3d, 0xe8,
+ 0xfe, 0xb2, 0x3d, 0x0e, 0x69, 0xb8, 0xbd, 0xab, 0x2a, 0x91, 0xbc, 0x02, 0x24,
+ 0x8f, 0xbd, 0xef, 0x96, 0xa7, 0x3b, 0x39, 0x39, 0xda, 0xbd, 0x31, 0x03, 0xcd,
+ 0x3d, 0xe5, 0xf7, 0x4c, 0x3c, 0xca, 0x45, 0x3f, 0x3c, 0xb4, 0xf6, 0x8c, 0xbd,
+ 0x4a, 0x36, 0x4f, 0x3c, 0x5c, 0xe7, 0x56, 0x3d, 0xe3, 0x81, 0xd6, 0xbd, 0x44,
+ 0x9d, 0x3d, 0xbd, 0xb2, 0xf5, 0xe2, 0x3d, 0xaa, 0xd0, 0xff, 0xbc, 0x49, 0x86,
+ 0x4b, 0x3d, 0x79, 0x40, 0x51, 0xbd, 0x60, 0xd2, 0x91, 0xbd, 0x9d, 0x61, 0x26,
+ 0xbe, 0x32, 0x82, 0xe5, 0x3d, 0xa3, 0x28, 0xc5, 0xbc, 0x3f, 0x02, 0x08, 0xbd,
+ 0x9b, 0xe8, 0xca, 0x3d, 0xb4, 0x34, 0xed, 0x3c, 0x48, 0x7f, 0xea, 0x3d, 0xd6,
+ 0x07, 0xa1, 0xbd, 0xf9, 0xad, 0x18, 0x3c, 0xba, 0x0d, 0x8b, 0x3d, 0xa6, 0x13,
+ 0x0f, 0x3e, 0x25, 0xfc, 0x99, 0x3c, 0xc4, 0x8e, 0xc1, 0x3c, 0xfe, 0xa2, 0x14,
+ 0x3d, 0x0f, 0x96, 0xd5, 0xbc, 0x21, 0x99, 0xbb, 0xbc, 0xd7, 0x9c, 0xd1, 0x3d,
+ 0x14, 0xd2, 0xa2, 0x3d, 0x8b, 0x64, 0xd9, 0xbd, 0x11, 0x36, 0xa2, 0x3c, 0xec,
+ 0xbe, 0x24, 0xbd, 0x9f, 0x0f, 0x2a, 0x3d, 0x9d, 0xd5, 0xa6, 0xbd, 0xba, 0xe4,
+ 0x83, 0xbd, 0xc1, 0xce, 0x45, 0xbd, 0x4a, 0x99, 0x8c, 0xbd, 0xa0, 0x8d, 0x99,
+ 0x3b, 0xf1, 0x4b, 0x7a, 0xbc, 0x9d, 0x76, 0xd1, 0xbd, 0x65, 0x96, 0xd5, 0x3d,
+ 0x65, 0xd5, 0x0a, 0xbd, 0x03, 0xb9, 0x60, 0x3c, 0xbe, 0xb3, 0x0e, 0xbe, 0xf3,
+ 0x86, 0xf3, 0x3d, 0x28, 0xc1, 0x0f, 0x3d, 0x88, 0x69, 0xc0, 0xbc, 0x0e, 0x06,
+ 0x7e, 0x3d, 0x42, 0x82, 0xa5, 0x3d, 0x28, 0x95, 0x1b, 0x3d, 0xb7, 0x6d, 0xac,
+ 0xbd, 0xe0, 0xc9, 0x14, 0xbd, 0x5c, 0xf4, 0xb3, 0x3d, 0x74, 0x9e, 0xd4, 0xbd,
+ 0x8d, 0x9a, 0xed, 0x3c, 0x9c, 0xe3, 0x01, 0x3d, 0x08, 0x0d, 0xc5, 0xbd, 0xc5,
+ 0xba, 0xa7, 0xbd, 0xf2, 0xf8, 0x30, 0x3c, 0x41, 0x3c, 0xa8, 0x3d, 0x15, 0x63,
+ 0x60, 0xbd, 0x31, 0x27, 0xc6, 0xbc, 0x61, 0x0f, 0xe8, 0xbd, 0xcf, 0x0c, 0xbb,
+ 0xbc, 0xf5, 0x06, 0xbd, 0x3d, 0x99, 0x20, 0xb4, 0x3c, 0x5c, 0x27, 0x2d, 0xbd,
+ 0x5f, 0x29, 0x4b, 0xbd, 0xe6, 0x17, 0xef, 0x3d, 0x9c, 0x60, 0x84, 0xbd, 0x6a,
+ 0x76, 0xce, 0x3d, 0xf7, 0x48, 0x92, 0x3d, 0x6a, 0x72, 0xa3, 0x3d, 0x07, 0x7e,
+ 0x04, 0x3e, 0x71, 0x2a, 0xa8, 0x3d, 0x9a, 0x94, 0x74, 0x3d, 0x78, 0x1b, 0xf6,
+ 0x3d, 0x98, 0x1e, 0xfd, 0xbc, 0x3a, 0xf5, 0xc4, 0x39, 0x5f, 0x45, 0xc6, 0x3d,
+ 0x14, 0xc4, 0x8b, 0x3d, 0xea, 0x0c, 0x16, 0xbd, 0x43, 0x08, 0x98, 0x3c, 0x42,
+ 0x6d, 0x04, 0x3d, 0x8f, 0x4f, 0xc5, 0xbd, 0x88, 0x9e, 0x35, 0xbd, 0xfd, 0x1d,
+ 0xfc, 0xbc, 0x82, 0x9f, 0xa5, 0x3c, 0xfe, 0xe2, 0x30, 0xbc, 0x6a, 0x80, 0xf1,
+ 0x3c, 0xc0, 0x61, 0x39, 0x3d, 0xcd, 0x81, 0x08, 0xbe, 0x6f, 0xa9, 0xa9, 0xbd,
+ 0x51, 0x50, 0x2b, 0xba, 0xaa, 0xd4, 0xa1, 0xbd, 0x13, 0x64, 0xdf, 0xbd, 0xa4,
+ 0xd4, 0x5c, 0xbc, 0x2d, 0x83, 0xad, 0xbd, 0xc3, 0x31, 0x07, 0x3d, 0x7d, 0x7a,
+ 0x97, 0xbc, 0xa7, 0x23, 0xf7, 0xbd, 0x61, 0x7f, 0xda, 0xbd, 0x1d, 0x39, 0xd4,
+ 0xbd, 0x0b, 0x50, 0x8f, 0xbc, 0xfc, 0xa2, 0x06, 0x3e, 0x7b, 0x0e, 0x90, 0x3d,
+ 0xf8, 0xa0, 0x9d, 0xbd, 0x25, 0x0f, 0x6d, 0x3d, 0xae, 0x7f, 0xb7, 0xbc, 0xe9,
+ 0x1f, 0x10, 0xbe, 0x5b, 0x7f, 0x52, 0xbd, 0xe5, 0x86, 0x0d, 0xbd, 0x03, 0x12,
+ 0x58, 0x3c, 0xee, 0x04, 0xaa, 0xbd, 0x08, 0x85, 0x0a, 0x3d, 0x73, 0x0b, 0x93,
+ 0xbd, 0x4c, 0x42, 0x0d, 0xbd, 0xe9, 0xa4, 0x7f, 0x3d, 0x3b, 0x8a, 0xa8, 0x3c,
+ 0xa6, 0x4d, 0x88, 0x3d, 0x44, 0xe9, 0x1e, 0x3c, 0x05, 0x39, 0xd0, 0x3d, 0x09,
+ 0xc4, 0xc7, 0x3b, 0xdb, 0x43, 0x88, 0xbd, 0xb2, 0x44, 0x9d, 0x3d, 0x00, 0x42,
+ 0x13, 0xbe, 0x25, 0x15, 0x9a, 0x3d, 0xee, 0x5d, 0x9d, 0x3d, 0x04, 0x63, 0x5b,
+ 0xbb, 0x67, 0x1c, 0x9e, 0x3d, 0xe1, 0x8e, 0xb4, 0x3d, 0x68, 0xae, 0x8c, 0x3d,
+ 0x1a, 0xdc, 0xac, 0x3d, 0xdb, 0x00, 0x86, 0x3d, 0x60, 0xb7, 0x07, 0xbd, 0x92,
+ 0x7c, 0xbc, 0xbd, 0x47, 0xb6, 0x8f, 0x3c, 0x16, 0x03, 0xc1, 0x3d, 0xbb, 0x65,
+ 0x94, 0x3d, 0x0c, 0x98, 0x05, 0xbd, 0xf1, 0xe1, 0xc2, 0x3d, 0xb5, 0xf2, 0x01,
+ 0xbe, 0xf2, 0xe0, 0x01, 0x3d, 0xb4, 0x4a, 0xa5, 0x3d, 0x7c, 0x67, 0x97, 0x3d,
+ 0xa4, 0xbe, 0x52, 0x3d, 0x17, 0x60, 0x1c, 0x3d, 0x95, 0x83, 0x5b, 0xbc, 0x33,
+ 0x59, 0xd3, 0xbd, 0x45, 0x05, 0xf7, 0xbd, 0xa5, 0x82, 0xbe, 0x3d, 0x91, 0xc4,
+ 0x46, 0x3d, 0x5c, 0x4b, 0x27, 0xb8, 0x32, 0xe3, 0xf9, 0x3c, 0xdf, 0xcb, 0xcc,
+ 0x3d, 0xc3, 0x94, 0x6f, 0xbd, 0x10, 0xa2, 0xec, 0x3d, 0x2e, 0xaf, 0x09, 0xbc,
+ 0x49, 0x91, 0x8d, 0x3d, 0x6e, 0xc8, 0xc5, 0xbc, 0x45, 0x0e, 0x66, 0xbc, 0x37,
+ 0xd6, 0xfd, 0xbc, 0x2a, 0xea, 0x81, 0xbd, 0xf7, 0xc2, 0xc2, 0x3d, 0x12, 0x27,
+ 0x6b, 0x3c, 0x97, 0x69, 0xf3, 0x3b, 0xc8, 0xb7, 0xa6, 0xbc, 0xd6, 0xdf, 0x96,
+ 0xbc, 0xe0, 0x8a, 0x1b, 0x3e, 0xe3, 0x34, 0xc5, 0x3c, 0x96, 0xcd, 0x12, 0xbe,
+ 0xcd, 0x75, 0x5a, 0x3c, 0x81, 0xd5, 0xd6, 0xbd, 0x2f, 0x97, 0x6e, 0xbd, 0x92,
+ 0x28, 0x45, 0xbc, 0x81, 0xaf, 0xce, 0x3d, 0xc3, 0x35, 0xd3, 0x3d, 0x97, 0x1f,
+ 0x99, 0x3c, 0x48, 0xb6, 0x5b, 0x3d, 0x98, 0x96, 0x9d, 0x3d, 0xed, 0x0a, 0xa3,
+ 0x3c, 0x5e, 0x72, 0xe5, 0xbb, 0xad, 0x65, 0xaa, 0xbd, 0x16, 0x57, 0x8c, 0xbd,
+ 0x4a, 0x37, 0x6b, 0xbd, 0x18, 0x35, 0xbe, 0xbd, 0xa8, 0xaa, 0x07, 0xbd, 0xbe,
+ 0xcb, 0xf5, 0xbb, 0xbe, 0x69, 0xad, 0x3c, 0x1f, 0x82, 0x54, 0x3d, 0x32, 0xbe,
+ 0x87, 0xbd, 0x67, 0x54, 0x41, 0x3d, 0x46, 0xb6, 0x2e, 0xbd, 0x04, 0xb2, 0x75,
+ 0x3c, 0xb8, 0xf0, 0xcd, 0xbc, 0x63, 0x01, 0x7f, 0x3d, 0x92, 0xb6, 0x84, 0xbd,
+ 0x43, 0x6b, 0xe0, 0x3d, 0x4a, 0xa8, 0xb3, 0x3c, 0x05, 0x93, 0x8f, 0xbd, 0xca,
+ 0xa0, 0x84, 0x3d, 0x84, 0x4b, 0x27, 0x3e, 0x68, 0xce, 0xe2, 0xbd, 0x30, 0x5d,
+ 0x22, 0x3d, 0xa3, 0x3c, 0xc0, 0x3d, 0xc3, 0xa5, 0x37, 0xbd, 0xc8, 0xb2, 0xa3,
+ 0x3d, 0x79, 0xee, 0x82, 0x3d, 0xc6, 0xb3, 0xab, 0x3a, 0x72, 0xa4, 0x65, 0xbb,
+ 0x5c, 0x20, 0xa7, 0x3d, 0xdd, 0xd9, 0xe5, 0xba, 0xbe, 0xcb, 0x9d, 0xbd, 0xdc,
+ 0x19, 0xc5, 0xbd, 0xa8, 0x93, 0xc8, 0x3d, 0x4d, 0x2f, 0x1a, 0x3d, 0x24, 0x73,
+ 0xa2, 0x3d, 0x11, 0xb1, 0x08, 0x3e, 0x8a, 0x27, 0xcf, 0x3d, 0xb6, 0xee, 0xab,
+ 0xbd, 0x1f, 0xd7, 0xe1, 0x3d, 0x5d, 0xcf, 0x5f, 0xbd, 0x8e, 0xa9, 0xb0, 0x3c,
+ 0x86, 0xb9, 0x31, 0x3d, 0xd7, 0xa8, 0x92, 0xbd, 0x7f, 0x37, 0xd0, 0x3d, 0x4c,
+ 0xbb, 0xb6, 0x3d, 0xa4, 0x4d, 0x09, 0xbd, 0xc5, 0x8e, 0x0f, 0xbd, 0xbf, 0x27,
+ 0xa8, 0xbd, 0x62, 0x94, 0xb2, 0x3d, 0x2d, 0x35, 0xe8, 0x3d, 0xd5, 0x78, 0xee,
+ 0xbd, 0x2a, 0x5b, 0x5a, 0xbd, 0x72, 0x89, 0x4d, 0x3d, 0x7f, 0x5b, 0xfd, 0xb8,
+ 0x11, 0x80, 0x58, 0xbd, 0x69, 0xa9, 0xbc, 0xbc, 0xdb, 0xe9, 0xd3, 0xbc, 0x45,
+ 0x3b, 0xf5, 0xbc, 0xa6, 0x28, 0xc5, 0x3d, 0xe2, 0x48, 0x31, 0x3d, 0x49, 0xab,
+ 0x36, 0x3b, 0xca, 0xd2, 0xc6, 0xbc, 0x29, 0x1f, 0x5a, 0x3d, 0x90, 0xe6, 0x3b,
+ 0xbd, 0xf7, 0x5f, 0xa0, 0x3d, 0xb7, 0xc1, 0x91, 0x3d, 0x18, 0xcc, 0xc4, 0x3c,
+ 0x0a, 0xc0, 0x8a, 0xbd, 0x2a, 0x5e, 0x63, 0xbd, 0xa1, 0x2f, 0xb7, 0xbc, 0xf2,
+ 0xfb, 0xac, 0x3b, 0xa4, 0xed, 0x17, 0x3d, 0xc1, 0x09, 0x59, 0xbd, 0xe9, 0xf7,
+ 0xf4, 0x3d, 0xad, 0xe5, 0x8f, 0xbd, 0xa9, 0x9e, 0xd0, 0x3d, 0x0a, 0x98, 0x40,
+ 0xbd, 0xbc, 0x1f, 0x95, 0x3d, 0x0b, 0x17, 0xf0, 0x3c, 0x64, 0x3f, 0x60, 0xbd,
+ 0xc0, 0xb2, 0xc7, 0x3b, 0x42, 0x3f, 0x62, 0x3c, 0x6a, 0x39, 0x8c, 0xbd, 0xbf,
+ 0x72, 0xfd, 0xbd, 0x47, 0x3d, 0xd1, 0xbd, 0x7c, 0x0b, 0x6d, 0x3d, 0xf3, 0x4a,
+ 0xda, 0xbc, 0xce, 0x57, 0x9d, 0x3d, 0xf0, 0x13, 0x53, 0x3b, 0x94, 0x39, 0x31,
+ 0x3d, 0x3d, 0xa7, 0x3f, 0xbd, 0xfa, 0x3e, 0x6b, 0x3d, 0xfb, 0x19, 0xa9, 0x3d,
+ 0x07, 0xfc, 0x5e, 0xbd, 0xfa, 0x47, 0xd3, 0x3d, 0xd6, 0x83, 0x9a, 0xbd, 0x2c,
+ 0xa9, 0x14, 0x3e, 0x01, 0xb5, 0x7e, 0x3d, 0x27, 0xfb, 0x00, 0x3a, 0x7d, 0xe5,
+ 0x35, 0xbd, 0x68, 0x50, 0x05, 0xbc, 0x87, 0xdb, 0x19, 0x3d, 0xbe, 0x2e, 0xe3,
+ 0x3d, 0xe4, 0x41, 0x07, 0xbd, 0x53, 0x57, 0xcc, 0xb9, 0x28, 0x92, 0x96, 0x3d,
+ 0xb6, 0x14, 0xa4, 0xbc, 0xad, 0x84, 0x69, 0x3c, 0x19, 0xe4, 0xde, 0xbd, 0x3b,
+ 0xad, 0x04, 0xbe, 0xd9, 0xe3, 0xbc, 0x3d, 0x5b, 0x59, 0xd3, 0x3d, 0x00, 0x12,
+ 0xcc, 0xbd, 0x2d, 0x0c, 0x8a, 0xbd, 0xc6, 0x1c, 0x79, 0x3d, 0x03, 0xf3, 0x14,
+ 0xbc, 0xb7, 0x28, 0xa6, 0x3d, 0x28, 0x0d, 0xa5, 0xbd, 0xa9, 0x8e, 0x32, 0x3b,
+ 0x60, 0xef, 0x30, 0x3d, 0x21, 0x9f, 0x68, 0xbc, 0x13, 0x02, 0x83, 0xbc, 0x21,
+ 0x90, 0x9e, 0x3c, 0x78, 0xfa, 0xf4, 0xbc, 0xf9, 0x40, 0x6e, 0x3a, 0x11, 0xdb,
+ 0x05, 0x3e, 0xc1, 0xb7, 0xff, 0x3b, 0x04, 0x47, 0x65, 0xbd, 0x6b, 0x8a, 0x85,
+ 0xbd, 0x30, 0xd5, 0x95, 0x3d, 0x3c, 0x4a, 0x92, 0x3d, 0xa6, 0x20, 0x11, 0x3d,
+ 0x03, 0xd8, 0xb1, 0x3c, 0x7d, 0x1e, 0x0b, 0xbd, 0xe9, 0x0a, 0x92, 0x3d, 0x7e,
+ 0x9d, 0xb8, 0x3c, 0xb5, 0x1e, 0x6d, 0x3d, 0x6d, 0x4e, 0x6f, 0x3d, 0xbc, 0x1e,
+ 0xdc, 0x3c, 0x2e, 0x87, 0xa0, 0x3d, 0x2d, 0x00, 0x5c, 0xb8, 0x8f, 0xfb, 0xb3,
+ 0xbd, 0x9e, 0x36, 0x08, 0x3d, 0xa4, 0x19, 0xe0, 0xbb, 0x5f, 0xc0, 0xb7, 0xbb,
+ 0xc7, 0x3c, 0x78, 0x3d, 0x53, 0xe4, 0x65, 0x3d, 0xca, 0xdf, 0xc9, 0x3d, 0x18,
+ 0x8b, 0x27, 0xbd, 0x19, 0x05, 0xa6, 0x3d, 0x23, 0xa2, 0xa2, 0x3d, 0xc2, 0x4b,
+ 0xac, 0xbd, 0x1b, 0x23, 0xd7, 0xbd, 0xc2, 0x53, 0x97, 0x3d, 0x2e, 0xb2, 0x45,
+ 0xbd, 0x73, 0x7b, 0xbc, 0xbd, 0x33, 0xfc, 0x47, 0xbc, 0x0b, 0x36, 0x91, 0x3d,
+ 0xaa, 0x1e, 0x0b, 0xbd, 0xc8, 0x3a, 0xda, 0x3c, 0x22, 0x29, 0xc5, 0x3d, 0x62,
+ 0x18, 0xf3, 0x3c, 0x75, 0x25, 0xc1, 0xbc, 0xe8, 0x19, 0xb8, 0x3d, 0x30, 0x46,
+ 0x47, 0x3d, 0x22, 0x80, 0x9f, 0xbc, 0x59, 0xcc, 0xcf, 0x3d, 0x00, 0x51, 0x95,
+ 0xbc, 0x8b, 0x00, 0xbf, 0xbc, 0xf5, 0xca, 0x89, 0xbd, 0xca, 0x56, 0xe4, 0x3d,
+ 0x7f, 0x86, 0x24, 0x3e, 0x23, 0xd7, 0x14, 0x3d, 0xe2, 0x8f, 0xa7, 0xbc, 0x1d,
+ 0x6d, 0xb3, 0x3c, 0xa4, 0x8a, 0x85, 0xbd, 0x4a, 0x36, 0x40, 0xbd, 0x20, 0xa4,
+ 0xa7, 0xbd, 0xfe, 0x10, 0xa3, 0xbc, 0xa3, 0x3b, 0xce, 0x3d, 0x88, 0x99, 0x12,
+ 0xbd, 0x3d, 0x58, 0xd5, 0xbd, 0x76, 0xe5, 0x7f, 0x3c, 0x87, 0xa0, 0x68, 0xbd,
+ 0x8a, 0xd4, 0xb7, 0xbd, 0xdb, 0x68, 0x6f, 0x3c, 0x22, 0x84, 0x2e, 0xbc, 0x94,
+ 0x63, 0xa6, 0xbc, 0x35, 0xa4, 0xa9, 0x3d, 0x17, 0xec, 0x0d, 0xbd, 0xd4, 0x25,
+ 0x9b, 0xbd, 0xf1, 0x84, 0x04, 0xbd, 0x3a, 0x19, 0xdd, 0x3d, 0xd8, 0xba, 0xb1,
+ 0x3d, 0xb2, 0xb7, 0x21, 0xbd, 0xeb, 0x7e, 0x19, 0x3d, 0xb9, 0xd3, 0xb9, 0x3b,
+ 0xa5, 0x6a, 0x88, 0xbd, 0xdc, 0x78, 0x99, 0xbd, 0xf4, 0x9f, 0xc4, 0x3d, 0x23,
+ 0xfe, 0x49, 0xbb, 0xbe, 0xa0, 0x98, 0xbb, 0x05, 0xe8, 0x84, 0xbd, 0x0e, 0x24,
+ 0x20, 0x3d, 0x30, 0x96, 0x80, 0xbd, 0xd8, 0x1e, 0xef, 0x3c, 0x0a, 0xad, 0xfe,
+ 0x3d, 0xa3, 0xaa, 0x3b, 0xbd, 0x24, 0xd1, 0xb9, 0xbd, 0xfd, 0xb4, 0xd6, 0x3c,
+ 0xe7, 0xfe, 0xe9, 0xbb, 0xf7, 0xd6, 0xaa, 0x3c, 0xa5, 0x35, 0xc1, 0xbc, 0x39,
+ 0xbd, 0x00, 0xbe, 0x19, 0xed, 0x3b, 0x3d, 0x7f, 0x4e, 0x99, 0x3d, 0x09, 0x63,
+ 0xe3, 0xbd, 0x74, 0xc3, 0x73, 0xbd, 0xb7, 0x7d, 0xa4, 0x3d, 0x68, 0x37, 0x50,
+ 0xbd, 0xb0, 0xb0, 0xe8, 0xbd, 0x28, 0x4f, 0xa7, 0xbd, 0x22, 0x85, 0x9e, 0xbd,
+ 0x32, 0xce, 0x12, 0x3e, 0x60, 0x47, 0xbb, 0x3c, 0xdb, 0xa8, 0xc6, 0x3d, 0x50,
+ 0xcf, 0x0c, 0x3d, 0x4b, 0x7d, 0x9c, 0x3b, 0xa9, 0xeb, 0xb9, 0xbd, 0x07, 0x97,
+ 0x13, 0x3c, 0xbe, 0x6b, 0x8f, 0xbd, 0x9c, 0xb3, 0xa9, 0x3d, 0x64, 0xd6, 0x96,
+ 0xbd, 0x75, 0x6a, 0xc4, 0x3c, 0x20, 0xb6, 0x7e, 0x3d, 0x9b, 0x0e, 0x0c, 0x3e,
+ 0xf3, 0xd5, 0xc5, 0x3d, 0x54, 0xb8, 0xdf, 0xbd, 0x12, 0x6e, 0xf2, 0x3a, 0x7b,
+ 0xe4, 0xaa, 0x3c, 0xe3, 0x7c, 0xb5, 0xbd, 0xe6, 0x11, 0x05, 0x3d, 0xc6, 0x65,
+ 0xa2, 0x3d, 0x95, 0x9e, 0x0c, 0x3d, 0x7f, 0xfe, 0xea, 0xbc, 0x22, 0x51, 0xcf,
+ 0x3b, 0x7b, 0xdd, 0x98, 0xbc, 0x6e, 0x2f, 0xba, 0xbc, 0xb3, 0x8e, 0xe6, 0xbd,
+ 0x5e, 0x5e, 0x76, 0x3d, 0x3e, 0xd4, 0xaf, 0xbd, 0x25, 0xbc, 0xa8, 0x3d, 0xb0,
+ 0xd0, 0x81, 0x3c, 0x4c, 0x3f, 0x52, 0x3c, 0x10, 0xd7, 0x13, 0xbd, 0xd0, 0x83,
+ 0x02, 0x3e, 0xd3, 0x03, 0xa5, 0x3d, 0xeb, 0xa7, 0xca, 0xbd, 0x91, 0x09, 0x1b,
+ 0x3d, 0x7a, 0x8c, 0xbf, 0x3c, 0x89, 0x04, 0xdb, 0xbd, 0xf8, 0xfc, 0x56, 0xbd,
+ 0x8a, 0x66, 0x36, 0x3d, 0x42, 0x8f, 0x6e, 0xbd, 0xc9, 0x79, 0x87, 0x3d, 0xbf,
+ 0xfb, 0x26, 0x3d, 0x56, 0xeb, 0xbc, 0xbb, 0x3b, 0xa7, 0x17, 0x3d, 0x17, 0x46,
+ 0x27, 0x3d, 0x87, 0xfb, 0xb4, 0x3d, 0x09, 0x7b, 0x9d, 0xbc, 0xf4, 0xdc, 0x30,
+ 0x3d, 0xca, 0xee, 0xf7, 0xbd, 0x08, 0x73, 0xec, 0x3d, 0x60, 0xed, 0x24, 0x3d,
+ 0x77, 0xa3, 0x26, 0x3c, 0x07, 0x95, 0xe2, 0x3c, 0x27, 0x2f, 0xde, 0x3c, 0xd3,
+ 0x8a, 0x94, 0xbc, 0x58, 0x57, 0xaa, 0xbd, 0x86, 0xdd, 0x0d, 0x3d, 0x29, 0x14,
+ 0x56, 0x3d, 0x94, 0xdf, 0xa8, 0x3d, 0x33, 0x86, 0xbd, 0x3d, 0xb2, 0x8a, 0x7b,
+ 0x3c, 0x8d, 0x7b, 0x26, 0xbc, 0x2f, 0x59, 0xb8, 0xbd, 0x65, 0xc2, 0x87, 0xbd,
+ 0xd3, 0x4b, 0x76, 0x3d, 0x16, 0x20, 0x22, 0x3d, 0xb9, 0xef, 0x62, 0x3b, 0xda,
+ 0x3b, 0x6b, 0x3d, 0xce, 0x75, 0x59, 0x3d, 0x90, 0xde, 0x33, 0x3d, 0x77, 0x8b,
+ 0xf7, 0x3d, 0x98, 0xfd, 0xa0, 0xbd, 0xcc, 0xa0, 0xd2, 0x3d, 0xec, 0x73, 0x84,
+ 0xbd, 0x2c, 0x7a, 0x34, 0x3c, 0xbd, 0x44, 0x07, 0x3e, 0xd8, 0xf6, 0x74, 0xbd,
+ 0x0a, 0x72, 0x8c, 0xbd, 0xad, 0xd3, 0xd5, 0xbd, 0x78, 0xf7, 0xc9, 0x3d, 0x28,
+ 0xef, 0x5f, 0x3d, 0x01, 0xbf, 0x80, 0xbd, 0xcc, 0xd6, 0x01, 0xbd, 0x37, 0x34,
+ 0x75, 0xbd, 0x4a, 0x00, 0x87, 0x3d, 0x4c, 0xd9, 0x4c, 0xbb, 0xcd, 0x86, 0x42,
+ 0xbd, 0x7b, 0xef, 0x1a, 0x3d, 0x98, 0x2b, 0x3a, 0x3d, 0x97, 0x7a, 0x18, 0x3c,
+ 0xd0, 0x24, 0xe6, 0xbd, 0xcd, 0xc5, 0xc2, 0x3c, 0x8d, 0x69, 0x7f, 0xbc, 0xed,
+ 0xef, 0x88, 0xbd, 0x54, 0x72, 0xd6, 0x3d, 0xc4, 0x5b, 0xba, 0x3d, 0x13, 0xd9,
+ 0x1d, 0xbd, 0xa9, 0x69, 0xd5, 0x3d, 0xf6, 0xab, 0x4b, 0x3d, 0xaf, 0x3c, 0xab,
+ 0x3d, 0xad, 0x17, 0x02, 0x3d, 0xfe, 0x82, 0x97, 0xbd, 0xe7, 0x5b, 0xca, 0x3d,
+ 0x0d, 0x04, 0x1b, 0x3d, 0x6a, 0x95, 0xb5, 0x3d, 0xa7, 0x5f, 0xc5, 0x3d, 0x57,
+ 0xf4, 0xdc, 0x3d, 0x25, 0xf3, 0xa2, 0xbd, 0xad, 0x96, 0xd3, 0x3d, 0x16, 0xb7,
+ 0x2f, 0xbe, 0x61, 0x4c, 0xaa, 0x3d, 0x71, 0x82, 0xcc, 0x3d, 0x44, 0x36, 0xbb,
+ 0x3d, 0xba, 0x8f, 0xca, 0xbc, 0xe0, 0xa3, 0x63, 0x3c, 0xfa, 0x02, 0xb3, 0xbd,
+ 0x0a, 0xcf, 0x00, 0xbe, 0x4b, 0xce, 0x7e, 0xbd, 0xe9, 0x90, 0xcf, 0x3b, 0x32,
+ 0x0d, 0xa9, 0xbd, 0x54, 0x4d, 0x42, 0x3d, 0x30, 0x36, 0x32, 0x3d, 0x04, 0xa6,
+ 0xb2, 0xbd, 0x79, 0x05, 0x0a, 0x3e, 0xbb, 0x45, 0xe6, 0x3c, 0xfd, 0xf6, 0x79,
+ 0x3d, 0x1c, 0x9f, 0x1d, 0x3d, 0xe5, 0x27, 0x97, 0x3c, 0x31, 0xf4, 0x02, 0xbd,
+ 0x30, 0x19, 0x45, 0x3d, 0xa4, 0x54, 0x06, 0x3d, 0x94, 0x4d, 0xb9, 0xbd, 0x3b,
+ 0x21, 0xdf, 0xbd, 0xbb, 0x79, 0x1f, 0xbd, 0x41, 0x34, 0x9f, 0x3d, 0x02, 0x58,
+ 0xb8, 0x3d, 0xe1, 0xb2, 0x03, 0xbe, 0x5e, 0x71, 0x29, 0x3d, 0x9e, 0xf7, 0xbf,
+ 0xbd, 0xc7, 0x01, 0x75, 0xbd, 0x0d, 0xe3, 0x14, 0xbd, 0x38, 0x23, 0xa3, 0x3d,
+ 0x93, 0xbc, 0xaa, 0xbd, 0xc9, 0x19, 0x91, 0x3d, 0xcb, 0xba, 0x69, 0x3d, 0xfc,
+ 0xfa, 0xd7, 0x3d, 0x95, 0xd9, 0x38, 0xbd, 0x4e, 0x3f, 0x75, 0x3d, 0x73, 0xdb,
+ 0x15, 0xbe, 0xdf, 0x76, 0x8d, 0x3d, 0x0f, 0xb1, 0x13, 0x3d, 0x90, 0x32, 0x24,
+ 0x3e, 0x3a, 0x17, 0xf9, 0xbd, 0xcd, 0xd1, 0x38, 0xbd, 0x27, 0xf4, 0x9b, 0xbd,
+ 0x10, 0x6c, 0xa3, 0xbc, 0x1e, 0x12, 0x42, 0x3d, 0xee, 0x38, 0xff, 0xbc, 0xb4,
+ 0x28, 0x2e, 0x3d, 0xba, 0x69, 0xbd, 0xbc, 0x7c, 0x69, 0xbb, 0xbc, 0x1a, 0xe8,
+ 0xde, 0xbd, 0xd8, 0xa2, 0x17, 0x3c, 0xb8, 0x9e, 0xb6, 0xbb, 0xae, 0x5e, 0x96,
+ 0x3c, 0x4f, 0xbb, 0x03, 0xbd, 0x8f, 0x72, 0xb4, 0xbc, 0x94, 0x57, 0xd7, 0x3d,
+ 0xf5, 0xe3, 0xaf, 0xbc, 0xa4, 0x0c, 0x0d, 0xbd, 0x13, 0xbb, 0x83, 0x3d, 0x62,
+ 0x06, 0xda, 0x3d, 0xb7, 0xa5, 0x1c, 0x3e, 0x90, 0xd8, 0x86, 0xbd, 0xf5, 0x7e,
+ 0xd0, 0xbd, 0x8b, 0x5e, 0xcb, 0xbd, 0x0e, 0x81, 0xf5, 0xbd, 0xfe, 0xf3, 0xe4,
+ 0xbc, 0xe2, 0xc9, 0xd6, 0xbc, 0x4c, 0xa9, 0xc8, 0x3b, 0x04, 0xd2, 0x49, 0xbc,
+ 0xf0, 0xb2, 0xa5, 0xbd, 0xc7, 0xd6, 0xea, 0x3d, 0xa6, 0xa6, 0x77, 0x3d, 0xdf,
+ 0x24, 0x03, 0x3d, 0x05, 0x9e, 0x86, 0xbd, 0xce, 0x27, 0x31, 0x3d, 0x46, 0x54,
+ 0xa4, 0x3d, 0x27, 0x9b, 0x35, 0xbd, 0x28, 0x86, 0x68, 0xbb, 0x2c, 0x1e, 0xc1,
+ 0xbd, 0xda, 0x7e, 0xa2, 0x3b, 0xa6, 0xe6, 0xe9, 0x3d, 0x8a, 0xcf, 0x0f, 0x3d,
+ 0x5e, 0xf0, 0x6f, 0xbd, 0xa0, 0xc6, 0xb1, 0xbb, 0x08, 0xc6, 0x77, 0xbc, 0x6d,
+ 0x17, 0x16, 0xbd, 0xf5, 0xc6, 0x21, 0x3d, 0x70, 0x2a, 0x11, 0xbd, 0x3f, 0x5a,
+ 0x6c, 0xbd, 0xfb, 0xd9, 0xbc, 0x3d, 0x91, 0x33, 0xb4, 0x3c, 0xc1, 0xc7, 0x84,
+ 0x3d, 0xd9, 0xca, 0x41, 0xbd, 0xd8, 0x5d, 0xec, 0x3d, 0x17, 0xe2, 0x94, 0x3d,
+ 0xbf, 0x3f, 0x04, 0xbe, 0x24, 0xa8, 0x66, 0xbd, 0xc4, 0xcd, 0xc0, 0x3d, 0x07,
+ 0xce, 0x9e, 0xbd, 0x67, 0x5d, 0xe0, 0x3d, 0x9e, 0xdd, 0x1c, 0xbe, 0x77, 0xe5,
+ 0x5c, 0x3d, 0x98, 0x1f, 0xaf, 0x3d, 0x8a, 0xfd, 0x02, 0x3e, 0x9f, 0x9a, 0xba,
+ 0xbc, 0x40, 0xe9, 0xbb, 0x3c, 0x4e, 0x51, 0x10, 0xbc, 0xc6, 0xcc, 0x81, 0x3d,
+ 0x83, 0x18, 0x78, 0xbc, 0x7f, 0x25, 0xe8, 0xbd, 0x2e, 0xa6, 0xcb, 0x3c, 0x2f,
+ 0x8c, 0x3e, 0x3c, 0x38, 0xdc, 0x67, 0xbb, 0x57, 0xf8, 0xbd, 0x3d, 0xa2, 0x4b,
+ 0x13, 0x3e, 0x6d, 0x76, 0x64, 0x3d, 0xcf, 0x5e, 0x98, 0x3c, 0x09, 0xc1, 0x8a,
+ 0x3c, 0x42, 0x2b, 0x82, 0x3d, 0xa3, 0x83, 0x4a, 0x3d, 0xe3, 0x74, 0xb9, 0xbb,
+ 0x26, 0xf8, 0x62, 0x3d, 0xd6, 0x4d, 0xa4, 0xbc, 0x68, 0x44, 0x13, 0x3d, 0x3b,
+ 0x7d, 0x54, 0x3d, 0xf4, 0xdf, 0x8c, 0x3d, 0xef, 0x72, 0xcf, 0xbd, 0x4e, 0xd6,
+ 0x85, 0x3c, 0x6a, 0x11, 0x38, 0xbc, 0xa5, 0xec, 0x83, 0xbd, 0x23, 0x95, 0x86,
+ 0xbd, 0x93, 0xa0, 0xbf, 0x3c, 0x91, 0xc5, 0x11, 0xbd, 0x96, 0x1b, 0x23, 0x3d,
+ 0xbc, 0x6d, 0x00, 0x3d, 0x55, 0xb7, 0x9d, 0x3d, 0x44, 0x45, 0x8d, 0x3c, 0x83,
+ 0x34, 0x19, 0xbd, 0x1c, 0x2e, 0xbe, 0xbd, 0xfb, 0x4b, 0xd5, 0x3c, 0x25, 0xec,
+ 0xd9, 0xba, 0xe0, 0xcd, 0xa9, 0x3d, 0x72, 0x99, 0xa1, 0x3d, 0xa6, 0xa1, 0x91,
+ 0xbd, 0xc8, 0x70, 0x39, 0xbd, 0x33, 0x54, 0x24, 0x3d, 0x80, 0x25, 0xd8, 0x3c,
+ 0x3c, 0x36, 0xdb, 0x3b, 0x04, 0x22, 0x3c, 0xbd, 0xc8, 0x81, 0xfb, 0x3d, 0x89,
+ 0x15, 0xe1, 0x3d, 0xa5, 0x9d, 0x17, 0xbd, 0x68, 0xad, 0x64, 0xbd, 0xad, 0xbd,
+ 0x59, 0xbc, 0xfc, 0x1a, 0xa5, 0xbd, 0xf5, 0x88, 0x44, 0x3d, 0x53, 0xa7, 0x9b,
+ 0x3d, 0x2e, 0x00, 0x93, 0xbd, 0xbd, 0xb1, 0xb9, 0x3c, 0x61, 0x54, 0xc8, 0x3c,
+ 0xe3, 0xe9, 0xd7, 0x3d, 0x78, 0xe2, 0xe0, 0x3d, 0x6c, 0xe0, 0x08, 0xbe, 0x80,
+ 0xc2, 0xaf, 0x3d, 0x2a, 0x5c, 0x10, 0xbd, 0x60, 0xcb, 0xf0, 0x3d, 0x7a, 0xa1,
+ 0xf0, 0xbb, 0x02, 0x56, 0xa9, 0x3d, 0x11, 0xf1, 0x1c, 0x3c, 0x39, 0xec, 0xa9,
+ 0xbd, 0x73, 0xfd, 0x24, 0xbd, 0xd5, 0x86, 0x8c, 0x3d, 0xdc, 0x85, 0x21, 0x3c,
+ 0xa7, 0x6f, 0xf6, 0x3d, 0xe0, 0x6b, 0x0c, 0xbd, 0x08, 0x15, 0xf2, 0x3d, 0xd6,
+ 0x6a, 0xed, 0x3d, 0xda, 0xc1, 0x51, 0xbd, 0x27, 0x6e, 0x11, 0xbe, 0xbe, 0x8f,
+ 0xcf, 0xbc, 0xa9, 0xf1, 0x05, 0x3d, 0xa1, 0x30, 0x8d, 0xbd, 0x35, 0x5e, 0x97,
+ 0xbd, 0xee, 0x02, 0x9d, 0xbc, 0xf8, 0xba, 0xe9, 0xbd, 0x61, 0xe1, 0xb5, 0xbd,
+ 0xaa, 0x6d, 0x0c, 0xbd, 0xeb, 0x1f, 0x5d, 0xbd, 0x17, 0x11, 0xda, 0x3c, 0xe3,
+ 0x75, 0x55, 0xbd, 0x8b, 0x40, 0x4a, 0x3d, 0xb2, 0x5b, 0x17, 0xbd, 0xc2, 0xbb,
+ 0x66, 0xbd, 0x42, 0x20, 0xf7, 0x3d, 0x05, 0x75, 0xff, 0xbd, 0xce, 0xd3, 0xca,
+ 0x3c, 0x76, 0x10, 0xbb, 0x3d, 0x66, 0xa2, 0xcc, 0xbc, 0x96, 0x30, 0xf7, 0xba,
+ 0xad, 0xa8, 0x16, 0xbc, 0x32, 0x10, 0x77, 0x3b, 0x98, 0xde, 0x1f, 0xbd, 0xc7,
+ 0xd6, 0x72, 0x3d, 0x33, 0xea, 0xe1, 0x3d, 0xb5, 0x5d, 0x8d, 0x3c, 0xfe, 0xf1,
+ 0x64, 0x3d, 0x3f, 0xe1, 0x88, 0x3c, 0x0d, 0xa2, 0x92, 0x3d, 0x52, 0x90, 0x20,
+ 0xbd, 0xcd, 0x17, 0x88, 0xbd, 0xf7, 0xf1, 0x7b, 0x3d, 0x55, 0xbe, 0x9c, 0x3b,
+ 0x1a, 0x3f, 0xd1, 0x3c, 0x46, 0xbe, 0x0d, 0x3d, 0x53, 0xd7, 0xd9, 0x3d, 0xda,
+ 0x58, 0xb5, 0xbc, 0x3a, 0x41, 0x78, 0xbd, 0x78, 0xc0, 0x54, 0xbd, 0x3c, 0x27,
+ 0x10, 0x3e, 0x16, 0x00, 0xe9, 0x3b, 0x6e, 0xcd, 0xc5, 0x3d, 0xd9, 0xf0, 0x82,
+ 0x3d, 0x44, 0x3e, 0x82, 0x3d, 0xde, 0x31, 0x83, 0x3d, 0x10, 0x32, 0x4e, 0xbd,
+ 0x13, 0x46, 0xd7, 0xbd, 0x60, 0xa0, 0xbb, 0xbc, 0x33, 0xc9, 0xb0, 0xbd, 0x8d,
+ 0x52, 0xfb, 0x3d, 0x5e, 0xa7, 0x07, 0x3d, 0x05, 0xd7, 0xb7, 0x3d, 0x34, 0x8c,
+ 0x71, 0x3d, 0xcf, 0x5d, 0x66, 0xbd, 0x2a, 0x61, 0x1c, 0x3d, 0xa5, 0xa5, 0x70,
+ 0xbd, 0xd2, 0xb9, 0x67, 0x3b, 0x9e, 0x63, 0x5a, 0x3d, 0xbe, 0xea, 0xd4, 0xbc,
+ 0x57, 0xe9, 0xb5, 0x3d, 0x03, 0xe4, 0xa6, 0x3d, 0xc4, 0x6b, 0xb3, 0x3d, 0x6e,
+ 0x60, 0x9f, 0x3d, 0xac, 0x31, 0xa0, 0x3d, 0xcf, 0xcc, 0xb5, 0x3d, 0xd0, 0x80,
+ 0xd6, 0x3d, 0xb9, 0x3f, 0x96, 0xbd, 0x2d, 0x17, 0x17, 0xbb, 0x6f, 0xf2, 0xe4,
+ 0xbd, 0x17, 0x51, 0x6e, 0x3d, 0xc2, 0xe2, 0xc2, 0x3d, 0xfe, 0x71, 0x59, 0x3d,
+ 0x0e, 0x1c, 0x78, 0xbd, 0xc9, 0xc7, 0xbc, 0xbd, 0x40, 0xb0, 0xa8, 0x3d, 0xbf,
+ 0xff, 0x42, 0xbd, 0xe4, 0x2e, 0x67, 0x3d, 0xca, 0x73, 0x81, 0xbd, 0x0b, 0x0d,
+ 0xf3, 0x3d, 0xce, 0x97, 0x70, 0x3d, 0xe9, 0x59, 0xe9, 0x3d, 0x45, 0x22, 0x73,
+ 0xbd, 0x24, 0xb8, 0xdf, 0x3d, 0x96, 0xbb, 0x3f, 0x3c, 0x02, 0xed, 0x65, 0x3d,
+ 0x84, 0x40, 0x25, 0x3c, 0x6c, 0xc5, 0xd2, 0x3c, 0xea, 0x38, 0x4a, 0x3d, 0xf9,
+ 0xa2, 0xc9, 0x3d, 0x6f, 0x30, 0xbc, 0x3a, 0x2d, 0xd5, 0x81, 0xbd, 0xd2, 0xae,
+ 0xa3, 0xbb, 0x8e, 0x91, 0xe7, 0x3c, 0x28, 0x6b, 0xc4, 0xbd, 0xf3, 0x0c, 0xbf,
+ 0xbc, 0x66, 0xf8, 0xd3, 0x3b, 0x6d, 0x3e, 0x01, 0x3d, 0xf3, 0xbf, 0xc2, 0xbc,
+ 0x0d, 0xc5, 0x6f, 0xbd, 0xb7, 0x9b, 0x9c, 0x3d, 0xeb, 0x79, 0x88, 0x3d, 0x81,
+ 0x8a, 0x7d, 0xbc, 0xde, 0x8b, 0x14, 0x3d, 0xa4, 0x3f, 0x7d, 0x3d, 0xb4, 0x27,
+ 0xa9, 0x3d, 0xb7, 0x75, 0x51, 0x3d, 0xff, 0x73, 0x85, 0x3d, 0x3f, 0xf3, 0x51,
+ 0x3d, 0xe6, 0xdd, 0xe2, 0xbb, 0x83, 0xc7, 0x65, 0xbd, 0x6a, 0x16, 0xb6, 0xbd,
+ 0xcf, 0xe8, 0x90, 0x3d, 0x5b, 0xc8, 0xad, 0xbc, 0xa1, 0x27, 0x29, 0xbd, 0x57,
+ 0xbd, 0x3d, 0x3d, 0x61, 0x4e, 0x41, 0xbc, 0x21, 0x2f, 0x29, 0x3d, 0x55, 0x0b,
+ 0xba, 0x3d, 0xaa, 0x67, 0xf3, 0xba, 0x7d, 0x60, 0xe4, 0x3d, 0xab, 0xe7, 0x20,
+ 0xbd, 0x01, 0x71, 0x9f, 0x3d, 0x5a, 0xd5, 0x95, 0xbd, 0x2f, 0x75, 0xd5, 0x3d,
+ 0x7c, 0x91, 0xf6, 0x3d, 0xaa, 0xd6, 0x0c, 0x3d, 0x6d, 0x1c, 0xd9, 0xbd, 0xb4,
+ 0x4e, 0x82, 0xbc, 0x3f, 0x5a, 0x1a, 0x3b, 0xb4, 0x94, 0xfb, 0x3d, 0x0a, 0x71,
+ 0x3c, 0xbd, 0x97, 0xba, 0x12, 0xbc, 0xfd, 0x3d, 0x33, 0xbd, 0xa3, 0x4d, 0x01,
+ 0x3e, 0x54, 0xe2, 0x33, 0xbd, 0x8d, 0x32, 0x5d, 0x3d, 0x92, 0x84, 0xcb, 0x3d,
+ 0x91, 0x67, 0xde, 0xbd, 0x4b, 0xfd, 0xc7, 0xbd, 0x4b, 0x11, 0x04, 0xbe, 0x3e,
+ 0xde, 0xac, 0x3d, 0xe4, 0x9e, 0x3c, 0x3d, 0x5e, 0x7d, 0xfb, 0x3d, 0xfd, 0x4d,
+ 0xae, 0x3d, 0x63, 0xcf, 0x6f, 0xbd, 0xa0, 0x4f, 0x8b, 0x3d, 0x46, 0x2c, 0x84,
+ 0xbd, 0xda, 0x69, 0x11, 0x3b, 0xca, 0x5b, 0x1c, 0xbd, 0x59, 0x23, 0x26, 0x3e,
+ 0x16, 0xb1, 0x68, 0xbd, 0x1c, 0xd4, 0x98, 0xbd, 0x9c, 0x91, 0x6e, 0xbd, 0xa5,
+ 0xc6, 0x55, 0xbc, 0xd0, 0xf3, 0xcc, 0xbd, 0xe8, 0x91, 0xe0, 0xbd, 0xdf, 0xe3,
+ 0xb4, 0x3d, 0x04, 0x77, 0xc2, 0xbd, 0xcc, 0x21, 0xda, 0xbd, 0x7d, 0xed, 0x1d,
+ 0x3d, 0x1c, 0xa9, 0x0f, 0x3e, 0x25, 0x19, 0x67, 0x3d, 0xcc, 0x29, 0x65, 0xbd,
+ 0x34, 0x00, 0xdd, 0x3d, 0xe3, 0x04, 0x15, 0xbd, 0x79, 0xb8, 0x50, 0xbd, 0x98,
+ 0x5b, 0x44, 0xbc, 0x32, 0x55, 0xd1, 0x3d, 0x19, 0x20, 0x2a, 0xbd, 0xbd, 0x28,
+ 0xb6, 0x3c, 0x33, 0xf4, 0xc4, 0xbb, 0x95, 0x26, 0x9f, 0xbb, 0x93, 0xb7, 0x7f,
+ 0x3d, 0x16, 0xbc, 0x5f, 0x3d, 0x0a, 0x14, 0x82, 0x3c, 0x3a, 0x40, 0x12, 0x3e,
+ 0x99, 0x9c, 0xbe, 0x3c, 0x6c, 0x22, 0x72, 0x3d, 0xb3, 0x18, 0x10, 0xbe, 0x2b,
+ 0x6f, 0x4b, 0x3d, 0xaf, 0x83, 0x90, 0x3c, 0x67, 0x6b, 0x57, 0x3d, 0xae, 0xba,
+ 0x1d, 0xbd, 0x42, 0x58, 0xda, 0xbd, 0xcd, 0x16, 0xc6, 0xbd, 0x28, 0x11, 0xa1,
+ 0xbd, 0xc3, 0xfa, 0x6b, 0x3d, 0xff, 0x35, 0xc4, 0x3d, 0xca, 0x54, 0x9d, 0x3d,
+ 0x65, 0xc0, 0x0a, 0x3d, 0xbe, 0xbd, 0x73, 0xbc, 0xee, 0xf8, 0xfb, 0x3a, 0x88,
+ 0xcf, 0x2c, 0x3d, 0xa4, 0x2d, 0xb9, 0x3d, 0x30, 0xbf, 0x9c, 0xbd, 0x16, 0xf6,
+ 0x97, 0x3c, 0x72, 0xf4, 0x12, 0x3d, 0x4c, 0xc6, 0x01, 0xbd, 0x68, 0x2e, 0xc0,
+ 0xbd, 0x38, 0xd4, 0x2c, 0x3d, 0xe6, 0xb4, 0xbf, 0x3d, 0xf5, 0x15, 0x66, 0xbd,
+ 0x29, 0x0f, 0x83, 0x3d, 0x44, 0x2b, 0xb0, 0x3d, 0xa1, 0x53, 0xeb, 0x3d, 0xc6,
+ 0x86, 0x8a, 0x3d, 0xe0, 0x36, 0x48, 0xbd, 0x29, 0xff, 0x22, 0xbd, 0xff, 0x33,
+ 0xae, 0x3d, 0xa2, 0x5b, 0x13, 0xbd, 0x1d, 0x6f, 0x9e, 0x3d, 0x0e, 0x6d, 0x09,
+ 0x3d, 0x7f, 0x06, 0x01, 0xbe, 0xc8, 0x08, 0xc7, 0x3d, 0xc2, 0xe8, 0xae, 0x3d,
+ 0xe6, 0x4a, 0xc7, 0x3d, 0x29, 0x40, 0xb3, 0x3d, 0xb5, 0x99, 0x83, 0xbd, 0xa4,
+ 0x23, 0x8f, 0x3d, 0x4a, 0xa2, 0x9c, 0x3d, 0x0d, 0xe2, 0x04, 0x3d, 0x40, 0xff,
+ 0x07, 0x3d, 0xa4, 0x8c, 0x30, 0x3d, 0x75, 0x00, 0x1c, 0x3d, 0x45, 0x9b, 0x02,
+ 0x3e, 0xb2, 0xce, 0x2e, 0x3d, 0x16, 0x9d, 0x3f, 0xbd, 0x8e, 0xf1, 0x1b, 0xbc,
+ 0x9b, 0x59, 0x04, 0xbd, 0xae, 0xd7, 0xd3, 0x3d, 0x2b, 0x15, 0x05, 0x3b, 0x12,
+ 0xec, 0x5d, 0x3c, 0x30, 0xe9, 0xea, 0x3d, 0x58, 0xe5, 0xe4, 0xbd, 0x9b, 0x54,
+ 0x86, 0xbd, 0xf0, 0x47, 0x4e, 0xbd, 0x21, 0xa7, 0xef, 0x3b, 0x89, 0xf9, 0x23,
+ 0x3d, 0xec, 0x14, 0x48, 0xbd, 0xfc, 0x86, 0x20, 0x3e, 0x08, 0x69, 0x95, 0x3d,
+ 0x26, 0x08, 0xb6, 0xbd, 0xd9, 0xe2, 0xb3, 0xbd, 0x27, 0x6f, 0xf0, 0x3d, 0x9d,
+ 0xc4, 0x1c, 0xbe, 0x1a, 0x6e, 0x22, 0x3d, 0xc5, 0xe3, 0x68, 0x3d, 0x45, 0x2d,
+ 0x8a, 0xbb, 0xbe, 0xf3, 0x84, 0x3d, 0x63, 0xef, 0x10, 0x3d, 0x54, 0xfa, 0xde,
+ 0x3c, 0x57, 0x4c, 0xc4, 0x3d, 0xa7, 0x44, 0x8b, 0xbd, 0x9e, 0xf0, 0x33, 0xbd,
+ 0x9a, 0x6c, 0x89, 0x3d, 0x6c, 0xc9, 0x21, 0xbe, 0x0e, 0x60, 0x9d, 0xbd, 0xd9,
+ 0x35, 0x1f, 0xbd, 0x0d, 0x4f, 0x9a, 0x3d, 0xd4, 0x24, 0xca, 0x3d, 0xc4, 0x5c,
+ 0x45, 0xbd, 0x28, 0x24, 0xea, 0x3c, 0xee, 0xea, 0xef, 0xbd, 0x4d, 0xae, 0x89,
+ 0x3d, 0x91, 0x99, 0x79, 0xbc, 0xb6, 0x1b, 0xc2, 0x3d, 0xcb, 0x8d, 0xb4, 0xbc,
+ 0x63, 0xaa, 0x7f, 0xbd, 0x19, 0xbc, 0xe6, 0xbc, 0x82, 0x28, 0x4e, 0xbd, 0xf4,
+ 0x7a, 0xbc, 0x3d, 0xe4, 0xe7, 0xcd, 0xbd, 0x2c, 0xe3, 0xda, 0xbd, 0xc6, 0x98,
+ 0xec, 0x3d, 0xd7, 0xfc, 0xf8, 0xbc, 0xd4, 0x80, 0x76, 0x3d, 0xbf, 0x17, 0x3e,
+ 0xbd, 0x20, 0x69, 0x48, 0x3a, 0x1c, 0x2c, 0xa2, 0x3d, 0xc2, 0x8b, 0x95, 0x3d,
+ 0xc4, 0xb5, 0xa9, 0x3d, 0x43, 0x5b, 0xde, 0xbc, 0xf1, 0x1e, 0x0f, 0xbd, 0x52,
+ 0x3e, 0xbb, 0x3d, 0xff, 0xaf, 0xfd, 0x3d, 0x66, 0x65, 0x59, 0x3d, 0x03, 0x95,
+ 0x55, 0x3d, 0x97, 0x22, 0x04, 0xbe, 0xcb, 0x24, 0x32, 0xbd, 0xf3, 0x26, 0xa5,
+ 0xbd, 0xaa, 0xd3, 0xdb, 0xbc, 0x75, 0x5b, 0x41, 0xbd, 0x2e, 0x2c, 0xc4, 0x3d,
+ 0xd5, 0x98, 0xc4, 0x3c, 0xa3, 0x19, 0x01, 0x3c, 0x4e, 0x3f, 0x3c, 0x3d, 0xea,
+ 0xee, 0x2d, 0xbd, 0x3f, 0x97, 0x13, 0xbc, 0xed, 0xdd, 0x55, 0x3d, 0x49, 0xba,
+ 0xfb, 0xbd, 0x5c, 0xbd, 0xc9, 0xbd, 0xe8, 0x9f, 0xad, 0x3d, 0x9c, 0x26, 0x32,
+ 0xbd, 0xf6, 0xfa, 0x15, 0xbe, 0x09, 0x88, 0xc0, 0xbd, 0xe2, 0xcc, 0xaf, 0xbd,
+ 0xdb, 0x22, 0x56, 0x3d, 0x78, 0x3f, 0x0f, 0xbc, 0x50, 0xe5, 0x93, 0xbd, 0x55,
+ 0x90, 0x09, 0x3d, 0xac, 0xec, 0x6d, 0xbd, 0x93, 0x0e, 0xce, 0xbc, 0x5b, 0xde,
+ 0x85, 0x3d, 0x08, 0x1d, 0x4b, 0x3d, 0x8f, 0x16, 0xf4, 0xbd, 0x89, 0xf8, 0x83,
+ 0xbd, 0x65, 0xf3, 0xf8, 0xbc, 0xe3, 0x37, 0x09, 0x3b, 0x37, 0x89, 0x91, 0xbc,
+ 0x69, 0xea, 0x2f, 0xbd, 0x2c, 0xf2, 0xbf, 0x3c, 0xd0, 0x57, 0xa7, 0x3d, 0xae,
+ 0x94, 0xbf, 0x3d, 0x15, 0x1d, 0x63, 0x3d, 0x53, 0x20, 0x4b, 0xbd, 0x4f, 0xf2,
+ 0x00, 0x3e, 0x29, 0x36, 0x54, 0xbd, 0x49, 0x2d, 0x8c, 0xbd, 0x29, 0xbc, 0xb6,
+ 0x3d, 0x08, 0xc4, 0xc7, 0x3d, 0xb6, 0x3d, 0xf9, 0xbd, 0x84, 0x0f, 0xa1, 0x3d,
+ 0xe8, 0x20, 0xb1, 0xbd, 0x8b, 0xf6, 0xa8, 0xbd, 0x51, 0xec, 0x75, 0x3d, 0x85,
+ 0xeb, 0x13, 0xbe, 0x5c, 0xe5, 0x4f, 0x3d, 0xe5, 0x90, 0xf3, 0xbc, 0x5a, 0xb0,
+ 0x39, 0xbd, 0xbf, 0x7a, 0x63, 0x3d, 0xa4, 0x35, 0x08, 0x3e, 0xae, 0x8a, 0xa6,
+ 0xbd, 0x4d, 0x53, 0x46, 0xbd, 0x8e, 0xb0, 0x46, 0xbc, 0x9d, 0x94, 0x15, 0x3d,
+ 0x6d, 0xdc, 0x62, 0x3c, 0x75, 0x33, 0x29, 0x3d, 0x61, 0xba, 0x3d, 0x3d, 0x0a,
+ 0xdb, 0x72, 0xbc, 0x18, 0x43, 0xdb, 0xbc, 0xb0, 0xca, 0x83, 0xbc, 0x33, 0x9b,
+ 0x12, 0xbe, 0xdb, 0x85, 0xb2, 0xbd, 0xe1, 0x52, 0xc7, 0xbd, 0xd6, 0xbc, 0x12,
+ 0xbd, 0x19, 0x0f, 0x90, 0xbc, 0x75, 0xb0, 0x4c, 0x3d, 0x91, 0x46, 0xd2, 0x3b,
+ 0xae, 0x95, 0x0e, 0x3d, 0x51, 0xa0, 0x74, 0x3d, 0x9b, 0x73, 0x90, 0xba, 0xec,
+ 0x61, 0x85, 0x3c, 0xaa, 0x01, 0xb7, 0x3d, 0x83, 0x19, 0x96, 0xbd, 0xeb, 0x6f,
+ 0xce, 0x3c, 0x46, 0x50, 0x15, 0xbe, 0x4c, 0x9d, 0xe2, 0xbb, 0xee, 0x86, 0x59,
+ 0xbb, 0xd9, 0xea, 0x8c, 0x3d, 0x5e, 0x80, 0x96, 0x3b, 0x9e, 0x36, 0xf2, 0x3d,
+ 0xfc, 0x4e, 0xa8, 0x3c, 0x67, 0x32, 0xb0, 0x3d, 0x93, 0xf9, 0x1a, 0x3d, 0x71,
+ 0x3b, 0xaa, 0xbd, 0xd4, 0xcf, 0x34, 0x3d, 0x93, 0x11, 0x84, 0xbd, 0x76, 0x9c,
+ 0xc7, 0x3d, 0x6b, 0xee, 0xd5, 0xbd, 0xb6, 0x03, 0xd8, 0x3d, 0xb8, 0x56, 0x53,
+ 0xbd, 0x61, 0x89, 0xab, 0xbd, 0x69, 0x71, 0x46, 0xbc, 0x79, 0x31, 0x81, 0xbd,
+ 0xa0, 0xaa, 0x9d, 0xbc, 0xab, 0x17, 0x0c, 0x3d, 0x31, 0xb8, 0x0a, 0x3d, 0xc3,
+ 0x40, 0xb4, 0xbd, 0xab, 0xb6, 0x97, 0x3d, 0xc1, 0x3a, 0x47, 0x3d, 0x31, 0xdc,
+ 0xdb, 0xbc, 0xb4, 0x23, 0x60, 0xbc, 0x9d, 0x47, 0x93, 0x3d, 0xc9, 0x69, 0xa1,
+ 0x3d, 0xbb, 0x2f, 0x7a, 0x3d, 0x07, 0x8d, 0x91, 0x3d, 0x20, 0xdb, 0xca, 0x3d,
+ 0xf8, 0x44, 0xd3, 0xbd, 0x68, 0xfc, 0x66, 0xbc, 0xfa, 0xab, 0x29, 0x3d, 0xcb,
+ 0xb6, 0xa4, 0x3d, 0x9e, 0xbd, 0x06, 0x3d, 0xd1, 0x54, 0xb1, 0x3d, 0x06, 0x7e,
+ 0xcb, 0xbd, 0x24, 0x71, 0xc4, 0x3d, 0x08, 0x17, 0x40, 0x3d, 0x7a, 0xf7, 0xae,
+ 0xbd, 0xc0, 0x66, 0xc1, 0xbd, 0xfa, 0x2a, 0x22, 0xbd, 0xf0, 0x3d, 0xd2, 0xbc,
+ 0x2e, 0xc7, 0x71, 0xbd, 0xc5, 0x4f, 0xd0, 0xbd, 0xf7, 0x68, 0x85, 0xbd, 0xab,
+ 0xeb, 0x92, 0xbd, 0x5e, 0xb7, 0xe8, 0xbd, 0x66, 0xc1, 0xef, 0xbd, 0xb7, 0x07,
+ 0x06, 0xbd, 0x5b, 0x2f, 0x40, 0x3d, 0xd6, 0xb0, 0xa8, 0xbd, 0xb8, 0x1a, 0xe8,
+ 0x3d, 0x9f, 0xb7, 0xc4, 0x3d, 0x3c, 0xb5, 0x8f, 0xbd, 0x23, 0x9f, 0xbc, 0x3d,
+ 0xfd, 0x90, 0x88, 0xbd, 0xa2, 0xa9, 0x27, 0xbc, 0x41, 0xe4, 0xd7, 0xbd, 0x29,
+ 0x97, 0x07, 0xbd, 0xff, 0x72, 0x04, 0x3c, 0x56, 0x5a, 0x34, 0xbd, 0xf4, 0x8a,
+ 0x9d, 0xbd, 0x7e, 0x5d, 0x83, 0xbd, 0xd2, 0x00, 0x4e, 0x3d, 0xbe, 0x7e, 0x5d,
+ 0x3d, 0x03, 0xd1, 0x38, 0xbd, 0xb2, 0x2b, 0xbc, 0xbd, 0x04, 0xa8, 0x4d, 0x3d,
+ 0xa8, 0x0b, 0xaa, 0xbd, 0x84, 0x50, 0xac, 0xbd, 0x09, 0xef, 0xbf, 0xbc, 0xfa,
+ 0xb8, 0xb2, 0xbd, 0xeb, 0x7e, 0xd9, 0x3d, 0x54, 0x08, 0xda, 0xbd, 0x21, 0x24,
+ 0x61, 0xbd, 0xae, 0x1e, 0xae, 0xbd, 0xb4, 0x50, 0x3a, 0xbc, 0x2e, 0x07, 0xe9,
+ 0xbd, 0xec, 0xb1, 0x9d, 0xbd, 0x88, 0x5d, 0xca, 0xbc, 0x0c, 0x8a, 0x8c, 0x3d,
+ 0x58, 0x56, 0xf9, 0x3c, 0x57, 0x0f, 0xe7, 0x3d, 0xd4, 0xd9, 0x1c, 0xbd, 0x87,
+ 0xfe, 0x38, 0xbd, 0x1c, 0x08, 0x17, 0xbd, 0x72, 0xbb, 0xc1, 0xbc, 0x5b, 0xa9,
+ 0xf7, 0xba, 0xf2, 0xd5, 0x34, 0xbd, 0x71, 0x2f, 0x4b, 0xbd, 0x6a, 0xd6, 0xab,
+ 0xbd, 0x07, 0x81, 0xcd, 0x3d, 0x03, 0xf0, 0x2e, 0x3d, 0xcd, 0x20, 0xd4, 0xbd,
+ 0x0e, 0xf4, 0x3f, 0xbc, 0xf3, 0xed, 0xe1, 0x3d, 0xf6, 0xc4, 0x82, 0x3d, 0x0b,
+ 0x42, 0x48, 0x3d, 0xf9, 0xcd, 0x87, 0x3d, 0x91, 0x7d, 0x49, 0x3b, 0x9a, 0xc7,
+ 0x28, 0xbd, 0xf6, 0x02, 0xc3, 0x3d, 0x6e, 0x82, 0xa4, 0xbd, 0x41, 0x1f, 0xe7,
+ 0x3d, 0x44, 0x06, 0x76, 0x3d, 0x3b, 0xbc, 0xc1, 0x3b, 0x20, 0xf7, 0x7c, 0xbd,
+ 0x0d, 0x0d, 0xe0, 0xbd, 0x2b, 0xa5, 0xc5, 0x3d, 0x51, 0x84, 0x6f, 0xbd, 0xd0,
+ 0x24, 0x22, 0x3d, 0x33, 0x68, 0xb7, 0x3d, 0x37, 0x88, 0x87, 0x3d, 0x24, 0x04,
+ 0x98, 0xbd, 0x1b, 0xba, 0x04, 0xbd, 0x48, 0x09, 0xdf, 0x3b, 0xac, 0x9e, 0x3c,
+ 0xbd, 0x4b, 0xbf, 0x2c, 0x3c, 0x07, 0xba, 0xf4, 0xbd, 0x6e, 0x91, 0x84, 0x3d,
+ 0x99, 0x5a, 0x7e, 0x3c, 0x21, 0x9e, 0xeb, 0x3c, 0xde, 0x69, 0x18, 0x3d, 0x1f,
+ 0x8f, 0xaa, 0x3d, 0x09, 0x55, 0x08, 0xbd, 0x42, 0xf3, 0xe5, 0xbd, 0x61, 0x6b,
+ 0x82, 0xbd, 0xe1, 0xe2, 0xd2, 0x3d, 0x3f, 0xd1, 0xb6, 0x3d, 0xf9, 0xf5, 0xc7,
+ 0xbd, 0x47, 0x47, 0x90, 0xbd, 0x74, 0xa3, 0x42, 0xbd, 0xa5, 0xda, 0x3e, 0x3d,
+ 0xaf, 0x45, 0xc1, 0x3d, 0x68, 0x46, 0xe5, 0xbd, 0x79, 0x83, 0x31, 0x3d, 0x7e,
+ 0xd3, 0xce, 0x3c, 0xea, 0x30, 0xca, 0xbd, 0x00, 0xb0, 0xae, 0x3b, 0x66, 0x91,
+ 0xde, 0xbd, 0x0e, 0x11, 0xc0, 0xbd, 0xd0, 0x6a, 0x41, 0xbd, 0x6d, 0x7a, 0x8e,
+ 0xbd, 0x0a, 0xe2, 0x70, 0x3d, 0x7b, 0x4d, 0xcf, 0x3d, 0x2c, 0x2b, 0x3d, 0xbd,
+ 0x7e, 0xc3, 0x6f, 0xbd, 0xd0, 0x38, 0xac, 0x3c, 0xac, 0x35, 0xd0, 0xbd, 0x88,
+ 0x08, 0xe3, 0xbd, 0x78, 0x27, 0xbf, 0x3d, 0x80, 0x1e, 0xf8, 0xbc, 0x52, 0x7a,
+ 0x84, 0xbc, 0x77, 0x84, 0xbb, 0xbc, 0x22, 0xdf, 0x2b, 0x3d, 0xa8, 0x16, 0xe9,
+ 0xbd, 0xec, 0xab, 0xda, 0x3b, 0xb9, 0x2f, 0x9b, 0x3d, 0x28, 0x97, 0xd6, 0x3d,
+ 0x08, 0xde, 0x2c, 0xbc, 0x8a, 0x6c, 0x29, 0x3d, 0xdd, 0xfe, 0xa4, 0xbc, 0x13,
+ 0xb3, 0x4e, 0xbc, 0x4f, 0x72, 0x81, 0xbc, 0x33, 0x6c, 0xcc, 0x3d, 0x1c, 0xbc,
+ 0x76, 0xbc, 0xfd, 0xd7, 0x8f, 0xbd, 0x99, 0xfd, 0x53, 0xbd, 0x2c, 0x76, 0x80,
+ 0xbd, 0x65, 0x2e, 0x1d, 0xbd, 0x9d, 0xd5, 0x8e, 0x3d, 0xeb, 0x16, 0xac, 0x3d,
+ 0xa6, 0x14, 0x3d, 0x3d, 0x75, 0x14, 0x97, 0x3d, 0x5e, 0x11, 0xf5, 0xbc, 0xca,
+ 0x20, 0x46, 0xbb, 0xb1, 0x04, 0xa1, 0xbd, 0x90, 0xcd, 0x3a, 0x3d, 0x70, 0xaf,
+ 0x01, 0xbe, 0x9d, 0xe3, 0xb2, 0xbd, 0xc3, 0xdf, 0x99, 0x3d, 0x20, 0x09, 0xab,
+ 0x3d, 0x35, 0x91, 0x06, 0xbd, 0x10, 0x3a, 0xa0, 0xbc, 0xc2, 0xd1, 0xad, 0x3d,
+ 0x60, 0x90, 0xe4, 0x3d, 0x9f, 0x47, 0xfd, 0x3c, 0x84, 0xa1, 0x5f, 0x3d, 0x06,
+ 0x5e, 0xf0, 0x3c, 0xab, 0x8c, 0x07, 0xbc, 0xf4, 0x6c, 0x16, 0x3d, 0x64, 0x06,
+ 0x04, 0xbe, 0xa8, 0x16, 0x85, 0x3d, 0xea, 0x1a, 0xa1, 0xbd, 0x0d, 0xb4, 0xdc,
+ 0xbd, 0xf4, 0x77, 0xc0, 0xbc, 0x5d, 0x03, 0x28, 0xbd, 0x29, 0x7d, 0xcc, 0xbc,
+ 0xae, 0x19, 0x9f, 0x3d, 0x09, 0x2a, 0xcd, 0x3d, 0xa4, 0x58, 0xaa, 0xbd, 0x6d,
+ 0xb8, 0xa9, 0x3c, 0xa1, 0xb7, 0xe6, 0xbd, 0xa9, 0x41, 0x9a, 0xbd, 0x69, 0xa4,
+ 0xab, 0x3c, 0xdd, 0x32, 0xa9, 0x3d, 0x19, 0x90, 0xd4, 0x3d, 0x52, 0xa8, 0xea,
+ 0xbd, 0x1e, 0x3d, 0xd4, 0x39, 0x84, 0x91, 0x03, 0xbe, 0xc9, 0x63, 0x3f, 0x3d,
+ 0x81, 0x1e, 0xe0, 0x3d, 0x05, 0xc5, 0x95, 0xbd, 0x2e, 0x1d, 0xc9, 0xbd, 0xf2,
+ 0x9c, 0x7c, 0xbc, 0x69, 0x19, 0xdb, 0xbc, 0x09, 0x3d, 0x6f, 0xbd, 0x58, 0x94,
+ 0xf8, 0x3d, 0x2c, 0x78, 0xb6, 0x3d, 0x96, 0xbe, 0xf8, 0x3d, 0x98, 0x4e, 0xb6,
+ 0x3d, 0x1a, 0xa0, 0x90, 0x3d, 0xa3, 0xeb, 0xd2, 0xbd, 0x4c, 0xfb, 0x2d, 0xbd,
+ 0xcb, 0xca, 0xa8, 0xbc, 0xa7, 0xca, 0x80, 0xbd, 0x65, 0xe2, 0x87, 0xbd, 0x9d,
+ 0x9a, 0x25, 0x3c, 0xc7, 0xf2, 0xcc, 0x3c, 0x38, 0x81, 0x48, 0xbd, 0xd3, 0x83,
+ 0xea, 0x3d, 0x4f, 0x72, 0xad, 0xbd, 0x6d, 0xef, 0x3f, 0xbc, 0x22, 0xc7, 0xbf,
+ 0xbc, 0xb6, 0x25, 0x64, 0x3c, 0x82, 0x76, 0x53, 0xbd, 0xd7, 0x9a, 0x89, 0x3c,
+ 0x01, 0xa7, 0x40, 0x3d, 0xbe, 0x03, 0x69, 0xbd, 0x5c, 0x79, 0x0e, 0xbe, 0xeb,
+ 0x87, 0x9f, 0xbd, 0x14, 0xa6, 0xad, 0x3c, 0x78, 0x6b, 0x25, 0x3d, 0xea, 0xa0,
+ 0xd7, 0x3d, 0x19, 0xb6, 0x22, 0xbd, 0xc6, 0xf6, 0xba, 0xbc, 0xe9, 0xd6, 0xe4,
+ 0x3c, 0x55, 0x68, 0x2a, 0xbd, 0xc0, 0x4c, 0xb0, 0xbc, 0xf5, 0xa5, 0x01, 0x3e,
+ 0x59, 0x9a, 0xd0, 0xbd, 0x4a, 0xb2, 0xfc, 0x3d, 0x3a, 0x59, 0x8f, 0x3d, 0x4a,
+ 0x0a, 0xb4, 0xbd, 0x7d, 0xc4, 0x63, 0x3d, 0xb6, 0xb8, 0xb9, 0x3d, 0xb0, 0x95,
+ 0x81, 0x3c, 0x2f, 0x7a, 0x32, 0x3d, 0x32, 0x87, 0xe4, 0xbc, 0xf0, 0xfc, 0xd5,
+ 0x3d, 0xfc, 0xe6, 0xf1, 0x3d, 0x04, 0x66, 0x98, 0x3c, 0x14, 0x23, 0x72, 0x3c,
+ 0xfe, 0x50, 0x95, 0x3d, 0xdf, 0xe6, 0x4c, 0x3d, 0x84, 0x80, 0x8e, 0x3d, 0x13,
+ 0xe8, 0x4c, 0xbd, 0xd4, 0xca, 0x83, 0xbd, 0x20, 0x86, 0xb0, 0xbd, 0xed, 0x66,
+ 0x89, 0x3c, 0x6a, 0x59, 0x19, 0xbd, 0xc2, 0x32, 0xc3, 0xbd, 0x04, 0x3f, 0x8d,
+ 0xbc, 0x51, 0xcc, 0x23, 0xbc, 0xb4, 0x4f, 0xa3, 0xbc, 0x30, 0x98, 0xc8, 0x3d,
+ 0x29, 0xaa, 0xd4, 0xbb, 0x5c, 0x7d, 0x88, 0xbd, 0x3a, 0xe9, 0xa9, 0xbd, 0xc3,
+ 0x4f, 0x40, 0xbd, 0x2d, 0x12, 0x49, 0xbd, 0x9e, 0x4e, 0x9a, 0xbd, 0xf1, 0xa9,
+ 0x84, 0xbd, 0x29, 0x09, 0x94, 0x3d, 0x98, 0x3c, 0xf0, 0x3d, 0x5f, 0xfe, 0x2a,
+ 0xbd, 0xd8, 0xa8, 0x46, 0xbd, 0xa1, 0xc8, 0x1c, 0xbb, 0x12, 0x3d, 0xbc, 0x3d,
+ 0x38, 0x39, 0x51, 0x3c, 0x3a, 0x00, 0x95, 0x3d, 0xd8, 0x2e, 0x67, 0x3c, 0x48,
+ 0x7e, 0xe0, 0xbd, 0x8c, 0x90, 0x79, 0x3c, 0xf2, 0x3d, 0x50, 0x3d, 0xbc, 0x2f,
+ 0xa1, 0x3c, 0xf9, 0xf0, 0x8a, 0x3d, 0x0e, 0x11, 0x30, 0x3c, 0x7c, 0xc8, 0xf8,
+ 0x3c, 0xe0, 0x88, 0x10, 0x3d, 0x4b, 0xaa, 0xbe, 0xbd, 0xa4, 0x0a, 0x5b, 0x3d,
+ 0xe2, 0x3c, 0x94, 0x3d, 0xdd, 0x36, 0x95, 0xbd, 0xc7, 0x70, 0x89, 0xbd, 0x95,
+ 0xe7, 0x89, 0x3d, 0x91, 0x0e, 0x23, 0x3c, 0xfe, 0x32, 0x4f, 0x3b, 0xd4, 0x79,
+ 0xc2, 0x3d, 0x52, 0xab, 0xb4, 0xbd, 0xb3, 0x98, 0xd2, 0x3d, 0xb8, 0x70, 0x88,
+ 0xbd, 0x2e, 0x3e, 0x77, 0x3d, 0xb5, 0x44, 0x00, 0x3d, 0xb4, 0xe9, 0x59, 0x3d,
+ 0xae, 0x3b, 0x9d, 0x3d, 0x3d, 0x89, 0x36, 0x3d, 0x22, 0x67, 0x9b, 0xbb, 0xca,
+ 0xca, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0xcf, 0x02,
+ 0xcf, 0x3d, 0x6b, 0xe2, 0x84, 0x3d, 0x62, 0xaa, 0xdc, 0x3d, 0xdf, 0x55, 0xef,
+ 0x3b, 0xc1, 0x2b, 0x41, 0xbd, 0x6e, 0x82, 0xb3, 0xbd, 0x08, 0xc0, 0x6c, 0xbd,
+ 0x7c, 0xb9, 0x10, 0xbe, 0x97, 0x76, 0xbb, 0xbc, 0xa3, 0x52, 0x00, 0xbe, 0xd9,
+ 0x90, 0x32, 0xbe, 0xac, 0x38, 0x62, 0x3d, 0x6c, 0xdc, 0xae, 0xbc, 0x2a, 0x7d,
+ 0x01, 0xbe, 0x2f, 0xf8, 0x30, 0xbd, 0x8f, 0x24, 0x45, 0xbe, 0x0c, 0x74, 0x1f,
+ 0xbe, 0x5e, 0x0b, 0x0f, 0xbd, 0xf7, 0xb6, 0xc5, 0x3d, 0xe9, 0x3c, 0xbb, 0xbd,
+ 0x61, 0x11, 0x19, 0x3d, 0x68, 0xf0, 0x44, 0x3e, 0x26, 0x64, 0x95, 0x3c, 0xa1,
+ 0xde, 0x54, 0x3d, 0x25, 0x8b, 0x14, 0x3e, 0x0f, 0xed, 0xfe, 0x3b, 0x1b, 0x37,
+ 0xf4, 0xbd, 0x9e, 0x28, 0xbd, 0x3d, 0x26, 0x5c, 0xca, 0x3d, 0xbb, 0xad, 0x02,
+ 0x3d, 0x1f, 0xc1, 0x25, 0x3e, 0x85, 0x0a, 0x39, 0xbe, 0xfa, 0xc3, 0xf7, 0xbd,
+ 0xda, 0x75, 0xc6, 0xbd, 0x06, 0x2d, 0x4a, 0x3c, 0x1a, 0xc1, 0x94, 0xbd, 0xb0,
+ 0x62, 0xa0, 0xbd, 0x63, 0x0c, 0x0e, 0xbe, 0xf3, 0x67, 0x01, 0xbe, 0xd9, 0x42,
+ 0x48, 0xbe, 0xaa, 0xf0, 0xf6, 0xbd, 0xc7, 0xa6, 0x39, 0xbe, 0xf6, 0xef, 0xb2,
+ 0x3d, 0xe6, 0x6f, 0xd7, 0xbd, 0x14, 0x4f, 0xfb, 0xbc, 0x7f, 0xb1, 0x86, 0x3d,
+ 0xcc, 0xca, 0xd9, 0xbd, 0x34, 0x6f, 0x3e, 0xbc, 0x90, 0x24, 0xe8, 0x3d, 0xda,
+ 0x5a, 0xf9, 0x3d, 0x78, 0xc9, 0xf0, 0xbd, 0x1e, 0x50, 0xa5, 0x3d, 0xce, 0xed,
+ 0x6d, 0xbd, 0x65, 0x3b, 0x62, 0xbd, 0x52, 0x36, 0x3d, 0xbd, 0xf8, 0x54, 0x70,
+ 0x3d, 0x01, 0x85, 0x39, 0x3c, 0x57, 0xf0, 0xa8, 0xbc, 0xf5, 0x69, 0xda, 0xbd,
+ 0xd5, 0x00, 0xda, 0x3d, 0x47, 0x0a, 0xe6, 0x3d, 0xf1, 0xed, 0xae, 0xbd, 0x1b,
+ 0x51, 0x93, 0x3d, 0x25, 0x8d, 0x1e, 0x3e, 0x65, 0x36, 0x24, 0x3e, 0xab, 0x4e,
+ 0x3b, 0xbe, 0x73, 0x91, 0x7b, 0x3d, 0x79, 0x2a, 0xa6, 0x3c, 0x6e, 0x13, 0x29,
+ 0x3e, 0xae, 0x98, 0x8b, 0x3d, 0x61, 0xec, 0x36, 0xbe, 0xee, 0xd9, 0x8a, 0x3d,
+ 0xe8, 0xd8, 0xff, 0xbd, 0x87, 0xae, 0x13, 0xbe, 0x45, 0x02, 0xae, 0x3d, 0xbc,
+ 0x03, 0x94, 0xbd, 0xf6, 0x5b, 0x17, 0xbe, 0x3c, 0x46, 0x15, 0x3e, 0x99, 0xe3,
+ 0x3b, 0x3e, 0x6c, 0x0a, 0x82, 0xbd, 0x67, 0xb1, 0xb4, 0x3c, 0x68, 0xc6, 0x0a,
+ 0x3e, 0x7f, 0xe1, 0xa5, 0x3d, 0x38, 0x5c, 0x61, 0x3e, 0x0d, 0x37, 0xdd, 0xbd,
+ 0x14, 0xae, 0xff, 0xbc, 0x00, 0xba, 0x97, 0x3d, 0x61, 0xf4, 0xd7, 0x3c, 0xb9,
+ 0x7e, 0x0b, 0xbe, 0x87, 0xa5, 0x59, 0xbc, 0x01, 0x95, 0x19, 0x3c, 0x3e, 0xf3,
+ 0x72, 0xbd, 0x8b, 0x32, 0x0e, 0xbe, 0x8e, 0x5c, 0x30, 0x3e, 0xd1, 0x09, 0x10,
+ 0x3e, 0xfb, 0xc9, 0x13, 0x3e, 0x82, 0x6f, 0xe2, 0x3d, 0x71, 0xd7, 0xc8, 0xbd,
+ 0x57, 0x14, 0xbb, 0xbd, 0x0f, 0x10, 0x40, 0x3d, 0xa6, 0x30, 0x1e, 0x3d, 0xc8,
+ 0x3f, 0x4a, 0x3e, 0x06, 0xe9, 0x15, 0xbd, 0x8a, 0x87, 0x11, 0x3e, 0xe2, 0xa4,
+ 0x0b, 0xbe, 0xe5, 0x96, 0x3d, 0x3e, 0x5e, 0x78, 0x0c, 0x3e, 0x32, 0x79, 0x7a,
+ 0xba, 0x24, 0x9f, 0x1f, 0xbe, 0xe1, 0x2d, 0xc3, 0xbc, 0xdf, 0x43, 0xb4, 0xbd,
+ 0xb1, 0x00, 0xde, 0x3d, 0x7e, 0x34, 0x4b, 0xbe, 0xeb, 0x21, 0xdd, 0xbd, 0xbe,
+ 0x43, 0xe2, 0xbd, 0x4b, 0x49, 0x9f, 0x3d, 0xa3, 0xd0, 0x8e, 0x3d, 0xdf, 0x84,
+ 0x17, 0xbe, 0x12, 0x0b, 0xc8, 0xbd, 0xcb, 0x0e, 0x64, 0xbd, 0xdd, 0x25, 0x83,
+ 0xbd, 0xa0, 0x78, 0x1b, 0x3e, 0x2e, 0x77, 0x1e, 0xbe, 0x94, 0x81, 0xc8, 0xbd,
+ 0x8d, 0x3e, 0xba, 0xbd, 0xff, 0xe9, 0x32, 0x3e, 0xb0, 0x76, 0xb9, 0xbd, 0xfd,
+ 0x8a, 0x71, 0xbd, 0xab, 0xf3, 0x4c, 0xbc, 0x0c, 0xa0, 0x0c, 0x3e, 0xa2, 0x36,
+ 0xb2, 0xbc, 0x1b, 0x34, 0xb2, 0xbd, 0x44, 0x18, 0x8c, 0xbd, 0xa3, 0xe3, 0x83,
+ 0xbd, 0x45, 0x8c, 0xae, 0xbd, 0x4e, 0x7d, 0x09, 0xbe, 0xdf, 0x58, 0x19, 0xbd,
+ 0xae, 0x8f, 0x5f, 0x3d, 0xa7, 0x36, 0x80, 0xbd, 0xfb, 0x12, 0x22, 0x3e, 0x25,
+ 0x11, 0x99, 0xbb, 0x51, 0xc9, 0x4a, 0x3d, 0x99, 0x68, 0x32, 0x3e, 0x44, 0xcc,
+ 0x7a, 0xbc, 0xa8, 0x46, 0xb7, 0x3d, 0x5f, 0xbb, 0x8a, 0xbd, 0xd3, 0xbb, 0x3a,
+ 0x3e, 0x46, 0x2c, 0x89, 0x3d, 0x26, 0xcb, 0x79, 0x3d, 0xe1, 0x45, 0x40, 0xbd,
+ 0x01, 0xc4, 0xe3, 0x3d, 0x42, 0x18, 0x24, 0x3e, 0x34, 0x73, 0x19, 0x3e, 0x00,
+ 0x53, 0xb7, 0x3d, 0x33, 0x6d, 0xf8, 0x3c, 0x2c, 0x5d, 0x3f, 0xbd, 0x85, 0xa9,
+ 0x1b, 0xbe, 0x18, 0xda, 0xb8, 0xbc, 0xaa, 0x92, 0xb4, 0x3d, 0x53, 0x65, 0x43,
+ 0x3e, 0x4f, 0xda, 0x03, 0xbd, 0xba, 0x8e, 0x40, 0xbe, 0xc1, 0x11, 0xb8, 0xbb,
+ 0x3e, 0x07, 0x66, 0x3e, 0xb8, 0x25, 0xe0, 0x3c, 0x7f, 0x4d, 0x0f, 0xbd, 0x35,
+ 0x57, 0xaa, 0xbd, 0xe5, 0x8b, 0xec, 0xbd, 0x70, 0xda, 0x08, 0xbc, 0x03, 0xc2,
+ 0xf5, 0xbb, 0xa5, 0x57, 0x83, 0xbd, 0xf1, 0x0b, 0x74, 0x3e, 0x9a, 0x63, 0x5a,
+ 0xbd, 0x8f, 0xb3, 0xa1, 0xbb, 0xe3, 0x0a, 0xd1, 0x3c, 0xa8, 0xc3, 0xfd, 0x3d,
+ 0x58, 0x80, 0x04, 0xbe, 0xfb, 0xca, 0xe0, 0x3d, 0x01, 0x75, 0x04, 0xbe, 0xbe,
+ 0xa9, 0x55, 0xbd, 0x59, 0x90, 0xff, 0xbd, 0x6a, 0xf0, 0x64, 0xbd, 0x89, 0xdc,
+ 0x1d, 0xbe, 0xb8, 0x8f, 0x26, 0xbd, 0x3b, 0x31, 0xc8, 0xbd, 0x2c, 0x3d, 0x88,
+ 0xbd, 0x48, 0xea, 0x0f, 0xbd, 0xce, 0x3f, 0x22, 0x3d, 0x8b, 0x31, 0xe7, 0x3d,
+ 0xa1, 0x13, 0x55, 0xbd, 0x2a, 0x96, 0xcc, 0x3d, 0xa1, 0xd9, 0xcf, 0x3d, 0x9f,
+ 0x0f, 0xcf, 0x3c, 0xac, 0x8b, 0xa4, 0xbc, 0x88, 0x69, 0xb6, 0x3d, 0x35, 0x40,
+ 0xc8, 0x3d, 0x5a, 0x6e, 0x23, 0xbe, 0x5f, 0xd9, 0x17, 0xbe, 0x4b, 0x8e, 0x9f,
+ 0xbd, 0x44, 0xeb, 0x15, 0xbe, 0xe9, 0x93, 0xba, 0x3d, 0x4b, 0x93, 0x08, 0xbe,
+ 0x79, 0x4d, 0x09, 0x3e, 0x5a, 0x98, 0x6d, 0xbd, 0x02, 0x95, 0x24, 0xbe, 0x80,
+ 0x67, 0x9d, 0xbd, 0xd2, 0x10, 0x1f, 0xbe, 0x64, 0xd2, 0x62, 0xbd, 0x01, 0x92,
+ 0x09, 0x3e, 0x96, 0x6e, 0xca, 0xbd, 0x62, 0x32, 0xf3, 0xbd, 0xe1, 0x10, 0x50,
+ 0x3d, 0x61, 0x3e, 0xdc, 0x3d, 0x7e, 0x6e, 0xd5, 0xbd, 0xf4, 0xea, 0x1f, 0x3e,
+ 0x2a, 0xd2, 0x10, 0xbd, 0x04, 0xa4, 0xdd, 0x3b, 0x7f, 0x19, 0x50, 0xbd, 0xad,
+ 0x49, 0x0e, 0x3e, 0x63, 0x14, 0xe3, 0x3d, 0x6f, 0x2d, 0x99, 0x3d, 0x4a, 0x0b,
+ 0x08, 0xbe, 0xd6, 0x54, 0xdd, 0xbd, 0xfb, 0x6b, 0x9e, 0xbd, 0xc0, 0x42, 0xe9,
+ 0xbd, 0xba, 0xef, 0x40, 0xbb, 0x9c, 0x44, 0xc5, 0x3d, 0x1e, 0x3a, 0xde, 0xbd,
+ 0xce, 0x6d, 0xef, 0x3d, 0x92, 0x4d, 0xf6, 0xbd, 0xa3, 0xc5, 0x0c, 0xbe, 0x74,
+ 0x63, 0xd8, 0xbd, 0xff, 0xd4, 0x11, 0x3e, 0x02, 0x10, 0x28, 0xbd, 0x86, 0xf5,
+ 0x4f, 0x3d, 0x6a, 0xfb, 0xc6, 0x3d, 0x6d, 0x29, 0x1f, 0xbe, 0xa4, 0x55, 0xab,
+ 0x3d, 0xaa, 0xc8, 0xc7, 0x3d, 0xf4, 0xec, 0x59, 0x3d, 0xd1, 0x44, 0x75, 0x3d,
+ 0xe6, 0x18, 0x3c, 0x3e, 0xd7, 0x83, 0xb5, 0x3d, 0xdc, 0xa3, 0xb1, 0xbd, 0xbb,
+ 0xa7, 0x73, 0xbd, 0x03, 0x00, 0x3c, 0x3d, 0x3b, 0x59, 0x8d, 0xbd, 0x27, 0x1f,
+ 0x07, 0xbe, 0x46, 0x5f, 0xcf, 0xbd, 0x5b, 0xf5, 0x13, 0xbe, 0xe9, 0xa9, 0x1b,
+ 0x3e, 0x05, 0x6e, 0x0e, 0x3e, 0xd2, 0xa7, 0xad, 0xbc, 0x55, 0xda, 0x12, 0x3e,
+ 0xd4, 0xd5, 0xcc, 0xbd, 0x5e, 0x0d, 0x33, 0xbe, 0x5f, 0xfa, 0x99, 0xbd, 0xa1,
+ 0xd4, 0x96, 0xbd, 0x7b, 0xec, 0x08, 0x3d, 0xf0, 0x43, 0x04, 0xbe, 0xd6, 0x6a,
+ 0x3e, 0x3d, 0x9c, 0x4c, 0xa5, 0xbd, 0xc1, 0x25, 0xeb, 0x3c, 0x00, 0x84, 0x7f,
+ 0xbd, 0x8e, 0x5b, 0x2d, 0xbd, 0x5a, 0x0d, 0x93, 0x3c, 0x14, 0x09, 0x5e, 0x3d,
+ 0x0e, 0x7c, 0x25, 0x3d, 0x4b, 0x3f, 0x0f, 0xbe, 0xad, 0x31, 0xd8, 0xbd, 0x81,
+ 0xa4, 0x66, 0xbd, 0x25, 0x37, 0x32, 0xbe, 0x64, 0x42, 0x6f, 0x3d, 0x9c, 0xdb,
+ 0xc2, 0x3d, 0x1f, 0x78, 0xcc, 0x3c, 0x45, 0xa8, 0x0c, 0x3e, 0xe8, 0x27, 0xe3,
+ 0x3d, 0xbf, 0xb1, 0xff, 0x3d, 0x3e, 0x13, 0xc6, 0x3d, 0xf2, 0x5b, 0x64, 0x3d,
+ 0xf1, 0xf8, 0x16, 0x3e, 0x24, 0x46, 0x40, 0x3d, 0xa1, 0x7e, 0x99, 0x3c, 0x6d,
+ 0x30, 0x1e, 0xbe, 0x04, 0xdd, 0x2a, 0xbe, 0x03, 0x25, 0x20, 0xbd, 0x07, 0xf4,
+ 0x74, 0xbc, 0xc8, 0x71, 0x03, 0xbd, 0x46, 0xf3, 0xd9, 0xbc, 0x33, 0x6d, 0xbb,
+ 0xbd, 0xbd, 0x8a, 0xd5, 0x3d, 0x68, 0xbd, 0x9e, 0xbc, 0x1c, 0x26, 0x09, 0xbe,
+ 0x0f, 0x3c, 0x9d, 0xbd, 0xde, 0x13, 0x53, 0xbd, 0x73, 0xe9, 0x90, 0x3d, 0xdc,
+ 0x50, 0xef, 0x3c, 0x6f, 0x00, 0x32, 0xbc, 0x42, 0x79, 0x18, 0x3e, 0xa8, 0xe4,
+ 0xb3, 0xbd, 0x04, 0x2f, 0x6e, 0xbd, 0x41, 0xb2, 0x51, 0x3e, 0x56, 0x54, 0xe7,
+ 0x3d, 0x0c, 0x44, 0xbb, 0xbd, 0xa4, 0xce, 0x8b, 0x3c, 0xad, 0x8a, 0xec, 0x3d,
+ 0xf7, 0xc9, 0x44, 0xbd, 0xc5, 0xdc, 0x2a, 0x3b, 0xde, 0x9e, 0xb6, 0x3d, 0x20,
+ 0x2c, 0x1c, 0xbe, 0x04, 0x0c, 0x9f, 0xbd, 0x41, 0x5f, 0xd4, 0xbd, 0x76, 0x92,
+ 0x06, 0xbe, 0x6a, 0x98, 0x30, 0xbe, 0xc4, 0xa0, 0xd3, 0x3c, 0x38, 0x33, 0xf5,
+ 0xbd, 0x94, 0x28, 0x0d, 0xbd, 0x42, 0x60, 0x1e, 0x3d, 0xfd, 0x72, 0xca, 0x3d,
+ 0xee, 0xf6, 0x0d, 0x3e, 0x35, 0xb3, 0x27, 0x3e, 0x15, 0xde, 0x08, 0xbe, 0x34,
+ 0xc4, 0x8b, 0xbd, 0x4a, 0x4f, 0x9a, 0x3d, 0x87, 0x8f, 0x06, 0xbc, 0x68, 0x43,
+ 0x10, 0xbd, 0x36, 0x40, 0xb6, 0xbc, 0xf2, 0xad, 0x82, 0xbd, 0xc5, 0xef, 0x13,
+ 0xbe, 0x4c, 0x38, 0xcd, 0xbd, 0x4a, 0xdf, 0x9d, 0x3c, 0x9d, 0xb0, 0x9a, 0x3d,
+ 0xe8, 0xf7, 0xd4, 0x3d, 0x9d, 0x50, 0x34, 0x3d, 0xc9, 0x92, 0xdf, 0x3d, 0x20,
+ 0x66, 0xeb, 0x3d, 0x54, 0x5c, 0x85, 0xbd, 0x2d, 0x0e, 0xc6, 0x3d, 0x90, 0xea,
+ 0x64, 0xbd, 0xcd, 0xa5, 0x5c, 0xbd, 0x77, 0x8d, 0x7b, 0x3d, 0xf7, 0xda, 0x98,
+ 0xbd, 0xc2, 0x98, 0xcb, 0x3d, 0x79, 0xa4, 0x2d, 0x3d, 0x52, 0x42, 0x15, 0x3e,
+ 0xc5, 0x68, 0x47, 0xbd, 0xbf, 0xa0, 0xe7, 0xbd, 0xbf, 0xa4, 0xbd, 0x3b, 0x6f,
+ 0xe3, 0x05, 0xbd, 0xd3, 0xda, 0xdb, 0xbd, 0x40, 0x3a, 0xa8, 0xbd, 0x87, 0x88,
+ 0x36, 0xbe, 0xaf, 0x1d, 0xe5, 0x3d, 0xf6, 0xe8, 0x2e, 0xbe, 0xbc, 0x78, 0x9b,
+ 0x3d, 0x8b, 0x27, 0xf6, 0xbd, 0x18, 0x45, 0xef, 0xbd, 0x8c, 0x3f, 0x3e, 0x3e,
+ 0x94, 0x69, 0x16, 0xbe, 0x4f, 0xce, 0x48, 0xbe, 0x0c, 0xfa, 0x0b, 0xbc, 0x01,
+ 0x50, 0x37, 0x3e, 0x87, 0x13, 0x0b, 0xbe, 0xd0, 0xb1, 0x38, 0x3e, 0x71, 0x2c,
+ 0xa1, 0x3d, 0x4a, 0x15, 0xb4, 0xbd, 0x80, 0x28, 0x2b, 0xbd, 0xc7, 0x3d, 0x7e,
+ 0x3c, 0xe5, 0xe1, 0xf1, 0x3d, 0x43, 0x56, 0x2c, 0x3d, 0x18, 0xba, 0x20, 0xbe,
+ 0x4e, 0x30, 0x8d, 0x3d, 0x0b, 0x52, 0x20, 0x3b, 0x2d, 0xbc, 0x48, 0xbd, 0xf8,
+ 0xff, 0xcf, 0xbb, 0x34, 0xb2, 0xaf, 0x3c, 0xea, 0xad, 0xf0, 0x3d, 0xed, 0xbd,
+ 0x8d, 0x3d, 0x41, 0x8c, 0xde, 0xbd, 0xb0, 0xb4, 0x32, 0x3e, 0xf8, 0x16, 0x2e,
+ 0xbe, 0x0c, 0x4a, 0x8c, 0x3d, 0x89, 0x92, 0x13, 0x3e, 0x8b, 0xd2, 0xbb, 0xbd,
+ 0xf5, 0xce, 0x0f, 0x3e, 0x31, 0x82, 0x7b, 0xbb, 0x7f, 0xac, 0x0e, 0x3e, 0x9f,
+ 0xe7, 0x0a, 0xbe, 0x5b, 0xef, 0x2b, 0x3d, 0xa9, 0x7f, 0x0d, 0x3e, 0xa4, 0xc0,
+ 0xde, 0x3d, 0xde, 0x0d, 0xbc, 0xbc, 0x59, 0x6f, 0x81, 0x3a, 0x46, 0x0c, 0x1b,
+ 0xbe, 0xd0, 0xba, 0xf5, 0xbc, 0xe5, 0x6d, 0x1d, 0x3e, 0x31, 0x08, 0x5a, 0x3d,
+ 0xab, 0x1c, 0xb5, 0xbc, 0xe7, 0xaa, 0x18, 0x3e, 0xaa, 0xcc, 0x14, 0x3e, 0x4e,
+ 0x1e, 0x08, 0xbd, 0xfc, 0x9f, 0xbe, 0xbd, 0x44, 0x7b, 0x2b, 0xbe, 0xf1, 0xfa,
+ 0x90, 0x3c, 0xa4, 0x75, 0x16, 0xbe, 0x27, 0x3b, 0x05, 0xbe, 0xf3, 0x41, 0xde,
+ 0xbd, 0xb9, 0x96, 0x10, 0xbd, 0xd0, 0x44, 0x6a, 0x3b, 0x5b, 0x04, 0x02, 0xbe,
+ 0x3c, 0xf7, 0x41, 0xbd, 0xe6, 0xaf, 0x06, 0xbe, 0x52, 0x74, 0x08, 0x3e, 0xda,
+ 0x81, 0x54, 0x3d, 0xcd, 0xe8, 0xbc, 0x3d, 0xf8, 0x07, 0xdc, 0x3d, 0x84, 0x6f,
+ 0xd8, 0xbd, 0xe0, 0x65, 0x2a, 0x3e, 0x04, 0xae, 0xe1, 0xbd, 0x34, 0xd5, 0x27,
+ 0xbd, 0x5c, 0xb4, 0x70, 0xbd, 0x0d, 0x68, 0xfa, 0x3d, 0x04, 0xb0, 0xc5, 0xbd,
+ 0xa0, 0xf7, 0x87, 0x3d, 0xdc, 0x08, 0x18, 0x3e, 0x86, 0xb9, 0x0f, 0xbe, 0x21,
+ 0x03, 0x75, 0x3d, 0x2b, 0x4f, 0x15, 0xbd, 0x3c, 0x86, 0x8e, 0xbc, 0xc7, 0xd0,
+ 0x73, 0x3d, 0xe0, 0x50, 0x37, 0x3c, 0xd6, 0x8d, 0xce, 0x3d, 0x3b, 0x42, 0x1b,
+ 0x3e, 0xa9, 0xfc, 0x29, 0x3e, 0xe4, 0x58, 0x1d, 0x3d, 0x5d, 0xab, 0x3b, 0xbe,
+ 0x28, 0x32, 0x07, 0xbd, 0x54, 0x37, 0x9c, 0x3d, 0xd4, 0xdd, 0x04, 0x3d, 0x28,
+ 0xe1, 0xad, 0xbc, 0x98, 0x0e, 0x13, 0x3e, 0xae, 0x57, 0x2a, 0xbe, 0xc4, 0xf0,
+ 0x70, 0xbd, 0xf9, 0x8d, 0x0d, 0xbe, 0x5e, 0x46, 0x17, 0xbe, 0x90, 0x6a, 0xbc,
+ 0x3d, 0x12, 0xa1, 0xf3, 0xbd, 0x0f, 0xf9, 0x88, 0xbd, 0x60, 0xd9, 0x2f, 0xbd,
+ 0x07, 0x99, 0xa2, 0xbd, 0x0b, 0xa5, 0x1b, 0xbc, 0x92, 0x9d, 0xaf, 0xbc, 0x37,
+ 0xf5, 0x5a, 0x3c, 0x88, 0xf0, 0xcf, 0x3d, 0x96, 0xdd, 0x54, 0x3d, 0x2f, 0xd2,
+ 0x0a, 0x3e, 0xe5, 0xbd, 0x46, 0x3c, 0xd2, 0x65, 0xcb, 0xbd, 0x19, 0x00, 0x0b,
+ 0xbe, 0xd6, 0xf6, 0xb0, 0x3d, 0x39, 0xc2, 0x14, 0x3e, 0x44, 0x63, 0x3f, 0x3e,
+ 0x4a, 0x6c, 0x1d, 0x3e, 0xf3, 0x6a, 0xe1, 0xbc, 0x31, 0xa5, 0x28, 0xbe, 0x54,
+ 0x4d, 0x49, 0xbd, 0xd4, 0xbf, 0x64, 0xbd, 0xec, 0x58, 0xbc, 0xbd, 0xff, 0xc6,
+ 0xd0, 0x3c, 0xb7, 0xf1, 0xa7, 0x3d, 0x55, 0x15, 0x26, 0xbd, 0xe6, 0x14, 0xe2,
+ 0x3c, 0x6b, 0x28, 0x05, 0x3e, 0x83, 0xaf, 0xbc, 0xbd, 0xc6, 0xb7, 0x6a, 0x3d,
+ 0x6f, 0xa9, 0x01, 0x3e, 0x93, 0x78, 0x62, 0xb9, 0x23, 0x46, 0x3f, 0xbd, 0x89,
+ 0xbd, 0x88, 0x3d, 0x4d, 0xeb, 0xa0, 0x3d, 0x5e, 0x68, 0x74, 0xbd, 0x3d, 0xe2,
+ 0x86, 0xbd, 0x11, 0x15, 0x62, 0xbd, 0x01, 0xde, 0xc8, 0xbd, 0xf0, 0x96, 0xc0,
+ 0xbd, 0xf4, 0x9d, 0xff, 0xbd, 0x04, 0xcb, 0x80, 0x3c, 0x4f, 0x43, 0x35, 0x3d,
+ 0x65, 0x45, 0x6c, 0x3d, 0x45, 0x55, 0xaa, 0xbc, 0xe1, 0x1a, 0x59, 0x3d, 0x4c,
+ 0x54, 0x20, 0xbe, 0x35, 0xaf, 0xe3, 0x3d, 0xd2, 0x5e, 0xae, 0xbd, 0xa7, 0xaa,
+ 0x15, 0x3e, 0xea, 0x3c, 0xe9, 0x3c, 0xa4, 0xc9, 0x08, 0xbe, 0xca, 0xec, 0x82,
+ 0x3b, 0x8b, 0x49, 0xfa, 0xbd, 0x9d, 0x1e, 0x8b, 0xbc, 0x1b, 0xb4, 0xed, 0xbd,
+ 0x1d, 0xbe, 0xc9, 0x3d, 0x8c, 0xdf, 0x2a, 0xbe, 0x8c, 0xba, 0xe3, 0x3d, 0x1f,
+ 0xa2, 0x14, 0x3d, 0x61, 0xf2, 0xcf, 0xba, 0xd5, 0x67, 0x88, 0xbd, 0xa7, 0xd0,
+ 0x5d, 0x3e, 0x71, 0x6e, 0xfd, 0x3d, 0xd5, 0xcf, 0x02, 0xbd, 0x0c, 0x25, 0xb5,
+ 0x3c, 0xa6, 0x27, 0x90, 0x3c, 0x86, 0x80, 0x1c, 0x3e, 0x41, 0x4f, 0x02, 0xbe,
+ 0xe1, 0x7a, 0x28, 0x3e, 0xef, 0xf7, 0x96, 0xbd, 0x0f, 0x11, 0xd3, 0x3d, 0xd9,
+ 0x11, 0x00, 0x3e, 0x77, 0x16, 0x98, 0x3d, 0x6a, 0xbc, 0x03, 0xbe, 0xbc, 0x2b,
+ 0xc9, 0xbd, 0xc0, 0xc5, 0x99, 0x3d, 0xf4, 0x17, 0xc9, 0x3d, 0x37, 0xc7, 0xea,
+ 0x3d, 0xd0, 0x01, 0x29, 0xbe, 0xae, 0xfd, 0x37, 0xbd, 0x7a, 0xce, 0xba, 0xbc,
+ 0x7d, 0x16, 0x19, 0x3e, 0x2b, 0x5f, 0x32, 0x3a, 0x54, 0x01, 0x96, 0xbd, 0xd6,
+ 0xb6, 0x73, 0x3c, 0x8f, 0x5c, 0xa9, 0x3c, 0x67, 0x4e, 0xac, 0x3d, 0x52, 0x49,
+ 0xab, 0x3d, 0x05, 0x07, 0x29, 0x3e, 0x43, 0x4c, 0x28, 0xbe, 0x0c, 0x1a, 0x12,
+ 0xbe, 0x05, 0x18, 0x3c, 0x3c, 0x29, 0x0f, 0x22, 0x3e, 0xf3, 0x49, 0x54, 0x3e,
+ 0xbf, 0xcd, 0x46, 0x3d, 0xea, 0x9f, 0x53, 0x3d, 0xf6, 0xcc, 0xb5, 0x3d, 0x80,
+ 0x51, 0x9e, 0x3d, 0xff, 0xc1, 0x69, 0x3d, 0x94, 0x19, 0x41, 0xbd, 0x7b, 0x33,
+ 0x75, 0x3c, 0x9e, 0x51, 0x2f, 0x3e, 0x58, 0x6e, 0x21, 0x3c, 0x46, 0x38, 0x22,
+ 0x3e, 0x73, 0xf9, 0x15, 0xbe, 0xfa, 0x12, 0x04, 0xbe, 0xaf, 0x1d, 0x1e, 0xbe,
+ 0xad, 0x03, 0x11, 0xbe, 0xb3, 0xa7, 0x07, 0x3d, 0x4b, 0x76, 0x58, 0xbd, 0x68,
+ 0xaa, 0x21, 0xbe, 0x18, 0xb3, 0x24, 0xbe, 0x59, 0xa7, 0x9d, 0xbd, 0x8a, 0x64,
+ 0x92, 0x3d, 0xf4, 0xe8, 0x00, 0xbe, 0xed, 0xd4, 0x85, 0x3c, 0x77, 0x84, 0xf0,
+ 0xbd, 0x3f, 0x0d, 0x37, 0x3e, 0x2c, 0x42, 0x64, 0x3c, 0x5b, 0x23, 0x27, 0x3e,
+ 0x3e, 0xc6, 0xb0, 0x3d, 0x1c, 0xba, 0xfe, 0xbc, 0xcf, 0xde, 0xb4, 0xbc, 0x97,
+ 0x05, 0x1c, 0xbd, 0x0d, 0xa5, 0x92, 0xbb, 0x6a, 0x79, 0x50, 0x3e, 0x62, 0x30,
+ 0x19, 0x3e, 0xd7, 0x23, 0x02, 0x3e, 0x9d, 0xc1, 0x7e, 0x3d, 0xb5, 0x03, 0x9c,
+ 0xbd, 0x7b, 0xc5, 0x72, 0x3d, 0xc3, 0xd4, 0x22, 0xbe, 0x55, 0x27, 0x63, 0x3d,
+ 0xb7, 0x8f, 0x2e, 0xbe, 0x18, 0xe1, 0xbd, 0xbd, 0xa9, 0x10, 0xf0, 0xbd, 0x51,
+ 0xd4, 0x4d, 0x3d, 0x62, 0x08, 0xe2, 0x3d, 0x3b, 0xf4, 0x5e, 0x3d, 0xa1, 0xeb,
+ 0xb4, 0x3d, 0xed, 0x6f, 0x72, 0x3d, 0x1c, 0x3b, 0xba, 0xbd, 0x56, 0xa6, 0xc8,
+ 0xbd, 0x1e, 0x39, 0x3b, 0xbe, 0x83, 0xc7, 0xb4, 0x3d, 0x04, 0xe6, 0xd6, 0x3d,
+ 0x2a, 0x2c, 0x91, 0x3d, 0x78, 0x72, 0x9f, 0x3d, 0x62, 0xf9, 0xdd, 0xbd, 0x21,
+ 0x97, 0x28, 0xbe, 0x52, 0xaa, 0x06, 0x3e, 0x55, 0x9e, 0x26, 0xbe, 0xb0, 0x2a,
+ 0x4f, 0xbd, 0x72, 0x66, 0xeb, 0x3c, 0xa8, 0x84, 0xed, 0x3d, 0x02, 0xca, 0xaf,
+ 0xbd, 0xbd, 0x90, 0x64, 0xbd, 0x91, 0xd5, 0x81, 0xbd, 0xcd, 0x4a, 0x24, 0x3e,
+ 0x57, 0x13, 0x44, 0xbd, 0x35, 0x93, 0x1b, 0xbb, 0x9e, 0x75, 0xe0, 0x3d, 0x86,
+ 0xfb, 0x25, 0xbe, 0x7a, 0xe1, 0xe5, 0x3d, 0x15, 0x97, 0x28, 0x3d, 0xa5, 0x78,
+ 0xe4, 0x3d, 0x22, 0xf8, 0x0d, 0x3d, 0x18, 0xbb, 0xcb, 0xbc, 0xfc, 0x53, 0x99,
+ 0xbd, 0xd5, 0x40, 0xcc, 0xbd, 0x2e, 0x47, 0xf6, 0x3d, 0xd0, 0x5c, 0x1c, 0xbb,
+ 0xac, 0x38, 0xb3, 0x3c, 0x25, 0xfd, 0x8e, 0x3c, 0xd0, 0xc9, 0x4c, 0xbd, 0x37,
+ 0xc4, 0xfe, 0xbd, 0x1d, 0xca, 0x17, 0xbe, 0x54, 0x50, 0x8f, 0xbd, 0xc1, 0xfb,
+ 0xed, 0xbd, 0xb9, 0x2f, 0x24, 0x3e, 0xc0, 0x6d, 0x1c, 0xbe, 0xe2, 0xd7, 0x95,
+ 0x3d, 0x21, 0xa6, 0x7c, 0x3d, 0x1b, 0x02, 0x3c, 0x3d, 0xc6, 0x73, 0x4b, 0x3d,
+ 0x28, 0x7a, 0xcf, 0x3d, 0x6c, 0x4f, 0xf5, 0x3c, 0x0a, 0x47, 0x88, 0xbd, 0xe1,
+ 0xc9, 0x39, 0xbe, 0x0d, 0x2d, 0x04, 0x3c, 0x80, 0xf8, 0xd7, 0xbb, 0x8e, 0xa6,
+ 0xf3, 0xbd, 0x10, 0x3c, 0xe1, 0x3d, 0xde, 0x10, 0xb2, 0xbd, 0x9c, 0x3f, 0x46,
+ 0xbd, 0xd4, 0x42, 0x01, 0x3e, 0x63, 0x0f, 0x82, 0x3d, 0xab, 0x71, 0xe9, 0xbd,
+ 0x06, 0xe4, 0x11, 0x3e, 0x12, 0x15, 0x0a, 0xbe, 0x46, 0x0a, 0x5a, 0xbd, 0x83,
+ 0xff, 0x9a, 0xbc, 0xe4, 0x96, 0xdc, 0xbd, 0xc7, 0xaf, 0x7a, 0x3d, 0x64, 0x84,
+ 0xbe, 0x3d, 0x90, 0x0c, 0x04, 0xbd, 0xb4, 0x26, 0xb1, 0xbc, 0x35, 0xf6, 0x23,
+ 0x3e, 0x81, 0x0c, 0x89, 0xbd, 0x8a, 0xe7, 0xd7, 0xbc, 0x3b, 0xce, 0xa5, 0x3d,
+ 0xc1, 0x40, 0x83, 0x3d, 0x44, 0x14, 0x9a, 0x3d, 0xeb, 0x57, 0xbe, 0x3c, 0xde,
+ 0x7c, 0x01, 0x3d, 0xa0, 0x13, 0xe4, 0xbc, 0x54, 0xae, 0xca, 0x3d, 0x9d, 0xd5,
+ 0xc7, 0x3b, 0x59, 0x7b, 0xfc, 0xbd, 0xae, 0x12, 0x00, 0x3e, 0x79, 0xac, 0x07,
+ 0x3e, 0x40, 0x9b, 0x83, 0xbd, 0x7b, 0xb9, 0xeb, 0xbb, 0x12, 0x58, 0xf6, 0x3d,
+ 0x10, 0x80, 0x8c, 0xbd, 0x73, 0x18, 0xc8, 0xbd, 0x5e, 0x85, 0xbc, 0xbd, 0xf4,
+ 0x7c, 0xd0, 0xbd, 0x3b, 0x06, 0x66, 0xbd, 0x88, 0xaf, 0x82, 0xbc, 0x43, 0x81,
+ 0x80, 0x3d, 0x03, 0x7a, 0x20, 0x3e, 0xc1, 0x44, 0xd1, 0x3c, 0x2f, 0xa0, 0x76,
+ 0x3d, 0x63, 0x3e, 0x06, 0x3c, 0x80, 0xb6, 0xa4, 0x3d, 0x6d, 0x3d, 0x20, 0x3e,
+ 0xee, 0xe4, 0xb3, 0x3d, 0x3f, 0xb3, 0xfc, 0x3c, 0x66, 0x46, 0x52, 0x3e, 0x93,
+ 0x86, 0x14, 0xbd, 0x1f, 0x77, 0x8e, 0xbd, 0x99, 0x66, 0x88, 0x3c, 0xbb, 0xb7,
+ 0xc1, 0x3d, 0x30, 0x43, 0xcd, 0xbd, 0xd6, 0x81, 0xbe, 0x39, 0x60, 0x9d, 0x21,
+ 0xbe, 0x77, 0xb4, 0x16, 0x3e, 0x50, 0x6b, 0x88, 0xbb, 0xbe, 0x2a, 0xe1, 0xbc,
+ 0x7e, 0xfb, 0x13, 0xbe, 0x04, 0xd2, 0x01, 0x3e, 0xd7, 0xf2, 0xfb, 0xbd, 0xa1,
+ 0x97, 0xa5, 0x3d, 0x51, 0xb1, 0x1d, 0x3e, 0xa6, 0xe9, 0x11, 0x3e, 0x28, 0xe3,
+ 0xb0, 0xbc, 0xd6, 0xd7, 0xcf, 0xbd, 0xf7, 0x89, 0x10, 0x3e, 0x2d, 0x9d, 0x0b,
+ 0xbe, 0x08, 0x0a, 0x0e, 0xbd, 0xc7, 0x1e, 0x08, 0x3d, 0x18, 0x40, 0xad, 0xbd,
+ 0xef, 0x48, 0x05, 0xbd, 0xf6, 0xc0, 0x23, 0xbe, 0xf6, 0x7d, 0xa6, 0x3d, 0x05,
+ 0xb5, 0x6c, 0x3d, 0x7f, 0x05, 0xd4, 0xbd, 0xd5, 0x2a, 0x1f, 0x3e, 0x60, 0x90,
+ 0xee, 0xbd, 0x82, 0x03, 0x26, 0xbd, 0x27, 0x9d, 0x05, 0xbd, 0x2d, 0x05, 0x9c,
+ 0x3c, 0xa0, 0x72, 0xef, 0x3d, 0x4a, 0xd9, 0xad, 0x3d, 0x9f, 0x2a, 0x46, 0xbd,
+ 0x47, 0x6e, 0xfb, 0xbc, 0x43, 0x4b, 0xde, 0xbd, 0xf0, 0x40, 0x97, 0x3d, 0xd9,
+ 0xf7, 0xe1, 0xbd, 0xbd, 0xae, 0xce, 0x3c, 0x79, 0xae, 0x8c, 0xbd, 0x34, 0xc9,
+ 0x34, 0xbe, 0x99, 0x0a, 0xae, 0xbd, 0xae, 0xe2, 0xe9, 0x3d, 0xe7, 0x97, 0xf7,
+ 0x3d, 0xd1, 0x30, 0x05, 0x3e, 0x14, 0xd3, 0x0c, 0x3d, 0xcd, 0x90, 0x63, 0x3d,
+ 0x50, 0xac, 0x27, 0xbd, 0x06, 0x6c, 0x30, 0xbe, 0x31, 0x20, 0xa1, 0xbd, 0xf3,
+ 0x98, 0x87, 0x3d, 0x31, 0x34, 0xac, 0xbd, 0x2e, 0xc3, 0xb3, 0xbb, 0xec, 0xb6,
+ 0x4d, 0xbd, 0x6f, 0x2c, 0x02, 0xbc, 0xcc, 0xcb, 0x80, 0xbd, 0x7b, 0x15, 0x29,
+ 0xbe, 0x8f, 0xb6, 0x8b, 0x3c, 0xca, 0x8b, 0x51, 0xbd, 0x64, 0x5f, 0x45, 0xbd,
+ 0x0f, 0xa3, 0xa4, 0x3d, 0xed, 0x79, 0x9c, 0xbd, 0x31, 0xa0, 0xbb, 0x3d, 0xe9,
+ 0x06, 0x26, 0x3e, 0x85, 0x78, 0x21, 0x3e, 0x81, 0x35, 0xcd, 0xbd, 0x05, 0x31,
+ 0x11, 0xbe, 0x9d, 0x19, 0xde, 0xbd, 0x9a, 0xd3, 0x11, 0xbe, 0x58, 0xa7, 0xff,
+ 0xbc, 0x9f, 0x4a, 0x29, 0x3d, 0xda, 0x56, 0x8c, 0xbc, 0xf6, 0xf9, 0x79, 0x3d,
+ 0x11, 0xbe, 0x82, 0x3d, 0xda, 0x43, 0x04, 0x3e, 0xed, 0xce, 0xe1, 0x3d, 0x3a,
+ 0x95, 0x3a, 0x3d, 0x56, 0x31, 0x4e, 0x3d, 0x82, 0x65, 0xbd, 0x3b, 0x4c, 0x6f,
+ 0xa8, 0xbc, 0xa4, 0xa1, 0x25, 0xbc, 0xad, 0x79, 0x2f, 0xbe, 0x73, 0xac, 0x2b,
+ 0x3e, 0x2d, 0x80, 0x3f, 0xbd, 0x97, 0xee, 0x80, 0xbd, 0xd8, 0x02, 0x77, 0x3d,
+ 0xb2, 0xcb, 0x9b, 0x3d, 0x7c, 0x94, 0xc9, 0xbd, 0xce, 0xd1, 0xdd, 0x3d, 0x12,
+ 0xef, 0x8b, 0x3d, 0x3a, 0xbe, 0x08, 0x3e, 0x73, 0x80, 0x1d, 0xbe, 0x2f, 0xdb,
+ 0x2d, 0xbe, 0x58, 0x7d, 0xd7, 0xbd, 0x44, 0x0f, 0xae, 0x3d, 0xd6, 0xe7, 0x3d,
+ 0x3e, 0xe0, 0x3a, 0xad, 0x3c, 0x7b, 0x10, 0x19, 0x3e, 0x1b, 0x4e, 0x78, 0xbd,
+ 0x3f, 0xf3, 0x07, 0xbe, 0x8c, 0xcc, 0xf7, 0xbd, 0x5a, 0x20, 0xb9, 0xbd, 0x53,
+ 0x04, 0x34, 0x3d, 0x6b, 0xcf, 0x24, 0x3e, 0x32, 0x1b, 0xc2, 0xbd, 0x92, 0x01,
+ 0xee, 0x3c, 0x79, 0x75, 0xd8, 0xbd, 0xdf, 0x4b, 0x0a, 0x3c, 0xf3, 0x93, 0xce,
+ 0x3d, 0x76, 0xf7, 0x31, 0xbd, 0xd7, 0x71, 0x17, 0xbe, 0xac, 0xed, 0x1f, 0xbe,
+ 0xb5, 0x4d, 0x46, 0x3d, 0xb0, 0xb9, 0x0b, 0xbe, 0x02, 0xb8, 0x9f, 0x3d, 0x7d,
+ 0x42, 0x28, 0xbe, 0x65, 0x07, 0xc7, 0x3d, 0xb2, 0xd4, 0xb5, 0x3d, 0x28, 0x07,
+ 0xd3, 0x3c, 0x55, 0x93, 0x2c, 0xbe, 0x79, 0x7c, 0x29, 0x3e, 0x59, 0x10, 0x0a,
+ 0xbe, 0x9d, 0x0a, 0x08, 0xbd, 0xa3, 0x61, 0x5d, 0x3d, 0xf8, 0xb5, 0xde, 0xbb,
+ 0x54, 0x24, 0xa7, 0x3d, 0xe3, 0xe4, 0x32, 0xbe, 0x20, 0x3b, 0x3d, 0xbe, 0x48,
+ 0x67, 0xc2, 0xbd, 0x3c, 0x7b, 0x2b, 0xbd, 0x69, 0xee, 0x56, 0xbd, 0xa9, 0x90,
+ 0xcb, 0x3d, 0xff, 0xf1, 0xa7, 0xbd, 0xa9, 0xd8, 0x43, 0xbd, 0xb8, 0xcd, 0xb7,
+ 0x3c, 0xcd, 0xfb, 0xbb, 0x3d, 0xd6, 0x26, 0x8a, 0xbd, 0x45, 0xa4, 0x81, 0x3d,
+ 0xd2, 0xc9, 0x29, 0x3e, 0xdb, 0xf4, 0xdd, 0xbd, 0x93, 0x95, 0xa9, 0x3d, 0x11,
+ 0xbb, 0x12, 0x3e, 0xdf, 0xf4, 0xcd, 0xbd, 0xb9, 0xde, 0x82, 0x3c, 0xdf, 0x26,
+ 0x76, 0x3d, 0xb6, 0x47, 0x32, 0xbe, 0x91, 0x0f, 0x6f, 0x3b, 0x56, 0x16, 0x4c,
+ 0xbe, 0x77, 0x77, 0x00, 0xbe, 0x2c, 0x1f, 0xd1, 0xbd, 0xf6, 0x43, 0x12, 0x3e,
+ 0xd8, 0x7c, 0x16, 0x3e, 0x26, 0xec, 0x0c, 0xbe, 0xaf, 0x69, 0xe0, 0x3d, 0x5a,
+ 0x3b, 0xdf, 0x3d, 0xbb, 0x0f, 0x99, 0x3d, 0xe2, 0x32, 0x2b, 0xbd, 0xf3, 0x1e,
+ 0x1d, 0x3e, 0x9e, 0xdc, 0xf3, 0x3c, 0x77, 0x8b, 0xf7, 0xbd, 0x46, 0xb5, 0x48,
+ 0xbc, 0x28, 0xce, 0xbd, 0x3c, 0x22, 0x68, 0x1a, 0x3e, 0x92, 0x40, 0xf0, 0x3c,
+ 0x35, 0xf1, 0xbe, 0xbd, 0x8d, 0xed, 0xd0, 0x3d, 0x93, 0x67, 0x5e, 0xbd, 0xc8,
+ 0xa3, 0xb0, 0xbd, 0x83, 0x61, 0x2f, 0x3d, 0x39, 0xce, 0x81, 0x3b, 0xa5, 0x87,
+ 0x1d, 0x3e, 0xe0, 0x8f, 0x38, 0x3c, 0xce, 0x6f, 0x26, 0x3d, 0x09, 0x7f, 0x9a,
+ 0x3d, 0x6c, 0x04, 0x8f, 0xbd, 0x31, 0x13, 0x9c, 0xbb, 0xab, 0xbc, 0x3f, 0xbd,
+ 0xe1, 0x11, 0xc2, 0xbd, 0x47, 0xa8, 0x3a, 0x3d, 0x76, 0xc5, 0x0b, 0xbe, 0x0d,
+ 0x71, 0xff, 0x3d, 0x30, 0x8e, 0x41, 0x3d, 0xdc, 0xf6, 0x2d, 0xbe, 0x1a, 0x84,
+ 0x1f, 0x3d, 0xe2, 0xd4, 0x09, 0x3e, 0xe7, 0x1f, 0x1d, 0xbd, 0x20, 0x25, 0x26,
+ 0x3d, 0x68, 0x8f, 0x61, 0x3d, 0xe7, 0xdf, 0x1f, 0xbe, 0xad, 0x57, 0x1b, 0xbe,
+ 0x3e, 0xec, 0x1b, 0xbe, 0x6f, 0xe4, 0x09, 0xbe, 0x87, 0x7d, 0xb5, 0xbc, 0xce,
+ 0x89, 0x07, 0x3d, 0x8a, 0x34, 0xbe, 0x3b, 0x7a, 0x7d, 0x24, 0x3e, 0xde, 0xc8,
+ 0xfa, 0x3d, 0xa4, 0xc7, 0x9e, 0xbd, 0x5b, 0x97, 0xf0, 0xbd, 0x16, 0xf7, 0x3b,
+ 0xbe, 0x91, 0xad, 0x27, 0x3e, 0x06, 0x69, 0xf3, 0xbd, 0x6d, 0xb9, 0xe6, 0xbd,
+ 0xfc, 0xa1, 0x33, 0x3e, 0x73, 0x47, 0xd4, 0xbd, 0xd1, 0x35, 0xc0, 0x3d, 0x74,
+ 0x47, 0x12, 0x3d, 0x2d, 0x04, 0x23, 0x3d, 0xfc, 0xc6, 0x1b, 0x3d, 0x75, 0x18,
+ 0x0e, 0xbe, 0xa5, 0x96, 0x55, 0x3c, 0xb8, 0x10, 0xad, 0xbc, 0x93, 0x9b, 0xde,
+ 0xbd, 0x9f, 0xa2, 0xf4, 0x3d, 0xb8, 0x21, 0xf6, 0xba, 0xd7, 0x96, 0x09, 0xbd,
+ 0x2a, 0x6c, 0xd9, 0xbd, 0xb1, 0x32, 0x45, 0x3d, 0xc0, 0x16, 0x94, 0xbd, 0x78,
+ 0xac, 0x97, 0xbd, 0x97, 0xd4, 0xdf, 0xbd, 0x68, 0x97, 0x36, 0xbd, 0x28, 0xce,
+ 0x2f, 0x3d, 0x12, 0x02, 0x3d, 0xbd, 0x5b, 0x8f, 0x23, 0x3d, 0xf5, 0xc3, 0xda,
+ 0xba, 0xa6, 0x72, 0x41, 0x3e, 0x27, 0xa9, 0xcd, 0xbd, 0x9c, 0x9a, 0x3c, 0x3d,
+ 0xf2, 0x7f, 0x45, 0x3e, 0x1c, 0x9f, 0x40, 0x3e, 0xa9, 0xdf, 0x74, 0x3c, 0x6a,
+ 0x72, 0x6e, 0xbd, 0x46, 0x83, 0xa5, 0x3d, 0x3b, 0x67, 0x6c, 0x3c, 0xfc, 0x84,
+ 0x2a, 0x3d, 0x3c, 0xf4, 0x35, 0x3e, 0xb4, 0x2c, 0x79, 0xbd, 0x43, 0xb9, 0xd6,
+ 0x3d, 0xe6, 0xae, 0x13, 0xbd, 0xeb, 0x77, 0xd0, 0xbd, 0x31, 0x51, 0xbe, 0x3d,
+ 0x5f, 0x2e, 0x23, 0x3c, 0x7a, 0xbe, 0x15, 0x3e, 0x4b, 0x59, 0xdc, 0xbd, 0xa0,
+ 0x8f, 0xe7, 0xbd, 0x76, 0xa8, 0xf3, 0xbd, 0x88, 0x1c, 0x74, 0x3d, 0x85, 0x4d,
+ 0xdd, 0xbd, 0x45, 0x96, 0x36, 0xbd, 0xe8, 0x39, 0x98, 0x3d, 0xbe, 0x82, 0xf9,
+ 0x3d, 0x1d, 0xdb, 0x2d, 0x3b, 0x6f, 0xac, 0x63, 0xbd, 0x8c, 0xc8, 0xe1, 0xbd,
+ 0xcf, 0x49, 0x73, 0xbd, 0x8a, 0xdd, 0xe3, 0xbd, 0xf8, 0x00, 0x19, 0xbd, 0x17,
+ 0xe8, 0xdf, 0xbd, 0xba, 0x22, 0x5b, 0x3c, 0xf1, 0x54, 0x21, 0xbe, 0x7b, 0x38,
+ 0x58, 0xbd, 0x48, 0x88, 0x67, 0xbd, 0x5e, 0xe2, 0x6c, 0x3d, 0xa5, 0x44, 0x20,
+ 0xbe, 0x69, 0x7f, 0xbf, 0xbc, 0x7c, 0xfa, 0x25, 0x3e, 0xc1, 0xd9, 0xd5, 0xbd,
+ 0x46, 0x87, 0x75, 0xbd, 0x13, 0x1c, 0x01, 0xbd, 0xe5, 0xc3, 0x19, 0xbb, 0x2d,
+ 0xc8, 0x30, 0xbe, 0xad, 0xd8, 0xf2, 0x3d, 0xd9, 0x37, 0x14, 0xbd, 0xd2, 0xb5,
+ 0x9a, 0x3d, 0xf4, 0x37, 0x8d, 0x3c, 0x2f, 0x8f, 0xc0, 0x3d, 0x8e, 0xe9, 0xc5,
+ 0xbd, 0xf5, 0x4d, 0x21, 0xbe, 0xfd, 0x9a, 0xaa, 0xbd, 0x91, 0xb6, 0x00, 0xbe,
+ 0xf0, 0x0d, 0xbf, 0x3c, 0xe4, 0x94, 0xed, 0x3d, 0x64, 0xbe, 0x8d, 0x3c, 0x27,
+ 0xcf, 0x2f, 0x3e, 0x22, 0xa5, 0xf1, 0x3d, 0x96, 0xf2, 0xbf, 0xbd, 0x62, 0xde,
+ 0xe5, 0xbd, 0x4b, 0x4a, 0x89, 0x3d, 0x7a, 0x3c, 0x1d, 0x3e, 0xfc, 0x83, 0xab,
+ 0xbc, 0x0f, 0x00, 0x2e, 0xbe, 0xd5, 0xd1, 0x93, 0x3d, 0x32, 0x51, 0xca, 0xbd,
+ 0x27, 0x77, 0x31, 0xbd, 0x6e, 0xe6, 0xe2, 0x3d, 0xdd, 0xb0, 0x03, 0xbe, 0xd7,
+ 0xec, 0xe5, 0xbd, 0x97, 0x8e, 0x82, 0x3b, 0x7b, 0xaf, 0x03, 0xbe, 0xbe, 0x24,
+ 0xc3, 0x3d, 0x1e, 0x4c, 0x51, 0x3e, 0x07, 0x32, 0x10, 0x3e, 0xac, 0xdb, 0x01,
+ 0xbe, 0xef, 0x14, 0x38, 0x3e, 0x1b, 0xbb, 0x73, 0x3d, 0x6a, 0x42, 0x35, 0xbd,
+ 0x79, 0x72, 0x13, 0xbe, 0x05, 0x8c, 0xe9, 0x3d, 0xc1, 0x57, 0xe5, 0x3b, 0x50,
+ 0x38, 0x71, 0x3d, 0x47, 0xb5, 0xe4, 0xbd, 0x0f, 0x18, 0x01, 0xbe, 0xd6, 0x1c,
+ 0x76, 0x3b, 0x99, 0x36, 0x1c, 0xbe, 0x6d, 0xee, 0x1a, 0x3d, 0x2d, 0xcb, 0x39,
+ 0xbd, 0xc0, 0x54, 0x24, 0x3e, 0xcb, 0x5b, 0xfb, 0x3c, 0x8d, 0xc8, 0x85, 0x3a,
+ 0x10, 0xcb, 0xd6, 0x3c, 0xfd, 0x81, 0xd8, 0x3c, 0xc7, 0xab, 0x1b, 0xba, 0xf5,
+ 0xe1, 0xb5, 0xbd, 0x7a, 0x09, 0xfc, 0x3d, 0x98, 0x7b, 0x6b, 0xbd, 0x31, 0x74,
+ 0x46, 0xbe, 0x13, 0x26, 0x02, 0x3e, 0x67, 0x37, 0x03, 0xbe, 0x68, 0x29, 0xc4,
+ 0xbd, 0x8a, 0xc5, 0x8b, 0xbd, 0x50, 0x23, 0x22, 0xbc, 0x6d, 0x99, 0xf5, 0x3d,
+ 0x01, 0x6c, 0xc5, 0xbd, 0xd6, 0xce, 0x14, 0xbe, 0x29, 0xd4, 0xef, 0xbd, 0x7c,
+ 0xe1, 0x8b, 0x3c, 0x8f, 0x04, 0xd6, 0xbc, 0x29, 0xf1, 0x60, 0x3c, 0x02, 0x1a,
+ 0x2c, 0x3b, 0x76, 0x21, 0x00, 0xbe, 0x16, 0x98, 0x66, 0xbd, 0x2a, 0x64, 0x3f,
+ 0xbd, 0xbf, 0x81, 0x24, 0x3d, 0x30, 0x34, 0x27, 0x3e, 0x90, 0xee, 0x9b, 0x3d,
+ 0xe1, 0x6c, 0xdd, 0x3c, 0x25, 0x40, 0x25, 0x3e, 0xc0, 0x85, 0x57, 0x3b, 0x16,
+ 0xa8, 0x4f, 0x3e, 0xa9, 0xfb, 0x48, 0xbd, 0x38, 0x1c, 0xf8, 0x3b, 0x7a, 0x4a,
+ 0xb0, 0xbd, 0x29, 0xe7, 0xf3, 0xbd, 0xa5, 0x5c, 0x42, 0x3d, 0xab, 0x54, 0x09,
+ 0x3e, 0x94, 0x68, 0x75, 0x3d, 0x24, 0x37, 0x03, 0xbe, 0x4e, 0xba, 0x09, 0x3e,
+ 0x16, 0xba, 0x09, 0x3e, 0xbd, 0x97, 0x00, 0xbe, 0x92, 0xe4, 0x95, 0xbd, 0x74,
+ 0xf5, 0x9f, 0xbd, 0x40, 0x16, 0x81, 0x3d, 0x83, 0x4c, 0x26, 0x3e, 0x61, 0xd1,
+ 0x25, 0x3e, 0xfb, 0x74, 0x1d, 0xbe, 0x9b, 0x9f, 0x0f, 0x3d, 0xe8, 0x7e, 0x10,
+ 0x3d, 0x9e, 0xb0, 0x15, 0x3d, 0x34, 0xe6, 0xee, 0x3d, 0xaf, 0xef, 0xf0, 0xbb,
+ 0xaa, 0x06, 0x24, 0xbe, 0x43, 0x5e, 0xdb, 0x3d, 0x10, 0xd8, 0xa4, 0x3d, 0x6e,
+ 0xc9, 0x0c, 0xbd, 0x1c, 0xfe, 0xa9, 0x3d, 0xf0, 0xf3, 0x31, 0x3d, 0x38, 0xf5,
+ 0x7e, 0xba, 0x24, 0x31, 0xe0, 0x3d, 0x6e, 0xf2, 0xa2, 0x3d, 0xbe, 0x8b, 0xd4,
+ 0xbd, 0x65, 0xc3, 0x25, 0x3c, 0xa3, 0xde, 0x67, 0xba, 0x41, 0xe9, 0x13, 0xbe,
+ 0x83, 0xd0, 0x02, 0xbd, 0x8b, 0x91, 0x3a, 0x3d, 0x29, 0x20, 0x4c, 0xbc, 0xfc,
+ 0x3f, 0xcd, 0xbd, 0x5a, 0x01, 0xae, 0xbd, 0x6c, 0x48, 0x1e, 0xbe, 0xe0, 0x29,
+ 0x80, 0x3d, 0x18, 0x74, 0xa0, 0xbd, 0x2a, 0xeb, 0xbd, 0x39, 0x28, 0xe6, 0x2e,
+ 0xbe, 0x4b, 0x70, 0x59, 0x3d, 0xd7, 0xcf, 0xd7, 0xbc, 0x34, 0x77, 0xa5, 0x3c,
+ 0xef, 0x6d, 0x58, 0xbb, 0x31, 0xcc, 0xde, 0xbb, 0xf6, 0xe6, 0xc2, 0xbd, 0x8b,
+ 0xee, 0x14, 0x3e, 0xf3, 0x70, 0x12, 0xbe, 0x88, 0x93, 0xae, 0xbd, 0x57, 0xd4,
+ 0xfc, 0x3d, 0x48, 0x74, 0x36, 0x3e, 0xb5, 0xcb, 0x08, 0xbe, 0x32, 0x08, 0xbe,
+ 0xbd, 0x95, 0xe2, 0x2e, 0xbd, 0x6c, 0xa0, 0xc3, 0x3d, 0x83, 0xdb, 0xc4, 0x3a,
+ 0xc8, 0x25, 0xf0, 0x3d, 0x8a, 0x78, 0x0f, 0x3e, 0xed, 0xd4, 0x02, 0xbc, 0xd4,
+ 0x18, 0xad, 0xbd, 0x70, 0x10, 0xbf, 0xbd, 0x9f, 0x8e, 0x1c, 0xbe, 0x41, 0xdf,
+ 0xf2, 0x3d, 0x20, 0x72, 0x45, 0x3d, 0x7f, 0x52, 0x16, 0xbe, 0xd7, 0xf4, 0x25,
+ 0xbe, 0x6d, 0x3f, 0x3d, 0x3e, 0xd4, 0xb0, 0x26, 0xbe, 0x23, 0x8c, 0x87, 0x3d,
+ 0x6c, 0x4e, 0xb9, 0xbc, 0x67, 0x6c, 0x44, 0x3c, 0x35, 0x7b, 0xde, 0x3d, 0x19,
+ 0x66, 0xd7, 0x3d, 0x1c, 0xc9, 0xc2, 0x3d, 0xf1, 0xee, 0xba, 0xbd, 0xa3, 0xe1,
+ 0xc8, 0x3d, 0xf5, 0xf9, 0x82, 0x3c, 0x3d, 0x0e, 0x81, 0x3d, 0xea, 0xc7, 0x5d,
+ 0x3d, 0x19, 0x63, 0x25, 0x3e, 0x59, 0x2f, 0x13, 0xbd, 0xf2, 0x44, 0xeb, 0x3d,
+ 0xf0, 0xb5, 0xf1, 0xbc, 0x85, 0x77, 0x03, 0x3d, 0xda, 0x66, 0x11, 0xbd, 0xef,
+ 0xae, 0x1b, 0x3d, 0xe1, 0x4f, 0x94, 0xbd, 0x25, 0x17, 0x56, 0xbd, 0x74, 0x34,
+ 0x0c, 0x3e, 0xf8, 0x12, 0x88, 0x3d, 0x96, 0x08, 0x97, 0xbd, 0x04, 0xb9, 0x75,
+ 0xbc, 0x72, 0x9f, 0x8e, 0x3d, 0x0d, 0xf3, 0x7d, 0xbd, 0x51, 0xe7, 0x56, 0xbc,
+ 0x93, 0x6d, 0x08, 0xbe, 0xa7, 0xd8, 0x09, 0x3e, 0x80, 0xd5, 0xa8, 0xbd, 0x40,
+ 0x03, 0xd1, 0x3c, 0xe2, 0x44, 0x1f, 0xbd, 0x3e, 0x1f, 0xd6, 0xbd, 0x9f, 0x62,
+ 0xe7, 0x3c, 0xf7, 0x6d, 0xae, 0xbd, 0xf4, 0x14, 0xf6, 0x3a, 0x54, 0x99, 0xea,
+ 0x3b, 0x9c, 0xab, 0xf7, 0xbd, 0x74, 0x21, 0xdd, 0x3d, 0x87, 0x18, 0x95, 0xbd,
+ 0x49, 0x55, 0x0c, 0xbe, 0xd6, 0xdc, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20,
+ 0x01, 0x00, 0x00, 0x5a, 0xd4, 0xee, 0x3d, 0x38, 0x39, 0x64, 0x3e, 0x55, 0xb4,
+ 0x79, 0x3d, 0x1d, 0xa3, 0xb9, 0x3d, 0xb9, 0x79, 0xe0, 0x3b, 0x30, 0xff, 0xd1,
+ 0x3d, 0x7a, 0x3b, 0x2d, 0xbd, 0x18, 0x51, 0x07, 0xbe, 0x5c, 0x31, 0x3d, 0x3e,
+ 0x46, 0x0f, 0x51, 0xbe, 0x29, 0x32, 0x13, 0x3e, 0x7c, 0x11, 0xf3, 0xbd, 0x3a,
+ 0xbd, 0x4a, 0xbd, 0x56, 0xb3, 0xce, 0xbd, 0x37, 0xd0, 0xf6, 0x3d, 0xd5, 0x9b,
+ 0xd8, 0x3d, 0xa8, 0xbc, 0x5a, 0xbe, 0x1b, 0x22, 0x0e, 0xbc, 0x03, 0x98, 0xf9,
+ 0x3d, 0x64, 0xf4, 0x47, 0x3e, 0xa2, 0xb5, 0x2f, 0xbe, 0x70, 0x7a, 0x89, 0xbe,
+ 0x9c, 0x58, 0x60, 0x3e, 0x71, 0xac, 0x25, 0xbe, 0x17, 0x1c, 0x01, 0x3e, 0x48,
+ 0x73, 0x93, 0xbd, 0x0d, 0x92, 0xa3, 0x3d, 0xf1, 0xff, 0x62, 0xbe, 0x56, 0xe9,
+ 0x71, 0xbe, 0x09, 0xf7, 0x96, 0xbe, 0x91, 0x7a, 0x0a, 0x3e, 0xc1, 0x6d, 0x88,
+ 0x3c, 0x6c, 0xd0, 0x4f, 0xbe, 0x71, 0x75, 0x99, 0xbd, 0x7d, 0x92, 0x01, 0xbe,
+ 0x35, 0x21, 0x96, 0xbe, 0xd9, 0x0e, 0x2d, 0x3e, 0x63, 0x17, 0x8b, 0x3d, 0x53,
+ 0x6d, 0xb7, 0x3c, 0xb9, 0x06, 0x20, 0x3d, 0xdf, 0x56, 0x11, 0x3e, 0xc4, 0xcd,
+ 0xa9, 0x3c, 0x7d, 0x0a, 0x3b, 0x3e, 0xd6, 0x23, 0x7f, 0xbc, 0xaf, 0x06, 0xc4,
+ 0xbc, 0xe0, 0xe3, 0x63, 0xbd, 0x34, 0x50, 0x2a, 0x3e, 0x1f, 0xff, 0x4c, 0x3e,
+ 0x34, 0x98, 0x79, 0xbe, 0x4c, 0xbd, 0x18, 0x3e, 0x5b, 0x8b, 0x0f, 0x3e, 0x33,
+ 0x44, 0x34, 0xbd, 0xd6, 0xd7, 0x90, 0xbe, 0x51, 0x5e, 0x55, 0x3d, 0x46, 0x2b,
+ 0x54, 0xbe, 0xd8, 0x49, 0x30, 0xbe, 0x45, 0xb3, 0x72, 0xbe, 0x93, 0x18, 0xcd,
+ 0x3d, 0x86, 0xe1, 0x73, 0xbd, 0x94, 0x56, 0xf3, 0x3d, 0x0a, 0x54, 0xd7, 0xbd,
+ 0x01, 0xd9, 0x98, 0x3e, 0xd5, 0x11, 0x01, 0xbb, 0x69, 0x07, 0x62, 0xbe, 0x81,
+ 0x33, 0x03, 0xbb, 0x98, 0xf9, 0x9f, 0x3c, 0xe8, 0x77, 0x96, 0x3e, 0x3a, 0xc2,
+ 0x73, 0x3e, 0xa1, 0x45, 0x35, 0xbe, 0xea, 0x1c, 0x86, 0xbc, 0xad, 0x90, 0x45,
+ 0xbe, 0x0b, 0xd2, 0x03, 0x3d, 0x02, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x00, 0xa1, 0xc6, 0xcd, 0xbe, 0x46, 0xa7, 0xbd, 0x3e, 0x7c,
+ 0xe3, 0x00, 0x3f, 0x13, 0x8d, 0xb6, 0xbe, 0x21, 0x72, 0x8b, 0x3e, 0x16, 0x68,
+ 0x68, 0x3e, 0x05, 0xb7, 0xb6, 0xbe, 0xa0, 0xd3, 0xd4, 0x3e, 0x98, 0x82, 0x83,
+ 0xbd, 0x8c, 0xb1, 0xe2, 0x3d, 0xd6, 0x94, 0x82, 0x3e, 0x07, 0x6a, 0x70, 0xbe,
+ 0x6b, 0x74, 0x0b, 0x3f, 0xd8, 0xf5, 0x3d, 0x3e, 0xfb, 0xf3, 0x19, 0xbd, 0x2c,
+ 0x72, 0xbf, 0x3e, 0xff, 0x95, 0x49, 0x3d, 0xee, 0x70, 0x78, 0x3e, 0xb0, 0x3f,
+ 0x58, 0x3d, 0x78, 0xea, 0x9d, 0xbe, 0x53, 0x1d, 0x15, 0x3f, 0x0d, 0xfc, 0xbe,
+ 0xbe, 0xad, 0x10, 0x07, 0xbf, 0xb4, 0x11, 0x87, 0xbe, 0x20, 0x92, 0x62, 0x3e,
+ 0x58, 0x61, 0xbd, 0x3e, 0xea, 0x54, 0x4a, 0xbd, 0xbd, 0x55, 0xce, 0xbe, 0x12,
+ 0x48, 0xa2, 0x3e, 0xe0, 0x74, 0x90, 0x3d, 0xce, 0x80, 0xf5, 0x3e, 0xa5, 0xb7,
+ 0x15, 0x3f, 0x8e, 0xde, 0xfe, 0xff, 0x04, 0x00, 0x00, 0x00, 0x00, 0x20, 0x01,
+ 0x00, 0x2c, 0xcf, 0x79, 0xbd, 0x8c, 0x37, 0x5a, 0xbc, 0x00, 0x4c, 0x6f, 0x3c,
+ 0x14, 0x0b, 0x8e, 0x3d, 0xa8, 0xc3, 0x12, 0x3c, 0x10, 0x9f, 0xa5, 0xbb, 0xe8,
+ 0x7e, 0x17, 0xbd, 0x43, 0x60, 0x74, 0xbd, 0xc6, 0x62, 0x6f, 0x3d, 0x88, 0x83,
+ 0x6c, 0xbd, 0xf7, 0xf2, 0x36, 0xbd, 0xb7, 0x11, 0x81, 0xbd, 0x69, 0x1c, 0x30,
+ 0xbd, 0xde, 0xd0, 0x4e, 0x3c, 0xa4, 0x9f, 0x6e, 0xbc, 0x06, 0xd8, 0xd6, 0xbc,
+ 0x21, 0x75, 0x5f, 0xbd, 0x68, 0x6f, 0x0c, 0xbc, 0xbd, 0x21, 0xcf, 0xbb, 0x20,
+ 0x31, 0xb0, 0x3b, 0x88, 0xa3, 0x32, 0x3c, 0xa0, 0xec, 0x56, 0x3d, 0x19, 0xfd,
+ 0xf8, 0x3c, 0x99, 0xd1, 0x75, 0x3d, 0x99, 0x54, 0x3d, 0x3c, 0x4d, 0x0f, 0x12,
+ 0x3b, 0x34, 0xf2, 0x37, 0xbd, 0xaa, 0x3b, 0x85, 0xbb, 0x23, 0xfe, 0xde, 0xbb,
+ 0x8a, 0xe4, 0x21, 0x3c, 0xbd, 0x46, 0x8d, 0x3d, 0xd8, 0xf0, 0x03, 0x3d, 0xfa,
+ 0xb6, 0xb6, 0x3c, 0xb8, 0x2e, 0xc9, 0xbc, 0xac, 0x52, 0x4a, 0xbd, 0xd2, 0x5d,
+ 0x00, 0x3c, 0x7d, 0x64, 0x6f, 0xbd, 0xe6, 0x47, 0x77, 0x3d, 0xe0, 0x29, 0xbe,
+ 0x3b, 0x5a, 0xb3, 0xee, 0xbc, 0x40, 0x76, 0xe3, 0xbb, 0x18, 0xf0, 0x8b, 0x3c,
+ 0xbc, 0x5f, 0x3a, 0x3d, 0x47, 0xdd, 0x08, 0x3d, 0x0b, 0xae, 0x39, 0xbc, 0xa1,
+ 0xca, 0xd9, 0xbc, 0xf8, 0x6b, 0x92, 0xbc, 0xf8, 0x2b, 0x42, 0x3d, 0xef, 0x4c,
+ 0x14, 0xbd, 0x64, 0xd7, 0x4b, 0xbd, 0x22, 0x18, 0x18, 0x3c, 0x20, 0xf8, 0x29,
+ 0xbd, 0x00, 0x5d, 0xdd, 0x3a, 0x56, 0x0c, 0x5f, 0xbd, 0x47, 0x5d, 0x84, 0xbd,
+ 0x5e, 0xea, 0xa1, 0x3c, 0xc4, 0x53, 0x89, 0xbd, 0x53, 0xde, 0x4d, 0xbc, 0xe7,
+ 0xc7, 0x88, 0xbc, 0x35, 0xef, 0x56, 0x3d, 0x45, 0x2c, 0xb4, 0x3c, 0xd8, 0x97,
+ 0x7b, 0xbd, 0x17, 0xec, 0x89, 0x3d, 0xe1, 0x90, 0x45, 0x3d, 0x89, 0xf2, 0x3f,
+ 0xbd, 0xf1, 0x11, 0xff, 0xbb, 0x1b, 0x6f, 0x03, 0xbd, 0xf7, 0xf7, 0x3d, 0x3b,
+ 0xc4, 0x7d, 0x91, 0x3c, 0x44, 0x07, 0x0b, 0x3d, 0x4a, 0xc0, 0x6f, 0x3d, 0x79,
+ 0x51, 0x8f, 0x3d, 0x66, 0x5e, 0x41, 0x3d, 0xf1, 0x9b, 0x8c, 0xbd, 0x38, 0xb9,
+ 0xca, 0x3c, 0xe3, 0xf8, 0xe8, 0x3c, 0xcd, 0xce, 0x8f, 0xbb, 0xe4, 0xe9, 0x6b,
+ 0x3c, 0x92, 0xd8, 0x39, 0x3d, 0xbe, 0x6d, 0x52, 0xbd, 0x38, 0xed, 0x4a, 0xbd,
+ 0x68, 0xd4, 0x28, 0xbc, 0x6f, 0x16, 0x67, 0xbd, 0xd7, 0x55, 0x8a, 0x3d, 0xe0,
+ 0x69, 0xb0, 0xbb, 0xfa, 0x9c, 0x93, 0xbd, 0x14, 0xe4, 0x21, 0x3d, 0x96, 0x1c,
+ 0x7b, 0x3d, 0x4c, 0x31, 0x34, 0x3c, 0xa8, 0x41, 0x5c, 0x3c, 0x90, 0xe5, 0x8c,
+ 0x3d, 0x11, 0x9f, 0x98, 0x3c, 0xf0, 0x3d, 0x16, 0x3d, 0x53, 0xd1, 0x91, 0xbd,
+ 0x50, 0xc5, 0xef, 0x3c, 0x25, 0x52, 0x83, 0x3c, 0x9e, 0xce, 0x1f, 0x3c, 0x91,
+ 0xa7, 0x0c, 0xbd, 0xb8, 0x95, 0x03, 0x3c, 0x7a, 0x4c, 0x35, 0x3d, 0x8e, 0xc4,
+ 0x44, 0x3d, 0x1c, 0x66, 0x2c, 0x3d, 0x00, 0x89, 0x40, 0xba, 0xe1, 0xa3, 0x83,
+ 0x3d, 0x68, 0xf2, 0x2b, 0xbd, 0x30, 0xd4, 0xde, 0x3b, 0xcf, 0xa1, 0xbc, 0x3c,
+ 0x24, 0x79, 0x39, 0xbd, 0xe5, 0xf4, 0xb7, 0xbc, 0x79, 0x8d, 0x25, 0x3c, 0x95,
+ 0xb6, 0x38, 0x3d, 0xd8, 0xc2, 0x74, 0x3c, 0xaa, 0x8e, 0x80, 0xbd, 0x0d, 0x74,
+ 0xf3, 0x3c, 0x73, 0x5b, 0x98, 0xbc, 0x00, 0x64, 0x5e, 0xbc, 0x44, 0x82, 0xcb,
+ 0x3c, 0x5a, 0x25, 0x53, 0xbd, 0xe2, 0xd0, 0x93, 0xbd, 0x3b, 0x7a, 0x77, 0xbd,
+ 0x93, 0x3e, 0xd4, 0x3c, 0x39, 0x81, 0x28, 0xbd, 0x54, 0xd5, 0xef, 0x3c, 0x6c,
+ 0x29, 0xe1, 0x3c, 0x69, 0xc8, 0x09, 0x3d, 0x83, 0xb3, 0x36, 0xbd, 0x90, 0xe1,
+ 0xd4, 0xbb, 0x95, 0xa7, 0x1a, 0xbd, 0x39, 0xf5, 0x2b, 0xbc, 0x0c, 0xdf, 0x64,
+ 0xbd, 0x74, 0xec, 0xdc, 0xbc, 0x20, 0xc6, 0x3b, 0x3d, 0x40, 0x20, 0x46, 0x3c,
+ 0x18, 0x09, 0x3f, 0xbd, 0x96, 0x4c, 0xdc, 0xbc, 0x98, 0x98, 0x8d, 0xbd, 0xb4,
+ 0xdd, 0x27, 0xbd, 0x74, 0x45, 0xbb, 0x3c, 0x49, 0xd9, 0x08, 0xbd, 0x8e, 0x06,
+ 0xa8, 0x3b, 0x91, 0x10, 0xb4, 0x3c, 0xf8, 0x58, 0xf3, 0xbc, 0x06, 0xe9, 0x5e,
+ 0x3d, 0x14, 0xc8, 0x26, 0x3d, 0xc5, 0xf7, 0x20, 0xbb, 0x6b, 0x78, 0xc0, 0x3c,
+ 0xae, 0x64, 0x7f, 0x3c, 0xbb, 0xbf, 0x8b, 0x3c, 0x82, 0x4e, 0x0c, 0xbd, 0xb0,
+ 0xd0, 0xdf, 0xbc, 0xfe, 0x53, 0x97, 0xbc, 0x8a, 0x9e, 0x24, 0xbd, 0xdf, 0x79,
+ 0x84, 0x3d, 0x7e, 0xff, 0x8e, 0xbd, 0x66, 0x7b, 0xda, 0x3c, 0xb0, 0xdd, 0x8d,
+ 0xbd, 0xab, 0x91, 0xbb, 0xbc, 0x23, 0x20, 0xb0, 0xbc, 0xbe, 0x43, 0x3f, 0xbd,
+ 0x64, 0x80, 0xda, 0x3c, 0x32, 0x00, 0xde, 0x3c, 0xb2, 0x8a, 0x86, 0x3c, 0x68,
+ 0x45, 0x05, 0x3d, 0x8b, 0x7c, 0xd8, 0x3b, 0x68, 0x97, 0xe7, 0x3c, 0x82, 0x8d,
+ 0x6b, 0x3d, 0xa6, 0x53, 0x2d, 0x3d, 0xc0, 0x43, 0x23, 0x3c, 0xaa, 0xe6, 0x2d,
+ 0xbd, 0x34, 0x06, 0x57, 0xbc, 0xfc, 0x9f, 0x0c, 0xbd, 0x42, 0x77, 0xc6, 0x3c,
+ 0x51, 0x7a, 0x70, 0x3c, 0xe5, 0xe4, 0x7c, 0x3d, 0x86, 0x00, 0x67, 0xbd, 0x95,
+ 0xb8, 0x37, 0xbd, 0xdd, 0x7a, 0x8d, 0x3d, 0x97, 0x08, 0xa9, 0x3c, 0xfd, 0xb6,
+ 0x09, 0x3d, 0xdc, 0xb7, 0x81, 0x3d, 0xe0, 0x6c, 0x68, 0xbc, 0x79, 0x9b, 0x03,
+ 0xbd, 0xb8, 0xc7, 0x78, 0xbb, 0x94, 0x60, 0x0f, 0x3d, 0x3b, 0x0e, 0x80, 0x3d,
+ 0x11, 0xe6, 0x80, 0x3d, 0xb3, 0xab, 0x86, 0x3d, 0xed, 0xe6, 0x9d, 0xbc, 0xd8,
+ 0xeb, 0xd9, 0xbc, 0xaa, 0x62, 0x80, 0x3d, 0x12, 0xc5, 0x00, 0x3d, 0x2b, 0x4b,
+ 0x23, 0xbc, 0xc7, 0x31, 0xff, 0xbc, 0xe4, 0x95, 0xdb, 0x3b, 0xa7, 0x90, 0x66,
+ 0x3c, 0xd3, 0x65, 0xdb, 0xbc, 0x50, 0xe3, 0x47, 0x3d, 0xd4, 0x25, 0x84, 0xbd,
+ 0x5a, 0xd5, 0xae, 0xbc, 0x90, 0x5e, 0xba, 0x3c, 0x8c, 0x60, 0x90, 0xbd, 0xfc,
+ 0x57, 0x4c, 0x3d, 0x99, 0x08, 0x7d, 0xbd, 0x9f, 0xac, 0x3b, 0x3c, 0x1c, 0xb1,
+ 0x61, 0xbc, 0x6a, 0xb5, 0x33, 0xbc, 0x10, 0xb0, 0x28, 0x3c, 0x89, 0x5d, 0x9f,
+ 0x3c, 0xd2, 0x80, 0x84, 0xbc, 0xb4, 0xb1, 0xd5, 0xba, 0x41, 0x1e, 0xa0, 0x3c,
+ 0xd1, 0xd9, 0xd0, 0xbb, 0x04, 0xda, 0xd2, 0x3c, 0x58, 0x46, 0x90, 0xbc, 0xc1,
+ 0x5c, 0x19, 0xbc, 0x01, 0x66, 0x2c, 0xbd, 0xad, 0xdc, 0x88, 0xbd, 0x32, 0xab,
+ 0xb6, 0xbc, 0x14, 0x1f, 0x0b, 0x3d, 0x87, 0xf0, 0x69, 0x3d, 0x55, 0x30, 0x26,
+ 0xbd, 0x2e, 0x3a, 0x05, 0xbd, 0xda, 0x08, 0x0e, 0xbd, 0xef, 0x31, 0x57, 0xbd,
+ 0x0e, 0x44, 0x13, 0xbd, 0x53, 0x11, 0x29, 0xbd, 0x00, 0xd2, 0xea, 0x3a, 0x47,
+ 0x72, 0xae, 0xbc, 0x54, 0x4a, 0x4d, 0xbd, 0x8a, 0x13, 0x2b, 0xbd, 0xa3, 0xaf,
+ 0x92, 0x3d, 0x68, 0x15, 0x0d, 0x3c, 0x18, 0x17, 0x35, 0x3c, 0xb8, 0xf2, 0x6a,
+ 0x3c, 0x15, 0xf8, 0xb2, 0x3c, 0x1d, 0x9d, 0xcd, 0x3c, 0xd3, 0x90, 0x81, 0xbd,
+ 0x51, 0xe8, 0x21, 0x3d, 0x74, 0x43, 0xa9, 0x3c, 0x00, 0x0b, 0xa0, 0x3c, 0x8e,
+ 0x69, 0xfb, 0xba, 0x81, 0x27, 0xfa, 0x3c, 0x6b, 0x7c, 0xf5, 0xbc, 0x61, 0x68,
+ 0x84, 0x3d, 0xe4, 0x1a, 0x6b, 0xbd, 0xd0, 0xe9, 0xc8, 0x3c, 0x26, 0xff, 0x47,
+ 0xbd, 0x64, 0xb7, 0xe9, 0x3b, 0xf3, 0xad, 0x36, 0x3d, 0x8a, 0x00, 0x3f, 0xbd,
+ 0x94, 0x41, 0xcf, 0xbc, 0x01, 0xba, 0x55, 0x3d, 0x8c, 0x08, 0x36, 0xbd, 0xa4,
+ 0x6b, 0x1a, 0x3d, 0x59, 0xfd, 0x83, 0x3d, 0xcc, 0xdd, 0x60, 0xbd, 0x59, 0xc2,
+ 0xfe, 0xbc, 0xa6, 0x99, 0x2a, 0x3d, 0xbd, 0x45, 0x8b, 0x3d, 0xe2, 0x5e, 0x8c,
+ 0x3d, 0x18, 0x83, 0x87, 0xbc, 0x10, 0x63, 0xda, 0x3b, 0x58, 0xa1, 0xc2, 0x3c,
+ 0x78, 0xfa, 0x78, 0x3c, 0xfc, 0x33, 0xf0, 0x3c, 0xc4, 0xab, 0x5b, 0xbd, 0xde,
+ 0x4b, 0x07, 0x3d, 0x53, 0x76, 0x1b, 0xbd, 0xee, 0xd8, 0x86, 0x3d, 0x7f, 0xd6,
+ 0x7c, 0xbd, 0x68, 0xb5, 0x8e, 0x3c, 0x49, 0xdd, 0xd5, 0xbc, 0x83, 0x63, 0xed,
+ 0xbb, 0x4e, 0x00, 0x91, 0xbd, 0x69, 0xce, 0xd5, 0xbb, 0x2f, 0x57, 0x71, 0xbc,
+ 0x9a, 0xc3, 0x8f, 0xbd, 0x65, 0x27, 0x47, 0x3d, 0x2d, 0x6b, 0x77, 0xbd, 0xdd,
+ 0x54, 0x43, 0xbc, 0xf7, 0x1f, 0xe8, 0xbc, 0x12, 0x8f, 0x87, 0xbd, 0x4f, 0xcf,
+ 0x2f, 0x3d, 0x15, 0x51, 0x4b, 0xbd, 0x9d, 0x1f, 0x86, 0x3d, 0x68, 0x35, 0x58,
+ 0xbd, 0x16, 0xe4, 0x4e, 0xbd, 0xd0, 0x03, 0x91, 0xbd, 0x39, 0xc6, 0x90, 0x3c,
+ 0xdd, 0xbb, 0x0a, 0xbd, 0x58, 0x1b, 0x33, 0xbd, 0x55, 0x86, 0x91, 0xbd, 0x48,
+ 0xe7, 0x90, 0xbc, 0xf4, 0x14, 0x3f, 0xbc, 0xc0, 0x75, 0x9e, 0xba, 0x7e, 0x8f,
+ 0xa8, 0xbc, 0x8c, 0x2b, 0x55, 0x3d, 0x54, 0x4b, 0x70, 0xbd, 0x56, 0x74, 0x52,
+ 0x3d, 0x6d, 0xf4, 0x02, 0x3b, 0x7d, 0x46, 0x5c, 0x3b, 0x76, 0xf4, 0x0c, 0xbd,
+ 0xac, 0xa2, 0x1d, 0xbd, 0x5c, 0x63, 0xe2, 0xbc, 0x64, 0x4d, 0x31, 0x3c, 0xf9,
+ 0x3e, 0x3f, 0x3d, 0xed, 0x12, 0x2c, 0xbd, 0xc8, 0x12, 0xb0, 0xbc, 0x4d, 0x90,
+ 0x8f, 0x3d, 0x1d, 0xef, 0x89, 0x3d, 0xf0, 0x4f, 0x93, 0xbd, 0x88, 0x79, 0xd8,
+ 0x3c, 0x74, 0x42, 0x1f, 0xbd, 0xba, 0x43, 0x90, 0x3c, 0xd5, 0x7e, 0xe3, 0xbc,
+ 0x71, 0x49, 0x7b, 0xbd, 0x5d, 0x36, 0x16, 0x3d, 0x91, 0xb8, 0x22, 0xbd, 0xd4,
+ 0x0e, 0x1e, 0x3d, 0xaa, 0x17, 0x2d, 0x3c, 0xca, 0x4d, 0xb9, 0x3b, 0x8a, 0x9d,
+ 0x01, 0x3d, 0x60, 0xcf, 0xc3, 0xbb, 0xc4, 0xc0, 0x00, 0x3b, 0x6d, 0xeb, 0x09,
+ 0xbd, 0x88, 0x55, 0x9e, 0xbc, 0x04, 0x54, 0xc3, 0xbc, 0x00, 0x93, 0xf2, 0x3a,
+ 0xe2, 0x88, 0x6e, 0x3d, 0xa0, 0xdb, 0xd4, 0xbc, 0x12, 0x3b, 0xa4, 0x3b, 0x5d,
+ 0x20, 0x88, 0x3d, 0xb4, 0xe5, 0xdc, 0xbc, 0x93, 0xf0, 0x70, 0xbc, 0xf6, 0x1a,
+ 0x31, 0xbd, 0xe0, 0xc3, 0x75, 0x3c, 0xbc, 0x2b, 0x96, 0x3c, 0x5b, 0x81, 0x44,
+ 0xbd, 0x6e, 0x2f, 0xab, 0xbc, 0x4c, 0x4e, 0x82, 0x3d, 0x6c, 0x17, 0x9b, 0xbc,
+ 0x70, 0x5a, 0x16, 0xbc, 0x70, 0x5e, 0x10, 0x3c, 0x81, 0xf0, 0x7d, 0xbd, 0x55,
+ 0xca, 0x3d, 0x3d, 0xca, 0x75, 0xa2, 0xbc, 0x7f, 0xc2, 0xe2, 0xbb, 0xc4, 0x59,
+ 0x82, 0x3d, 0xbd, 0xde, 0xd0, 0xbc, 0xe6, 0x4c, 0x3a, 0x3d, 0x62, 0xc7, 0x62,
+ 0x3d, 0x3e, 0xd2, 0xc1, 0xba, 0xeb, 0xae, 0xb3, 0xbb, 0x39, 0xf0, 0xa2, 0x3c,
+ 0xd0, 0xa2, 0x18, 0xbd, 0x65, 0xea, 0x99, 0x3b, 0xd0, 0x01, 0x8d, 0xbc, 0x34,
+ 0x0c, 0x84, 0xbd, 0xc3, 0x10, 0x3f, 0xbd, 0xb0, 0x26, 0xc4, 0x3b, 0xde, 0xc4,
+ 0x2e, 0x3d, 0xb4, 0x3f, 0xe5, 0x3c, 0x80, 0x6d, 0xda, 0x3b, 0xd3, 0x01, 0x8f,
+ 0x3d, 0x7b, 0x2e, 0x70, 0x3b, 0x95, 0x55, 0x51, 0xbd, 0xc2, 0x13, 0x4a, 0x3d,
+ 0x70, 0xd8, 0x4a, 0x3d, 0x6d, 0xf3, 0xc7, 0xbb, 0x40, 0x46, 0xe8, 0x3c, 0x71,
+ 0x53, 0x85, 0x3a, 0xea, 0x87, 0xf9, 0x3c, 0xb0, 0xb0, 0xf5, 0x3c, 0xf2, 0x2a,
+ 0x58, 0x3d, 0xe8, 0xd7, 0xc4, 0x3c, 0x57, 0xd9, 0xc8, 0x3c, 0xf3, 0x05, 0x79,
+ 0xbd, 0x9c, 0x0e, 0xf5, 0xbb, 0xcd, 0xaa, 0x1b, 0xbc, 0x42, 0xa2, 0x22, 0x3d,
+ 0x3e, 0x81, 0xe3, 0x3c, 0x66, 0x13, 0x2a, 0xbd, 0x6d, 0xfd, 0x8f, 0x3d, 0xd3,
+ 0x64, 0xab, 0x3c, 0x1e, 0x94, 0xba, 0x3c, 0x68, 0x42, 0x45, 0xbd, 0x4c, 0x0e,
+ 0xaf, 0xbc, 0x90, 0xbf, 0x7e, 0x3d, 0x6f, 0x71, 0x91, 0x3d, 0xc3, 0xb6, 0x80,
+ 0x3d, 0x3a, 0xbd, 0x32, 0xbd, 0x08, 0x63, 0x11, 0xbc, 0xec, 0xf4, 0x08, 0x3d,
+ 0x60, 0x5c, 0xcc, 0x3b, 0x66, 0x5b, 0x59, 0xbd, 0xb9, 0xcb, 0x8d, 0xbd, 0xfd,
+ 0x30, 0x54, 0x3d, 0x2e, 0xaa, 0x0f, 0xbc, 0x80, 0x26, 0x1a, 0xbb, 0x47, 0x43,
+ 0x19, 0xbd, 0x2c, 0x5d, 0xb8, 0x3c, 0x6c, 0xa6, 0xe8, 0x3c, 0xec, 0x3c, 0xcb,
+ 0xbc, 0x61, 0x53, 0xa4, 0x3c, 0x68, 0xf1, 0x0a, 0x3c, 0x9c, 0x5f, 0x30, 0x3d,
+ 0x5b, 0x39, 0xb8, 0xbc, 0xd2, 0x8d, 0x99, 0xbc, 0xe7, 0x1e, 0x31, 0xbd, 0x61,
+ 0x4e, 0x2c, 0xbd, 0x11, 0xeb, 0xb3, 0xbc, 0x80, 0x2e, 0x0b, 0xbc, 0x57, 0xbf,
+ 0x75, 0x3c, 0xbb, 0xd3, 0x2b, 0x3d, 0xba, 0xc5, 0x1b, 0x3d, 0x43, 0x78, 0x80,
+ 0x3d, 0xeb, 0x30, 0x0a, 0x3c, 0xf7, 0xf8, 0x04, 0x3d, 0x1f, 0x88, 0x17, 0xbd,
+ 0x7c, 0x55, 0xf0, 0xbc, 0x4a, 0x93, 0x3c, 0x3d, 0x7a, 0x12, 0x5c, 0xbd, 0x54,
+ 0x6b, 0x42, 0xbd, 0xa0, 0x16, 0xd8, 0x3b, 0x20, 0x3e, 0x3b, 0x3b, 0x3c, 0xde,
+ 0x72, 0xbd, 0x68, 0x37, 0x68, 0xbd, 0x37, 0x55, 0x97, 0xbb, 0x19, 0x7b, 0x43,
+ 0xbd, 0x82, 0xce, 0x8a, 0xbd, 0xcf, 0xc2, 0x88, 0xbd, 0x30, 0xde, 0xd8, 0x3b,
+ 0xf1, 0xc1, 0xa9, 0x3c, 0x68, 0x51, 0x2d, 0x3d, 0x76, 0xd5, 0xac, 0x3c, 0xb8,
+ 0x4b, 0x78, 0xbb, 0x0f, 0x1c, 0x5d, 0xbd, 0xf7, 0x31, 0x25, 0xbd, 0x72, 0x4c,
+ 0x91, 0x3d, 0x6e, 0x4f, 0x51, 0x3d, 0xb4, 0x9b, 0x21, 0xbd, 0x03, 0x73, 0xdd,
+ 0xbc, 0x38, 0x49, 0x4f, 0x3c, 0xb8, 0xc7, 0x4f, 0x3d, 0x6a, 0x17, 0x0a, 0xba,
+ 0xf4, 0x4f, 0xcd, 0x3c, 0x93, 0x14, 0x86, 0xbd, 0xde, 0x1e, 0x31, 0x3c, 0x57,
+ 0x45, 0xf1, 0x3c, 0x53, 0xc3, 0x7c, 0x3d, 0xc8, 0x1a, 0xd8, 0x3c, 0x85, 0xf4,
+ 0x8d, 0x3d, 0xf2, 0xaa, 0x46, 0x3d, 0xa6, 0x5c, 0x73, 0x3d, 0xf8, 0x5a, 0x3c,
+ 0x3d, 0xd0, 0x85, 0xaf, 0x3c, 0x60, 0x1f, 0xa0, 0x3c, 0xef, 0xcb, 0x45, 0xbd,
+ 0x68, 0xc2, 0x24, 0x3d, 0x25, 0x65, 0x14, 0x3b, 0x0c, 0x01, 0x67, 0x3d, 0x43,
+ 0x57, 0x65, 0xbd, 0x50, 0x8f, 0xec, 0x3b, 0x88, 0xf5, 0x16, 0x3d, 0xde, 0xa3,
+ 0xe2, 0xbc, 0x92, 0x11, 0xfb, 0x3c, 0x35, 0x93, 0x26, 0x3d, 0x96, 0xe4, 0x70,
+ 0x3d, 0x30, 0xea, 0x40, 0x3c, 0x50, 0x65, 0x37, 0x3c, 0x56, 0xf8, 0x84, 0xbd,
+ 0x36, 0xc0, 0x8e, 0x3d, 0x58, 0x45, 0x6b, 0xbd, 0x46, 0xcc, 0x5e, 0xbc, 0x41,
+ 0x2a, 0x4f, 0xbd, 0x5f, 0xce, 0x80, 0xbb, 0xfb, 0x75, 0xae, 0xbc, 0x19, 0xe3,
+ 0x0b, 0xbd, 0x54, 0x3e, 0x8a, 0x3c, 0x41, 0x54, 0xb7, 0x39, 0x8f, 0xb4, 0x80,
+ 0x3d, 0xfb, 0x42, 0x00, 0x3d, 0x5e, 0x0b, 0x19, 0xbd, 0x5d, 0x03, 0xb5, 0x3c,
+ 0xd8, 0x30, 0x78, 0x3c, 0x3e, 0xef, 0x90, 0xbc, 0xe0, 0x2c, 0xdb, 0x3b, 0x0a,
+ 0x5a, 0xfc, 0xbc, 0x24, 0x7e, 0x90, 0xbd, 0x1a, 0xd4, 0x1b, 0x3d, 0x10, 0x0a,
+ 0x87, 0x3d, 0xa3, 0x8c, 0x3b, 0xbd, 0x3f, 0x54, 0xda, 0xbc, 0x0f, 0x59, 0xd8,
+ 0x3b, 0xbe, 0xea, 0xea, 0x3c, 0x39, 0x2d, 0x7e, 0xbd, 0x19, 0xa0, 0x73, 0xba,
+ 0x3c, 0xc5, 0x60, 0xbd, 0x57, 0x9e, 0x70, 0xbd, 0xdc, 0x65, 0xfb, 0x3b, 0xbc,
+ 0x13, 0x32, 0xbd, 0xa4, 0xd0, 0x81, 0xbd, 0x5f, 0x74, 0x85, 0x3d, 0x1a, 0xf5,
+ 0x58, 0x3d, 0xa3, 0x35, 0x7c, 0x3d, 0xb3, 0x3d, 0x87, 0x3c, 0x83, 0xc6, 0x6b,
+ 0x3d, 0xff, 0xe3, 0x8e, 0x3d, 0x97, 0xab, 0x01, 0xbd, 0x7c, 0xd4, 0x85, 0x3d,
+ 0xa0, 0xbd, 0x83, 0xbc, 0x04, 0x12, 0x41, 0x3d, 0x9e, 0x3d, 0x57, 0xbd, 0xa2,
+ 0x37, 0xc1, 0x3c, 0xf2, 0xa6, 0x81, 0xbd, 0xe0, 0xde, 0xe6, 0xbc, 0xa0, 0x4b,
+ 0xd4, 0xbb, 0xe8, 0x33, 0xd8, 0xbc, 0x9a, 0x4c, 0x55, 0x3d, 0x16, 0xc0, 0x91,
+ 0xbd, 0x28, 0xa0, 0x1e, 0x3c, 0xfc, 0xc7, 0x5f, 0xbc, 0xc1, 0x5e, 0x95, 0x3c,
+ 0xc4, 0x85, 0xa0, 0x3c, 0xf5, 0x01, 0xd7, 0xbc, 0xf3, 0x15, 0xcc, 0xbb, 0x52,
+ 0x0c, 0x2c, 0xbd, 0xea, 0xdf, 0x7b, 0x3d, 0x06, 0xe0, 0x26, 0xbc, 0x7a, 0x9a,
+ 0x8d, 0xbd, 0x9c, 0xdb, 0xac, 0x3c, 0x4b, 0xfa, 0x2f, 0x3d, 0xe4, 0x93, 0xf1,
+ 0x3c, 0x89, 0xe5, 0x91, 0xbd, 0xda, 0x41, 0x28, 0xbd, 0x52, 0x6f, 0x58, 0x3d,
+ 0x89, 0x2f, 0x43, 0xbd, 0x74, 0xe4, 0x00, 0xbd, 0x59, 0xd4, 0x26, 0xbd, 0x97,
+ 0x79, 0xa9, 0x3c, 0xb0, 0x62, 0x9f, 0xb9, 0xbc, 0xac, 0x04, 0x3d, 0x5c, 0xce,
+ 0x3d, 0xbd, 0x15, 0x58, 0x67, 0xbd, 0x0a, 0xce, 0xf4, 0xbc, 0x3a, 0x8f, 0x01,
+ 0xbd, 0x50, 0xd2, 0x73, 0xbc, 0x8e, 0x54, 0x16, 0xbc, 0xea, 0xd7, 0x3c, 0x3d,
+ 0xf0, 0xbe, 0xd7, 0x3c, 0x1a, 0x3d, 0x82, 0xbd, 0xba, 0x91, 0x2f, 0x3d, 0x10,
+ 0xb0, 0x92, 0xbd, 0xf8, 0x36, 0x1c, 0x3d, 0x50, 0x2a, 0x8f, 0xbd, 0xb0, 0x09,
+ 0x5e, 0x3d, 0x3b, 0xc8, 0x8f, 0xba, 0xf4, 0xce, 0x92, 0xbd, 0x38, 0xc4, 0x78,
+ 0xbd, 0xe0, 0x8c, 0x5c, 0xbc, 0x98, 0x6b, 0x8b, 0x3d, 0x16, 0x7f, 0x4a, 0x3d,
+ 0x18, 0xc0, 0xfe, 0xbc, 0x66, 0xbb, 0x4b, 0xbd, 0x90, 0xb6, 0xe1, 0x3b, 0x98,
+ 0xca, 0x8c, 0x3c, 0x05, 0xfe, 0xec, 0xbc, 0x58, 0x1c, 0x17, 0x3d, 0x37, 0x17,
+ 0x80, 0x3d, 0x41, 0x6e, 0x14, 0x3d, 0xee, 0x95, 0xcb, 0xbb, 0x1a, 0x56, 0x1f,
+ 0xbd, 0xae, 0xc7, 0x2c, 0x3c, 0x28, 0x3a, 0x80, 0x3b, 0x00, 0x13, 0x76, 0xbc,
+ 0x69, 0xaf, 0x5e, 0xbc, 0x80, 0xcc, 0x02, 0xbd, 0xa8, 0xea, 0x04, 0xba, 0xb8,
+ 0xae, 0x09, 0x3d, 0xb3, 0x0d, 0x8d, 0x3d, 0xc0, 0x22, 0x84, 0xba, 0x04, 0x62,
+ 0x5c, 0xbd, 0xd8, 0x28, 0x09, 0x3c, 0x68, 0xd3, 0x41, 0x3c, 0x62, 0x52, 0x1e,
+ 0x3d, 0x99, 0x42, 0x03, 0xbd, 0x3b, 0x4b, 0xd9, 0xba, 0x68, 0x5e, 0x32, 0xbd,
+ 0x8b, 0x9e, 0x26, 0xbb, 0x9c, 0xd7, 0xcd, 0x3c, 0x4e, 0xdc, 0x16, 0x3d, 0x42,
+ 0x1a, 0x07, 0x3d, 0xbb, 0xa6, 0x96, 0xbb, 0xf4, 0x47, 0x59, 0xbc, 0x13, 0xa3,
+ 0xa1, 0xbc, 0x8f, 0x58, 0x0f, 0xbc, 0x88, 0xd1, 0x1d, 0xbd, 0xe0, 0x0f, 0xfb,
+ 0x3c, 0x81, 0xd3, 0x90, 0x3d, 0xe0, 0x4b, 0x4f, 0xbd, 0x3f, 0x4a, 0x80, 0x3d,
+ 0x3a, 0x63, 0x67, 0x3d, 0xe2, 0xee, 0x1e, 0x3c, 0xf8, 0x65, 0xdd, 0x3b, 0x1c,
+ 0x30, 0x09, 0xbd, 0xe9, 0x2f, 0xdb, 0xbc, 0x94, 0x36, 0x55, 0xbd, 0x2c, 0xa4,
+ 0x95, 0x3a, 0x78, 0x24, 0x2f, 0x3d, 0xc7, 0x9c, 0x44, 0xbd, 0xb5, 0x09, 0x10,
+ 0xbd, 0x7d, 0x10, 0x49, 0xbd, 0x60, 0xd3, 0x43, 0x3c, 0xef, 0x67, 0x05, 0xbd,
+ 0x0a, 0x1d, 0x6c, 0x3d, 0xaa, 0x4d, 0x0c, 0x3d, 0x84, 0xfc, 0x8a, 0xbc, 0x0d,
+ 0xf7, 0x65, 0xbd, 0x5c, 0x71, 0x93, 0xbc, 0xd8, 0xe9, 0x2a, 0x3d, 0x1d, 0xd9,
+ 0xc6, 0xbc, 0xd6, 0xeb, 0x70, 0xbd, 0xef, 0x92, 0x41, 0xbd, 0x4a, 0xd3, 0x83,
+ 0xbd, 0x1e, 0xf1, 0x74, 0x3b, 0xa3, 0xb4, 0x1e, 0xbc, 0x4f, 0x0c, 0x12, 0x3d,
+ 0x69, 0xf6, 0x25, 0x3d, 0x5a, 0x52, 0x35, 0x3d, 0xb5, 0x14, 0x37, 0x3d, 0x2b,
+ 0xf9, 0x2d, 0xbd, 0xb8, 0xc6, 0x12, 0x3d, 0x2e, 0xeb, 0xf8, 0xbb, 0x31, 0xe0,
+ 0x43, 0xbd, 0x37, 0x68, 0xf4, 0x3b, 0x4e, 0xd7, 0x55, 0xbd, 0xf2, 0x8f, 0x06,
+ 0x3d, 0xa3, 0xe0, 0x8a, 0x3d, 0x47, 0xcb, 0x91, 0x3d, 0xc3, 0xaa, 0x1c, 0xbd,
+ 0x43, 0x44, 0x24, 0x3d, 0x5a, 0xcc, 0x30, 0xbd, 0x72, 0xbe, 0x27, 0x3c, 0xfc,
+ 0xd5, 0xbe, 0x3c, 0x34, 0x0e, 0x3f, 0x3d, 0xdc, 0x3d, 0x7b, 0xbc, 0x64, 0xe1,
+ 0xa9, 0x3c, 0x00, 0x61, 0x80, 0x3b, 0x19, 0xd4, 0x82, 0xbd, 0x41, 0xef, 0x8c,
+ 0x3d, 0x90, 0x50, 0x11, 0xbd, 0x0d, 0x32, 0x8d, 0x3d, 0x56, 0x78, 0x5f, 0x3c,
+ 0x71, 0x44, 0x6c, 0x3d, 0x21, 0xe4, 0x22, 0x3d, 0x31, 0xfd, 0xb4, 0xbb, 0xcc,
+ 0x10, 0x7e, 0x3c, 0x7a, 0xb4, 0x06, 0x3d, 0xc5, 0xde, 0x22, 0xbc, 0xd2, 0x57,
+ 0xfe, 0x3c, 0x30, 0x95, 0x81, 0xbd, 0x00, 0x6d, 0xde, 0x39, 0xfd, 0x2b, 0x3f,
+ 0x3d, 0x8f, 0xe7, 0xf4, 0x3b, 0x2b, 0xf8, 0xa3, 0xbc, 0xcf, 0x7c, 0x4e, 0x3d,
+ 0x86, 0xee, 0xf7, 0x3c, 0x20, 0x5a, 0x22, 0xbb, 0x1a, 0xa9, 0x62, 0xbd, 0x0f,
+ 0x24, 0x7f, 0x3d, 0x74, 0x7e, 0x00, 0x3d, 0x24, 0xd2, 0xcb, 0xbc, 0x06, 0xc6,
+ 0x44, 0xbd, 0xe1, 0x53, 0xa3, 0x3c, 0x7d, 0x24, 0x08, 0x3d, 0xf6, 0x9f, 0x23,
+ 0xbd, 0x3f, 0xb0, 0x84, 0xbd, 0xb0, 0xbb, 0xbc, 0x3c, 0x74, 0x6c, 0x22, 0xbc,
+ 0x0b, 0x32, 0x50, 0xbd, 0x81, 0x6f, 0x8b, 0x3d, 0x98, 0x37, 0xc3, 0x3c, 0xfd,
+ 0x30, 0x08, 0xbd, 0x11, 0x42, 0x01, 0xbd, 0xd6, 0x91, 0x16, 0x3c, 0x6e, 0xf1,
+ 0xc2, 0x3a, 0xed, 0x4b, 0x8c, 0xbd, 0x51, 0x70, 0x34, 0xbd, 0x2a, 0x7e, 0x1c,
+ 0x3b, 0x5a, 0x96, 0xcd, 0x37, 0x9a, 0x8e, 0xf8, 0x3c, 0xce, 0x8a, 0x6d, 0x3d,
+ 0x62, 0xb2, 0x38, 0x3d, 0x70, 0x0a, 0xbe, 0xbc, 0xd0, 0x3f, 0x66, 0xbc, 0xf4,
+ 0xfe, 0x24, 0x3d, 0xbe, 0xf9, 0x89, 0x3c, 0xa0, 0x2b, 0xc1, 0xbc, 0x02, 0x6d,
+ 0x41, 0x3c, 0xa4, 0x00, 0x14, 0xbd, 0xbc, 0xa1, 0xd1, 0x3b, 0xbc, 0x27, 0xa6,
+ 0x3c, 0xc8, 0x08, 0xfd, 0xbc, 0xa1, 0x0e, 0x9c, 0xbc, 0xa1, 0x28, 0x07, 0xbc,
+ 0x33, 0xf3, 0x71, 0x3c, 0x96, 0xed, 0x1f, 0x3d, 0xf6, 0x6d, 0x5e, 0xbd, 0x30,
+ 0x7c, 0x12, 0xbc, 0xf2, 0xaf, 0x7b, 0x3d, 0x56, 0xfa, 0x36, 0xbd, 0x7a, 0x6f,
+ 0x3a, 0x3d, 0x40, 0x65, 0x8f, 0x3c, 0x2c, 0xa1, 0x4f, 0xbc, 0x80, 0x0f, 0x7b,
+ 0x3b, 0xaf, 0xc3, 0xf2, 0x3c, 0xae, 0x39, 0x8a, 0xbd, 0xd5, 0xf6, 0x42, 0xbd,
+ 0x12, 0x9c, 0x33, 0x3d, 0x88, 0x27, 0x4d, 0x3d, 0x61, 0x05, 0x1e, 0xbd, 0x02,
+ 0xcd, 0x04, 0xbd, 0xe8, 0x6f, 0xe1, 0x3c, 0xf8, 0xd2, 0x73, 0x3d, 0xb9, 0xa3,
+ 0x61, 0xbd, 0x64, 0x01, 0x92, 0x3c, 0x4f, 0x8e, 0x21, 0xbc, 0x8b, 0xf5, 0x18,
+ 0x3d, 0xce, 0x3b, 0x77, 0x3d, 0x8d, 0x0e, 0x97, 0x3a, 0x30, 0xfc, 0x85, 0x3c,
+ 0x1f, 0x24, 0x8e, 0x3a, 0xca, 0xdd, 0x4e, 0x3d, 0x5f, 0x7c, 0xfe, 0x3b, 0x84,
+ 0xdf, 0x2d, 0x3d, 0x7a, 0x5c, 0x8c, 0x3d, 0x90, 0xf3, 0x79, 0xbc, 0x4f, 0x99,
+ 0x17, 0xbd, 0x30, 0xb1, 0xd2, 0xbb, 0x1c, 0x5a, 0x32, 0xbd, 0xd4, 0x8c, 0xd9,
+ 0x3c, 0x08, 0x56, 0xec, 0x3c, 0xf0, 0xcf, 0x64, 0xbd, 0xf0, 0x2a, 0xf1, 0xbb,
+ 0x28, 0x09, 0x0c, 0xbc, 0x0f, 0xf7, 0x8d, 0xbd, 0x86, 0x8f, 0x59, 0xbd, 0xfa,
+ 0xbf, 0x52, 0xbd, 0x76, 0x65, 0x4c, 0xbd, 0x79, 0xaa, 0x16, 0xbd, 0x9e, 0x6f,
+ 0xa7, 0xbc, 0xac, 0x9e, 0x8f, 0xbd, 0x5a, 0xfc, 0x7b, 0xbd, 0x90, 0xe3, 0x20,
+ 0x3d, 0xd0, 0x2b, 0x81, 0x3d, 0xc1, 0xbf, 0x85, 0x3d, 0x48, 0x79, 0x44, 0x3d,
+ 0x3e, 0x7b, 0x6d, 0x3d, 0x2b, 0x83, 0x11, 0x3d, 0x45, 0x84, 0x38, 0x3d, 0xbd,
+ 0x6d, 0x47, 0xb8, 0xe9, 0x7c, 0x29, 0xbd, 0x51, 0xd2, 0xc9, 0x3c, 0x77, 0x53,
+ 0xf0, 0x3b, 0xca, 0xc2, 0x17, 0xbd, 0xb2, 0xbc, 0x13, 0x3d, 0xbc, 0x58, 0xf9,
+ 0x3c, 0xed, 0x65, 0xed, 0x3c, 0x05, 0xdd, 0x8e, 0xbc, 0x0f, 0xa5, 0x96, 0xbc,
+ 0xd2, 0x96, 0x00, 0x3d, 0x90, 0xfe, 0x5c, 0x3d, 0x1f, 0x18, 0x90, 0xbd, 0x68,
+ 0xbb, 0xc8, 0x3c, 0x86, 0xae, 0xbb, 0xbc, 0x8a, 0x69, 0xea, 0xbc, 0x28, 0x6a,
+ 0x7c, 0x3c, 0x32, 0x5f, 0x70, 0x3d, 0xdd, 0x12, 0xd4, 0xba, 0xca, 0x54, 0x56,
+ 0xbd, 0x46, 0x94, 0x3f, 0xbd, 0x28, 0x3e, 0xa6, 0x3c, 0x93, 0x06, 0x43, 0xbd,
+ 0x58, 0xc7, 0xf0, 0x3c, 0x5d, 0x14, 0xa9, 0xbb, 0x58, 0x98, 0xc8, 0xbc, 0x89,
+ 0x34, 0x8d, 0x3d, 0x39, 0x90, 0x7b, 0x3d, 0x66, 0x18, 0x63, 0x3d, 0x60, 0x47,
+ 0x4d, 0x3b, 0x1d, 0x50, 0x6c, 0xbd, 0x55, 0x74, 0x27, 0x3d, 0x11, 0xf1, 0x66,
+ 0xbd, 0x14, 0xe6, 0x90, 0x3d, 0xdf, 0x99, 0x88, 0x3d, 0x9b, 0xc6, 0x67, 0x3d,
+ 0x16, 0xca, 0xd3, 0xbc, 0x79, 0xad, 0x87, 0x3d, 0x52, 0x56, 0x7b, 0x3d, 0x6e,
+ 0x19, 0x14, 0xbc, 0x12, 0x02, 0x26, 0x3d, 0xaf, 0x26, 0x1b, 0xbd, 0x5e, 0x09,
+ 0x8c, 0xbd, 0xa2, 0x3c, 0x5f, 0x3d, 0x60, 0x7e, 0x7d, 0xbd, 0x10, 0xc0, 0x85,
+ 0xbd, 0x70, 0x15, 0xc4, 0x3b, 0xe0, 0xfa, 0xf8, 0x3b, 0xe6, 0x2e, 0x00, 0x3d,
+ 0xf7, 0xd5, 0x1f, 0x3d, 0x48, 0x70, 0x60, 0x3d, 0x2a, 0x3a, 0xed, 0xbc, 0xfd,
+ 0x05, 0x26, 0xbc, 0x67, 0xf0, 0xee, 0x3a, 0x7e, 0x6e, 0x46, 0x3d, 0x57, 0x87,
+ 0x90, 0x3d, 0x22, 0xdb, 0x65, 0xbd, 0x70, 0xad, 0x7a, 0x3c, 0xa6, 0xb5, 0xc3,
+ 0x3c, 0xd4, 0xfa, 0x12, 0x3c, 0x4e, 0x84, 0x2f, 0xbd, 0x00, 0x37, 0x63, 0xbb,
+ 0xfb, 0x25, 0x41, 0xbc, 0x38, 0xa5, 0x84, 0x3d, 0x8a, 0xd7, 0x5a, 0xbd, 0x11,
+ 0xf7, 0xd6, 0xbb, 0xd1, 0x99, 0x22, 0xbd, 0xc8, 0xfc, 0x83, 0x3c, 0xd8, 0x91,
+ 0xd8, 0xbc, 0xa6, 0xf0, 0x3f, 0xbd, 0x08, 0x4d, 0x3b, 0x3d, 0xdd, 0x56, 0x4c,
+ 0xbd, 0xeb, 0x23, 0x8d, 0xbd, 0x23, 0x09, 0xcc, 0x3c, 0xbb, 0x3d, 0x8a, 0x3d,
+ 0x47, 0xb9, 0x75, 0xbd, 0x69, 0x75, 0x82, 0x3d, 0x30, 0x78, 0x86, 0x3c, 0x0c,
+ 0xc2, 0xd6, 0xbc, 0x2a, 0x22, 0x51, 0x3d, 0x9c, 0xfa, 0x3b, 0xbc, 0x00, 0x4b,
+ 0xbf, 0x39, 0x10, 0x58, 0xe6, 0xbb, 0x22, 0xa4, 0x47, 0x3d, 0x8b, 0xd1, 0x6f,
+ 0x3c, 0xf3, 0x8b, 0x23, 0xbd, 0xad, 0x67, 0x71, 0xbd, 0xa4, 0xbb, 0x71, 0xbc,
+ 0x68, 0x9d, 0x36, 0x3d, 0x79, 0xda, 0x00, 0x3d, 0x30, 0x88, 0x15, 0x3d, 0xc4,
+ 0x55, 0xab, 0x3c, 0xd0, 0xbe, 0x4f, 0x3d, 0x43, 0xa2, 0x8b, 0x3d, 0xc0, 0x0b,
+ 0x27, 0xbc, 0xfe, 0x35, 0x91, 0xbd, 0x27, 0x33, 0x5b, 0xbc, 0xc5, 0x00, 0x91,
+ 0xb9, 0x3e, 0x30, 0x74, 0xbd, 0x1c, 0x92, 0x70, 0xbd, 0xfe, 0x13, 0x56, 0xbb,
+ 0x63, 0x1b, 0x84, 0x3d, 0x24, 0x9a, 0xa1, 0x3c, 0x93, 0x78, 0x83, 0xbc, 0x29,
+ 0xb2, 0xce, 0x3c, 0x05, 0x6f, 0x8f, 0x3d, 0xe8, 0xb4, 0x3b, 0xbd, 0x12, 0x90,
+ 0x8e, 0x3d, 0x58, 0x6a, 0x76, 0xbd, 0xee, 0x8f, 0x90, 0xbd, 0x1e, 0x98, 0xde,
+ 0xbc, 0x88, 0x22, 0x40, 0x3d, 0x1b, 0x7f, 0x87, 0xbd, 0x3e, 0x25, 0x5e, 0x3d,
+ 0x38, 0xf3, 0x0c, 0xbc, 0x77, 0x6a, 0x8b, 0xbd, 0x0c, 0x98, 0x08, 0xbc, 0xbd,
+ 0x52, 0xf6, 0x3c, 0x2d, 0x2f, 0x03, 0xbd, 0x15, 0xbf, 0x91, 0x3d, 0xba, 0x41,
+ 0xef, 0xbc, 0xdf, 0x02, 0xab, 0xbc, 0xe4, 0xac, 0x7e, 0x3d, 0x9e, 0x8c, 0x51,
+ 0x3d, 0xcc, 0x12, 0x01, 0x3d, 0xfc, 0xfb, 0x1b, 0xbd, 0x75, 0x2b, 0x81, 0xbd,
+ 0x6a, 0xbf, 0x20, 0x3d, 0xbb, 0x3c, 0x77, 0xbd, 0xae, 0x2f, 0x74, 0xbd, 0x58,
+ 0x94, 0x53, 0xbd, 0xa0, 0xcf, 0xd4, 0x3c, 0x68, 0x51, 0xd1, 0x3c, 0x1c, 0x40,
+ 0x22, 0xbd, 0x86, 0x62, 0x04, 0x3d, 0x9c, 0x10, 0x02, 0xbd, 0x5d, 0x31, 0x49,
+ 0xbb, 0x5d, 0x8e, 0xf5, 0xbc, 0xb8, 0xef, 0x44, 0xbc, 0x06, 0xe5, 0x50, 0xbd,
+ 0xe6, 0x33, 0x40, 0xbd, 0x20, 0x2e, 0x39, 0x3b, 0x00, 0x2f, 0x96, 0xbb, 0x75,
+ 0x2e, 0x80, 0xbd, 0x2c, 0x9f, 0x4e, 0x3d, 0xd0, 0x40, 0xf6, 0x3b, 0x2e, 0x56,
+ 0x8e, 0x3d, 0xcf, 0x00, 0x15, 0x3d, 0xae, 0x5d, 0xc7, 0x3b, 0x44, 0x47, 0x05,
+ 0x3d, 0x80, 0x19, 0x71, 0xbb, 0x8c, 0xce, 0x87, 0xbd, 0xd2, 0x30, 0x78, 0xbd,
+ 0xcc, 0x7b, 0x14, 0xbd, 0xf4, 0xb8, 0x91, 0xbd, 0xbe, 0x76, 0x64, 0x3d, 0xf9,
+ 0x7e, 0x80, 0x3d, 0xda, 0xf8, 0x13, 0xbd, 0x92, 0xd0, 0x11, 0xbd, 0x03, 0x64,
+ 0x55, 0xbc, 0x50, 0x1a, 0xe8, 0xbc, 0x97, 0xeb, 0x5e, 0xbd, 0x7c, 0xf8, 0x90,
+ 0x3d, 0xc4, 0x26, 0x4b, 0x3d, 0xc2, 0x04, 0x7d, 0xbd, 0x25, 0x41, 0x14, 0x3b,
+ 0xac, 0xc2, 0xdf, 0x3c, 0xda, 0x60, 0xd3, 0xbc, 0x1b, 0x00, 0x45, 0xbd, 0x7e,
+ 0x09, 0xac, 0xbc, 0x28, 0x65, 0xcb, 0xbc, 0xe6, 0xd0, 0xb2, 0xbc, 0xb8, 0xdf,
+ 0xae, 0x3c, 0xc8, 0xb7, 0xca, 0x3c, 0x98, 0x50, 0xa1, 0x3c, 0x5c, 0xa2, 0xa0,
+ 0xbc, 0x8c, 0x18, 0x56, 0x3d, 0xea, 0x98, 0x8e, 0xbd, 0xb5, 0xba, 0x49, 0x3b,
+ 0xff, 0x2b, 0xaf, 0x3c, 0x91, 0xf6, 0x49, 0xbd, 0x0a, 0x19, 0x4d, 0x3d, 0xa1,
+ 0x7e, 0x69, 0xbd, 0x6c, 0x77, 0x3e, 0xbc, 0xa0, 0x00, 0x6e, 0x3d, 0x81, 0xc6,
+ 0xb1, 0x3b, 0x8b, 0xbf, 0x40, 0xbd, 0x5e, 0x71, 0xf5, 0xbc, 0x74, 0x2c, 0x96,
+ 0xbc, 0x3d, 0x0c, 0x8b, 0xbd, 0x45, 0x9a, 0x8a, 0xbd, 0xdb, 0x49, 0xcb, 0x3c,
+ 0x9b, 0x5b, 0x10, 0x3d, 0xf5, 0x79, 0x45, 0x3d, 0x5a, 0x50, 0x86, 0xbd, 0xf9,
+ 0x2f, 0x7c, 0xbd, 0xf6, 0x3d, 0x19, 0xbd, 0x54, 0x10, 0x0c, 0x3b, 0xaf, 0x59,
+ 0x27, 0xbd, 0x1f, 0x75, 0x78, 0x3d, 0x10, 0xb2, 0x9a, 0xbc, 0xc3, 0xb1, 0x99,
+ 0xbc, 0xb4, 0x08, 0xac, 0x3c, 0x15, 0x41, 0x86, 0x3d, 0xc0, 0x2d, 0x46, 0xbb,
+ 0xc4, 0x49, 0x56, 0xbc, 0xef, 0x2e, 0x7b, 0xbd, 0x6c, 0xee, 0x14, 0x3d, 0x70,
+ 0xe7, 0x9c, 0x3c, 0x78, 0x7e, 0xfb, 0xbc, 0xf7, 0x06, 0x51, 0xbd, 0x52, 0xd4,
+ 0x1a, 0xbd, 0xb0, 0x2b, 0xeb, 0xbc, 0xad, 0xad, 0x4e, 0xbd, 0xa4, 0x7c, 0xe3,
+ 0x3c, 0x18, 0xa1, 0xd8, 0xbc, 0x6e, 0xa6, 0x8f, 0xbd, 0x79, 0x0d, 0xb7, 0xba,
+ 0xb2, 0x10, 0x10, 0x3d, 0xe6, 0xcf, 0x52, 0x3d, 0x8e, 0x88, 0x35, 0x3d, 0xdd,
+ 0x92, 0x8d, 0x3d, 0x54, 0x69, 0x83, 0xbc, 0xab, 0xa9, 0x88, 0xbd, 0xe0, 0xa7,
+ 0x1c, 0xbb, 0x86, 0x10, 0x2c, 0xbd, 0x24, 0xde, 0x18, 0x3d, 0x4a, 0x04, 0x87,
+ 0xbd, 0x42, 0x3c, 0x16, 0xbd, 0x62, 0x25, 0x90, 0xbd, 0xce, 0x01, 0x64, 0xbd,
+ 0x2c, 0x76, 0x6f, 0xbd, 0xd2, 0x15, 0x0b, 0xbd, 0x45, 0x72, 0x73, 0x3b, 0xeb,
+ 0x46, 0x02, 0xbd, 0x05, 0x12, 0x1c, 0xbd, 0xb8, 0x16, 0x22, 0xbd, 0xe5, 0x22,
+ 0x89, 0x3d, 0x8c, 0x8a, 0xf4, 0x3c, 0x40, 0x6b, 0xe4, 0x3a, 0x5c, 0xe2, 0x70,
+ 0xbd, 0x56, 0x08, 0x67, 0xbd, 0x5b, 0xec, 0x4d, 0x3d, 0xba, 0x4d, 0x2a, 0xbd,
+ 0xb9, 0x55, 0xa4, 0xbc, 0xb7, 0xd7, 0x39, 0x3d, 0xa0, 0x88, 0xfe, 0x3c, 0xbf,
+ 0x7d, 0x6b, 0xbd, 0xcd, 0xdf, 0xe3, 0xbc, 0x26, 0xa0, 0x3e, 0x3d, 0x19, 0x4b,
+ 0x17, 0x3d, 0x54, 0x84, 0xa7, 0xbc, 0x78, 0x9a, 0x6a, 0xbd, 0x80, 0xcc, 0xa7,
+ 0x3c, 0x58, 0x48, 0x3a, 0x3d, 0xd9, 0x9a, 0xe3, 0xbc, 0xe0, 0xa2, 0xb8, 0x3c,
+ 0x3f, 0x32, 0x4d, 0x3d, 0x8e, 0xa6, 0x80, 0xbc, 0x0f, 0xfc, 0xd6, 0xbb, 0x40,
+ 0x70, 0x8b, 0xbd, 0xe3, 0xa3, 0xf6, 0xbb, 0x40, 0x26, 0x33, 0xbb, 0x43, 0xb2,
+ 0x01, 0xbd, 0x2e, 0xf9, 0x27, 0xbd, 0x6c, 0xcf, 0x54, 0x3c, 0xae, 0xca, 0x4d,
+ 0x3c, 0x6e, 0x2d, 0x1d, 0x3a, 0x04, 0xda, 0x94, 0xbc, 0x2c, 0x2b, 0xc6, 0x3c,
+ 0x59, 0xc8, 0x1a, 0xbd, 0x80, 0x56, 0xcb, 0x3b, 0xf4, 0xce, 0xa1, 0x3c, 0x84,
+ 0xdd, 0xeb, 0x3c, 0x95, 0x36, 0x83, 0xbd, 0x60, 0xeb, 0x47, 0x3d, 0x90, 0xf8,
+ 0x63, 0x3d, 0x8a, 0xc4, 0x6a, 0xbc, 0x40, 0x25, 0xa9, 0x3b, 0x7a, 0xfc, 0x65,
+ 0x3d, 0xe2, 0xcd, 0x33, 0x3d, 0x69, 0x80, 0xe5, 0xbc, 0xf7, 0xc5, 0x42, 0xbc,
+ 0x17, 0xf4, 0x31, 0xbd, 0xbe, 0xb3, 0x79, 0x3d, 0xff, 0xfc, 0x6c, 0x3d, 0xc5,
+ 0x04, 0x7d, 0xbc, 0xd9, 0x4f, 0x8e, 0x3d, 0xfe, 0xd3, 0x86, 0xbd, 0xcd, 0xeb,
+ 0x3f, 0x3d, 0xd8, 0x90, 0x2e, 0xbd, 0x56, 0x17, 0xbf, 0x3c, 0xbb, 0x23, 0x83,
+ 0xbd, 0x69, 0x4a, 0x43, 0x3d, 0x0a, 0x76, 0x5e, 0xbd, 0xee, 0x69, 0x8d, 0x3d,
+ 0x75, 0xda, 0x1c, 0x3c, 0xe8, 0xf7, 0xe0, 0xbc, 0x53, 0xbe, 0xda, 0xb8, 0xc2,
+ 0x03, 0x2e, 0xbd, 0xe4, 0xa0, 0x38, 0xbc, 0xbc, 0x5e, 0x3b, 0xbd, 0xfc, 0xfc,
+ 0xb7, 0x3c, 0xd4, 0xfb, 0x13, 0xbd, 0xf6, 0x8c, 0x44, 0x3d, 0x70, 0x13, 0x9d,
+ 0x3c, 0xf8, 0xb8, 0x11, 0xbc, 0xcc, 0x9b, 0x3b, 0xbd, 0xf7, 0x18, 0xe4, 0xbc,
+ 0x89, 0xc3, 0x31, 0x3d, 0xde, 0x7c, 0x32, 0xbd, 0x3c, 0xc7, 0x97, 0x3c, 0x2e,
+ 0xc0, 0xb8, 0xbc, 0xa2, 0xfe, 0x29, 0xbd, 0x17, 0xb2, 0x35, 0xbd, 0xaa, 0x83,
+ 0xdd, 0x3c, 0x1e, 0xfa, 0x83, 0x3d, 0xc6, 0x4c, 0x16, 0x3d, 0xfd, 0x0f, 0x29,
+ 0x3d, 0x2d, 0x90, 0xac, 0x3b, 0xfe, 0xe5, 0xc8, 0x3b, 0xac, 0x11, 0xc7, 0xbc,
+ 0x2d, 0xf3, 0xfa, 0x3c, 0x2a, 0x75, 0x81, 0xbd, 0x2d, 0x84, 0xb4, 0x3c, 0xfd,
+ 0xad, 0x66, 0xbc, 0xaa, 0x80, 0x2a, 0xbd, 0x58, 0x82, 0x8c, 0x3d, 0x75, 0x06,
+ 0x78, 0x3d, 0x1b, 0xdd, 0x21, 0xbc, 0x1c, 0x40, 0x38, 0x3d, 0xe0, 0xdc, 0x6e,
+ 0x3d, 0x50, 0xb8, 0x32, 0xbc, 0x80, 0x13, 0x4f, 0xbb, 0x32, 0x50, 0x6c, 0x3d,
+ 0xce, 0x1b, 0xf1, 0xbc, 0xd8, 0x20, 0x02, 0x3d, 0x43, 0x68, 0xa2, 0x3c, 0x9a,
+ 0x6c, 0x29, 0xbd, 0x8d, 0x90, 0x22, 0xbd, 0x14, 0xff, 0xe6, 0xbb, 0xb8, 0xcf,
+ 0xc1, 0x3c, 0xa6, 0x3b, 0x4a, 0x3d, 0xac, 0xad, 0x11, 0x3d, 0x60, 0x19, 0xc9,
+ 0x3c, 0x55, 0xae, 0xf1, 0xbc, 0x3d, 0xc0, 0x23, 0xbd, 0xa3, 0x00, 0xcd, 0xbb,
+ 0x44, 0x9e, 0x17, 0x3d, 0xc0, 0x31, 0xe2, 0x3a, 0x30, 0xdf, 0xf4, 0x3c, 0x31,
+ 0x09, 0x92, 0xbc, 0xa8, 0xbd, 0x66, 0x3c, 0xa5, 0x06, 0x4f, 0x3c, 0xdc, 0x2e,
+ 0x92, 0xbd, 0xfb, 0x54, 0x87, 0xb9, 0x9b, 0x34, 0x1f, 0x3d, 0xd8, 0xf7, 0xa7,
+ 0xbb, 0xff, 0x1d, 0x62, 0xbd, 0xe0, 0xf8, 0x3c, 0x3d, 0x85, 0x58, 0x8f, 0xbd,
+ 0x75, 0xf9, 0x62, 0xbd, 0xef, 0xf5, 0x7a, 0xbd, 0x58, 0x32, 0x86, 0x3d, 0x90,
+ 0x17, 0x29, 0x3c, 0x64, 0xcc, 0x4a, 0xbd, 0xf0, 0x07, 0xc1, 0xbc, 0x72, 0xdc,
+ 0x64, 0xbd, 0x68, 0x3e, 0x2e, 0x3c, 0x38, 0x6d, 0x60, 0xbd, 0x46, 0x1f, 0x59,
+ 0x3d, 0xd0, 0xa7, 0x3e, 0x3d, 0x77, 0x1d, 0x49, 0x3d, 0xcb, 0xed, 0x7f, 0xbd,
+ 0xd8, 0x47, 0x40, 0x3c, 0x00, 0xf0, 0xee, 0x39, 0xcc, 0xea, 0x57, 0x3d, 0x10,
+ 0x1d, 0x8a, 0xbd, 0xb9, 0x55, 0x5f, 0xbd, 0x17, 0x3c, 0x66, 0xbc, 0x02, 0xb8,
+ 0x06, 0xbd, 0x5f, 0xfb, 0x16, 0xbd, 0x58, 0x15, 0x8c, 0x3d, 0x18, 0x99, 0x5f,
+ 0x3d, 0x5f, 0x73, 0xb3, 0xbc, 0x61, 0x73, 0x63, 0x3d, 0x61, 0xf2, 0x7b, 0xbc,
+ 0xbd, 0x2b, 0xad, 0x3a, 0xda, 0x99, 0x5c, 0xbd, 0x81, 0xd1, 0xd0, 0x3c, 0xf0,
+ 0xf9, 0xb0, 0x3c, 0x84, 0x54, 0x68, 0x3c, 0x24, 0x10, 0x84, 0x3d, 0x4d, 0xec,
+ 0xa2, 0x3b, 0xd3, 0xab, 0x1e, 0xbd, 0xbd, 0x4d, 0x84, 0x3d, 0xd0, 0xd9, 0xb6,
+ 0x3c, 0x84, 0xdc, 0x71, 0xbd, 0x84, 0x4a, 0x03, 0x3d, 0x54, 0xb8, 0xc6, 0x3c,
+ 0x0a, 0x84, 0x0e, 0x3d, 0xdc, 0xfe, 0x64, 0xbd, 0xa6, 0xc2, 0x19, 0x3d, 0xd1,
+ 0x79, 0x4c, 0x3c, 0x7c, 0x16, 0xbd, 0x3c, 0xc1, 0x7d, 0x3c, 0xbc, 0xb2, 0xe7,
+ 0x94, 0xbc, 0xf0, 0x46, 0x69, 0xbc, 0x2d, 0x5f, 0x68, 0x3c, 0xbc, 0x78, 0x44,
+ 0xbd, 0xcf, 0x27, 0x97, 0xbd, 0x03, 0xfb, 0x4b, 0xbd, 0x0c, 0xc4, 0xcd, 0xbc,
+ 0xd7, 0xc5, 0x11, 0xbd, 0x6b, 0xe3, 0xf5, 0xbb, 0xda, 0x4d, 0x75, 0x3d, 0xb0,
+ 0xf1, 0x39, 0xbd, 0x02, 0x4e, 0x00, 0xbd, 0xcf, 0x22, 0x81, 0x3d, 0x48, 0x54,
+ 0x10, 0xbd, 0x93, 0x8c, 0x42, 0x3a, 0x62, 0x1e, 0x18, 0x3d, 0xb5, 0x1d, 0x8d,
+ 0x3d, 0xbe, 0x37, 0x54, 0xbc, 0x9e, 0xa3, 0x92, 0xbc, 0x6a, 0x91, 0x7b, 0x3d,
+ 0xc5, 0x13, 0x8c, 0xbb, 0x30, 0x93, 0x55, 0xbd, 0x01, 0x29, 0x2b, 0xbd, 0xd4,
+ 0x57, 0x3a, 0xbd, 0xaf, 0xbc, 0xed, 0x3c, 0x65, 0xfe, 0x66, 0xbd, 0x2c, 0x98,
+ 0x11, 0x3d, 0x6e, 0xcf, 0x7c, 0xbd, 0xbe, 0xb4, 0x49, 0x3d, 0x17, 0x7c, 0x4f,
+ 0xbc, 0x13, 0xfc, 0x28, 0x3d, 0x28, 0xca, 0x2b, 0xbd, 0xdf, 0x3e, 0xa3, 0x3b,
+ 0x7e, 0xf4, 0x99, 0xbd, 0x9d, 0x89, 0x35, 0xbc, 0x70, 0x4c, 0x8a, 0xbd, 0xf9,
+ 0x58, 0x3a, 0xbd, 0x6f, 0xa9, 0x4f, 0x3d, 0x30, 0xce, 0x59, 0xbc, 0x52, 0xd4,
+ 0x41, 0xbd, 0x0d, 0x88, 0x2d, 0xbd, 0x94, 0xe1, 0x30, 0x3d, 0x7a, 0x53, 0xcd,
+ 0xbb, 0x2d, 0xcc, 0x75, 0x3c, 0x18, 0x30, 0x24, 0x3d, 0xfb, 0xa8, 0x07, 0x3d,
+ 0xa8, 0x1f, 0x19, 0xbc, 0xdf, 0x0a, 0x1c, 0x3d, 0x76, 0x06, 0x31, 0x3d, 0x6c,
+ 0x40, 0x82, 0x3c, 0x72, 0xb0, 0x82, 0xbd, 0x10, 0xae, 0x67, 0x3d, 0x00, 0x02,
+ 0xb5, 0x3a, 0x0a, 0xcd, 0x29, 0x3d, 0x7a, 0xf4, 0x27, 0x3c, 0x9d, 0xe2, 0x75,
+ 0xbd, 0x1e, 0xcd, 0x09, 0x3c, 0xa7, 0x3e, 0x25, 0xbd, 0x90, 0xb7, 0x8b, 0xbd,
+ 0xac, 0x2e, 0x6c, 0x3c, 0x22, 0x59, 0x79, 0x3d, 0xaf, 0x3b, 0x02, 0xba, 0x40,
+ 0xb8, 0x2c, 0x3d, 0xe8, 0x48, 0x6e, 0x3d, 0x13, 0xdb, 0x2f, 0x3b, 0x89, 0x0e,
+ 0x82, 0x3c, 0xdf, 0xe9, 0xc4, 0xbc, 0xc9, 0x26, 0x19, 0xbc, 0x67, 0x6b, 0x50,
+ 0x3d, 0xc0, 0x4c, 0x10, 0xbd, 0x30, 0xa9, 0x40, 0x3c, 0x12, 0x2f, 0xb1, 0x3c,
+ 0x3e, 0x0e, 0x00, 0xbd, 0xe9, 0x1b, 0x6f, 0xbd, 0xe4, 0x4b, 0x81, 0xbd, 0x93,
+ 0xc1, 0x7f, 0x3d, 0xb7, 0x8d, 0x04, 0xbd, 0x68, 0x33, 0x29, 0xbc, 0xa4, 0x5e,
+ 0x60, 0x3d, 0x23, 0xc0, 0x0a, 0xbd, 0xf0, 0x22, 0x80, 0xbd, 0x79, 0xea, 0x47,
+ 0x3d, 0x10, 0x77, 0x87, 0x3d, 0xc1, 0xfb, 0x19, 0xbd, 0x9c, 0xf7, 0x7c, 0x3d,
+ 0x27, 0x74, 0xb9, 0xbc, 0xc6, 0xea, 0x25, 0x3d, 0x54, 0xbc, 0xa4, 0x3c, 0x88,
+ 0x18, 0x36, 0x3d, 0x74, 0xd5, 0xd3, 0x3c, 0x68, 0x6e, 0x24, 0x3d, 0x36, 0xb4,
+ 0x49, 0x3d, 0x3e, 0x98, 0x2c, 0xbd, 0x99, 0x3e, 0x47, 0xbd, 0x21, 0xac, 0x15,
+ 0x3d, 0xef, 0x4f, 0x26, 0xbd, 0xb4, 0x49, 0x3f, 0xbd, 0xf5, 0xbc, 0x0a, 0xbd,
+ 0x04, 0x05, 0x6f, 0x3d, 0xf1, 0x5f, 0x15, 0x3d, 0xca, 0x51, 0x3f, 0x3d, 0xc2,
+ 0x88, 0x3a, 0xbd, 0x40, 0xeb, 0xbf, 0x3c, 0x4c, 0x13, 0xb6, 0x3c, 0xe6, 0x26,
+ 0xfe, 0x3c, 0xda, 0xab, 0x95, 0xbd, 0xd8, 0xcf, 0x81, 0x3d, 0xa2, 0x19, 0x53,
+ 0xbd, 0x5d, 0x5e, 0x0d, 0xbd, 0xfe, 0x6b, 0x36, 0x3d, 0xfb, 0x27, 0x4c, 0xbd,
+ 0x36, 0x92, 0x43, 0xbd, 0x94, 0xee, 0x45, 0xbc, 0x8a, 0x6d, 0xe4, 0x3c, 0xa8,
+ 0xb1, 0x52, 0xbc, 0x1f, 0x82, 0x88, 0xbb, 0x73, 0x6b, 0x53, 0xbd, 0x56, 0xc3,
+ 0x6f, 0x3d, 0x78, 0x17, 0x4a, 0x3d, 0xf2, 0x2e, 0x77, 0xbd, 0x2e, 0xae, 0x2a,
+ 0x3d, 0xa0, 0xd4, 0xa8, 0x3c, 0xe0, 0xb4, 0xd8, 0x3c, 0x24, 0x6d, 0x6a, 0xbd,
+ 0x16, 0xd2, 0x58, 0xbd, 0x56, 0xf5, 0x5d, 0x3b, 0xae, 0xdb, 0x76, 0xbd, 0x16,
+ 0x9a, 0x9a, 0xbd, 0x7c, 0x79, 0x51, 0x3d, 0x72, 0x5b, 0xa7, 0xbc, 0xce, 0xbf,
+ 0x62, 0x3d, 0xab, 0xd8, 0x23, 0x3d, 0x7e, 0xfd, 0x23, 0x3d, 0x0c, 0x3d, 0x6b,
+ 0x3d, 0x6c, 0x2f, 0x87, 0x3c, 0x1e, 0x26, 0x00, 0xbc, 0xc3, 0x94, 0x6f, 0xbd,
+ 0xb3, 0x7d, 0x24, 0xbd, 0x2a, 0xfb, 0x71, 0x3d, 0xee, 0x5a, 0xeb, 0xbc, 0x6c,
+ 0x3e, 0x60, 0xbd, 0x6c, 0x46, 0xf5, 0x3c, 0x83, 0xe3, 0x17, 0x3b, 0xe6, 0x15,
+ 0x32, 0xbd, 0x45, 0xba, 0x05, 0xbd, 0x18, 0x9a, 0x72, 0x3d, 0x45, 0x9c, 0x83,
+ 0xbd, 0x08, 0x2b, 0x5e, 0x3d, 0x75, 0xea, 0xe8, 0xbc, 0x81, 0xb6, 0x84, 0x3b,
+ 0x4b, 0xf4, 0x16, 0xbd, 0x90, 0xf4, 0x16, 0x3d, 0x2b, 0x95, 0x53, 0xbc, 0x53,
+ 0x27, 0x4b, 0xbd, 0x00, 0x6c, 0xe7, 0x3b, 0x62, 0xbd, 0x83, 0xbd, 0xd8, 0x6f,
+ 0x87, 0x3c, 0x3c, 0x17, 0x65, 0x3c, 0x3b, 0x64, 0x7e, 0x3d, 0xbd, 0x05, 0x09,
+ 0xbd, 0x7f, 0x37, 0x88, 0xbd, 0x63, 0x0e, 0x98, 0xbd, 0x03, 0x67, 0x71, 0x3c,
+ 0x02, 0x06, 0xe5, 0x39, 0xe4, 0x9f, 0xe7, 0x3b, 0x93, 0x66, 0x93, 0xbd, 0xc6,
+ 0xcd, 0x7c, 0xbd, 0xde, 0xaf, 0x20, 0x3d, 0xd2, 0x18, 0x54, 0x3c, 0xac, 0xeb,
+ 0x62, 0xbd, 0x93, 0xf7, 0xa2, 0x3c, 0x4c, 0x4b, 0x00, 0x3d, 0x38, 0x67, 0x3d,
+ 0xbd, 0x81, 0xcb, 0xa2, 0x3c, 0x9b, 0xd5, 0x90, 0x3c, 0x35, 0x26, 0x0f, 0x3c,
+ 0xcb, 0x77, 0x45, 0xbd, 0x38, 0xe0, 0x48, 0xbd, 0x96, 0x9e, 0x1d, 0x3b, 0x7c,
+ 0x3f, 0xaf, 0xbc, 0xef, 0x49, 0xac, 0xbc, 0x07, 0x74, 0xcc, 0x3c, 0xc0, 0x22,
+ 0x42, 0xbb, 0x5b, 0x72, 0x62, 0x3d, 0xd0, 0x55, 0x95, 0xbd, 0xf7, 0x7d, 0x82,
+ 0x3d, 0x90, 0x79, 0xd9, 0x3b, 0xd0, 0xa1, 0x96, 0x3c, 0xbf, 0x32, 0x8a, 0x3d,
+ 0xbd, 0xf0, 0x57, 0x3d, 0x5f, 0xf9, 0x3b, 0x3c, 0x4f, 0xea, 0x86, 0x3d, 0xbb,
+ 0x72, 0xaa, 0x3c, 0x42, 0x3b, 0x4c, 0x3d, 0x86, 0x1d, 0x86, 0x3c, 0x90, 0xc6,
+ 0x2a, 0xbd, 0x4f, 0x86, 0x76, 0x3d, 0x92, 0x79, 0x3d, 0x3d, 0x0d, 0x95, 0x92,
+ 0x3d, 0xbf, 0x77, 0x4e, 0x3d, 0x8b, 0x45, 0x03, 0xbd, 0x95, 0x0c, 0xff, 0xbc,
+ 0x62, 0x35, 0x11, 0xbb, 0xbd, 0x74, 0x28, 0x3d, 0xaf, 0x87, 0x7f, 0xbd, 0x8e,
+ 0xb8, 0x06, 0xbd, 0x0f, 0xbd, 0x3e, 0x3d, 0xe6, 0xd4, 0x41, 0xbd, 0x80, 0x81,
+ 0xac, 0x3c, 0x7a, 0xec, 0x82, 0xbc, 0x01, 0xac, 0x93, 0xbd, 0xe8, 0xba, 0xb3,
+ 0xbb, 0xcf, 0x47, 0x8f, 0xbb, 0x11, 0x6f, 0x57, 0x3d, 0x74, 0xf5, 0x9d, 0x3c,
+ 0x67, 0x6e, 0x01, 0xbd, 0xa6, 0x8c, 0x8f, 0xbd, 0xe4, 0x48, 0x30, 0xbd, 0x80,
+ 0xa7, 0x88, 0xbb, 0x48, 0x69, 0xea, 0x3c, 0x20, 0x78, 0x14, 0x3b, 0x18, 0xc4,
+ 0xca, 0xbc, 0xd6, 0x83, 0xcb, 0x3c, 0x88, 0x63, 0xd1, 0x3c, 0x02, 0x3a, 0x1b,
+ 0xbc, 0x02, 0x15, 0x13, 0x3c, 0xbe, 0x71, 0xf0, 0xbb, 0xe1, 0x3c, 0x12, 0xbd,
+ 0xa6, 0x23, 0x33, 0x3c, 0xc8, 0x04, 0xee, 0x3c, 0x78, 0x7e, 0x4d, 0x3c, 0x7f,
+ 0xd1, 0x95, 0xbc, 0xa3, 0x48, 0x22, 0x3c, 0x6d, 0x33, 0x77, 0xbd, 0xfc, 0x4f,
+ 0xc7, 0xbc, 0x8c, 0x5c, 0x8c, 0xbd, 0x98, 0x32, 0x02, 0xbd, 0x5f, 0x37, 0x00,
+ 0x3d, 0x41, 0xea, 0x7f, 0x3d, 0x4b, 0x38, 0x77, 0xbc, 0x47, 0x90, 0x92, 0xbd,
+ 0x56, 0x10, 0x1f, 0xbd, 0x10, 0x70, 0x8e, 0xbb, 0x0a, 0x99, 0x7a, 0x3c, 0x46,
+ 0x4c, 0x7d, 0x3d, 0xc0, 0x71, 0x6d, 0x3d, 0xd8, 0x3f, 0x28, 0x3d, 0x84, 0xe3,
+ 0x2b, 0x3d, 0x31, 0xdc, 0x55, 0xbd, 0x6e, 0x0a, 0x34, 0x3d, 0x10, 0xff, 0x85,
+ 0x3c, 0x72, 0x7b, 0x1d, 0xbd, 0x7f, 0xf5, 0xb4, 0xbb, 0xfb, 0xef, 0x87, 0x3d,
+ 0xb5, 0x8a, 0x4f, 0x3c, 0x20, 0xd7, 0x40, 0xbd, 0x17, 0x2c, 0x38, 0xbd, 0xcb,
+ 0xd4, 0x6d, 0x3d, 0x3c, 0x24, 0x7a, 0xbd, 0xb3, 0x3d, 0x92, 0xbd, 0x18, 0xbe,
+ 0x99, 0xba, 0x29, 0xe3, 0x42, 0xbc, 0xf7, 0x2c, 0x8f, 0xbd, 0x34, 0xd9, 0xc7,
+ 0x3c, 0xac, 0x8c, 0x99, 0xbd, 0x40, 0xe4, 0xa5, 0x3c, 0x8d, 0xcf, 0x3d, 0x3d,
+ 0x81, 0xe9, 0x3e, 0x3d, 0x7a, 0xbb, 0x3f, 0x3d, 0xc7, 0x9b, 0x25, 0xbc, 0x84,
+ 0x26, 0xc3, 0xbb, 0x52, 0x3f, 0x7a, 0x3d, 0x7b, 0xdb, 0x69, 0xbd, 0x99, 0x0e,
+ 0x71, 0xbd, 0x4c, 0xb5, 0xa5, 0x3b, 0xcf, 0x2f, 0xfd, 0xbb, 0x6b, 0x5b, 0x0c,
+ 0x3b, 0x9e, 0xeb, 0x04, 0xbc, 0x00, 0x9d, 0xdc, 0xbb, 0x10, 0xc2, 0xc0, 0x3c,
+ 0x08, 0xa2, 0x31, 0xbd, 0xc0, 0x3c, 0xf9, 0x3a, 0xad, 0xd5, 0x55, 0xbd, 0x11,
+ 0xea, 0xf3, 0x3c, 0x80, 0x63, 0xfa, 0x3a, 0x30, 0x82, 0x48, 0x3b, 0x58, 0x5f,
+ 0x2c, 0xbd, 0xd4, 0x00, 0x83, 0xbd, 0x12, 0x38, 0x8a, 0xbd, 0xd2, 0xdf, 0x1e,
+ 0x3c, 0xd0, 0x71, 0x1b, 0x3d, 0x92, 0x5f, 0x56, 0xbd, 0x51, 0x29, 0x94, 0xbd,
+ 0x40, 0x81, 0x92, 0xbd, 0x04, 0x93, 0x82, 0xbd, 0x8c, 0xf7, 0x84, 0x3d, 0x8a,
+ 0x96, 0x85, 0xbd, 0x2a, 0x93, 0x3b, 0xba, 0xc7, 0x7c, 0x3b, 0xbd, 0xb0, 0x3d,
+ 0x50, 0x3d, 0xa0, 0xcb, 0x42, 0x3d, 0xad, 0x3c, 0x16, 0xbc, 0x59, 0xaa, 0x30,
+ 0xbd, 0xcd, 0x10, 0x91, 0xbc, 0xe8, 0xea, 0x35, 0xbd, 0x53, 0x63, 0x36, 0xbd,
+ 0xa9, 0x85, 0x82, 0x3c, 0x23, 0xbd, 0x36, 0xbd, 0x25, 0x81, 0xe9, 0x3c, 0x76,
+ 0x54, 0x6d, 0x3d, 0xc1, 0x4f, 0x69, 0xbd, 0x55, 0x6c, 0x8f, 0x3d, 0xd5, 0x0a,
+ 0x7d, 0xbd, 0x48, 0xbe, 0xd2, 0x3c, 0x5b, 0xce, 0x84, 0x3d, 0xaa, 0x8e, 0x46,
+ 0xbc, 0x9c, 0x93, 0xc9, 0x3c, 0x66, 0xb1, 0x45, 0x3d, 0xf1, 0xc0, 0x90, 0xbc,
+ 0x2d, 0x09, 0x22, 0x3d, 0xcc, 0x52, 0x20, 0x3d, 0xaa, 0xec, 0x70, 0x3d, 0x3a,
+ 0xbd, 0xac, 0xbb, 0x70, 0x69, 0x81, 0x3d, 0x43, 0x3f, 0x8b, 0xbc, 0x46, 0x6a,
+ 0x04, 0xbd, 0xac, 0x25, 0x5a, 0xbd, 0xc2, 0xb9, 0x74, 0xbd, 0x35, 0x78, 0xeb,
+ 0x3c, 0xe2, 0x31, 0x54, 0xbd, 0xa0, 0xb1, 0xfe, 0x3c, 0xaf, 0xd2, 0xf8, 0x3c,
+ 0x00, 0x44, 0x82, 0x3a, 0x70, 0xcc, 0x91, 0xbd, 0x82, 0x1f, 0x57, 0xbd, 0xc2,
+ 0xe4, 0x03, 0x3d, 0xd0, 0xbd, 0x80, 0xbd, 0x7a, 0xde, 0x41, 0xbd, 0xe9, 0xf4,
+ 0x3b, 0x3c, 0xf9, 0x96, 0x1a, 0xbd, 0xe2, 0x2e, 0x46, 0xbd, 0xae, 0xbd, 0x34,
+ 0xbd, 0xb4, 0xa2, 0x8c, 0xbc, 0xa8, 0x0e, 0x30, 0xbd, 0x56, 0xf8, 0x33, 0xbd,
+ 0xce, 0x69, 0x35, 0x3d, 0x52, 0x2f, 0xeb, 0xbc, 0x9f, 0xe0, 0x0f, 0xbd, 0xc9,
+ 0x34, 0x29, 0xbd, 0x43, 0x26, 0x1e, 0x3d, 0xc8, 0x03, 0x05, 0x3c, 0x0f, 0x46,
+ 0x97, 0x3c, 0x18, 0x4c, 0x0c, 0xbd, 0xb8, 0xf9, 0x1c, 0xbd, 0xbd, 0x84, 0x86,
+ 0xbd, 0xbe, 0x50, 0xb1, 0xbc, 0x26, 0x15, 0x57, 0x3c, 0xca, 0x9f, 0x77, 0xbc,
+ 0xc0, 0xea, 0xca, 0xba, 0x23, 0xde, 0x41, 0xbd, 0x9d, 0xb4, 0x5c, 0xbd, 0x46,
+ 0x03, 0x30, 0xbd, 0xd0, 0xb3, 0x37, 0x3d, 0xfd, 0xe6, 0x3e, 0x3d, 0x8a, 0x0e,
+ 0x6a, 0xbd, 0xf8, 0x91, 0x64, 0x3d, 0xb4, 0x0b, 0x76, 0x3d, 0xf2, 0x94, 0x5f,
+ 0x3d, 0x98, 0xe6, 0x78, 0x3c, 0xc4, 0xab, 0x1e, 0xbd, 0xdd, 0xb6, 0x77, 0xbd,
+ 0x56, 0x1e, 0x8c, 0x3d, 0x0f, 0xee, 0x15, 0xbd, 0x42, 0xb6, 0x92, 0xbd, 0x2c,
+ 0xea, 0x96, 0xbc, 0x90, 0xc4, 0x30, 0xbd, 0x2e, 0xdc, 0xc8, 0xbb, 0xe4, 0x79,
+ 0xb0, 0xbc, 0x2e, 0xe6, 0x08, 0x3d, 0x74, 0x81, 0x34, 0x3d, 0xc0, 0xd5, 0x48,
+ 0xbc, 0xd3, 0xf2, 0x3c, 0xbd, 0x34, 0x47, 0xef, 0x3c, 0x9a, 0xcb, 0xe5, 0x3c,
+ 0xe0, 0x94, 0xef, 0xba, 0x80, 0x36, 0x23, 0xbc, 0x08, 0xf9, 0x35, 0xbd, 0x0f,
+ 0x9d, 0x99, 0xbd, 0x71, 0xdf, 0x2e, 0xbd, 0xb5, 0xa6, 0x78, 0xbd, 0xfa, 0xa8,
+ 0x69, 0x3d, 0x97, 0xc3, 0xda, 0xbb, 0x37, 0x74, 0xdf, 0x3c, 0x7f, 0xc2, 0x88,
+ 0xbd, 0x53, 0x20, 0xbe, 0x3b, 0x9c, 0x7a, 0xd9, 0x3c, 0xa9, 0x4b, 0x01, 0xbd,
+ 0xfb, 0xf7, 0x00, 0xbd, 0xd5, 0xda, 0x41, 0x3d, 0x9d, 0x2a, 0x82, 0x3d, 0x9a,
+ 0x03, 0x01, 0x3d, 0x38, 0xa7, 0x1b, 0x3d, 0x40, 0x75, 0xef, 0x3c, 0x4a, 0xdc,
+ 0x1b, 0xbc, 0xd1, 0x1a, 0x41, 0x3d, 0x04, 0xee, 0x74, 0x3d, 0xdb, 0x3f, 0x71,
+ 0xbd, 0x86, 0xc4, 0x22, 0x3d, 0x99, 0x74, 0x78, 0xbc, 0x48, 0x90, 0x54, 0xbd,
+ 0x88, 0xae, 0xf9, 0x3c, 0x4f, 0xbe, 0x10, 0x3d, 0x7d, 0x35, 0x68, 0xbd, 0xb3,
+ 0xf9, 0x3d, 0x3d, 0x1b, 0x89, 0x85, 0xbb, 0x85, 0x05, 0xae, 0x3c, 0xfd, 0x18,
+ 0x5b, 0xbd, 0x2d, 0xfa, 0x7f, 0xbd, 0x6e, 0xad, 0x8c, 0xbd, 0x67, 0x72, 0x28,
+ 0x3d, 0x2c, 0x8b, 0x9a, 0x3c, 0xb3, 0x94, 0x57, 0xbd, 0xa4, 0x3e, 0xa8, 0xbc,
+ 0xa6, 0x6a, 0x06, 0x3d, 0xf8, 0x03, 0x33, 0x3d, 0x56, 0xb0, 0x7a, 0xbd, 0x47,
+ 0x97, 0x68, 0xbc, 0xd0, 0x17, 0x7a, 0xbd, 0xe8, 0xab, 0x7d, 0xbd, 0xec, 0x67,
+ 0xf9, 0xbb, 0x3d, 0x92, 0x83, 0xbd, 0x36, 0xa4, 0x00, 0xbd, 0x00, 0x1b, 0x45,
+ 0x3a, 0x39, 0x13, 0x88, 0xbd, 0x05, 0x63, 0x26, 0x3c, 0x53, 0x7b, 0xc9, 0x3c,
+ 0x67, 0x97, 0x7a, 0xbb, 0xfe, 0x71, 0xd6, 0xbc, 0x24, 0x84, 0x1e, 0xbd, 0x02,
+ 0xa3, 0x76, 0x3d, 0xff, 0x16, 0x69, 0x3d, 0x80, 0xf0, 0x21, 0x3d, 0x90, 0x11,
+ 0x48, 0xbd, 0xc8, 0xa9, 0x3f, 0xbd, 0xc8, 0x06, 0x25, 0xbd, 0xaa, 0xfe, 0x96,
+ 0xbd, 0xa4, 0xbe, 0x57, 0xbc, 0x6e, 0x82, 0x1d, 0x3d, 0xd6, 0xfa, 0x66, 0xbb,
+ 0x9a, 0x25, 0x20, 0x3d, 0xa3, 0x94, 0x27, 0xbb, 0x23, 0x2f, 0xcd, 0x3c, 0x5e,
+ 0xa4, 0x4e, 0x3d, 0x2a, 0x3b, 0x09, 0xbd, 0x4a, 0x40, 0x6f, 0x3d, 0xfe, 0xd8,
+ 0xe4, 0x3c, 0xab, 0xce, 0x56, 0xbd, 0x1d, 0x9a, 0x65, 0x3d, 0xb6, 0xf5, 0x76,
+ 0xbd, 0x88, 0x3d, 0x52, 0x3d, 0x0f, 0x1c, 0x50, 0xbd, 0x1d, 0x0d, 0x6a, 0x3d,
+ 0x99, 0x66, 0x98, 0xbd, 0x6e, 0xe2, 0xb9, 0x3c, 0x4c, 0x26, 0x82, 0xbd, 0xe2,
+ 0x3f, 0x65, 0xbd, 0x09, 0xa4, 0x8a, 0x3c, 0x19, 0x7d, 0x7d, 0xbd, 0xe6, 0xf8,
+ 0x1d, 0xbd, 0xfc, 0xe2, 0xee, 0xbc, 0x1d, 0xab, 0x89, 0x3d, 0x8e, 0xb4, 0xfe,
+ 0xbc, 0x68, 0x9c, 0x83, 0x3c, 0xf7, 0xa9, 0x0b, 0xbd, 0x3c, 0xed, 0x92, 0x3c,
+ 0x90, 0x72, 0xa5, 0x3c, 0x02, 0xd9, 0x69, 0xbd, 0xa9, 0x64, 0x2a, 0xbb, 0x6d,
+ 0x20, 0xf5, 0xbc, 0x0e, 0x44, 0x37, 0xbd, 0xc7, 0xf0, 0xde, 0x3c, 0xb6, 0xdb,
+ 0x71, 0x3d, 0xea, 0x6b, 0xda, 0xbc, 0xc8, 0x8f, 0x1d, 0xbd, 0xb9, 0x43, 0x05,
+ 0xbd, 0x6c, 0x4a, 0x78, 0xbc, 0xc0, 0xc3, 0x82, 0x3b, 0x4b, 0x41, 0x49, 0xbd,
+ 0xc1, 0xfc, 0xcb, 0x3b, 0x93, 0x21, 0x8d, 0xbd, 0xcf, 0x67, 0x7a, 0xbd, 0x58,
+ 0x9d, 0xdb, 0x3c, 0xd3, 0x71, 0x03, 0x3d, 0xaf, 0x55, 0x84, 0x3d, 0x71, 0x0c,
+ 0x5d, 0xbd, 0x4c, 0x19, 0x89, 0x3c, 0x7f, 0x29, 0x8b, 0x3d, 0xf6, 0xcd, 0xa9,
+ 0x3c, 0xaa, 0x00, 0x4c, 0x3d, 0x2b, 0xaa, 0x19, 0xbc, 0x93, 0xde, 0x16, 0xb9,
+ 0xda, 0xaf, 0x90, 0xbb, 0xf6, 0xde, 0x48, 0x3d, 0x00, 0x08, 0x29, 0x3b, 0xb2,
+ 0xe0, 0x82, 0xbc, 0x84, 0xf3, 0x40, 0xbc, 0xd4, 0x75, 0x08, 0x3d, 0x88, 0xe7,
+ 0x64, 0xbd, 0x68, 0xd6, 0x95, 0x3c, 0x1b, 0x70, 0x3f, 0x3d, 0x64, 0xfa, 0xfd,
+ 0xbc, 0xfc, 0x82, 0x61, 0x3d, 0x8e, 0x6e, 0x11, 0xbd, 0x0a, 0x0a, 0x9f, 0xbc,
+ 0xb5, 0x1d, 0x68, 0x3c, 0x7d, 0x9f, 0x86, 0x3d, 0xe6, 0x3f, 0x83, 0x3d, 0xf9,
+ 0xd6, 0xfe, 0x3c, 0x68, 0x0c, 0x61, 0xbd, 0x65, 0x33, 0x27, 0x3d, 0x2c, 0xcf,
+ 0x68, 0x3d, 0xb0, 0xc0, 0x14, 0xbd, 0xb0, 0xb2, 0x81, 0x3d, 0xc0, 0x9c, 0x89,
+ 0xbc, 0xae, 0x60, 0x8e, 0xbd, 0x92, 0xdd, 0x91, 0xbd, 0xc9, 0x0b, 0x85, 0x3d,
+ 0xa4, 0x00, 0xb1, 0xbc, 0x80, 0x9d, 0xf8, 0x3c, 0x1d, 0xc1, 0x98, 0xbd, 0x3e,
+ 0x88, 0xcd, 0x3c, 0x67, 0xc9, 0x66, 0x3c, 0x00, 0x46, 0x64, 0xba, 0x80, 0x3e,
+ 0x19, 0xbd, 0x18, 0xe0, 0x20, 0x3c, 0x50, 0xcb, 0xc0, 0x3b, 0xe3, 0xf3, 0x8c,
+ 0xbc, 0xac, 0x02, 0xd6, 0x3c, 0xca, 0x7a, 0x45, 0x3d, 0x95, 0xab, 0x47, 0xbd,
+ 0xe6, 0x14, 0x55, 0x3d, 0x88, 0x82, 0x09, 0x3d, 0x1c, 0x74, 0x91, 0x3c, 0xbf,
+ 0x00, 0x2f, 0x3c, 0x8c, 0xfc, 0x96, 0xbd, 0xcb, 0xa8, 0x9e, 0xbb, 0xb5, 0x6b,
+ 0x42, 0x3d, 0x0f, 0xed, 0x99, 0xbd, 0x6a, 0x9e, 0x45, 0xba, 0x50, 0xa3, 0x2d,
+ 0xbc, 0x6a, 0x95, 0x52, 0x3d, 0x18, 0x66, 0xd7, 0xbb, 0x65, 0x63, 0x7c, 0xbd,
+ 0xfe, 0xa8, 0xe1, 0xbc, 0x48, 0x89, 0x50, 0xbd, 0x64, 0x1d, 0xbe, 0x3c, 0x54,
+ 0xe9, 0x07, 0x3d, 0x2f, 0x27, 0x2b, 0x3d, 0x55, 0x02, 0x00, 0x3d, 0xb2, 0xbe,
+ 0x53, 0xbd, 0xd8, 0x03, 0x72, 0xbd, 0xd4, 0x63, 0x69, 0x3d, 0x1c, 0x9b, 0x7c,
+ 0xbd, 0x87, 0x6b, 0x83, 0xbd, 0xc8, 0x0e, 0x0f, 0xbd, 0xed, 0x88, 0x30, 0xbd,
+ 0xce, 0x02, 0x31, 0xbd, 0xae, 0xdd, 0x17, 0xbd, 0x03, 0x61, 0x43, 0xbd, 0xcf,
+ 0xd3, 0x03, 0xbd, 0x56, 0x0b, 0x57, 0xbd, 0x85, 0x33, 0x0d, 0xbd, 0x36, 0x8f,
+ 0x0b, 0xbd, 0x8e, 0x7d, 0x2c, 0xbc, 0x99, 0x21, 0x40, 0xbd, 0x9b, 0xf2, 0x62,
+ 0xbb, 0xcc, 0xaf, 0x3f, 0x3d, 0x3f, 0xc0, 0xab, 0x3c, 0xc1, 0x4d, 0x27, 0x3c,
+ 0x4b, 0x78, 0x30, 0x3d, 0x04, 0x65, 0xfe, 0x3b, 0xbe, 0x78, 0xb0, 0xbc, 0x9a,
+ 0xb9, 0xe8, 0xbc, 0x58, 0x9c, 0x5d, 0x3d, 0x95, 0x93, 0x65, 0x3d, 0xd9, 0xa8,
+ 0x41, 0xbd, 0x91, 0xb5, 0x36, 0x3d, 0x48, 0xc5, 0x84, 0xbd, 0xf8, 0x98, 0x3c,
+ 0x3c, 0x07, 0x2e, 0x96, 0xbd, 0xf2, 0xa1, 0x2b, 0xba, 0xdc, 0xa1, 0x10, 0xbd,
+ 0x3a, 0xa4, 0xdb, 0xbc, 0x03, 0x75, 0x63, 0xbd, 0x5f, 0x46, 0x3d, 0x3a, 0x75,
+ 0x7d, 0x56, 0x3d, 0x68, 0x12, 0xa8, 0xbc, 0x03, 0xf5, 0x98, 0xbd, 0xe0, 0x3c,
+ 0xe7, 0xbc, 0x90, 0xb6, 0xbb, 0xbb, 0x48, 0x0e, 0x08, 0x3d, 0x68, 0x30, 0x35,
+ 0x3c, 0xb4, 0x17, 0xcf, 0x3c, 0xf9, 0xd9, 0xf8, 0x3c, 0xc8, 0x7e, 0x09, 0xbc,
+ 0x84, 0xde, 0x45, 0xbd, 0xfe, 0xad, 0xf7, 0xbc, 0xdb, 0x10, 0x8b, 0xbd, 0x65,
+ 0xac, 0x40, 0x3d, 0x2f, 0xc7, 0x12, 0x3c, 0x60, 0x81, 0x62, 0x3d, 0x96, 0xbd,
+ 0xf6, 0x3c, 0xee, 0x7e, 0x80, 0x3d, 0x76, 0x78, 0x25, 0x3d, 0xec, 0x17, 0x1b,
+ 0xbc, 0x17, 0xa7, 0x2f, 0xbd, 0x5c, 0x17, 0x4e, 0x3d, 0x92, 0x4e, 0x99, 0xbb,
+ 0xe6, 0xec, 0x1d, 0xbd, 0xcf, 0xd4, 0x15, 0x3d, 0x36, 0x68, 0xcb, 0x3c, 0x05,
+ 0xd3, 0x68, 0x3c, 0x4d, 0x37, 0x96, 0x3c, 0x85, 0x4b, 0x98, 0x3b, 0x3e, 0xf9,
+ 0x6a, 0x3d, 0x42, 0xd5, 0x85, 0xbc, 0x35, 0xf1, 0x48, 0xbd, 0xae, 0x5a, 0x69,
+ 0x3b, 0xfc, 0xc3, 0x81, 0xbd, 0x3d, 0xe3, 0x71, 0xbd, 0xdb, 0x3b, 0x18, 0xbd,
+ 0x40, 0x90, 0x26, 0xbd, 0x5d, 0xef, 0x80, 0xbc, 0x94, 0x89, 0x9a, 0xbc, 0x96,
+ 0x7a, 0x33, 0xbd, 0x94, 0x61, 0x71, 0x3d, 0xe6, 0xaf, 0x5a, 0x3d, 0x5f, 0x3d,
+ 0x6a, 0x3b, 0x22, 0xcf, 0x23, 0xbc, 0xb1, 0x6f, 0x4b, 0xbb, 0x9a, 0x4b, 0xbe,
+ 0x3c, 0xd7, 0x02, 0x95, 0xbc, 0xb5, 0xfa, 0x4b, 0xbd, 0x8d, 0x7e, 0x85, 0xbc,
+ 0x12, 0x0b, 0x3c, 0x3d, 0xa5, 0x2c, 0xfc, 0xbb, 0xb0, 0xcc, 0xb2, 0xbb, 0xf2,
+ 0x03, 0x4a, 0xbd, 0x87, 0xe3, 0x1d, 0xbd, 0xcc, 0xd7, 0xed, 0x3c, 0x16, 0x63,
+ 0x73, 0xbc, 0x18, 0x4e, 0x47, 0x3d, 0x70, 0x95, 0x37, 0xbd, 0xfb, 0xdd, 0xc4,
+ 0x3c, 0x3d, 0x65, 0xfb, 0x3c, 0x96, 0xa0, 0x84, 0x3d, 0x60, 0x19, 0xff, 0xbb,
+ 0xa4, 0xbf, 0x4b, 0x3c, 0x5b, 0x63, 0x03, 0xbd, 0x8d, 0x86, 0xcb, 0xbb, 0x62,
+ 0xee, 0x76, 0xbd, 0x9c, 0x16, 0x73, 0x3d, 0x4f, 0xd8, 0x81, 0x3d, 0xe2, 0x7d,
+ 0xba, 0xbc, 0xd6, 0x7a, 0xb4, 0x3b, 0x61, 0x45, 0x87, 0x3d, 0xe1, 0x5e, 0x8a,
+ 0xbd, 0xfc, 0x1f, 0xc0, 0xbc, 0xc0, 0x87, 0x14, 0xbd, 0x3d, 0x53, 0x16, 0x3d,
+ 0x86, 0x91, 0x17, 0x3c, 0xa6, 0x1a, 0x71, 0xbc, 0xe7, 0x57, 0xf9, 0xbc, 0x27,
+ 0x13, 0x87, 0x3d, 0x98, 0x4e, 0x02, 0x3d, 0xe5, 0x9d, 0x13, 0x3d, 0x89, 0xbf,
+ 0x2e, 0x3c, 0xa0, 0x5f, 0x21, 0x3b, 0x80, 0xc1, 0xf4, 0x3b, 0x14, 0x22, 0x2a,
+ 0xbc, 0x33, 0xd3, 0x93, 0x3c, 0xd7, 0x3d, 0x6e, 0x3d, 0x2e, 0xcd, 0x81, 0xbd,
+ 0x71, 0xa3, 0x45, 0xbd, 0xde, 0xd6, 0x4f, 0x3d, 0xb7, 0xe7, 0x41, 0xbd, 0x27,
+ 0x86, 0xd6, 0x3c, 0x6b, 0x72, 0x85, 0x3d, 0x6d, 0x89, 0x11, 0xbd, 0x21, 0x7b,
+ 0x1a, 0xbd, 0x18, 0xf1, 0x38, 0xbd, 0xc3, 0xf7, 0xb1, 0x3c, 0xd7, 0xa0, 0x8e,
+ 0xbd, 0x6e, 0x16, 0x24, 0x3d, 0xc2, 0x2b, 0x2f, 0x3d, 0xc8, 0x1c, 0x82, 0x3c,
+ 0x53, 0x30, 0x24, 0xbc, 0xd9, 0x49, 0x1f, 0xbd, 0xea, 0x81, 0x3f, 0x3d, 0xc4,
+ 0xb7, 0x1a, 0x3d, 0xc3, 0x0a, 0x0b, 0xbd, 0x29, 0x5d, 0x88, 0x3d, 0x3f, 0xb6,
+ 0x9f, 0xbc, 0x97, 0x16, 0x72, 0xbd, 0x67, 0x40, 0xa4, 0xbc, 0x67, 0x64, 0x59,
+ 0xbc, 0xd0, 0x90, 0xfd, 0xbc, 0x48, 0xa3, 0x1b, 0xbd, 0x5f, 0x6c, 0xf2, 0x3c,
+ 0xe4, 0x81, 0x97, 0xbd, 0x2b, 0xe9, 0x86, 0x3d, 0x6c, 0xa1, 0x06, 0xbd, 0xa8,
+ 0x7c, 0x2a, 0x3c, 0x07, 0xca, 0x8d, 0x3b, 0x1f, 0x0c, 0x21, 0xbd, 0xb0, 0x7f,
+ 0x90, 0xbd, 0xe5, 0x3f, 0x17, 0x3d, 0x03, 0x58, 0x43, 0xbd, 0xe7, 0x24, 0x42,
+ 0xbd, 0xdd, 0xf2, 0x95, 0xbd, 0x58, 0xd0, 0xd9, 0x3c, 0xa9, 0xbe, 0x00, 0x3d,
+ 0x40, 0x4c, 0x97, 0xbd, 0x06, 0x0f, 0x63, 0xbd, 0x44, 0x04, 0x42, 0xbd, 0x69,
+ 0xfa, 0xd6, 0xbb, 0x40, 0x95, 0xca, 0xba, 0xba, 0x29, 0x80, 0xbd, 0x40, 0x04,
+ 0x8f, 0xbd, 0x9b, 0xd2, 0x71, 0xbd, 0x16, 0x0f, 0x36, 0xbd, 0xcf, 0xe9, 0x77,
+ 0x3d, 0x00, 0x20, 0xe2, 0xb8, 0x77, 0xed, 0x89, 0xba, 0x27, 0x9d, 0x7d, 0xbd,
+ 0x8b, 0x7d, 0xa1, 0x3c, 0xaf, 0x02, 0x41, 0xbd, 0x76, 0x0a, 0x80, 0xbd, 0xc5,
+ 0xbe, 0x0c, 0x3c, 0x65, 0xbc, 0x53, 0x3c, 0x23, 0x57, 0x71, 0x3d, 0x4c, 0x69,
+ 0xad, 0x3c, 0xe6, 0x35, 0x70, 0xbd, 0x4a, 0x71, 0x0f, 0x3d, 0x60, 0x74, 0x60,
+ 0xbd, 0x00, 0x21, 0xff, 0xbc, 0x2e, 0x9e, 0x15, 0xbd, 0x5b, 0xfa, 0xfb, 0xbc,
+ 0x70, 0x17, 0xe6, 0x3c, 0xb8, 0x5a, 0x03, 0x3d, 0x26, 0x71, 0x82, 0x3d, 0x40,
+ 0xf1, 0xe2, 0xbb, 0xad, 0xa1, 0x7d, 0xbd, 0xbb, 0x38, 0xb0, 0xbc, 0xa8, 0x2e,
+ 0x18, 0x3d, 0x29, 0xe4, 0x01, 0xbd, 0x3d, 0xed, 0x75, 0xbc, 0xc1, 0x90, 0x09,
+ 0x3d, 0x7a, 0x35, 0xf9, 0xbc, 0x0a, 0x1f, 0x8e, 0xbc, 0x7b, 0x9e, 0x05, 0xbc,
+ 0x00, 0xe1, 0x18, 0x3c, 0x90, 0xf1, 0xc1, 0xbc, 0xbc, 0xfc, 0x87, 0x3d, 0x28,
+ 0x2a, 0x48, 0x3c, 0xcf, 0x41, 0xf4, 0xbc, 0xa3, 0x20, 0x7a, 0xbd, 0x58, 0x65,
+ 0x0c, 0x3b, 0x5b, 0x8e, 0xd7, 0xbc, 0x09, 0x03, 0x87, 0x3d, 0xfa, 0xcf, 0xaa,
+ 0xbc, 0x12, 0x45, 0x83, 0xbd, 0x29, 0x24, 0x89, 0xbd, 0x77, 0x6e, 0x98, 0xbd,
+ 0x50, 0xf7, 0x91, 0xbb, 0x3e, 0x17, 0x86, 0x3c, 0xcf, 0x82, 0x54, 0x3d, 0x12,
+ 0x48, 0xff, 0xbb, 0xa8, 0x39, 0xa6, 0x3c, 0x57, 0xfc, 0xb4, 0xbc, 0xc5, 0x25,
+ 0x30, 0xbd, 0xcd, 0xbc, 0x04, 0xbd, 0x10, 0x87, 0xb4, 0xbc, 0x16, 0x7b, 0x6e,
+ 0xbd, 0xba, 0x00, 0x5f, 0xbd, 0xf8, 0x14, 0xac, 0x3c, 0xdf, 0x4d, 0x88, 0xbd,
+ 0x2e, 0xd2, 0xb6, 0xbc, 0x8e, 0x7a, 0x8e, 0xbd, 0xac, 0xdb, 0xe2, 0x3c, 0x7b,
+ 0x12, 0x8b, 0x3d, 0x03, 0xe2, 0x91, 0xbd, 0x43, 0xac, 0x3c, 0xbc, 0x5a, 0xc7,
+ 0x52, 0x3d, 0x5e, 0xec, 0x40, 0x3d, 0x1a, 0xb0, 0x1f, 0xbc, 0x1d, 0x9c, 0x92,
+ 0xbd, 0xd3, 0x03, 0xfd, 0x3c, 0xdd, 0x22, 0x0a, 0xbb, 0xe2, 0x2a, 0x89, 0x3d,
+ 0x94, 0xb6, 0xd4, 0xbb, 0x74, 0x26, 0xb8, 0xbc, 0xc6, 0x7a, 0x35, 0xbd, 0xa8,
+ 0xb7, 0x8e, 0xbd, 0xbe, 0x94, 0x36, 0xbd, 0x22, 0xc0, 0x03, 0xbd, 0x40, 0xb4,
+ 0xe5, 0x3a, 0x53, 0xb5, 0x14, 0xbc, 0xac, 0x00, 0x3a, 0xbc, 0xb3, 0xd9, 0xee,
+ 0x3c, 0xb5, 0x7c, 0xae, 0xbb, 0xd6, 0xb2, 0x75, 0x3c, 0x2f, 0x0e, 0x1a, 0xbd,
+ 0xf0, 0xb2, 0x47, 0xbd, 0xad, 0x36, 0x50, 0xbb, 0x19, 0x86, 0x36, 0xbd, 0xb4,
+ 0x02, 0xe4, 0xbc, 0xe2, 0x37, 0x10, 0x3d, 0x17, 0xcb, 0x86, 0xbd, 0x33, 0x35,
+ 0x5e, 0x3c, 0x63, 0xfe, 0x8f, 0x3d, 0x8e, 0x91, 0x6c, 0xbd, 0xf8, 0x55, 0x6f,
+ 0x3c, 0x60, 0xc0, 0xb6, 0x3c, 0x09, 0x23, 0x8d, 0xbd, 0x75, 0xae, 0x89, 0x3d,
+ 0x4e, 0xb2, 0x76, 0x3d, 0xbc, 0x52, 0x57, 0xbd, 0x5c, 0xf2, 0xde, 0xbc, 0x5a,
+ 0xc5, 0xc5, 0xbc, 0x01, 0xbf, 0x1a, 0xbd, 0xc4, 0x10, 0x37, 0xbd, 0xe9, 0xe5,
+ 0x7a, 0x3b, 0xa0, 0x03, 0x58, 0xbd, 0x4f, 0xe4, 0x66, 0x3d, 0xbd, 0xc0, 0xa8,
+ 0xbc, 0xd0, 0x05, 0xb9, 0x3c, 0xd3, 0xb7, 0xd9, 0x3c, 0xf2, 0x28, 0x2d, 0x3d,
+ 0x69, 0x78, 0x38, 0xbd, 0x55, 0x58, 0x49, 0xbc, 0xc5, 0x5b, 0xc2, 0x3c, 0x67,
+ 0x0d, 0x40, 0x3d, 0x02, 0xec, 0x2b, 0x3d, 0x60, 0x6a, 0xac, 0x3c, 0x6a, 0x9c,
+ 0x65, 0x3d, 0x19, 0x18, 0x4d, 0xbd, 0x05, 0xaf, 0xbd, 0xbc, 0x22, 0x2b, 0x54,
+ 0xbd, 0x1d, 0x0c, 0xd9, 0xbc, 0x0a, 0xf7, 0xfd, 0x3a, 0x5a, 0x18, 0x23, 0x3d,
+ 0xeb, 0xfc, 0x84, 0xbd, 0xaf, 0x71, 0x0c, 0xbc, 0x98, 0x72, 0x5e, 0x3c, 0x18,
+ 0x8b, 0x88, 0x3c, 0xa4, 0x1d, 0x8f, 0xbb, 0x3c, 0x3d, 0xbf, 0xbc, 0x18, 0x7a,
+ 0xc7, 0x3c, 0x2e, 0x1c, 0x77, 0xbd, 0x50, 0x47, 0x55, 0x3c, 0x5c, 0xa7, 0x23,
+ 0xbc, 0x0c, 0x4e, 0xda, 0x3c, 0x00, 0x25, 0x7f, 0x3d, 0xdc, 0xbd, 0x85, 0xbd,
+ 0xee, 0x84, 0x91, 0xbc, 0x0b, 0xcb, 0x81, 0x3d, 0x7a, 0x5f, 0x04, 0xbc, 0xde,
+ 0x3d, 0x7b, 0xbb, 0x05, 0xa9, 0x79, 0x3d, 0x6c, 0x47, 0x2e, 0xbd, 0x9a, 0x8c,
+ 0x7c, 0x3d, 0xee, 0xc6, 0x93, 0xbd, 0xaf, 0xd0, 0xd9, 0xbc, 0x33, 0x14, 0x3c,
+ 0xbd, 0xe3, 0x36, 0x6e, 0x3d, 0x0b, 0x9a, 0x55, 0xbc, 0xe9, 0x83, 0x84, 0x3d,
+ 0xd6, 0xb4, 0x6c, 0x3d, 0xc4, 0xea, 0xd4, 0x3c, 0x48, 0xb4, 0x20, 0x3d, 0x6e,
+ 0xc9, 0x53, 0x3d, 0x4e, 0x95, 0xbb, 0xbc, 0x15, 0x0c, 0x86, 0x3d, 0xdc, 0x7a,
+ 0x40, 0xbd, 0x98, 0x24, 0x6d, 0xbc, 0x2f, 0xea, 0x8a, 0xbd, 0x78, 0x00, 0xb4,
+ 0x3c, 0x8f, 0x53, 0x52, 0x3d, 0xc2, 0xfb, 0x11, 0x3d, 0x10, 0x7e, 0x81, 0x3c,
+ 0xae, 0xf3, 0x3e, 0x3d, 0x34, 0x8d, 0xeb, 0x3c, 0x72, 0x86, 0xd6, 0xbc, 0xd5,
+ 0x02, 0xad, 0x3b, 0x9d, 0x1c, 0x41, 0xbd, 0xda, 0x6b, 0x23, 0x3d, 0xaf, 0xa0,
+ 0x2b, 0x3d, 0x91, 0xd9, 0x5c, 0x3d, 0xce, 0x13, 0x4c, 0xbd, 0xa8, 0x7a, 0x4a,
+ 0x3d, 0xfd, 0xc5, 0x29, 0xbd, 0xff, 0xa6, 0x50, 0xbd, 0x9d, 0x04, 0x43, 0x3d,
+ 0x49, 0x9f, 0x82, 0xbd, 0xe0, 0x8c, 0x87, 0xbd, 0xb7, 0xb5, 0x64, 0xbd, 0x5e,
+ 0x55, 0x27, 0x3d, 0x8d, 0xde, 0x41, 0x3d, 0x19, 0x6b, 0x23, 0xbc, 0x6f, 0x71,
+ 0xf6, 0x3c, 0x04, 0x56, 0x24, 0x3d, 0xb8, 0x20, 0x3a, 0x3c, 0x97, 0xb4, 0x91,
+ 0xbd, 0x87, 0xf5, 0x6d, 0x3d, 0x80, 0x5b, 0x9d, 0x3c, 0x70, 0x4c, 0xad, 0x3b,
+ 0xff, 0x49, 0x81, 0x3d, 0x88, 0x14, 0x89, 0xbc, 0x72, 0xde, 0x25, 0xbd, 0x62,
+ 0xa9, 0x21, 0x3d, 0x94, 0x43, 0x59, 0xbc, 0xb1, 0x5a, 0x92, 0x3d, 0x9d, 0x57,
+ 0x6b, 0x3c, 0x5d, 0xa8, 0x8d, 0x3d, 0xd7, 0xf7, 0x08, 0x3d, 0x1c, 0x07, 0xe3,
+ 0xbc, 0xdd, 0xfc, 0xb5, 0xbc, 0xbc, 0xca, 0x84, 0x3d, 0x5c, 0x9e, 0x18, 0xbd,
+ 0xd5, 0x6d, 0x86, 0x3d, 0x42, 0x2b, 0x58, 0x3c, 0x0a, 0xc6, 0x33, 0x3d, 0x2c,
+ 0x1e, 0xf6, 0xbc, 0xb8, 0x48, 0x46, 0xbd, 0x26, 0xd6, 0x88, 0xbd, 0xd8, 0x45,
+ 0x2e, 0x3d, 0x7f, 0x28, 0x4f, 0x3d, 0x52, 0x42, 0x40, 0xbc, 0xad, 0xc8, 0x45,
+ 0xbd, 0xaa, 0x1c, 0x27, 0xbd, 0x32, 0x83, 0x72, 0xbb, 0xd2, 0xc5, 0x33, 0x3b,
+ 0x1e, 0x2f, 0x6f, 0x3d, 0x9e, 0x5c, 0x1c, 0x3d, 0x2d, 0xfb, 0xc5, 0xbc, 0x3d,
+ 0x12, 0x68, 0x3b, 0xb4, 0x98, 0xe9, 0x3c, 0xb9, 0xbd, 0xdf, 0x3a, 0xe0, 0xac,
+ 0x2c, 0x3d, 0x10, 0x5c, 0x87, 0x3c, 0x80, 0xd6, 0x2d, 0xba, 0x18, 0x73, 0x94,
+ 0x3c, 0xb8, 0x3c, 0x39, 0xbc, 0x48, 0x64, 0xda, 0x3c, 0x54, 0xdf, 0x05, 0x3d,
+ 0x04, 0x35, 0xdf, 0x3c, 0xdb, 0xf8, 0xfb, 0xba, 0xc3, 0x2d, 0xc1, 0xb8, 0x0e,
+ 0x8c, 0xd1, 0x3c, 0x4f, 0x12, 0x14, 0x3d, 0x50, 0xbc, 0x7d, 0xbc, 0xc7, 0x20,
+ 0x88, 0xbd, 0x79, 0x45, 0x2f, 0xbd, 0x77, 0x83, 0x55, 0xbc, 0x42, 0x7e, 0x95,
+ 0xbd, 0x9d, 0xfb, 0x4d, 0xbd, 0x92, 0xcc, 0x89, 0xbd, 0x84, 0x1d, 0x03, 0xbd,
+ 0x1f, 0xe1, 0x86, 0xbb, 0xca, 0xee, 0x4e, 0x3c, 0x15, 0x39, 0x55, 0xbd, 0x94,
+ 0x4b, 0x87, 0xbd, 0xf3, 0xf0, 0x0d, 0xbd, 0x4d, 0x17, 0x7b, 0x3d, 0xe5, 0x0b,
+ 0x95, 0xbc, 0x10, 0x50, 0x20, 0xbd, 0x60, 0x74, 0x7c, 0xbd, 0x50, 0x76, 0xad,
+ 0xbc, 0xdd, 0x59, 0x89, 0x3c, 0xa1, 0xcc, 0x10, 0x3d, 0x23, 0x4c, 0x37, 0x3c,
+ 0x50, 0x0e, 0xa6, 0x3c, 0x02, 0x0e, 0x24, 0xbd, 0x9d, 0x9f, 0x40, 0xbd, 0xba,
+ 0xe1, 0x51, 0xbd, 0x9e, 0xe5, 0x2a, 0xbd, 0x44, 0x07, 0xc8, 0x3c, 0xc0, 0x11,
+ 0x85, 0x3c, 0x1c, 0xde, 0x40, 0xbd, 0x34, 0xd3, 0xe3, 0x3c, 0xf1, 0xae, 0xdb,
+ 0xbc, 0xea, 0xbb, 0xf0, 0xbc, 0x32, 0x81, 0xb7, 0x3c, 0x1b, 0xe9, 0x4f, 0xbd,
+ 0x47, 0xd3, 0xb7, 0xbc, 0xc4, 0x4b, 0xe7, 0xbc, 0xf3, 0x52, 0x3b, 0x3d, 0x10,
+ 0xb8, 0xb6, 0x3b, 0x0b, 0xb8, 0x33, 0xbc, 0xb1, 0xba, 0x29, 0x3d, 0x93, 0xfc,
+ 0x00, 0xbd, 0xdf, 0x63, 0x30, 0xbd, 0xac, 0x1d, 0x1e, 0x3d, 0x52, 0xf7, 0x15,
+ 0xbd, 0x7f, 0xea, 0x53, 0xbd, 0x29, 0xe4, 0x2f, 0xbc, 0x5e, 0xf0, 0xb7, 0x3c,
+ 0xb1, 0xff, 0x09, 0xbd, 0xc9, 0x0f, 0xae, 0x3c, 0x5a, 0xc0, 0x06, 0xbd, 0x34,
+ 0x15, 0x10, 0xbd, 0x76, 0xea, 0x95, 0xbc, 0x60, 0xd8, 0x2d, 0x3c, 0x4c, 0x12,
+ 0x77, 0xbc, 0x2d, 0xb6, 0x88, 0x3d, 0x7f, 0x15, 0xe4, 0x3c, 0xb0, 0xef, 0xf0,
+ 0xbc, 0x79, 0x32, 0x1c, 0xbd, 0x4d, 0xbc, 0x4b, 0xbd, 0xae, 0x6d, 0x64, 0x3d,
+ 0x0c, 0x44, 0x82, 0xbc, 0x15, 0x4f, 0x3e, 0xbd, 0x86, 0x54, 0xab, 0xbc, 0x78,
+ 0xea, 0x0d, 0xbd, 0x73, 0xc6, 0x87, 0xbd, 0x06, 0xed, 0x32, 0xbd, 0xfd, 0x03,
+ 0x8a, 0xbd, 0x89, 0x8b, 0x30, 0xbd, 0x40, 0x73, 0x0d, 0xbd, 0xcf, 0x80, 0x84,
+ 0xbd, 0x3c, 0x00, 0x69, 0xbd, 0xeb, 0x8a, 0xf8, 0x3b, 0xc1, 0xa4, 0x93, 0xbd,
+ 0x25, 0x74, 0x69, 0xbd, 0x11, 0xe5, 0x00, 0x3d, 0x2d, 0xa0, 0x01, 0x3d, 0xf9,
+ 0x7d, 0x02, 0xbc, 0x55, 0x26, 0x30, 0x3d, 0xad, 0xf7, 0x50, 0x3c, 0xd6, 0xb1,
+ 0x68, 0x3d, 0xce, 0x49, 0x71, 0xbd, 0xcf, 0xde, 0xaa, 0x3b, 0x5d, 0x6e, 0x91,
+ 0xbd, 0xb4, 0xf1, 0x1a, 0xbd, 0xc7, 0xeb, 0xc2, 0x3c, 0x50, 0x74, 0xd4, 0xbb,
+ 0xe8, 0x25, 0x1f, 0x3d, 0xdb, 0x0a, 0x8e, 0xbc, 0x9d, 0x5d, 0x73, 0xbd, 0x70,
+ 0xce, 0x01, 0xbc, 0xc4, 0x22, 0x84, 0x3d, 0x80, 0x3b, 0x1d, 0x3c, 0x3d, 0xfa,
+ 0x15, 0xbd, 0x45, 0xd7, 0x9a, 0xbd, 0x4d, 0xa2, 0x4e, 0xbd, 0x41, 0x6e, 0x96,
+ 0xbc, 0xbf, 0xe4, 0x6c, 0x3d, 0x90, 0x3c, 0x21, 0x3d, 0x99, 0x76, 0x83, 0x3c,
+ 0xe1, 0xb9, 0x6f, 0x3d, 0x24, 0xb9, 0xcf, 0xbc, 0xc0, 0x33, 0xee, 0xbb, 0x8d,
+ 0xa6, 0xf0, 0xbc, 0x40, 0x81, 0x3f, 0x3d, 0x43, 0x82, 0x7e, 0x3c, 0xfa, 0x13,
+ 0x7a, 0x3d, 0x91, 0xcd, 0x0a, 0xbc, 0x80, 0x3e, 0x61, 0x3d, 0x65, 0xef, 0x56,
+ 0xbd, 0x44, 0x57, 0x90, 0xbd, 0xb4, 0x86, 0x7a, 0x3c, 0x70, 0xf5, 0xbd, 0x3c,
+ 0x90, 0x5c, 0xdc, 0x3c, 0x13, 0xe5, 0xeb, 0xbc, 0x30, 0x7a, 0x48, 0x3d, 0xfa,
+ 0x4c, 0xbe, 0x3c, 0x4d, 0x35, 0x2e, 0xbd, 0x32, 0x33, 0xdb, 0xbc, 0xab, 0x4c,
+ 0x0a, 0xbd, 0x12, 0x58, 0xad, 0xbc, 0x20, 0x07, 0x0c, 0x3c, 0xbc, 0xb5, 0xa6,
+ 0x3c, 0xb6, 0x70, 0x8f, 0xbd, 0xbc, 0x9a, 0x57, 0x3d, 0xb3, 0x6f, 0x82, 0xbd,
+ 0x52, 0xb9, 0x5c, 0x3c, 0x0d, 0x71, 0xd9, 0x3c, 0x18, 0x70, 0x0a, 0x3d, 0x80,
+ 0x7b, 0x0a, 0x3b, 0xee, 0x75, 0x27, 0xbc, 0x63, 0x74, 0x56, 0xbd, 0xf0, 0x20,
+ 0x5f, 0x3b, 0xfb, 0x77, 0x1e, 0xba, 0xb8, 0x6c, 0xee, 0x3c, 0x01, 0xd0, 0xef,
+ 0x3c, 0xb2, 0x68, 0x12, 0xbd, 0x51, 0xf6, 0x3c, 0xbd, 0x12, 0xb0, 0x2e, 0xbd,
+ 0x11, 0xfd, 0x5e, 0xbd, 0x48, 0xea, 0xb4, 0xbc, 0xce, 0xca, 0x88, 0x3d, 0x38,
+ 0x57, 0x40, 0x3d, 0x11, 0xfa, 0x8b, 0x3d, 0xc0, 0x34, 0x36, 0x3d, 0xe4, 0x82,
+ 0x8e, 0xbd, 0xbd, 0x95, 0x59, 0xbd, 0xf0, 0x8b, 0x43, 0xbd, 0x93, 0x9b, 0x0a,
+ 0xbc, 0xb7, 0x99, 0x4d, 0x3c, 0x46, 0x42, 0x1d, 0x3d, 0x00, 0x19, 0x3a, 0xbd,
+ 0x1c, 0xd3, 0x5a, 0xbd, 0xff, 0x09, 0x02, 0xbd, 0xa1, 0x01, 0x8e, 0x3d, 0xc3,
+ 0x9e, 0xd8, 0xbb, 0x28, 0xb5, 0x2d, 0x3d, 0x56, 0x9c, 0x16, 0x3d, 0x78, 0xe6,
+ 0x1e, 0xbc, 0x06, 0x56, 0x14, 0x3d, 0xbc, 0x3f, 0x88, 0xbd, 0x34, 0x45, 0x94,
+ 0xbc, 0xfb, 0xb1, 0x0a, 0xbd, 0x67, 0x87, 0x90, 0xbd, 0x4d, 0x75, 0x27, 0xbd,
+ 0x9f, 0xc8, 0x60, 0x3b, 0x02, 0xc4, 0xb0, 0xbc, 0x54, 0x5b, 0x5f, 0xbd, 0xe3,
+ 0x43, 0xff, 0xbc, 0xf6, 0xf7, 0x39, 0xbc, 0x99, 0x4c, 0x82, 0xbd, 0xda, 0x99,
+ 0xa9, 0x3b, 0x6a, 0xd5, 0xee, 0xbc, 0x1e, 0xc1, 0x93, 0xbd, 0xc2, 0x21, 0x52,
+ 0xbc, 0x52, 0xfc, 0x06, 0xbc, 0x70, 0x59, 0x85, 0xbd, 0x5d, 0xbd, 0x8a, 0xbd,
+ 0xe2, 0x10, 0x77, 0x3d, 0x36, 0x83, 0x90, 0xbd, 0x66, 0x9f, 0x90, 0xbc, 0x30,
+ 0x78, 0x4c, 0x3d, 0xd4, 0x2c, 0x8b, 0x3c, 0xe0, 0x8b, 0x4e, 0xbc, 0x31, 0x0f,
+ 0x80, 0xbd, 0x4a, 0xb7, 0x5b, 0xbd, 0x52, 0xd0, 0x1a, 0xbd, 0x5c, 0x20, 0xe3,
+ 0x3c, 0x5a, 0x77, 0x29, 0xbd, 0x90, 0x0b, 0x00, 0xbd, 0x62, 0x10, 0x4c, 0x3d,
+ 0x40, 0x52, 0x58, 0x3c, 0x18, 0x5e, 0x46, 0x3c, 0xc6, 0x6b, 0x37, 0x3d, 0x17,
+ 0x5c, 0x90, 0x3d, 0x28, 0x6c, 0xfd, 0xbc, 0x7e, 0x4b, 0x28, 0xbd, 0x86, 0x7b,
+ 0x1d, 0xbd, 0x2b, 0x78, 0x83, 0x3d, 0x48, 0x65, 0x53, 0x3d, 0x91, 0x41, 0x7b,
+ 0xbd, 0x0a, 0x32, 0x65, 0xbd, 0x80, 0xb5, 0x83, 0xbd, 0x93, 0x10, 0x8b, 0x3d,
+ 0x40, 0xc2, 0x9b, 0x3a, 0xe8, 0xe9, 0xcc, 0x3c, 0xb8, 0xf5, 0x00, 0x3d, 0x2a,
+ 0x60, 0x70, 0x3d, 0xbb, 0xa9, 0x18, 0xbd, 0xbf, 0xca, 0x76, 0xbd, 0xf4, 0x83,
+ 0xda, 0xbc, 0xcc, 0x89, 0xeb, 0x3c, 0xa0, 0x01, 0x27, 0xbb, 0x90, 0x98, 0x1e,
+ 0x3d, 0x2d, 0x7a, 0x91, 0xbd, 0x00, 0x8e, 0x71, 0xbd, 0xc7, 0x30, 0x1a, 0xbd,
+ 0x22, 0xe9, 0x3d, 0x3d, 0x1a, 0xb3, 0x46, 0x3d, 0xbe, 0x20, 0x5a, 0x3d, 0x02,
+ 0x34, 0x0b, 0xbd, 0x8d, 0x91, 0x5c, 0xbd, 0x84, 0xeb, 0xdc, 0xbc, 0xaa, 0x4b,
+ 0xd6, 0xbc, 0xab, 0xd1, 0x91, 0x3d, 0xb8, 0x2c, 0x95, 0x3c, 0x0c, 0xf7, 0x59,
+ 0x3d, 0xc9, 0xea, 0x8e, 0xbd, 0x23, 0xb1, 0x83, 0xbd, 0x27, 0x20, 0x85, 0xbd,
+ 0x40, 0xdb, 0xaa, 0x3a, 0x4c, 0x7b, 0x48, 0xbc, 0x00, 0x62, 0x9d, 0x3b, 0xaf,
+ 0xeb, 0x83, 0x3d, 0xe0, 0x4e, 0x1d, 0x3b, 0x90, 0xf9, 0xdc, 0xbc, 0xd6, 0x49,
+ 0x60, 0x3d, 0x4e, 0x96, 0x66, 0x3d, 0xbe, 0x9e, 0x9b, 0xbc, 0xec, 0x9e, 0xff,
+ 0x3c, 0xd0, 0xa1, 0x0b, 0x3d, 0xb4, 0x2d, 0x39, 0x3d, 0x28, 0x62, 0x9a, 0x3c,
+ 0xce, 0xdc, 0x67, 0x3d, 0xe8, 0xb6, 0x68, 0x3c, 0xb6, 0x37, 0x87, 0xbd, 0xee,
+ 0xd3, 0x67, 0x3d, 0x18, 0xfb, 0x31, 0x3c, 0x27, 0x89, 0x26, 0xbd, 0x30, 0x9e,
+ 0xc0, 0x3c, 0xd0, 0x5b, 0x30, 0xbd, 0x90, 0x96, 0x33, 0x3c, 0x1e, 0xf8, 0x20,
+ 0xbd, 0x48, 0xa2, 0xa2, 0x3c, 0x2e, 0x6b, 0x3f, 0xbd, 0x32, 0x37, 0x1e, 0x3d,
+ 0x10, 0x9e, 0x26, 0xbd, 0x1c, 0xd5, 0x60, 0xbd, 0xf5, 0x5f, 0x06, 0xbd, 0x87,
+ 0xff, 0x71, 0xbd, 0x1d, 0xba, 0x8c, 0xbd, 0x00, 0xe0, 0x8c, 0xba, 0x20, 0x94,
+ 0x0d, 0xbc, 0x5a, 0x15, 0x84, 0xbc, 0x36, 0x58, 0x50, 0x3d, 0x7a, 0x21, 0x5c,
+ 0x3d, 0x78, 0x57, 0x39, 0xbd, 0x8d, 0x3b, 0x59, 0xbd, 0x90, 0x90, 0x80, 0xbb,
+ 0xf0, 0x93, 0xbe, 0x3b, 0x50, 0x34, 0xe1, 0xbb, 0xc0, 0xac, 0xd3, 0xba, 0x42,
+ 0x75, 0xb4, 0xbc, 0x38, 0xaa, 0x30, 0xbd, 0xa6, 0x79, 0x49, 0x3d, 0xfc, 0xd2,
+ 0x37, 0xbc, 0xe0, 0x0d, 0xd6, 0xbb, 0xc1, 0x2d, 0x73, 0xbd, 0x4a, 0xf1, 0x5b,
+ 0xbd, 0xd4, 0x0c, 0x82, 0x3c, 0xce, 0x51, 0x0c, 0xbd, 0xe0, 0x9c, 0x4e, 0xbd,
+ 0x3e, 0x98, 0x6a, 0x3d, 0x7e, 0xbf, 0x27, 0x3d, 0x00, 0xb2, 0x6f, 0xbd, 0x0c,
+ 0xcd, 0x4d, 0x3d, 0xfa, 0x7b, 0x22, 0x3d, 0x18, 0x3f, 0x02, 0xbc, 0xa4, 0x1a,
+ 0xb7, 0xbc, 0xe2, 0xf5, 0x45, 0x3d, 0xf0, 0x66, 0xe6, 0xbb, 0xd2, 0x56, 0x54,
+ 0x3d, 0x72, 0xff, 0x64, 0x3d, 0x68, 0xbf, 0x41, 0x3d, 0x8c, 0xa8, 0x39, 0xbd,
+ 0x4b, 0x80, 0x88, 0x3d, 0x40, 0x05, 0x8f, 0x3c, 0x9a, 0x58, 0x6b, 0xbd, 0xb6,
+ 0xc7, 0x58, 0xbd, 0x66, 0x73, 0x12, 0x3d, 0x9c, 0x2b, 0x50, 0xbd, 0xc8, 0x47,
+ 0x7d, 0xbc, 0xb7, 0x6a, 0x04, 0xbd, 0xe6, 0x6a, 0x23, 0x3d, 0xdb, 0x11, 0x1f,
+ 0xbd, 0x60, 0x1d, 0x5e, 0xbc, 0x80, 0x70, 0x72, 0xbd, 0x08, 0xed, 0x51, 0x3c,
+ 0xb8, 0x35, 0x0c, 0xbc, 0x2e, 0xef, 0x47, 0x3d, 0xd0, 0xfb, 0xdf, 0x3b, 0xee,
+ 0xea, 0x5c, 0x3d, 0x52, 0xa6, 0x7f, 0x3d, 0x1c, 0xd4, 0x92, 0x3c, 0x0c, 0xe1,
+ 0xe3, 0x3c, 0x0b, 0x0e, 0x8b, 0x3d, 0x1e, 0x6f, 0x20, 0x3d, 0xee, 0xf3, 0x45,
+ 0xbd, 0x28, 0xef, 0xfc, 0x3c, 0x48, 0x19, 0x8c, 0xbd, 0x02, 0x87, 0x7f, 0xbd,
+ 0x6c, 0xc1, 0x4b, 0x3d, 0x30, 0x88, 0x72, 0xbc, 0x00, 0xb2, 0xce, 0x39, 0x68,
+ 0x2f, 0xf1, 0xbc, 0x00, 0xa0, 0x3b, 0xb8, 0x0c, 0x90, 0x7b, 0xbd, 0xd0, 0x97,
+ 0x45, 0xbd, 0xf6, 0xf5, 0x5d, 0x3d, 0x50, 0x0b, 0x0e, 0x3c, 0x48, 0x51, 0xf9,
+ 0x3c, 0xb7, 0xe4, 0x4d, 0xbd, 0xca, 0x8d, 0xcf, 0xbc, 0x49, 0x0d, 0x88, 0xbd,
+ 0xb1, 0x3c, 0x8f, 0x3d, 0xef, 0x72, 0x8a, 0x3d, 0x90, 0x23, 0x02, 0x3d, 0xe8,
+ 0x60, 0x05, 0x3c, 0xc0, 0x9f, 0xb6, 0xba, 0xd5, 0x57, 0x03, 0xbd, 0x22, 0xae,
+ 0x66, 0x3d, 0x61, 0x03, 0x8b, 0xbd, 0xcc, 0x23, 0xea, 0xbc, 0x80, 0x58, 0x4f,
+ 0x3c, 0x60, 0xea, 0xd0, 0x3b, 0xae, 0x19, 0x2e, 0xbd, 0x5e, 0xee, 0xb5, 0xbc,
+ 0x50, 0x19, 0x18, 0x3c, 0x6d, 0xd7, 0x78, 0xbd, 0x40, 0xcb, 0xe9, 0xbc, 0xea,
+ 0x76, 0x53, 0xbd, 0x2c, 0x0e, 0x6b, 0xbc, 0xd8, 0xd6, 0x6a, 0x3c, 0xe0, 0x3d,
+ 0x80, 0xbd, 0x80, 0x36, 0xf1, 0xba, 0x30, 0x30, 0x51, 0x3c, 0x40, 0x41, 0xa3,
+ 0xba, 0xc8, 0xe8, 0x80, 0xbd, 0x72, 0x33, 0x67, 0x3d, 0xdd, 0x7d, 0x0c, 0xbd,
+ 0x1c, 0xcf, 0xbe, 0x3c, 0x8c, 0x1d, 0x8f, 0xbd, 0x4c, 0x5a, 0x3a, 0x3d, 0xa0,
+ 0x35, 0xff, 0x3b, 0x50, 0xb8, 0xea, 0xbb, 0x58, 0x63, 0x26, 0xbc, 0x70, 0x33,
+ 0x0c, 0xbc, 0x58, 0xbb, 0x09, 0xbc, 0x1a, 0xd0, 0xf6, 0xbc, 0x02, 0xb0, 0x08,
+ 0x3d, 0x4c, 0x72, 0xa7, 0x3c, 0x10, 0xa0, 0xa7, 0x3b, 0x7c, 0xab, 0x3f, 0x3d,
+ 0x12, 0x95, 0xc6, 0xbc, 0x58, 0xe5, 0xac, 0xbc, 0x80, 0xbc, 0x56, 0x3b, 0x00,
+ 0xd2, 0xda, 0xbb, 0x26, 0xff, 0xaa, 0xbc, 0xf2, 0xdc, 0x71, 0x3d, 0x30, 0xaf,
+ 0x85, 0xbb, 0x88, 0xf9, 0x14, 0x3d, 0x50, 0x89, 0xc5, 0xbb, 0xc0, 0xd0, 0xf1,
+ 0x3b, 0x95, 0xf2, 0x7b, 0xbd, 0x66, 0x43, 0xfa, 0xbc, 0xa0, 0x68, 0xf3, 0xbb,
+ 0x60, 0xa0, 0xdc, 0x3c, 0x0e, 0x67, 0x6e, 0x3d, 0xdd, 0xec, 0x8a, 0xbd, 0xca,
+ 0x1e, 0x8f, 0xbd, 0x64, 0x84, 0x6c, 0xbd, 0xee, 0x7b, 0x7a, 0xbd, 0xd2, 0xdc,
+ 0x97, 0xbc, 0x84, 0x44, 0x77, 0xbd, 0xf8, 0xec, 0x0e, 0xbd, 0xea, 0x25, 0x03,
+ 0x3d, 0x8e, 0x42, 0x27, 0xbd, 0x31, 0x0b, 0x87, 0x3d, 0xba, 0x5e, 0x31, 0xbd,
+ 0x74, 0xee, 0xa5, 0x3c, 0xb5, 0xa1, 0x83, 0x3d, 0x48, 0x87, 0xad, 0x3c, 0x5c,
+ 0xc4, 0x04, 0xbd, 0xe6, 0xe7, 0x4e, 0x3d, 0x24, 0xa4, 0xb2, 0xbc, 0x02, 0x4a,
+ 0x8d, 0xbd, 0xfa, 0x96, 0x92, 0xbd, 0xf8, 0x1e, 0xaf, 0x3c, 0x80, 0xdb, 0xfe,
+ 0x3a, 0x20, 0x48, 0xff, 0xbb, 0xf2, 0xdd, 0x63, 0x3d, 0x2c, 0x12, 0xaf, 0x3c,
+ 0x8a, 0x05, 0xcf, 0xbc, 0xd8, 0x3a, 0x23, 0x3d, 0x2b, 0x32, 0x89, 0xbd, 0xd0,
+ 0xff, 0x8b, 0x3b, 0x58, 0xd1, 0x13, 0xbd, 0x00, 0xac, 0x96, 0x3a, 0x8a, 0x92,
+ 0x33, 0x3d, 0x1c, 0xdb, 0x2f, 0xbc, 0x8a, 0x30, 0x69, 0xbd, 0x80, 0xcc, 0x7a,
+ 0x3b, 0x88, 0xaa, 0x7b, 0xbd, 0x03, 0xda, 0x8e, 0xbd, 0x10, 0x40, 0xfe, 0x3b,
+ 0x74, 0x92, 0x0b, 0x3d, 0x54, 0x61, 0x7e, 0xbd, 0xdd, 0x2f, 0x75, 0xbd, 0xa8,
+ 0xcd, 0x52, 0x3c, 0x20, 0xf1, 0x57, 0x3d, 0x98, 0x18, 0x05, 0xbc, 0x86, 0x14,
+ 0x3a, 0x3d, 0xf0, 0xa5, 0x94, 0x3b, 0x13, 0xd7, 0x8b, 0x3d, 0xbe, 0x38, 0x1e,
+ 0x3d, 0xe6, 0xa2, 0x8d, 0xbc, 0xc0, 0x39, 0xdf, 0x3c, 0xf8, 0x3f, 0x8b, 0xbd,
+ 0xc9, 0x86, 0x8a, 0x3d, 0x51, 0xa4, 0x6d, 0xbd, 0x7b, 0xe0, 0x82, 0x3d, 0x50,
+ 0x6e, 0x6d, 0x3c, 0xd0, 0x15, 0x60, 0xbd, 0x46, 0xec, 0x06, 0xbd, 0x50, 0x8b,
+ 0x0f, 0x3d, 0x8e, 0x36, 0xab, 0xbc, 0x7f, 0x46, 0x74, 0xbd, 0x4e, 0x2b, 0x63,
+ 0xbd, 0x6e, 0xdf, 0x2c, 0x3d, 0xee, 0x87, 0x60, 0x3d, 0x4e, 0x24, 0x6e, 0xbd,
+ 0x06, 0xbf, 0x7d, 0x3d, 0x40, 0xf6, 0x25, 0x3c, 0xba, 0xea, 0x01, 0x3d, 0x29,
+ 0x4f, 0x8c, 0xbd, 0xf3, 0x02, 0x8b, 0xbd, 0x7c, 0x06, 0x30, 0xbd, 0xda, 0x97,
+ 0x1e, 0x3d, 0xad, 0x89, 0x8b, 0xbd, 0x90, 0x78, 0xd1, 0x3b, 0x2c, 0x75, 0xb5,
+ 0x3c, 0x41, 0x04, 0x40, 0xbd, 0x52, 0x9d, 0x08, 0x3d, 0xf4, 0x53, 0xbf, 0x3c,
+ 0x48, 0x82, 0x16, 0x3c, 0x3a, 0xa1, 0x72, 0x3d, 0xc8, 0x73, 0x32, 0x3d, 0x5a,
+ 0x20, 0x20, 0x3d, 0x08, 0xb1, 0x48, 0x3d, 0x46, 0x6e, 0x73, 0x3d, 0x59, 0x17,
+ 0x0f, 0xbd, 0xb8, 0xa7, 0x01, 0x3c, 0x10, 0x53, 0x46, 0x3c, 0x27, 0xc2, 0x3f,
+ 0xbd, 0x77, 0x6b, 0x91, 0x3d, 0xa8, 0x1c, 0xec, 0x3c, 0xfd, 0x09, 0x92, 0xbd,
+ 0x1c, 0x87, 0x89, 0xbd, 0x60, 0x10, 0xdc, 0xbb, 0x00, 0x40, 0xd1, 0x36, 0x48,
+ 0xb3, 0x28, 0x3c, 0xc8, 0xb3, 0x94, 0x3c, 0xfa, 0x6c, 0x8e, 0xbc, 0x98, 0x5b,
+ 0x68, 0xbc, 0x32, 0xc1, 0x3b, 0x3d, 0xb7, 0xd5, 0x81, 0x3d, 0x48, 0xb6, 0x10,
+ 0x3d, 0x5c, 0x95, 0x58, 0xbd, 0xf6, 0xb9, 0x00, 0xbd, 0xaa, 0xbe, 0x51, 0xbd,
+ 0x2e, 0xbc, 0x70, 0x3d, 0xc8, 0x89, 0x06, 0x3c, 0x00, 0x00, 0x41, 0xb9, 0x31,
+ 0x3e, 0x10, 0xbd, 0xf0, 0x26, 0x14, 0xbc, 0x98, 0xfc, 0xf2, 0x3c, 0xf3, 0x6d,
+ 0x27, 0xbd, 0xd0, 0xdd, 0x2e, 0xbc, 0xee, 0x5b, 0x92, 0xbd, 0xc6, 0x4c, 0x24,
+ 0x3d, 0x3c, 0x5e, 0x01, 0x3d, 0x6a, 0xe6, 0x26, 0xbd, 0x90, 0xd6, 0x1f, 0x3c,
+ 0xbc, 0x88, 0xcd, 0x3c, 0xb0, 0xad, 0xee, 0x3c, 0xd4, 0xc5, 0xdf, 0x3c, 0xa6,
+ 0x0f, 0xe7, 0xbc, 0x51, 0x99, 0x84, 0x3d, 0xc4, 0x84, 0x6a, 0xbc, 0xa8, 0xb6,
+ 0x5c, 0xbc, 0x00, 0xba, 0x3a, 0x39, 0x28, 0x4f, 0x59, 0x3d, 0x80, 0x55, 0x45,
+ 0xba, 0x48, 0x20, 0x84, 0xbc, 0x3f, 0xfd, 0x90, 0x3d, 0x74, 0x17, 0x82, 0xbd,
+ 0x93, 0xd5, 0x26, 0xbd, 0xc0, 0x02, 0xbf, 0xbc, 0x42, 0xdf, 0x24, 0x3d, 0x0e,
+ 0xac, 0xd5, 0xbc, 0x42, 0xcc, 0x7a, 0xbd, 0xd0, 0x21, 0xf6, 0x3b, 0x88, 0x2e,
+ 0x63, 0xbd, 0x08, 0xdd, 0xc4, 0xbc, 0x08, 0xa7, 0x6b, 0x3c, 0x17, 0x07, 0x83,
+ 0xbd, 0x31, 0xfd, 0x81, 0x3d, 0x68, 0xb0, 0x3f, 0x3c, 0xec, 0x78, 0xc0, 0xbc,
+ 0x40, 0x91, 0x3b, 0x3c, 0x80, 0x96, 0xbf, 0x3a, 0x94, 0xed, 0xa7, 0x3c, 0xb0,
+ 0xf7, 0x2a, 0x3c, 0x00, 0x90, 0xc6, 0x37, 0xb4, 0x0d, 0x89, 0xbd, 0xd0, 0x28,
+ 0xb0, 0xbb, 0xf0, 0x65, 0x06, 0x3c, 0xcd, 0xc8, 0x8d, 0x3d, 0x66, 0xa5, 0x6f,
+ 0x3d, 0x36, 0x46, 0x4c, 0x3d, 0x00, 0x80, 0x67, 0x36, 0xaf, 0x78, 0x20, 0xbd,
+ 0xce, 0x83, 0x08, 0x3d, 0x7f, 0x32, 0x84, 0xbd, 0x23, 0x80, 0x8e, 0x3d, 0xb4,
+ 0xa5, 0x56, 0x3d, 0xe4, 0xc2, 0x10, 0xbd, 0xc0, 0xf4, 0xe9, 0xba, 0xa6, 0x4e,
+ 0x6d, 0x3d, 0x04, 0x19, 0xad, 0xbc, 0x0c, 0xf2, 0x38, 0x3d, 0xc6, 0x2c, 0x29,
+ 0xbd, 0xba, 0x51, 0x5c, 0x3d, 0x20, 0x92, 0xae, 0x3c, 0x68, 0x55, 0xf7, 0x3c,
+ 0x40, 0x10, 0x08, 0x3d, 0x86, 0x95, 0x62, 0x3d, 0x36, 0xef, 0x80, 0xbd, 0xd8,
+ 0x21, 0x37, 0xbd, 0x28, 0x37, 0x93, 0xbc, 0x20, 0xb5, 0x35, 0x3b, 0x2f, 0x41,
+ 0x86, 0xbd, 0xf0, 0xf4, 0xfd, 0xbc, 0x3e, 0xa1, 0x8a, 0xbd, 0x38, 0xf3, 0x8f,
+ 0xbd, 0x15, 0xd9, 0x6e, 0xbd, 0xb8, 0xd9, 0x4b, 0x3d, 0x6e, 0x7c, 0x61, 0xbd,
+ 0x00, 0x0e, 0x4d, 0xbb, 0xf8, 0xa5, 0x58, 0xbc, 0x20, 0x15, 0xb6, 0x3b, 0xa0,
+ 0x58, 0x09, 0x3b, 0xed, 0x15, 0x72, 0xbd, 0x00, 0xc6, 0x1a, 0x3a, 0x90, 0xdf,
+ 0x44, 0x3d, 0x70, 0xb4, 0x28, 0xbd, 0x66, 0x55, 0x7d, 0xbd, 0x94, 0x94, 0x84,
+ 0x3c, 0x49, 0xde, 0x32, 0xbd, 0x32, 0x47, 0x13, 0x3d, 0x2e, 0x3b, 0x4a, 0xbd,
+ 0x8a, 0x6d, 0x53, 0xbd, 0x88, 0x9e, 0x8b, 0xbc, 0xfe, 0x9b, 0xd0, 0xbc, 0xf0,
+ 0xb2, 0x16, 0x3c, 0x8c, 0x8a, 0x85, 0x3c, 0xd5, 0x73, 0x8b, 0xbd, 0xd6, 0xd6,
+ 0x02, 0xbd, 0x70, 0x96, 0x22, 0x3d, 0x8a, 0x4b, 0x1c, 0x3d, 0x80, 0x91, 0xeb,
+ 0x3a, 0x80, 0x29, 0x95, 0x3c, 0x71, 0xf1, 0x8d, 0x3d, 0x3e, 0x5e, 0x5e, 0xbd,
+ 0xd2, 0x53, 0x63, 0x3d, 0x0b, 0xcb, 0x8d, 0xbd, 0x58, 0x76, 0x5f, 0xbc, 0xc2,
+ 0xe8, 0x02, 0x3d, 0x9c, 0x96, 0x99, 0x3c, 0xbc, 0xe8, 0x96, 0x3c, 0xff, 0x05,
+ 0x45, 0xbd, 0x48, 0xa6, 0x02, 0x3d, 0x83, 0x34, 0x87, 0xbd, 0xe4, 0x9a, 0x47,
+ 0x3d, 0xd8, 0x5f, 0xc5, 0x3c, 0x0c, 0x1c, 0xee, 0xbc, 0x3e, 0x65, 0x46, 0x3d,
+ 0xe5, 0xd2, 0x10, 0xbd, 0x00, 0x98, 0x9a, 0xbb, 0x06, 0x89, 0x8d, 0xbc, 0xb8,
+ 0x08, 0xc5, 0xbc, 0x9e, 0xeb, 0xbd, 0xbc, 0x98, 0x4b, 0x78, 0xbd, 0x7d, 0x8a,
+ 0x7d, 0xbd, 0x00, 0x70, 0xf6, 0x39, 0xe0, 0x0c, 0xba, 0x3b, 0xa2, 0xf4, 0xdf,
+ 0xbc, 0xca, 0x61, 0x79, 0xbd, 0x44, 0x6f, 0xa3, 0xbc, 0x3c, 0x56, 0xe1, 0x3c,
+ 0x90, 0xfd, 0x3c, 0xbd, 0x71, 0x08, 0x35, 0xbd, 0xde, 0x28, 0x6b, 0xbd, 0xae,
+ 0xe2, 0x36, 0x3d, 0xe7, 0x04, 0x1e, 0xbd, 0x94, 0x0b, 0x1a, 0x3d, 0x3a, 0x8f,
+ 0x26, 0x3d, 0x40, 0xbe, 0x07, 0xbc, 0x10, 0x36, 0x8d, 0xbd, 0x40, 0x7b, 0x06,
+ 0x3b, 0xd8, 0x7b, 0x2c, 0x3d, 0x4f, 0x09, 0x59, 0xbd, 0x28, 0xc9, 0xeb, 0x3c,
+ 0x1c, 0xee, 0x7c, 0xbc, 0xf0, 0x79, 0x19, 0x3c, 0xf8, 0x06, 0x72, 0x3c, 0xe0,
+ 0x83, 0xb5, 0x3b, 0xc8, 0xca, 0x47, 0x3c, 0x88, 0x99, 0x0c, 0x3d, 0xe6, 0x5f,
+ 0xaf, 0xbc, 0x14, 0x1b, 0x4f, 0xbc, 0x13, 0x70, 0x80, 0xbd, 0xdd, 0x13, 0x18,
+ 0xbd, 0x4e, 0xae, 0xe3, 0xbc, 0xaa, 0x98, 0x7d, 0x3d, 0x00, 0xf9, 0x2f, 0x3c,
+ 0xdd, 0xd1, 0x8c, 0x3d, 0x28, 0x5c, 0x3c, 0x3d, 0x90, 0x81, 0x38, 0x3d, 0x3a,
+ 0xf4, 0x5d, 0x3d, 0xc2, 0x24, 0x53, 0x3d, 0x00, 0x34, 0x42, 0xbb, 0x32, 0xc8,
+ 0x78, 0x3d, 0x7a, 0x94, 0xe6, 0xbc, 0x76, 0x8f, 0x80, 0xbc, 0x83, 0xca, 0x8b,
+ 0x3d, 0x62, 0xfb, 0x78, 0x3d, 0xe9, 0x00, 0x90, 0x3d, 0xe8, 0x9b, 0x1c, 0xbd,
+ 0x66, 0xd9, 0x8d, 0xbd, 0xa2, 0xe7, 0x73, 0x3d, 0xd8, 0xb6, 0xb9, 0xbc, 0xa0,
+ 0x55, 0x70, 0x3b, 0x08, 0x5b, 0x00, 0x3c, 0xb4, 0xd0, 0x58, 0xbd, 0xe4, 0x3b,
+ 0x52, 0xbd, 0xb0, 0x22, 0x3d, 0x3d, 0x4a, 0x4f, 0x81, 0xbd, 0x48, 0xf0, 0x6a,
+ 0x3c, 0x61, 0xf4, 0x65, 0xbd, 0x34, 0x4e, 0x00, 0x3d, 0xd1, 0x71, 0x3c, 0xbd,
+ 0x8e, 0x3e, 0x70, 0x3d, 0x55, 0x7a, 0x27, 0xbd, 0x68, 0x22, 0xd5, 0xbc, 0x59,
+ 0x71, 0x90, 0xbd, 0xc8, 0xb0, 0x60, 0x3c, 0x74, 0x5b, 0x36, 0xbd, 0xdc, 0x16,
+ 0xbf, 0x3c, 0x62, 0x7a, 0xe3, 0xbc, 0x00, 0x21, 0x8e, 0xba, 0x1e, 0x0d, 0x08,
+ 0xbd, 0xa3, 0x7a, 0x07, 0xbd, 0xb4, 0x92, 0xee, 0x3c, 0x8d, 0xd2, 0x81, 0x3d,
+ 0x40, 0xc6, 0x98, 0x3c, 0x78, 0xc1, 0x69, 0x3c, 0x36, 0x9a, 0x72, 0x3d, 0xd2,
+ 0xfa, 0xe3, 0xbc, 0x42, 0x4c, 0x0e, 0x3d, 0x97, 0x2c, 0x88, 0x3d, 0x78, 0x6f,
+ 0x13, 0xbc, 0x40, 0x90, 0x7a, 0x3b, 0x66, 0x40, 0x95, 0xbc, 0xb8, 0xe6, 0x33,
+ 0x3d, 0x64, 0x0c, 0xf1, 0x3c, 0xb3, 0xc0, 0x1f, 0xbd, 0x67, 0x03, 0x03, 0xbd,
+ 0xe4, 0x7c, 0xfb, 0x3c, 0x7e, 0x22, 0x0e, 0xbd, 0xd6, 0x60, 0x8d, 0xbd, 0xcc,
+ 0xa2, 0x2c, 0xbd, 0x00, 0xa4, 0xd6, 0x39, 0xf8, 0x7d, 0x8d, 0xbd, 0xe4, 0x27,
+ 0x9a, 0xbc, 0xd8, 0x19, 0x61, 0xbd, 0xb8, 0x49, 0x54, 0xbd, 0x70, 0xcb, 0xd3,
+ 0x3b, 0x49, 0xe1, 0x89, 0x3d, 0x06, 0x6c, 0x78, 0x3d, 0xc0, 0xbe, 0x82, 0x3c,
+ 0x4d, 0x99, 0x8f, 0x3d, 0xd8, 0x0d, 0xe6, 0x3c, 0x4e, 0x2d, 0x60, 0x3d, 0x1c,
+ 0xab, 0x99, 0x3c, 0x66, 0xc6, 0xcc, 0xbc, 0x28, 0x76, 0x0b, 0xbc, 0x7b, 0x6e,
+ 0x90, 0x3d, 0x3b, 0x2f, 0x1c, 0xbd, 0x60, 0x1e, 0x83, 0x3b, 0xc8, 0x88, 0xfd,
+ 0x3c, 0x00, 0x48, 0xa8, 0x3c, 0x40, 0x3d, 0xd4, 0x3b, 0xa4, 0x83, 0xfc, 0x3c,
+ 0x3c, 0xe7, 0xd8, 0x3c, 0xfe, 0xaa, 0x6f, 0x3d, 0xbb, 0x22, 0x90, 0xbd, 0xd6,
+ 0xf5, 0x29, 0x3d, 0x8e, 0x7e, 0x65, 0x3d, 0xae, 0x3b, 0xe4, 0xbc, 0xea, 0x04,
+ 0x54, 0x3d, 0x64, 0x22, 0x1f, 0x3d, 0x24, 0x95, 0x90, 0x3c, 0xcd, 0x7b, 0x21,
+ 0xbd, 0xd0, 0xf8, 0xb9, 0x3b, 0x26, 0xf8, 0x28, 0xbd, 0x6a, 0x37, 0x5b, 0x3d,
+ 0x6e, 0x7e, 0x70, 0x3d, 0xa0, 0x90, 0xec, 0x3c, 0x00, 0x8e, 0x0d, 0xbb, 0xe0,
+ 0xbe, 0x5b, 0xbb, 0x58, 0xf6, 0x9c, 0x3c, 0xbe, 0x59, 0xc0, 0xbc, 0x64, 0x78,
+ 0xa4, 0x3c, 0x79, 0xfb, 0x86, 0x3d, 0x60, 0x6c, 0x85, 0xbc, 0xba, 0x44, 0x18,
+ 0xbd, 0x5e, 0xea, 0x6a, 0xbd, 0x6c, 0xf4, 0x36, 0xbd, 0xee, 0xd4, 0x4c, 0xbd,
+ 0xa2, 0x17, 0x16, 0x3d, 0x98, 0x59, 0xb9, 0x3c, 0x90, 0x41, 0x3d, 0x3c, 0x66,
+ 0x14, 0x06, 0x3d, 0x40, 0xa2, 0x17, 0xbb, 0xdd, 0x83, 0x75, 0xbd, 0x2c, 0x19,
+ 0x8f, 0x3c, 0xfe, 0xde, 0x49, 0xbd, 0x57, 0x3d, 0x85, 0x3d, 0x1c, 0xb3, 0xef,
+ 0xbc, 0x58, 0xdb, 0x3f, 0xbd, 0x0e, 0x38, 0x20, 0x3d, 0x80, 0xbf, 0xa7, 0x3a,
+ 0xf0, 0xe2, 0x91, 0xbd, 0xcc, 0x0f, 0x0a, 0x3d, 0xc7, 0xad, 0x4d, 0xbd, 0x64,
+ 0x33, 0x69, 0xbd, 0xc0, 0xc0, 0xd7, 0xbb, 0xb0, 0x16, 0x83, 0xbd, 0xd0, 0xbf,
+ 0x3c, 0x3d, 0x11, 0x62, 0x87, 0x3d, 0x68, 0x04, 0x0f, 0x3d, 0x6e, 0xee, 0x2a,
+ 0x3d, 0xb8, 0x70, 0x37, 0xbc, 0x62, 0x76, 0x7e, 0x3d, 0x84, 0xbc, 0xa0, 0x3c,
+ 0xc0, 0xc9, 0x26, 0xbd, 0x82, 0x1a, 0x85, 0xbd, 0x80, 0x55, 0x8e, 0xbd, 0xe4,
+ 0xdb, 0x48, 0x3d, 0x60, 0xa5, 0xd6, 0x3b, 0x39, 0x18, 0x92, 0x3d, 0x36, 0x5a,
+ 0x6c, 0xbd, 0xe8, 0x77, 0xcb, 0x3c, 0x48, 0x9e, 0x12, 0x3d, 0x3b, 0x40, 0x91,
+ 0xbd, 0x00, 0xe0, 0xf6, 0x38, 0xd6, 0xa0, 0x2f, 0xbd, 0xe0, 0xe2, 0x0f, 0xbc,
+ 0xf4, 0x85, 0x50, 0x3d, 0x64, 0xf7, 0x9b, 0x3c, 0xdc, 0x72, 0x53, 0x3d, 0x28,
+ 0x0b, 0x45, 0xbc, 0x4e, 0xb5, 0x3f, 0xbd, 0x34, 0x7a, 0xea, 0x3c, 0x58, 0xe1,
+ 0x71, 0x3c, 0x60, 0x5b, 0xf8, 0xbc, 0xf8, 0x3d, 0x52, 0x3c, 0xd0, 0xdc, 0x67,
+ 0xbd, 0xee, 0x2d, 0x0c, 0x3d, 0x70, 0x47, 0xb0, 0x3c, 0x70, 0x7c, 0x29, 0x3d,
+ 0xf4, 0x97, 0xc9, 0x3c, 0x74, 0x63, 0x32, 0x3d, 0x6c, 0x17, 0x94, 0x3c, 0x87,
+ 0xdc, 0x7a, 0xbd, 0xb6, 0xf5, 0x7c, 0x3d, 0x62, 0xd2, 0xe7, 0xbc, 0x99, 0xa5,
+ 0x50, 0xbd, 0x4c, 0xa2, 0xb1, 0xbc, 0xf0, 0x38, 0xdd, 0xbb, 0xac, 0x44, 0x3f,
+ 0xbd, 0x34, 0xb7, 0x06, 0x3d, 0xf6, 0x65, 0x25, 0x3d, 0xdb, 0x01, 0x1e, 0xbd,
+ 0x68, 0xee, 0x19, 0xbc, 0x4c, 0xdd, 0x8a, 0x3c, 0xe0, 0xe4, 0x14, 0xbc, 0x9e,
+ 0x6f, 0x21, 0x3d, 0x18, 0xd1, 0x59, 0x3d, 0x0c, 0xdd, 0xe1, 0xbc, 0x84, 0xa1,
+ 0xe6, 0x3c, 0x5c, 0x56, 0xfa, 0x3c, 0xc4, 0x30, 0x8d, 0x3c, 0x9c, 0xba, 0x12,
+ 0xbd, 0xe0, 0x85, 0xbf, 0xbc, 0x00, 0x1d, 0x62, 0xbb, 0xe4, 0x7a, 0x13, 0x3d,
+ 0x36, 0x6c, 0x07, 0x3d, 0x88, 0xb1, 0x2a, 0x3c, 0x06, 0xba, 0x16, 0xbd, 0x24,
+ 0x12, 0xaf, 0x3c, 0x7c, 0x97, 0x3b, 0xbc, 0xe4, 0x3d, 0x2e, 0xbd, 0x8c, 0x86,
+ 0xa9, 0xbc, 0x6c, 0x70, 0x06, 0x3d, 0x0b, 0x2c, 0x76, 0xbd, 0x72, 0x24, 0xe8,
+ 0xbc, 0x22, 0xeb, 0x70, 0x3d, 0xf0, 0xfb, 0x7b, 0x3c, 0x62, 0x51, 0x08, 0xbd,
+ 0x52, 0x97, 0x88, 0xbd, 0x58, 0x8d, 0x76, 0x3c, 0x3c, 0x79, 0xf1, 0x3c, 0x6c,
+ 0x9b, 0xbd, 0xbc, 0xa4, 0xf4, 0xe9, 0x3c, 0x80, 0x4d, 0x22, 0x3a, 0x78, 0x12,
+ 0x81, 0x3c, 0x9a, 0xc5, 0x4a, 0x3d, 0xfa, 0x9b, 0x4a, 0x3d, 0x0c, 0x20, 0x7f,
+ 0xbd, 0x36, 0x46, 0x06, 0xbd, 0x60, 0x13, 0xbd, 0xbb, 0x8e, 0x08, 0x92, 0xbc,
+ 0xca, 0x25, 0x1c, 0x3d, 0xb2, 0x84, 0x3f, 0x3d, 0x98, 0x3f, 0x47, 0x3d, 0x58,
+ 0x18, 0x4b, 0x3d, 0x60, 0x91, 0x63, 0xbb, 0xa2, 0x5c, 0xea, 0xbc, 0xc4, 0x8e,
+ 0x86, 0x3c, 0x5c, 0x76, 0x91, 0xbd, 0x10, 0xa2, 0x1d, 0xbc, 0xe0, 0xcb, 0xb5,
+ 0xbb, 0x50, 0xd2, 0xe2, 0x3c, 0x98, 0xbd, 0x88, 0xbd, 0x00, 0xd8, 0x0f, 0x39,
+ 0x72, 0x33, 0x20, 0x3d, 0x00, 0x13, 0xbd, 0x39, 0xae, 0xc3, 0xd1, 0xbc, 0xec,
+ 0x7e, 0xb8, 0xbc, 0x78, 0xb4, 0x90, 0xbc, 0xc2, 0x01, 0x68, 0x3d, 0x40, 0x0a,
+ 0x4f, 0xbb, 0xb7, 0xe6, 0x87, 0x3d, 0x35, 0xe8, 0x85, 0x3d, 0x94, 0x2a, 0xe6,
+ 0x3c, 0xd8, 0x5c, 0x69, 0x3c, 0x20, 0x8e, 0xc2, 0xbb, 0x4c, 0xa2, 0x92, 0x3c,
+ 0xd6, 0xc7, 0x73, 0x3d, 0xf8, 0x0c, 0xb8, 0x3c, 0x40, 0x90, 0xb9, 0x3a, 0x2e,
+ 0x2b, 0x31, 0x3d, 0x18, 0xf5, 0x8a, 0x3c, 0x91, 0x95, 0x5b, 0xbd, 0xc0, 0xfa,
+ 0xc8, 0x3a, 0x72, 0xf1, 0xa9, 0xbc, 0x36, 0x77, 0x48, 0xbd, 0x73, 0x0d, 0x6c,
+ 0xbd, 0x70, 0x22, 0xe4, 0xbb, 0x88, 0x5c, 0x28, 0x3d, 0xc6, 0x18, 0x3e, 0x3d,
+ 0x94, 0x3c, 0xd1, 0xbc, 0x7f, 0x43, 0x15, 0xbd, 0xee, 0x0d, 0x9e, 0xbc, 0x62,
+ 0xff, 0x29, 0x3d, 0xf0, 0x56, 0xf2, 0x3b, 0x22, 0x3f, 0x4e, 0x3d, 0xb6, 0x94,
+ 0x39, 0xbd, 0x9e, 0xf1, 0x45, 0xbd, 0x87, 0xdb, 0x85, 0x3d, 0xd8, 0x35, 0x65,
+ 0x3c, 0xcc, 0x13, 0x8a, 0x3c, 0x44, 0x89, 0x64, 0xbc, 0xe6, 0xb5, 0x2a, 0xbd,
+ 0x28, 0x4f, 0x69, 0x3c, 0x36, 0x45, 0x53, 0x3d, 0x3a, 0xd2, 0xfe, 0xbc, 0xce,
+ 0xa8, 0xa2, 0xbc, 0x8a, 0x16, 0x7d, 0xbd, 0xc2, 0xd5, 0xd9, 0xbc, 0xa0, 0x4a,
+ 0x87, 0xbd, 0x9e, 0xc2, 0x2c, 0x3d, 0xfc, 0x3a, 0xaf, 0x3c, 0x9e, 0x10, 0x40,
+ 0xbd, 0xe0, 0x3a, 0x82, 0x3b, 0x0c, 0xe4, 0xfc, 0x3c, 0xd8, 0x07, 0x57, 0xbd,
+ 0xba, 0x34, 0x91, 0xbd, 0xc6, 0x42, 0x51, 0x3d, 0xc0, 0xe9, 0xe1, 0x3b, 0x9c,
+ 0x4a, 0x2a, 0xbc, 0xc6, 0x92, 0x7b, 0x3d, 0x12, 0x9f, 0x59, 0xbd, 0x0c, 0x62,
+ 0xfd, 0xbc, 0x6c, 0x1a, 0xe6, 0x3c, 0x72, 0x2c, 0x4b, 0x3d, 0x7a, 0xa5, 0x3b,
+ 0xbd, 0xfa, 0x37, 0x7b, 0x3d, 0xc0, 0xf0, 0x87, 0xbc, 0x28, 0xd1, 0x5a, 0x3c,
+ 0xd7, 0x35, 0x6b, 0xbd, 0x7e, 0x9c, 0x6f, 0x3d, 0x1a, 0xf6, 0x23, 0xbd, 0x66,
+ 0x3b, 0xa2, 0xbc, 0x00, 0xb5, 0x5d, 0xba, 0xbb, 0xc3, 0x52, 0xbd, 0x24, 0x0d,
+ 0x14, 0x3d, 0x6f, 0x6f, 0x7d, 0xbd, 0x74, 0x88, 0x90, 0xbd, 0xda, 0x8a, 0x68,
+ 0xbd, 0xb4, 0xe0, 0x5f, 0xbc, 0xb8, 0x32, 0x88, 0xbd, 0x13, 0xc0, 0x81, 0x3d,
+ 0x2c, 0x07, 0x2e, 0xbd, 0xd0, 0x8a, 0x8a, 0x3b, 0xe2, 0x9e, 0x8a, 0xbd, 0x60,
+ 0x09, 0x8a, 0x3b, 0xd5, 0x6b, 0x92, 0xbd, 0x90, 0x61, 0x50, 0x3d, 0x62, 0x32,
+ 0x0f, 0xbd, 0x9b, 0x7c, 0x6f, 0xbd, 0x10, 0x7c, 0xa3, 0x3c, 0x80, 0x22, 0xcc,
+ 0xbb, 0x20, 0xc6, 0x3a, 0x3d, 0x40, 0xcb, 0x3f, 0x3b, 0xca, 0xa4, 0xdd, 0xbc,
+ 0xc0, 0x36, 0xbf, 0x3c, 0x40, 0x4f, 0x85, 0x3b, 0x13, 0x52, 0x6c, 0xbd, 0x6b,
+ 0xa9, 0x6f, 0xbd, 0x58, 0x41, 0x5d, 0xbc, 0xa8, 0x0e, 0x82, 0x3c, 0x7c, 0x92,
+ 0xf5, 0x3c, 0xfa, 0xd8, 0x5a, 0xbd, 0xcc, 0x79, 0x54, 0x3d, 0xc4, 0x8f, 0x2a,
+ 0xbc, 0x78, 0xec, 0xdb, 0x3c, 0xf0, 0x95, 0xa9, 0x3b, 0x78, 0x9d, 0xf6, 0xbc,
+ 0x53, 0x59, 0x55, 0xbd, 0x08, 0x4e, 0xca, 0x3c, 0xcc, 0x95, 0xbb, 0x3c, 0xe4,
+ 0x91, 0xb4, 0xbc, 0xfb, 0x9d, 0x86, 0xbd, 0x08, 0x68, 0x3f, 0xbc, 0x5d, 0x1b,
+ 0x84, 0xbd, 0xd0, 0xc8, 0x83, 0x3b, 0x4a, 0x39, 0x54, 0x3d, 0x3c, 0x6e, 0xb6,
+ 0xbc, 0x70, 0xdd, 0x1b, 0x3c, 0xf4, 0xfc, 0x21, 0xbd, 0x68, 0x25, 0x5e, 0x3c,
+ 0x01, 0xfc, 0x8e, 0xbd, 0x60, 0xe5, 0x2a, 0x3b, 0x98, 0x51, 0x23, 0xbc, 0x00,
+ 0xef, 0x0a, 0xba, 0xfc, 0x95, 0x1f, 0xbc, 0xf4, 0x89, 0x55, 0x3d, 0x76, 0x2e,
+ 0x29, 0x3d, 0xdb, 0x02, 0x86, 0x3d, 0x64, 0xaa, 0x31, 0xbc, 0x7c, 0x3a, 0x9c,
+ 0xbc, 0x00, 0xf2, 0x64, 0xbd, 0x86, 0xf3, 0x51, 0xbd, 0xc0, 0x2f, 0x9a, 0x3a,
+ 0xf2, 0xf2, 0xd3, 0xbc, 0x1e, 0x43, 0xcb, 0xbc, 0x6d, 0x44, 0x92, 0x3d, 0x40,
+ 0xc6, 0x90, 0xba, 0xaa, 0xc9, 0x3e, 0xbd, 0x02, 0xc1, 0x5b, 0x3d, 0x66, 0xeb,
+ 0x1e, 0x3d, 0xf2, 0x34, 0x63, 0xbd, 0xea, 0xba, 0x66, 0x3d, 0xee, 0x8c, 0x1a,
+ 0x3d, 0x3b, 0xb9, 0x1e, 0xbd, 0x0a, 0xd2, 0x13, 0x3d, 0xa0, 0xaf, 0x3e, 0x3c,
+ 0xc0, 0x24, 0x83, 0x3c, 0x90, 0x69, 0xf0, 0xbb, 0x1f, 0x73, 0x86, 0x3d, 0x9d,
+ 0x21, 0x77, 0xbd, 0x45, 0x4f, 0x8c, 0x3d, 0x40, 0x6d, 0xfe, 0x3c, 0xcb, 0xa5,
+ 0x8d, 0xbd, 0x00, 0x8d, 0xe5, 0x39, 0x56, 0x9b, 0x55, 0x3d, 0x26, 0x49, 0x5a,
+ 0xbd, 0x66, 0x93, 0x7a, 0x3d, 0x80, 0x29, 0x4f, 0xba, 0xff, 0xff, 0x82, 0xbd,
+ 0x50, 0xf9, 0x65, 0x3c, 0x28, 0xa6, 0xb5, 0xbc, 0xdf, 0x70, 0x54, 0xbd, 0x17,
+ 0xd1, 0x8e, 0xbd, 0x00, 0x3a, 0xb9, 0x3b, 0x26, 0x45, 0x86, 0xbc, 0xad, 0x85,
+ 0x33, 0xbd, 0x94, 0x78, 0x32, 0x3d, 0x70, 0xcb, 0xa1, 0x3b, 0x40, 0xe5, 0x21,
+ 0x3d, 0x32, 0xd5, 0xc2, 0xbc, 0xf8, 0x3d, 0x27, 0x3d, 0x28, 0xc0, 0x39, 0xbc,
+ 0xac, 0xc8, 0x7a, 0xbc, 0xe6, 0xc2, 0xd4, 0xbc, 0x91, 0x81, 0x5c, 0xbd, 0xe1,
+ 0x6a, 0x90, 0xbd, 0xa9, 0xc8, 0x1d, 0xbd, 0x00, 0x94, 0xcb, 0xb9, 0xe0, 0x0d,
+ 0x31, 0x3c, 0x00, 0x2a, 0xbe, 0xbb, 0x9a, 0x1e, 0x2a, 0xbd, 0x06, 0xef, 0x7f,
+ 0x3d, 0xc0, 0xcc, 0x0d, 0x3c, 0xd6, 0x50, 0x74, 0xbd, 0x10, 0x24, 0xcd, 0x3b,
+ 0x22, 0x4f, 0x0c, 0xbd, 0xc8, 0xf2, 0xaa, 0x3c, 0x9e, 0x84, 0xc8, 0xbc, 0x80,
+ 0xf2, 0x4e, 0x3c, 0x0c, 0x38, 0x77, 0xbd, 0x6c, 0xab, 0x63, 0xbd, 0xb7, 0x31,
+ 0x11, 0xbd, 0x25, 0x39, 0x84, 0x3d, 0x31, 0x0b, 0x91, 0x3d, 0xe3, 0x1d, 0x08,
+ 0xbd, 0x92, 0xb6, 0x1b, 0xbd, 0x65, 0xca, 0x88, 0x3d, 0x1c, 0x62, 0x2c, 0xbd,
+ 0xda, 0x7b, 0x73, 0x3d, 0xff, 0xbb, 0x85, 0xbd, 0xc4, 0xc7, 0x51, 0x3d, 0x98,
+ 0xd2, 0x6f, 0xbd, 0x70, 0xa4, 0xe9, 0x3c, 0x74, 0x65, 0xd7, 0x3c, 0x18, 0xdd,
+ 0x5e, 0x3c, 0x78, 0x1d, 0x04, 0x3d, 0x2c, 0xef, 0x43, 0xbd, 0x48, 0x7d, 0x5e,
+ 0xbd, 0xd6, 0x02, 0x9f, 0xbc, 0x80, 0x29, 0xa1, 0x3c, 0x70, 0x64, 0x54, 0x3d,
+ 0x3e, 0xe0, 0x50, 0x3d, 0xd3, 0x7d, 0x2e, 0xbd, 0x64, 0xdf, 0x55, 0xbd, 0x72,
+ 0x47, 0x8c, 0xbd, 0xfb, 0x45, 0x12, 0xbd, 0xd6, 0x49, 0x9d, 0xbc, 0xca, 0xd5,
+ 0x67, 0x3d, 0x50, 0xb9, 0xf4, 0x3c, 0x93, 0xca, 0x1f, 0xbd, 0xa7, 0xe1, 0x8f,
+ 0xbd, 0xcc, 0x00, 0x52, 0x3d, 0x07, 0xd3, 0x20, 0xbd, 0xd0, 0x26, 0x82, 0xbc,
+ 0x2a, 0x6e, 0x69, 0x3d, 0x0c, 0x67, 0x70, 0xbd, 0xaa, 0x35, 0xe9, 0xbc, 0xae,
+ 0x97, 0xba, 0xbc, 0xea, 0x69, 0x3d, 0xbd, 0x28, 0xa0, 0x6f, 0xbc, 0x2a, 0x6a,
+ 0x67, 0x3d, 0x50, 0xd0, 0x6e, 0x3c, 0x16, 0x90, 0x06, 0x3d, 0x4a, 0xdf, 0x3f,
+ 0x3d, 0xa0, 0x4e, 0x07, 0x3d, 0x48, 0x0d, 0x55, 0xbd, 0x50, 0x0b, 0xc6, 0xbc,
+ 0xc4, 0xf3, 0x47, 0xbd, 0x90, 0x09, 0xb3, 0xbb, 0x20, 0xe9, 0x7f, 0xbd, 0xbf,
+ 0x2e, 0x86, 0xbd, 0xba, 0xcf, 0x74, 0x3d, 0x86, 0xd8, 0xf6, 0xbc, 0x20, 0x65,
+ 0x57, 0x3d, 0x82, 0xc5, 0x50, 0xbd, 0xac, 0x70, 0x41, 0x3d, 0x0e, 0xb0, 0x40,
+ 0xbd, 0x4c, 0x30, 0x39, 0xbd, 0x80, 0xa0, 0xe5, 0x3c, 0x20, 0xc2, 0x86, 0xbb,
+ 0xb8, 0x3d, 0x8c, 0x3c, 0xdf, 0x7e, 0x5f, 0xbd, 0xe0, 0xfd, 0x37, 0x3b, 0x0b,
+ 0x70, 0x15, 0xbd, 0x00, 0xc1, 0x97, 0xba, 0x9a, 0x38, 0x56, 0xbd, 0x32, 0x67,
+ 0xdb, 0xbc, 0x4a, 0x22, 0x38, 0x3d, 0x12, 0x1c, 0x7f, 0x3d, 0x88, 0x38, 0xee,
+ 0x3c, 0x0a, 0x76, 0x61, 0x3d, 0x6d, 0xd7, 0x0a, 0xbd, 0xba, 0xb0, 0x3c, 0x3d,
+ 0x28, 0xbe, 0x91, 0xbc, 0xa8, 0x3e, 0x0b, 0x3c, 0x54, 0x53, 0xb7, 0x3c, 0x50,
+ 0x41, 0x57, 0x3c, 0xb4, 0x5d, 0x9b, 0x3c, 0x04, 0xb9, 0x18, 0xbd, 0xa8, 0xd5,
+ 0x9c, 0xbc, 0x7c, 0x5f, 0x15, 0xbd, 0x64, 0xf3, 0x0d, 0x3d, 0x17, 0x85, 0x90,
+ 0x3d, 0x5d, 0xf4, 0x51, 0xbd, 0x97, 0x93, 0x30, 0xbd, 0x40, 0x65, 0xe6, 0xbb,
+ 0x20, 0xa7, 0xc3, 0x3c, 0x10, 0xb1, 0x90, 0x3c, 0xc8, 0x2f, 0x36, 0x3c, 0x6b,
+ 0x38, 0x8e, 0xbd, 0xd6, 0x6c, 0x62, 0x3d, 0x94, 0x52, 0x4b, 0xbd, 0x48, 0xe5,
+ 0x15, 0x3d, 0x48, 0x7a, 0x3f, 0x3d, 0x60, 0xb0, 0xdf, 0xbb, 0xc2, 0x53, 0x05,
+ 0xbd, 0xc0, 0xaa, 0x94, 0x3a, 0xf2, 0xef, 0x68, 0xbd, 0xb0, 0x4d, 0x46, 0xbc,
+ 0xa0, 0xdc, 0x0e, 0x3b, 0x9c, 0x99, 0x5d, 0xbd, 0xd0, 0x37, 0x63, 0xbd, 0x61,
+ 0x02, 0x03, 0xbd, 0x80, 0x26, 0x51, 0x3a, 0xa0, 0xab, 0xb5, 0xbb, 0x65, 0x1e,
+ 0x8d, 0x3d, 0xa0, 0x46, 0xc6, 0x3c, 0x00, 0x48, 0xa3, 0x3c, 0x4d, 0xdf, 0x84,
+ 0x3d, 0x1c, 0xf1, 0x34, 0xbd, 0x1a, 0xb0, 0x00, 0x3d, 0x86, 0x6e, 0x5a, 0x3d,
+ 0x02, 0xfe, 0x8b, 0xbd, 0x0e, 0x96, 0x32, 0x3d, 0xe6, 0x1e, 0x91, 0xbc, 0x8a,
+ 0xe9, 0x6b, 0xbd, 0x4c, 0x53, 0x38, 0x3d, 0x39, 0xf5, 0x90, 0xbd, 0x66, 0x81,
+ 0x7e, 0x3d, 0xec, 0x33, 0xaa, 0xbc, 0x3e, 0xc4, 0x5c, 0x3d, 0xd8, 0x19, 0x87,
+ 0xbc, 0x70, 0xd6, 0x52, 0x3d, 0x00, 0x6a, 0xab, 0x3a, 0xda, 0x41, 0x81, 0xbc,
+ 0xf0, 0xbd, 0xe3, 0x3c, 0x38, 0x66, 0x1e, 0x3c, 0x62, 0x7d, 0x8e, 0xbd, 0xa5,
+ 0x2a, 0x15, 0xbd, 0xf6, 0x6a, 0x72, 0x3d, 0x72, 0x22, 0x33, 0x3d, 0x8c, 0xb7,
+ 0x8e, 0xbd, 0xe2, 0xf8, 0x6a, 0xbd, 0x01, 0x40, 0x35, 0xbd, 0xb3, 0xe4, 0x79,
+ 0xbd, 0xdc, 0xb4, 0x65, 0xbc, 0x3d, 0x74, 0x91, 0x3d, 0x94, 0x0a, 0xe8, 0x3c,
+ 0x16, 0x25, 0x57, 0xbd, 0xd6, 0x05, 0x0b, 0x3d, 0x16, 0x2b, 0x5f, 0x3d, 0x38,
+ 0x59, 0xcd, 0xbc, 0x8c, 0x9f, 0x0e, 0x3d, 0xac, 0x67, 0x9c, 0x3c, 0x00, 0xe1,
+ 0xb3, 0x39, 0x1c, 0x2e, 0xf8, 0x3c, 0xed, 0xfd, 0x80, 0x3d, 0xc6, 0x8b, 0x2b,
+ 0xbd, 0x08, 0x4d, 0xe0, 0x3c, 0xff, 0x55, 0x85, 0x3d, 0x3c, 0xd0, 0xe9, 0x3c,
+ 0x30, 0x7c, 0x79, 0x3c, 0xd0, 0xf7, 0x8c, 0x3b, 0x82, 0xe9, 0x7d, 0xbd, 0x54,
+ 0x3f, 0x46, 0x3d, 0xb8, 0x88, 0xc0, 0x3c, 0xc8, 0xf4, 0x35, 0xbc, 0xe9, 0x19,
+ 0x85, 0x3d, 0x01, 0x5f, 0x62, 0xbd, 0xea, 0x7f, 0x0f, 0x3d, 0xf8, 0x73, 0x42,
+ 0xbd, 0x41, 0x97, 0x8f, 0x3d, 0x13, 0xec, 0x80, 0x3d, 0xe7, 0xa8, 0x40, 0xbd,
+ 0x08, 0x47, 0x4b, 0x3c, 0x80, 0xce, 0x77, 0xbc, 0xb6, 0x2d, 0x4f, 0xbd, 0xe0,
+ 0xa7, 0x0b, 0x3b, 0xda, 0xb6, 0x76, 0x3d, 0xc8, 0xce, 0x14, 0x3c, 0xe0, 0xbf,
+ 0x20, 0xbb, 0x10, 0xa1, 0x94, 0x3b, 0x02, 0x4e, 0x3f, 0x3d, 0xa0, 0xe9, 0x0c,
+ 0xbc, 0x6a, 0x57, 0x2b, 0xbd, 0x22, 0x09, 0x1d, 0xbd, 0xa8, 0xa6, 0x4c, 0x3c,
+ 0x21, 0x7d, 0x40, 0xbd, 0x91, 0xdf, 0x87, 0x3d, 0x65, 0xe4, 0x05, 0xbd, 0xdc,
+ 0xd6, 0x84, 0xbd, 0x22, 0x49, 0x79, 0x3d, 0xf4, 0xf7, 0x40, 0xbc, 0x2c, 0x16,
+ 0x86, 0xbc, 0xa8, 0x26, 0x40, 0x3d, 0xaa, 0x89, 0xa9, 0xbc, 0xc4, 0x74, 0xc5,
+ 0xbc, 0x3c, 0x76, 0x83, 0xbc, 0x2b, 0xf7, 0x90, 0x3d, 0xa8, 0x0c, 0x6f, 0xbc,
+ 0xdc, 0x96, 0x2c, 0x3d, 0xe0, 0x71, 0x88, 0x3c, 0x66, 0x9f, 0x2a, 0xbd, 0xf1,
+ 0x10, 0x82, 0x3d, 0x41, 0x73, 0x41, 0xbd, 0x7e, 0x2c, 0x21, 0xbd, 0xf0, 0xea,
+ 0x08, 0x3c, 0x54, 0xb4, 0x2a, 0xbc, 0xf6, 0xf5, 0x64, 0xbd, 0x46, 0xf9, 0x2a,
+ 0xbd, 0x54, 0xa4, 0x29, 0x3d, 0x1e, 0x79, 0xee, 0xbc, 0xf5, 0x8b, 0x83, 0x3d,
+ 0x30, 0x04, 0x10, 0x3d, 0x14, 0x83, 0x4e, 0x3d, 0x67, 0x9f, 0x62, 0xbd, 0x00,
+ 0x01, 0x10, 0xbd, 0x96, 0xc8, 0x2c, 0x3d, 0x3f, 0x58, 0x8e, 0x3d, 0x34, 0xeb,
+ 0xe1, 0x3c, 0x12, 0x5d, 0x87, 0xbc, 0x0b, 0x23, 0x80, 0x3d, 0x0a, 0x55, 0x81,
+ 0xbd, 0xc2, 0x80, 0x16, 0xbd, 0x58, 0xa6, 0x7a, 0x3c, 0xec, 0x9a, 0xf1, 0x3c,
+ 0xf0, 0x0e, 0xaa, 0x3c, 0xe2, 0x06, 0x9a, 0xbc, 0x20, 0x57, 0xec, 0xbb, 0xe8,
+ 0x5b, 0xc6, 0x3c, 0x40, 0x51, 0x3b, 0x3c, 0x47, 0xf6, 0x8e, 0x3d, 0x6e, 0xc5,
+ 0x06, 0xbd, 0xac, 0xf6, 0x2b, 0x3d, 0xec, 0x29, 0x05, 0x3d, 0x76, 0xd9, 0x2e,
+ 0x3d, 0x7c, 0x02, 0x40, 0xbc, 0x5e, 0x98, 0x8b, 0xbc, 0x20, 0xf8, 0x8b, 0x3c,
+ 0xcc, 0x04, 0x59, 0xbc, 0xd7, 0xfe, 0x8a, 0x3d, 0xda, 0xed, 0x1a, 0xbd, 0x82,
+ 0x45, 0x9b, 0xbc, 0xfc, 0xa0, 0x7b, 0xbc, 0x14, 0x19, 0x0a, 0x3d, 0x7c, 0x3a,
+ 0x7d, 0xbd, 0x46, 0x32, 0x91, 0xbd, 0xc0, 0xea, 0x8b, 0x3c, 0x0e, 0x44, 0x78,
+ 0x3d, 0x96, 0x53, 0x2a, 0x3d, 0x3a, 0xbb, 0x79, 0x3d, 0x1f, 0xe3, 0x19, 0xbd,
+ 0x56, 0xbb, 0x67, 0x3d, 0x44, 0x48, 0x86, 0x3c, 0x33, 0x5f, 0x8e, 0xbd, 0xc0,
+ 0x86, 0x8c, 0xbc, 0xb0, 0x2a, 0x8e, 0x3b, 0x20, 0xd2, 0x8f, 0xbd, 0x16, 0x08,
+ 0x67, 0x3d, 0x4a, 0xc7, 0x67, 0x3d, 0x50, 0x7c, 0xfd, 0xbc, 0xb0, 0xc1, 0x3f,
+ 0xbd, 0xc0, 0x77, 0xde, 0x3b, 0x98, 0x6b, 0x98, 0xbc, 0x10, 0x91, 0xa0, 0x3b,
+ 0x80, 0x9a, 0xed, 0x3c, 0xdd, 0xc9, 0x82, 0x3d, 0x2c, 0x20, 0x4d, 0x3d, 0x05,
+ 0xe9, 0x78, 0xbd, 0x44, 0xae, 0xcd, 0x3c, 0xd8, 0x92, 0x81, 0x3c, 0x57, 0xa3,
+ 0x77, 0xbd, 0xbe, 0x2e, 0x65, 0xbd, 0x74, 0xfc, 0x41, 0x3d, 0xa2, 0x99, 0x7b,
+ 0x3d, 0xe0, 0x55, 0x98, 0x3b, 0xe4, 0xdf, 0xa5, 0x3c, 0xcf, 0x0c, 0x16, 0xbd,
+ 0x68, 0x3f, 0x78, 0xbd, 0xbe, 0xe3, 0x4e, 0x3d, 0xf4, 0x7f, 0x4a, 0x3d, 0xaa,
+ 0x64, 0x3b, 0xbd, 0xa7, 0xe7, 0x83, 0xbd, 0xe0, 0x45, 0x60, 0x3b, 0x41, 0x1e,
+ 0x0c, 0xbd, 0x14, 0xa6, 0x90, 0xbd, 0x71, 0x37, 0x5f, 0xbd, 0x72, 0x90, 0xb8,
+ 0xbc, 0xc6, 0x6e, 0x3b, 0xbd, 0x4d, 0x5e, 0xe0, 0xbc, 0x40, 0x74, 0x5b, 0xbb,
+ 0xb2, 0x61, 0x06, 0x3d, 0xc8, 0xd6, 0xc1, 0x3c, 0xa9, 0x80, 0x85, 0xbd, 0x76,
+ 0xe9, 0x20, 0x3d, 0x1a, 0xcc, 0x80, 0x3d, 0x39, 0x17, 0xdf, 0xbc, 0xe1, 0x45,
+ 0x8c, 0x3c, 0x67, 0x35, 0x48, 0x3d, 0x9d, 0x17, 0x76, 0xbd, 0x38, 0xa6, 0xb2,
+ 0xba, 0xad, 0x55, 0xaf, 0x3c, 0xf4, 0x50, 0x5e, 0x3d, 0x02, 0x7b, 0xd9, 0xba,
+ 0x0a, 0x74, 0x0f, 0xbd, 0xa9, 0x69, 0x54, 0x3d, 0x3e, 0xa8, 0x6c, 0x3d, 0xcc,
+ 0xde, 0x27, 0xbd, 0x4f, 0x51, 0xa7, 0xbb, 0xbf, 0x78, 0x26, 0xbd, 0x66, 0xcc,
+ 0x84, 0xbd, 0xce, 0x30, 0xcd, 0xbc, 0xab, 0x28, 0x60, 0x3d, 0x97, 0xdb, 0x31,
+ 0xbd, 0x6f, 0x6f, 0xc3, 0x3b, 0xe0, 0x7e, 0x8c, 0xbd, 0x06, 0xe2, 0xc0, 0xbc,
+ 0xce, 0x5b, 0x7a, 0xbd, 0xa5, 0xfb, 0xe1, 0xbc, 0xbd, 0x3b, 0x44, 0xbd, 0x90,
+ 0xa1, 0xbd, 0x3b, 0xc9, 0xba, 0x34, 0xbc, 0x5f, 0xab, 0x08, 0xbd, 0xf8, 0x5a,
+ 0x5f, 0x3c, 0x23, 0xbe, 0x8c, 0x3d, 0xbc, 0x19, 0xad, 0xbc, 0xb1, 0xd8, 0x19,
+ 0xbd, 0x33, 0x7a, 0x85, 0x3d, 0xa5, 0x19, 0xc7, 0x3b, 0x83, 0x55, 0x83, 0xbc,
+ 0x9d, 0x63, 0x08, 0x3d, 0x36, 0x98, 0x1c, 0x3d, 0x20, 0x2d, 0x2d, 0xbc, 0x6b,
+ 0xc3, 0x68, 0xbd, 0xbc, 0x22, 0xb6, 0x3c, 0x93, 0xdb, 0xc0, 0x3a, 0x88, 0x17,
+ 0xdf, 0x3c, 0x0d, 0x0d, 0x2c, 0xbd, 0xc0, 0x40, 0x60, 0x3b, 0xea, 0xf9, 0x3f,
+ 0xbd, 0x0d, 0xd7, 0x03, 0xbd, 0x45, 0x08, 0x68, 0xbd, 0xb3, 0xa4, 0xe9, 0xbc,
+ 0xfd, 0xe9, 0x5f, 0x3d, 0x4c, 0x45, 0x0c, 0x3d, 0xff, 0xdb, 0xa3, 0xbc, 0x12,
+ 0x16, 0x88, 0xbd, 0x70, 0x42, 0xe5, 0xbc, 0x60, 0xda, 0x1c, 0x3c, 0x2b, 0x55,
+ 0xf8, 0x3b, 0x07, 0x82, 0x87, 0x3c, 0x08, 0x94, 0x83, 0xbd, 0x66, 0xf3, 0x44,
+ 0x3d, 0x0b, 0xed, 0x10, 0x3c, 0x1b, 0x7e, 0x8f, 0xbd, 0xbe, 0x4c, 0xb5, 0xbc,
+ 0xc4, 0x84, 0x26, 0x3d, 0x80, 0x5f, 0x6a, 0xbc, 0xb8, 0x41, 0x29, 0x3d, 0xfa,
+ 0xbc, 0x4a, 0x3d, 0xbe, 0x44, 0x47, 0xbc, 0xc1, 0x9b, 0x21, 0x3d, 0x33, 0xb8,
+ 0xd7, 0xbc, 0x54, 0xe6, 0x53, 0x3d, 0xd8, 0x95, 0x3d, 0xbd, 0x2b, 0x4d, 0x90,
+ 0x3d, 0x0c, 0x3c, 0x3a, 0xbc, 0x6c, 0x41, 0x24, 0xbd, 0x31, 0xfd, 0x66, 0xbd,
+ 0x43, 0x29, 0x4a, 0x3d, 0x00, 0x8d, 0xc3, 0xb9, 0x20, 0xd6, 0xe2, 0xbb, 0xb7,
+ 0xf6, 0x22, 0xbd, 0xe9, 0xd7, 0x3f, 0x3d, 0x8d, 0xb7, 0xf7, 0x3c, 0x2b, 0x56,
+ 0x8b, 0x3d, 0xa6, 0xa7, 0x70, 0xbd, 0xdf, 0x62, 0x56, 0x3d, 0xe9, 0x4b, 0xb0,
+ 0x3c, 0x40, 0xb6, 0x04, 0x3c, 0x34, 0x8c, 0x04, 0xbd, 0xb9, 0x1a, 0x1b, 0x3d,
+ 0x25, 0xbc, 0x05, 0xbd, 0x3d, 0x10, 0x1c, 0xbd, 0x77, 0x24, 0x8c, 0xbd, 0x53,
+ 0x9b, 0xdf, 0x3b, 0x80, 0xc9, 0x53, 0x3d, 0x40, 0xc7, 0x6c, 0xbc, 0x00, 0xb3,
+ 0xbe, 0xba, 0xe5, 0xe9, 0x89, 0x3d, 0xb0, 0x72, 0x88, 0xbd, 0xcd, 0x2d, 0x0c,
+ 0xbd, 0x27, 0x35, 0x07, 0xbd, 0x6b, 0x6a, 0x49, 0xbd, 0x99, 0x9b, 0x51, 0xbd,
+ 0x1c, 0x94, 0x51, 0x3c, 0x78, 0x26, 0x6a, 0xbd, 0xc2, 0x3e, 0x04, 0x3d, 0xf3,
+ 0x19, 0x16, 0xbd, 0x9c, 0xb7, 0x0b, 0xbd, 0xb8, 0x3d, 0xf9, 0x3c, 0x69, 0xdb,
+ 0x14, 0x3d, 0x0a, 0xe3, 0x0f, 0xbd, 0x1a, 0xd5, 0x80, 0xbd, 0xed, 0x79, 0x8d,
+ 0x3c, 0x1b, 0x21, 0x00, 0xbb, 0x9a, 0x88, 0x0e, 0x3d, 0xc0, 0x1c, 0x66, 0x3d,
+ 0x60, 0x74, 0x82, 0xbd, 0x7b, 0x96, 0x1c, 0x3d, 0x53, 0x16, 0x49, 0x3d, 0xeb,
+ 0xfc, 0x8d, 0x3d, 0xb0, 0x52, 0x32, 0x3c, 0xa0, 0xa5, 0x5a, 0xbd, 0xfe, 0xf7,
+ 0x9c, 0xbc, 0x19, 0x78, 0x4a, 0x3c, 0x78, 0xd1, 0xc2, 0x3c, 0xb4, 0x51, 0x91,
+ 0xbd, 0x47, 0x08, 0x76, 0xbd, 0x7e, 0x70, 0x02, 0x3d, 0x8b, 0x90, 0x80, 0xbd,
+ 0xc0, 0xad, 0x10, 0xbd, 0xc6, 0x2e, 0x4d, 0xbd, 0x0e, 0xe4, 0x0b, 0x3d, 0x9e,
+ 0x8e, 0x8f, 0x3b, 0xd6, 0x81, 0x8a, 0xbd, 0xb9, 0x43, 0x05, 0xbd, 0xfd, 0xb4,
+ 0x3d, 0xbd, 0x69, 0x1b, 0xa9, 0xbb, 0x0b, 0xb6, 0x88, 0xbd, 0xe3, 0x8f, 0x64,
+ 0x3d, 0xd9, 0xda, 0x4d, 0x3c, 0xa8, 0xa9, 0x66, 0xbd, 0x87, 0x10, 0x23, 0x3d,
+ 0xf6, 0x03, 0x3b, 0x3d, 0xa4, 0xcb, 0x83, 0x3c, 0x36, 0xd0, 0x2a, 0xbd, 0x22,
+ 0x31, 0x27, 0x3d, 0xf0, 0xfb, 0x18, 0x3d, 0x8e, 0xa1, 0x04, 0x3d, 0x67, 0x0e,
+ 0x67, 0xbc, 0x77, 0x07, 0x90, 0x3d, 0xaf, 0x11, 0x72, 0x3d, 0x7b, 0xdd, 0x80,
+ 0x3d, 0x18, 0xd2, 0x6e, 0xbc, 0x0c, 0xfa, 0x5e, 0xbd, 0xe8, 0x92, 0xaf, 0xbc,
+ 0x8f, 0x89, 0xe9, 0x3c, 0x15, 0x06, 0x1d, 0x3c, 0x02, 0x7f, 0x81, 0x3d, 0x88,
+ 0xe0, 0x0f, 0xbd, 0x16, 0x6a, 0xab, 0xbc, 0xc4, 0x1f, 0xdf, 0x3c, 0x38, 0xab,
+ 0x4b, 0x3c, 0x40, 0xfd, 0x83, 0x3b, 0x71, 0x9a, 0x52, 0xbd, 0x90, 0x3f, 0x04,
+ 0xbd, 0xe4, 0x23, 0x81, 0x3d, 0x4a, 0xaa, 0x39, 0xbd, 0xc1, 0xb6, 0x7c, 0x3d,
+ 0xa4, 0xb4, 0x2d, 0x3d, 0x3c, 0x8b, 0xea, 0x3b, 0xf3, 0x93, 0x8e, 0x3d, 0x9b,
+ 0xea, 0x87, 0xbc, 0x25, 0x22, 0x91, 0xbd, 0xeb, 0x03, 0x1a, 0x3d, 0xde, 0xb3,
+ 0x41, 0x3d, 0xb3, 0x03, 0x59, 0xbd, 0x98, 0xea, 0x1d, 0xbd, 0xaf, 0x46, 0xd9,
+ 0xbc, 0xc0, 0x55, 0x3e, 0xbd, 0x4d, 0xe2, 0x45, 0x3d, 0x85, 0xa0, 0x44, 0x3c,
+ 0x00, 0xe5, 0x3e, 0xbd, 0x6f, 0x4e, 0x4b, 0xbb, 0xe1, 0xcd, 0x86, 0x3c, 0x90,
+ 0xaa, 0x08, 0xbd, 0xb6, 0xb9, 0x7a, 0x3d, 0x45, 0x80, 0x5c, 0x3d, 0xda, 0x7b,
+ 0x28, 0xbd, 0x4e, 0x73, 0xc1, 0xbc, 0x8b, 0xff, 0x1b, 0x3d, 0xe0, 0xad, 0x71,
+ 0xbc, 0x5c, 0xa3, 0xd3, 0xbc, 0x93, 0x08, 0x85, 0x3d, 0xce, 0x42, 0x3a, 0x3d,
+ 0x31, 0x10, 0x86, 0x3d, 0x28, 0x95, 0x86, 0x3a, 0x81, 0x0e, 0x39, 0xbd, 0xa6,
+ 0xb2, 0x57, 0x3d, 0x97, 0xab, 0xf8, 0xbc, 0x53, 0x5b, 0x9f, 0xbc, 0x79, 0x78,
+ 0x54, 0x3d, 0xdc, 0x5b, 0x8b, 0x3d, 0xf5, 0xe7, 0x2d, 0x3d, 0xe7, 0x23, 0xa4,
+ 0xbc, 0x6a, 0xff, 0x83, 0x3d, 0x53, 0xe7, 0x48, 0x3d, 0x27, 0x3c, 0x8c, 0x3d,
+ 0x44, 0xdf, 0x74, 0xbd, 0x58, 0xe8, 0xf3, 0xbc, 0x4c, 0x9f, 0x57, 0x3c, 0x6c,
+ 0xb6, 0x95, 0x3c, 0xbd, 0x8e, 0x65, 0x3d, 0x11, 0x3e, 0xcb, 0x3c, 0x88, 0x0e,
+ 0x02, 0xbd, 0x68, 0x1c, 0x8d, 0xbb, 0xe9, 0xaa, 0x81, 0x3d, 0x00, 0xcc, 0x35,
+ 0xbd, 0x4f, 0x0b, 0x8f, 0xbd, 0xa4, 0xaa, 0x40, 0xbc, 0x0a, 0x00, 0xac, 0xbc,
+ 0xe2, 0x2a, 0x40, 0xbd, 0xc3, 0xff, 0x05, 0xbd, 0x09, 0xbe, 0x65, 0xbd, 0xe6,
+ 0xde, 0x7e, 0xbd, 0x30, 0x36, 0x17, 0x3c, 0x50, 0x30, 0x0e, 0xbc, 0x64, 0x36,
+ 0xfa, 0x3c, 0x9d, 0x5a, 0x85, 0xbb, 0x50, 0x2c, 0x65, 0xbc, 0x90, 0x5a, 0xae,
+ 0xbb, 0x37, 0xe6, 0x41, 0xbd, 0xfd, 0x21, 0xf7, 0xbc, 0xb5, 0x91, 0x8b, 0xbb,
+ 0x15, 0xaa, 0xbe, 0x3c, 0x86, 0x46, 0x78, 0xbd, 0xd4, 0x41, 0xf8, 0xbc, 0xf2,
+ 0xb7, 0xe4, 0x3c, 0x1b, 0x84, 0x5a, 0x3c, 0x5a, 0xc8, 0x5e, 0x3d, 0x74, 0xad,
+ 0xa8, 0x3c, 0x71, 0xbe, 0xa0, 0xbc, 0x9b, 0xaf, 0x2b, 0x3d, 0x43, 0x1b, 0x69,
+ 0xbd, 0xb3, 0xe7, 0x88, 0x3d, 0xbd, 0xe2, 0x5c, 0x3d, 0x6b, 0xa4, 0x35, 0xbd,
+ 0xe9, 0xbc, 0x8f, 0xbd, 0x16, 0xc0, 0x74, 0x3d, 0x92, 0xb9, 0x4c, 0x3d, 0x5d,
+ 0xee, 0x91, 0x3c, 0x74, 0xda, 0x1d, 0xbd, 0xda, 0x42, 0x5a, 0xbb, 0x70, 0x1b,
+ 0xbc, 0x3c, 0xc3, 0x23, 0xd9, 0xba, 0x6c, 0xf4, 0xa4, 0x3c, 0x9c, 0x95, 0x0a,
+ 0x3d, 0xb8, 0x03, 0x9e, 0x3c, 0x05, 0x7b, 0x84, 0x3d, 0x88, 0x24, 0x29, 0x3d,
+ 0x6e, 0xb3, 0x72, 0x3d, 0x36, 0x31, 0x62, 0x3c, 0xea, 0x27, 0x24, 0xbd, 0x6d,
+ 0xf3, 0xe5, 0x3c, 0x2e, 0x24, 0x1f, 0x3d, 0x69, 0x95, 0x6b, 0xbd, 0xa6, 0xdf,
+ 0x42, 0xba, 0xdd, 0x6e, 0x90, 0xbd, 0xb3, 0x52, 0x00, 0xbd, 0xbe, 0x22, 0x02,
+ 0x3d, 0xbf, 0x61, 0x80, 0xbd, 0x8d, 0xde, 0x82, 0x3d, 0xf4, 0x40, 0x28, 0x3d,
+ 0x7b, 0xeb, 0xb7, 0xba, 0xe1, 0x73, 0x94, 0x3c, 0xae, 0x7f, 0x12, 0xba, 0x02,
+ 0xf0, 0x40, 0xbb, 0xf1, 0xb7, 0x05, 0x3d, 0x0d, 0xbb, 0x6b, 0xbd, 0xe2, 0x4f,
+ 0x12, 0xbd, 0x0a, 0x66, 0x09, 0xbd, 0xb7, 0xe9, 0x8f, 0x3d, 0x0d, 0x7c, 0x14,
+ 0x3d, 0x11, 0xf4, 0xbe, 0xba, 0x09, 0x4d, 0x38, 0xbd, 0x80, 0x94, 0x41, 0x3a,
+ 0xd3, 0x89, 0xc2, 0x3c, 0xd8, 0x3a, 0x3d, 0x3c, 0x28, 0x00, 0x5f, 0xbc, 0xc4,
+ 0x2a, 0x91, 0xbc, 0x50, 0x98, 0xe6, 0xbc, 0xfa, 0x52, 0x16, 0x3d, 0x3c, 0xb5,
+ 0x87, 0x3d, 0xed, 0xcf, 0x70, 0x3c, 0x78, 0x9e, 0x72, 0xbb, 0x93, 0x6b, 0x23,
+ 0x3d, 0xf0, 0xaf, 0x64, 0xbd, 0xce, 0xd7, 0x5e, 0xbd, 0x6c, 0x20, 0x7b, 0xbc,
+ 0xd0, 0x7a, 0xe0, 0xbb, 0x60, 0xfd, 0xef, 0x3b, 0x95, 0xe5, 0x5f, 0xbd, 0xdf,
+ 0x49, 0x33, 0x3c, 0x11, 0x3d, 0x80, 0x3d, 0xd4, 0x04, 0xc8, 0x3c, 0x58, 0xc0,
+ 0x41, 0xbd, 0x50, 0x35, 0x63, 0x3d, 0xd2, 0x8a, 0xc8, 0xbc, 0x67, 0xf0, 0x8b,
+ 0xbd, 0x69, 0x02, 0x55, 0x3d, 0x0c, 0xa1, 0x76, 0xbd, 0xa8, 0x5e, 0x05, 0xbb,
+ 0xd0, 0xc3, 0x16, 0x3d, 0x78, 0x7f, 0x23, 0xbc, 0x59, 0x25, 0x5c, 0xbd, 0xb4,
+ 0xaf, 0x36, 0xbd, 0x26, 0xc1, 0xd0, 0xb9, 0xa3, 0xb9, 0x54, 0x3d, 0xd3, 0x99,
+ 0xea, 0xbc, 0x56, 0x87, 0xfc, 0xbc, 0x86, 0x17, 0x16, 0xbd, 0x80, 0x75, 0x17,
+ 0xbd, 0xe9, 0xe9, 0x26, 0xbd, 0x73, 0xd9, 0x7f, 0xbd, 0x78, 0xf7, 0x08, 0x3d,
+ 0xb4, 0x6e, 0x24, 0x3d, 0xdb, 0x78, 0x04, 0x3d, 0x91, 0x4e, 0x5e, 0x3d, 0x93,
+ 0x73, 0x86, 0x3d, 0xd5, 0xc8, 0x41, 0xbd, 0x18, 0x68, 0x79, 0x3d, 0x1e, 0x5e,
+ 0x74, 0xbd, 0x05, 0x92, 0x43, 0x3d, 0xed, 0xd7, 0xcb, 0x3c, 0x90, 0x04, 0x48,
+ 0xbd, 0x2a, 0x81, 0x59, 0xbd, 0xa6, 0xf8, 0x8f, 0xbd, 0x21, 0x1b, 0x82, 0x3d,
+ 0x47, 0x2f, 0x03, 0xbd, 0x49, 0x8a, 0xea, 0x3b, 0x82, 0x20, 0x29, 0x3d, 0x3e,
+ 0x06, 0x0a, 0x3b, 0x0d, 0xe3, 0x93, 0x3c, 0x3f, 0xb2, 0x83, 0x3d, 0x57, 0x42,
+ 0xe4, 0x3b, 0x02, 0x82, 0xde, 0xbc, 0x75, 0x96, 0x0a, 0xbd, 0x66, 0xb5, 0x0a,
+ 0x3d, 0x11, 0xed, 0x8d, 0xbd, 0xc5, 0x7c, 0x61, 0xbd, 0x85, 0xde, 0x56, 0xbc,
+ 0x2f, 0x3e, 0x41, 0xbd, 0x65, 0x92, 0x70, 0x3d, 0x10, 0x6d, 0xd8, 0xbb, 0x6e,
+ 0x7b, 0x45, 0x3d, 0xe0, 0xcd, 0x58, 0x3d, 0x5a, 0xa0, 0x6c, 0xbd, 0x25, 0x13,
+ 0x2f, 0xbd, 0x95, 0xcf, 0x6b, 0xbd, 0x42, 0x36, 0x20, 0xbc, 0x3c, 0x82, 0x47,
+ 0x3c, 0x71, 0xef, 0x16, 0x3c, 0x50, 0xa2, 0xb8, 0xba, 0x7e, 0xc4, 0x61, 0x3c,
+ 0xa6, 0xc5, 0x78, 0xbd, 0xb9, 0x33, 0x32, 0xbd, 0x47, 0x60, 0x81, 0x3d, 0x58,
+ 0xd9, 0x16, 0x3d, 0x3a, 0x50, 0x7a, 0xbd, 0x47, 0xc7, 0x15, 0x3d, 0x00, 0xca,
+ 0x8a, 0xbd, 0x6f, 0x8f, 0x83, 0xbd, 0x7b, 0x4f, 0x58, 0xba, 0x30, 0x8f, 0x43,
+ 0xbd, 0xd1, 0x28, 0xd6, 0xbb, 0x20, 0x94, 0xf7, 0xbc, 0x84, 0xef, 0x25, 0xbd,
+ 0x06, 0x79, 0x6f, 0x3d, 0xdb, 0x3e, 0xcd, 0x3c, 0xc7, 0xce, 0x79, 0x3d, 0x23,
+ 0x71, 0x97, 0xbc, 0x5c, 0x5c, 0x38, 0x3d, 0xc8, 0xb6, 0x03, 0xbd, 0xd6, 0x31,
+ 0xc6, 0xbc, 0x33, 0xe1, 0xd0, 0xbb, 0x66, 0xf2, 0xd5, 0xbc, 0xe2, 0x07, 0x49,
+ 0x3d, 0x2c, 0x67, 0xc9, 0xbc, 0x71, 0xd2, 0x41, 0xbd, 0x1a, 0xb4, 0x81, 0x3c,
+ 0xf0, 0x27, 0x7d, 0x3d, 0xca, 0xcc, 0xd5, 0xbc, 0x3f, 0x3e, 0x30, 0xbd, 0x50,
+ 0xe1, 0x26, 0xba, 0x53, 0x7d, 0x00, 0x3d, 0x8e, 0x75, 0x4d, 0x3b, 0x0a, 0x56,
+ 0x20, 0x3d, 0x61, 0xaf, 0xf4, 0xbc, 0x55, 0x41, 0x98, 0xbc, 0x16, 0x66, 0x13,
+ 0x3d, 0x40, 0x96, 0x67, 0xbd, 0x40, 0x3a, 0x0b, 0xbd, 0xbe, 0x16, 0x88, 0xbc,
+ 0x54, 0xd1, 0x56, 0xbd, 0xd5, 0xa2, 0xba, 0xbb, 0x97, 0x30, 0x1f, 0xbb, 0x37,
+ 0x2d, 0x18, 0xbd, 0xe7, 0xe3, 0x8e, 0xbd, 0x82, 0x9b, 0x29, 0x3c, 0x8f, 0x41,
+ 0x24, 0xbd, 0xa2, 0x55, 0x8f, 0x3b, 0x25, 0xa4, 0x18, 0x3c, 0xb6, 0xee, 0xe7,
+ 0x3c, 0x3a, 0x0b, 0x12, 0xbd, 0x27, 0xfb, 0xb4, 0xb9, 0x70, 0x41, 0x0a, 0xbc,
+ 0xe8, 0x8b, 0x62, 0xbd, 0x04, 0x95, 0xc5, 0x3c, 0xa4, 0x51, 0x46, 0xbd, 0x42,
+ 0x1e, 0x65, 0xbd, 0x4f, 0x3d, 0x4a, 0x3d, 0x6f, 0x9d, 0x19, 0x3d, 0xb8, 0xdb,
+ 0x8c, 0xbd, 0x9a, 0xfe, 0x23, 0x3c, 0x0c, 0x8a, 0x58, 0x3d, 0xe2, 0x61, 0x62,
+ 0xbd, 0x1f, 0xee, 0x64, 0x3c, 0x0c, 0xb0, 0x9a, 0x3b, 0xe8, 0x9f, 0xf7, 0xbc,
+ 0x54, 0xf9, 0xef, 0xbc, 0xbb, 0x3b, 0x57, 0x3a, 0xcc, 0x92, 0xa6, 0x3c, 0xfa,
+ 0x7f, 0xf0, 0x3c, 0x92, 0x0c, 0x03, 0x3d, 0xc4, 0xa7, 0x0b, 0xbd, 0x3d, 0xf1,
+ 0x8b, 0xbd, 0x6a, 0x7a, 0x4c, 0xbd, 0xfe, 0x96, 0xdc, 0x3c, 0xf8, 0x93, 0x99,
+ 0x3b, 0xe4, 0xd7, 0x70, 0x3d, 0x72, 0x25, 0x4f, 0x3d, 0xc0, 0xa1, 0x80, 0xbd,
+ 0xb8, 0xac, 0x50, 0x3d, 0x87, 0x18, 0x87, 0xbc, 0xcc, 0xe2, 0x01, 0xbd, 0x70,
+ 0x67, 0xfb, 0xbb, 0xda, 0x29, 0x7c, 0x3d, 0xe6, 0xf0, 0x67, 0x3d, 0x98, 0xd8,
+ 0x0e, 0x3d, 0xe8, 0xf6, 0x45, 0xbd, 0xcc, 0x76, 0x57, 0xbd, 0x12, 0xec, 0x02,
+ 0x3d, 0x02, 0x73, 0xbf, 0x3c, 0xea, 0x67, 0x9e, 0x3a, 0x29, 0x29, 0x1f, 0x3d,
+ 0x19, 0x65, 0x2a, 0x3d, 0x9c, 0x3a, 0x86, 0x3d, 0xd8, 0xcd, 0x15, 0xbd, 0xf3,
+ 0xed, 0x75, 0xbd, 0xa6, 0x30, 0xff, 0xbc, 0x87, 0x2e, 0xc7, 0x3c, 0xe6, 0x41,
+ 0xb9, 0x3c, 0x38, 0xf9, 0xb0, 0x3c, 0x49, 0x88, 0x8c, 0xbd, 0xf2, 0x2b, 0x70,
+ 0x3d, 0x3d, 0x58, 0xec, 0x3b, 0xa2, 0x59, 0x3a, 0x3c, 0x3f, 0x5f, 0x3a, 0x3d,
+ 0x5f, 0xb9, 0x48, 0xbd, 0x09, 0x9a, 0xc5, 0x3b, 0x12, 0x63, 0x84, 0xbd, 0x11,
+ 0x76, 0x5e, 0x3d, 0x4f, 0xa0, 0x84, 0x3d, 0x90, 0x8b, 0x29, 0xbd, 0x03, 0xcc,
+ 0x2c, 0xbd, 0xbe, 0x89, 0x8f, 0xbd, 0xa5, 0x7a, 0x81, 0x3d, 0x54, 0xa8, 0xd0,
+ 0x3c, 0x54, 0x70, 0x9d, 0xbb, 0x4a, 0xe4, 0xb9, 0xbc, 0x94, 0x65, 0xfe, 0xbc,
+ 0x3c, 0xef, 0xac, 0x3c, 0x4c, 0x87, 0x16, 0xbd, 0x0a, 0xda, 0x85, 0xbc, 0x89,
+ 0x04, 0x88, 0x3d, 0xb6, 0xe7, 0x19, 0x3d, 0x38, 0x06, 0x08, 0xbd, 0x37, 0x6c,
+ 0x3d, 0xbd, 0x75, 0x70, 0x09, 0x3d, 0x13, 0x5c, 0x7f, 0xbd, 0xe2, 0x25, 0xfb,
+ 0x3c, 0x74, 0xe4, 0x06, 0x3d, 0xd8, 0xcb, 0x82, 0x3d, 0xbc, 0xa0, 0xeb, 0xbc,
+ 0xaf, 0xb1, 0x8e, 0xbd, 0x30, 0x53, 0xdc, 0x3b, 0x4b, 0x94, 0x84, 0x3d, 0xc9,
+ 0x6d, 0xcd, 0x3c, 0xd1, 0x47, 0x8e, 0x3d, 0x5e, 0x1a, 0x15, 0xbc, 0x0b, 0xe3,
+ 0xb2, 0x3c, 0x4c, 0x7f, 0xfb, 0x3c, 0x6e, 0x6d, 0x53, 0x3d, 0xdc, 0xa5, 0x8d,
+ 0x3d, 0x71, 0x25, 0x85, 0xbd, 0xc8, 0xa9, 0x17, 0xbc, 0xe1, 0xcd, 0xf3, 0xbc,
+ 0xbd, 0xc5, 0x5f, 0xbd, 0xde, 0xbc, 0x07, 0x3d, 0x2a, 0x50, 0x91, 0x3c, 0x12,
+ 0x64, 0x9a, 0x3b, 0x54, 0x8b, 0x02, 0x3d, 0x2d, 0x77, 0x8b, 0xbd, 0x83, 0x37,
+ 0x82, 0x3d, 0x5f, 0xdb, 0x50, 0xbd, 0xba, 0xe6, 0x63, 0x3d, 0x2d, 0x97, 0x21,
+ 0x3d, 0xfe, 0xba, 0x80, 0x3d, 0xe4, 0xc2, 0x39, 0xbd, 0x8d, 0x37, 0x94, 0x3c,
+ 0x8d, 0xe8, 0xb0, 0xbc, 0x0e, 0xbc, 0xa9, 0xbc, 0xbb, 0xfb, 0xb1, 0xbb, 0xff,
+ 0xdb, 0x13, 0xbd, 0x15, 0x1e, 0x1f, 0xbd, 0xe6, 0x81, 0x51, 0xbd, 0xf1, 0x39,
+ 0xaf, 0xbc, 0x86, 0x69, 0x68, 0xbd, 0x33, 0x5c, 0xe8, 0x3c, 0x25, 0xd3, 0x5d,
+ 0xbd, 0x77, 0xf4, 0x0e, 0xbd, 0x5f, 0x4b, 0xec, 0x3c, 0xc4, 0x6c, 0xfc, 0x3c,
+ 0x39, 0x1e, 0xc9, 0x3c, 0x2c, 0xdc, 0x6f, 0xbd, 0xf0, 0xdd, 0x5b, 0x3c, 0xba,
+ 0x58, 0x63, 0x3d, 0x20, 0xb8, 0x9c, 0x3b, 0x58, 0x4e, 0xb6, 0xbc, 0x47, 0x2d,
+ 0xc4, 0xbc, 0x0c, 0x5b, 0x6b, 0x3d, 0x00, 0x18, 0xed, 0xb9, 0x96, 0xa9, 0x9e,
+ 0x3c, 0x42, 0x5c, 0x4a, 0xbb, 0x94, 0x9f, 0x85, 0xbd, 0x10, 0xdd, 0xcd, 0x3c,
+ 0x47, 0x98, 0x8c, 0xbd, 0x28, 0x33, 0x6f, 0xbd, 0x6c, 0x52, 0x21, 0x3d, 0x41,
+ 0x5c, 0x45, 0x3c, 0xf7, 0x7c, 0x36, 0xbd, 0x6d, 0xf5, 0xdb, 0xbc, 0x30, 0x95,
+ 0x87, 0x3d, 0xed, 0x8a, 0x8f, 0xbd, 0x79, 0x78, 0x88, 0xbd, 0x0c, 0x54, 0x1c,
+ 0xbc, 0x82, 0xa3, 0xa7, 0x3b, 0x1f, 0xcf, 0x76, 0xbd, 0x71, 0x23, 0x8b, 0x3c,
+ 0x01, 0xc3, 0x87, 0x3d, 0x54, 0xb5, 0xe5, 0x3c, 0x3e, 0x2f, 0x17, 0xbd, 0x99,
+ 0xb5, 0x13, 0x3d, 0x69, 0xf7, 0xad, 0x3c, 0xb1, 0x19, 0x13, 0xbc, 0x0e, 0xf8,
+ 0x5b, 0xbd, 0x74, 0x52, 0x82, 0x3d, 0x7a, 0x5f, 0xfd, 0xbb, 0x2b, 0x17, 0x15,
+ 0xbd, 0x05, 0x3c, 0x72, 0xbd, 0x18, 0xbd, 0xb9, 0xba, 0xaf, 0x8e, 0xc5, 0xbc,
+ 0x7a, 0x8f, 0xc3, 0xbb, 0xd9, 0x64, 0x14, 0xbd, 0x97, 0xdf, 0x55, 0x3d, 0x99,
+ 0x96, 0xac, 0xba, 0x4f, 0x5c, 0x84, 0x3d, 0xa4, 0x57, 0x27, 0x3d, 0xf8, 0x8e,
+ 0x81, 0xbd, 0xf8, 0xef, 0x55, 0x3c, 0x0e, 0x2d, 0x59, 0xbd, 0xf1, 0xeb, 0x52,
+ 0x3a, 0x06, 0xde, 0x94, 0x3c, 0x53, 0x8e, 0x17, 0xbd, 0x5d, 0x25, 0x86, 0x3c,
+ 0x1c, 0x8c, 0x8b, 0xbc, 0x32, 0xa0, 0x1c, 0x3d, 0x2e, 0xb3, 0x53, 0x3d, 0x2e,
+ 0x1c, 0x3f, 0x3d, 0x38, 0xb0, 0xf1, 0x3c, 0x95, 0xc2, 0x55, 0xbb, 0x74, 0x05,
+ 0x39, 0xbd, 0x4a, 0xa6, 0x27, 0x3b, 0xb3, 0x63, 0xd8, 0x3c, 0xd6, 0x03, 0x83,
+ 0x3d, 0x24, 0x65, 0x49, 0xbd, 0x18, 0x9e, 0xee, 0x3c, 0x26, 0xf0, 0x85, 0xbd,
+ 0xfc, 0xd0, 0x67, 0xbd, 0x43, 0xca, 0x12, 0xbd, 0xb1, 0xec, 0x03, 0x3d, 0x00,
+ 0x1e, 0x74, 0x3c, 0xb5, 0x32, 0xa6, 0xbc, 0x3d, 0x56, 0x65, 0x3d, 0x8b, 0x0e,
+ 0xa9, 0xbc, 0x03, 0x1e, 0x91, 0x3d, 0x64, 0x8f, 0x88, 0x3d, 0x1c, 0x50, 0xb5,
+ 0xbc, 0xe4, 0xb3, 0x05, 0xbd, 0x2c, 0x4f, 0x59, 0xbd, 0x29, 0x30, 0x23, 0xbd,
+ 0x0c, 0x23, 0x56, 0xbd, 0x7d, 0x77, 0x82, 0xbc, 0x45, 0x1a, 0xa4, 0x3c, 0xb7,
+ 0x9c, 0x0f, 0xbc, 0xc5, 0x76, 0xd8, 0xbc, 0x7f, 0x4f, 0x78, 0xbd, 0xb4, 0x07,
+ 0x82, 0x3c, 0x56, 0xcc, 0x6a, 0xbd, 0xc3, 0x11, 0x29, 0x3c, 0xa5, 0xf6, 0x7a,
+ 0x3d, 0x8a, 0x88, 0xc4, 0x3c, 0x00, 0xf8, 0xa2, 0xbc, 0x30, 0x08, 0x50, 0xbd,
+ 0x59, 0xcf, 0xb1, 0xbc, 0xd1, 0xba, 0x52, 0xbd, 0xc0, 0xe8, 0xbe, 0x3b, 0xc3,
+ 0xb8, 0xfe, 0xbc, 0x22, 0xc5, 0x84, 0xbd, 0xef, 0x51, 0xbd, 0x3a, 0x75, 0x42,
+ 0xc8, 0xbc, 0x1a, 0x32, 0x88, 0x3d, 0x2a, 0x26, 0xc2, 0xbc, 0x66, 0x17, 0x2a,
+ 0xbd, 0x1d, 0x0f, 0x7f, 0x3d, 0x55, 0x2f, 0x8f, 0x3b, 0x01, 0x47, 0x8c, 0x3d,
+ 0x3a, 0x01, 0x18, 0x3d, 0xca, 0xa0, 0xea, 0xbc, 0x3e, 0x16, 0x34, 0xbd, 0xe8,
+ 0xf7, 0x75, 0x3c, 0x20, 0xee, 0x49, 0x3c, 0x6a, 0xc1, 0x3b, 0xbd, 0xa0, 0x98,
+ 0x5c, 0xbd, 0x60, 0x8e, 0x94, 0x3b, 0xa2, 0x9b, 0x8a, 0x3d, 0x10, 0x4d, 0x4f,
+ 0x3d, 0x87, 0xe4, 0x45, 0xbd, 0xb6, 0x17, 0xdd, 0x3b, 0xee, 0x06, 0x71, 0xbd,
+ 0xca, 0xb4, 0xe0, 0x3c, 0xd4, 0x9d, 0x0b, 0xbd, 0xba, 0x3a, 0x21, 0x3d, 0x6c,
+ 0xfd, 0xaa, 0x3c, 0x35, 0x20, 0x61, 0xbd, 0x20, 0x51, 0x52, 0x3d, 0x96, 0xcc,
+ 0x29, 0xbd, 0x9f, 0x99, 0x22, 0x3d, 0x06, 0x2d, 0xdb, 0xba, 0xdb, 0xf1, 0x90,
+ 0x3c, 0xf9, 0x05, 0x06, 0x3d, 0xdf, 0x02, 0xcb, 0x3c, 0x02, 0xb8, 0xf8, 0xbc,
+ 0x70, 0x14, 0x50, 0xbd, 0x51, 0xdc, 0x88, 0x3d, 0xa8, 0xa5, 0xd6, 0xbc, 0x69,
+ 0xd7, 0x8e, 0x3d, 0xbe, 0x91, 0x86, 0xbd, 0x5d, 0x93, 0x12, 0xbd, 0x7c, 0x23,
+ 0x60, 0xbd, 0xb2, 0x55, 0xb7, 0x3c, 0x38, 0xb8, 0x0e, 0x3d, 0x88, 0x86, 0x0e,
+ 0x3c, 0x9a, 0x4b, 0x0d, 0x3d, 0x00, 0xfa, 0x1a, 0x3b, 0xb8, 0x59, 0xbf, 0x3c,
+ 0xbe, 0xa8, 0xea, 0x3c, 0xfc, 0xf4, 0xf3, 0x3c, 0xbf, 0x69, 0x17, 0x3d, 0x82,
+ 0xe6, 0x84, 0xbd, 0x9d, 0xde, 0x3e, 0xbd, 0x3a, 0x02, 0x5b, 0xbd, 0x04, 0x34,
+ 0x8b, 0xbd, 0x83, 0x26, 0xc5, 0x3c, 0x71, 0x0c, 0x17, 0x3d, 0x44, 0x33, 0x5a,
+ 0xbd, 0xe0, 0x15, 0xe4, 0x3b, 0xd9, 0x25, 0x80, 0xbd, 0xbb, 0xac, 0x56, 0xbd,
+ 0x54, 0x26, 0x6f, 0xbd, 0x30, 0x23, 0xa2, 0x3b, 0x08, 0x7c, 0x27, 0xbd, 0xba,
+ 0x00, 0xde, 0xbc, 0x80, 0x47, 0x8f, 0xbd, 0xca, 0x52, 0x17, 0xbd, 0xf0, 0x9a,
+ 0x0a, 0x3d, 0xe9, 0x6a, 0xea, 0x3b, 0x12, 0xaa, 0x65, 0x3d, 0x3e, 0x1a, 0x49,
+ 0x3d, 0x3b, 0x68, 0x30, 0xbd, 0xfb, 0x34, 0x3d, 0x3d, 0x0c, 0x21, 0xe3, 0x3c,
+ 0x13, 0x68, 0x67, 0xbb, 0xe5, 0xaf, 0x8b, 0xbd, 0xfe, 0x2b, 0x00, 0xbd, 0x5e,
+ 0x1e, 0x4a, 0xbd, 0xb2, 0x94, 0x70, 0x3d, 0xa0, 0x7e, 0x47, 0x3b, 0xde, 0xa9,
+ 0xef, 0xbc, 0x84, 0x2f, 0x1a, 0x3a, 0x26, 0xb6, 0xf8, 0x3c, 0xe4, 0xab, 0xd9,
+ 0xbc, 0xa8, 0x0b, 0x87, 0xbd, 0x70, 0x2c, 0xbd, 0x3c, 0x32, 0xb2, 0x8c, 0x3c,
+ 0xce, 0x0f, 0x34, 0xba, 0xc7, 0xc9, 0x3b, 0xbd, 0x22, 0xdb, 0xf3, 0xbc, 0x8d,
+ 0x4e, 0x48, 0xbd, 0xf0, 0x63, 0x53, 0x3d, 0x04, 0xd6, 0xc7, 0x3b, 0xfa, 0x40,
+ 0x6c, 0xbd, 0x22, 0xfb, 0x80, 0x38, 0xe9, 0x8c, 0x0e, 0x3c, 0xc4, 0x60, 0x27,
+ 0x3d, 0xaa, 0xcf, 0x60, 0x3d, 0xfe, 0x59, 0x08, 0x3d, 0x6e, 0x69, 0x43, 0xbd,
+ 0xcb, 0xa1, 0x03, 0xbd, 0x16, 0x47, 0x72, 0x3d, 0xc1, 0x37, 0x5d, 0x3d, 0x53,
+ 0x6f, 0x8b, 0xbd, 0x50, 0x99, 0x18, 0x3d, 0x65, 0x92, 0x89, 0x3d, 0x12, 0x80,
+ 0x94, 0xbd, 0x8d, 0x1d, 0x21, 0xbd, 0x6e, 0xc6, 0x69, 0x3d, 0x18, 0x1d, 0x23,
+ 0x3d, 0x3e, 0x2b, 0x00, 0x3d, 0xe4, 0x71, 0x4f, 0xbd, 0xfb, 0xc5, 0x0e, 0xbd,
+ 0x6e, 0x24, 0x47, 0x3d, 0x34, 0xf0, 0x50, 0x3c, 0x3f, 0x38, 0x89, 0x3d, 0xb5,
+ 0x84, 0x41, 0xbc, 0xb8, 0xdc, 0x56, 0x3d, 0x3b, 0x56, 0x60, 0xbc, 0x5a, 0x3b,
+ 0x58, 0x3d, 0x86, 0x56, 0x6d, 0xbd, 0x4f, 0x33, 0x43, 0x3d, 0x7e, 0x6c, 0x7d,
+ 0x3c, 0xb9, 0x4c, 0x8b, 0x3d, 0x00, 0x88, 0x3f, 0x3a, 0x3a, 0xb8, 0xc1, 0x3c,
+ 0x02, 0x18, 0x30, 0x3d, 0x6b, 0xb4, 0x4c, 0xbd, 0x0d, 0xd8, 0x3c, 0x3d, 0x9a,
+ 0x25, 0x61, 0xbd, 0x87, 0x7b, 0xa7, 0xbc, 0x76, 0x8e, 0x06, 0xbb, 0x47, 0xf9,
+ 0x73, 0xbd, 0x80, 0xfa, 0x28, 0xbb, 0xd4, 0xd1, 0x76, 0xbd, 0x9a, 0xcb, 0x29,
+ 0xbd, 0xf6, 0x0f, 0xe5, 0xbc, 0x6d, 0xeb, 0x4f, 0xbd, 0x46, 0xe8, 0x69, 0xbc,
+ 0x9a, 0x72, 0x69, 0x3d, 0x55, 0x19, 0x86, 0xbd, 0xba, 0x77, 0x0f, 0x3d, 0x4d,
+ 0xf6, 0x64, 0x3d, 0xf4, 0xf6, 0x19, 0x3d, 0xc3, 0x53, 0x4a, 0x3d, 0x83, 0xc4,
+ 0x7f, 0x3c, 0xb6, 0xcb, 0x53, 0xbd, 0xc5, 0x99, 0x83, 0xbd, 0xa9, 0xcb, 0x4e,
+ 0xbd, 0xbc, 0xc0, 0xf3, 0x3c, 0xc3, 0x45, 0x2c, 0x3d, 0x6a, 0x2f, 0x93, 0xbd,
+ 0x8d, 0x05, 0x67, 0x3d, 0xec, 0x6f, 0x3a, 0x3d, 0xf5, 0x47, 0x5a, 0x3d, 0xca,
+ 0xa6, 0x79, 0x3d, 0x16, 0x97, 0x7d, 0xbd, 0x53, 0x30, 0x52, 0x3d, 0x07, 0x81,
+ 0x52, 0x3d, 0xf7, 0xae, 0xa6, 0xbc, 0xa3, 0xc2, 0xa4, 0xbc, 0x5c, 0xd8, 0x23,
+ 0xbd, 0xc5, 0x77, 0x50, 0x3d, 0x28, 0x78, 0x47, 0x3c, 0xe7, 0xe2, 0x04, 0xbd,
+ 0xcc, 0x6f, 0x83, 0xbd, 0x4c, 0x2b, 0xfc, 0xbc, 0x42, 0xf8, 0xf6, 0x3c, 0x03,
+ 0x7c, 0x87, 0x3d, 0x2d, 0x4d, 0x80, 0xbd, 0x08, 0x59, 0x65, 0x3d, 0x2b, 0x4a,
+ 0x3a, 0xbd, 0xae, 0xec, 0x68, 0x3d, 0x1e, 0x42, 0x85, 0xbd, 0xd6, 0x06, 0x6a,
+ 0x3d, 0x6e, 0xfe, 0x65, 0xbd, 0x77, 0xef, 0xb0, 0x3c, 0x81, 0xb1, 0x48, 0x3c,
+ 0x86, 0x4b, 0x57, 0xbd, 0x1e, 0x45, 0x82, 0x3c, 0x9b, 0x6c, 0x0f, 0xbd, 0xeb,
+ 0x5f, 0x1c, 0xbd, 0xc3, 0x49, 0x3b, 0x3d, 0x5b, 0x31, 0x7b, 0xbd, 0xee, 0xcb,
+ 0x0c, 0xbd, 0x49, 0xa6, 0xa7, 0x3c, 0x89, 0x96, 0x73, 0xbd, 0x4d, 0xcf, 0x89,
+ 0x3d, 0xec, 0x73, 0xe1, 0x3b, 0x0e, 0x74, 0x0b, 0x3c, 0xc4, 0x52, 0xe1, 0xbc,
+ 0xf9, 0x15, 0x5f, 0x3d, 0x4a, 0x6c, 0x6c, 0xbd, 0x1d, 0x1d, 0xc7, 0xbb, 0xa2,
+ 0x11, 0x26, 0x3d, 0x92, 0xa6, 0x00, 0xbd, 0xe8, 0x29, 0x52, 0x3d, 0x6c, 0x9f,
+ 0xc3, 0x3c, 0xa9, 0xf6, 0xea, 0xbc, 0x0b, 0xce, 0x84, 0x3d, 0x3a, 0x7a, 0x83,
+ 0x3d, 0x95, 0x99, 0xff, 0x3c, 0x26, 0xc1, 0xae, 0xbc, 0x4c, 0x73, 0xab, 0x3c,
+ 0x10, 0x47, 0x5f, 0xbd, 0x6c, 0x99, 0xab, 0x3c, 0x40, 0x91, 0xee, 0x3a, 0x30,
+ 0xe9, 0x43, 0xbd, 0xd8, 0xdf, 0xed, 0x3c, 0x93, 0xd4, 0x98, 0xbc, 0x05, 0xf8,
+ 0x8c, 0x3d, 0x8d, 0x54, 0x89, 0xbd, 0x29, 0x6a, 0x5a, 0xbd, 0x54, 0x2f, 0x2d,
+ 0xbd, 0x11, 0x76, 0x90, 0xbd, 0x62, 0x24, 0xdf, 0x3c, 0x1f, 0x0c, 0x92, 0xbd,
+ 0x87, 0xb7, 0x06, 0xbd, 0x28, 0x1b, 0x92, 0xbd, 0x41, 0xb6, 0x19, 0xbd, 0x90,
+ 0xa9, 0xc8, 0xbc, 0x10, 0x06, 0xa2, 0x3c, 0x9b, 0x59, 0x72, 0x3d, 0x9f, 0x9b,
+ 0xc4, 0x3c, 0xc2, 0x44, 0xb9, 0xbb, 0xe4, 0x46, 0x90, 0x3d, 0xe9, 0x54, 0x40,
+ 0xbd, 0x18, 0xdd, 0xc8, 0xbc, 0xff, 0x78, 0x44, 0xbd, 0x6e, 0xaa, 0x92, 0xbc,
+ 0x76, 0xaa, 0x31, 0x3c, 0x37, 0x94, 0xe8, 0xbc, 0x2b, 0x84, 0xf6, 0x3c, 0xce,
+ 0x29, 0x8f, 0xbc, 0x37, 0xdc, 0xaf, 0x3c, 0x40, 0x76, 0xbd, 0x3c, 0xd6, 0x49,
+ 0x50, 0x3d, 0x48, 0x72, 0x36, 0xbd, 0xc7, 0x51, 0x63, 0xbd, 0x04, 0x47, 0x70,
+ 0xbc, 0x02, 0x99, 0x7c, 0xbc, 0x83, 0xb4, 0x44, 0xbd, 0x1d, 0x3b, 0x83, 0xbd,
+ 0x55, 0xe3, 0x41, 0x3d, 0x2c, 0x05, 0xcf, 0x3a, 0x52, 0x65, 0x2f, 0x3d, 0x8e,
+ 0x0d, 0x2d, 0x3d, 0x59, 0x13, 0x43, 0xbd, 0xe6, 0x6e, 0xf3, 0x3c, 0xc3, 0xfc,
+ 0xac, 0x3c, 0x82, 0x9e, 0x5f, 0xbc, 0x07, 0xd9, 0x6f, 0xbd, 0xf0, 0xf1, 0x9d,
+ 0x3b, 0x09, 0xcd, 0x07, 0xbd, 0x99, 0xc1, 0x87, 0x3d, 0xfa, 0xef, 0x73, 0x3d,
+ 0xe5, 0x18, 0xfc, 0x3c, 0xbc, 0x08, 0x06, 0x3d, 0x5e, 0x91, 0x90, 0xbd, 0x9c,
+ 0x69, 0xf7, 0x3b, 0x71, 0x14, 0xef, 0xbc, 0x90, 0x77, 0xf9, 0x3c, 0x4c, 0x17,
+ 0x6e, 0xbd, 0x59, 0x66, 0xe5, 0xbb, 0x6d, 0x0b, 0x5f, 0xbc, 0x8a, 0xde, 0x57,
+ 0x3d, 0xdf, 0x37, 0x84, 0xbd, 0x6a, 0x62, 0x7b, 0x3d, 0x19, 0x4c, 0xc5, 0xbc,
+ 0xf0, 0x81, 0x2b, 0x3d, 0x0c, 0xe8, 0x3f, 0xbd, 0x2c, 0xac, 0x36, 0xbd, 0x2a,
+ 0x6a, 0x2e, 0x3d, 0x90, 0xcc, 0x94, 0xbb, 0x07, 0xfd, 0x28, 0xbd, 0x5e, 0x9f,
+ 0xb7, 0x3b, 0xcc, 0xf7, 0x83, 0xbd, 0x2e, 0x4f, 0xa0, 0xbc, 0x06, 0x60, 0xcc,
+ 0x3c, 0xc6, 0xbf, 0x5d, 0x3c, 0x48, 0x40, 0x6b, 0xbd, 0x69, 0x48, 0x03, 0x3d,
+ 0x75, 0x47, 0x48, 0x3d, 0xc4, 0x2f, 0x0f, 0x3d, 0x2d, 0xa5, 0x6e, 0xbd, 0x5a,
+ 0x05, 0x41, 0xbd, 0x7c, 0x10, 0xff, 0x3c, 0x2c, 0x2e, 0x78, 0xbd, 0x16, 0x4f,
+ 0x7d, 0x3d, 0xcf, 0x20, 0x5f, 0x3d, 0xd7, 0x5c, 0x87, 0xbd, 0x96, 0x63, 0x1e,
+ 0xbc, 0x2b, 0xf3, 0x8c, 0xbc, 0x6e, 0x52, 0x00, 0xbd, 0xb0, 0xb0, 0x47, 0x3d,
+ 0x6e, 0x8c, 0xa2, 0xbc, 0x26, 0xa4, 0xbd, 0x3c, 0x50, 0xfb, 0xc4, 0xbc, 0x16,
+ 0xc5, 0xe2, 0x3c, 0x34, 0xbe, 0xba, 0xbc, 0x58, 0x77, 0x06, 0xbc, 0xb6, 0x0f,
+ 0x02, 0x3d, 0x00, 0xc0, 0x67, 0xbd, 0x19, 0x7b, 0x0f, 0xbd, 0xdf, 0xca, 0x42,
+ 0xbd, 0x28, 0x6b, 0x5d, 0xbd, 0xe8, 0x7b, 0x0b, 0x3d, 0x0f, 0xd3, 0x9b, 0xbc,
+ 0x0e, 0x94, 0x3c, 0x3d, 0x56, 0xcd, 0x32, 0xbd, 0x39, 0x73, 0x82, 0xbd, 0x32,
+ 0x4b, 0x06, 0xbd, 0x77, 0xbe, 0x35, 0xbd, 0x4f, 0x03, 0x0b, 0x3d, 0x40, 0x14,
+ 0x8b, 0x3d, 0xe0, 0x32, 0x60, 0xbd, 0x4f, 0xd0, 0x85, 0x3d, 0x0f, 0xfc, 0x74,
+ 0xbc, 0xa1, 0xfc, 0xfa, 0xbb, 0x83, 0x11, 0x49, 0x3b, 0x48, 0x21, 0x1b, 0xbc,
+ 0x4d, 0x36, 0xe6, 0xbc, 0x27, 0x47, 0x6c, 0xbc, 0x6f, 0x04, 0x37, 0xbd, 0xc6,
+ 0x57, 0x6a, 0x3d, 0xa0, 0x16, 0x4d, 0x3b, 0x1a, 0xeb, 0x55, 0x3d, 0x6e, 0x5f,
+ 0x2d, 0xbd, 0xde, 0xff, 0x65, 0xbd, 0x68, 0x46, 0x49, 0x3c, 0x3c, 0x27, 0x3c,
+ 0xbd, 0xfd, 0xdc, 0x0e, 0xbd, 0xb9, 0xff, 0x24, 0xbd, 0xf0, 0x8f, 0x5c, 0xbd,
+ 0xa8, 0x9d, 0x32, 0x3d, 0x5c, 0x6d, 0x4d, 0xbd, 0x0d, 0xc2, 0x47, 0x3d, 0xf5,
+ 0xe0, 0x8b, 0x3c, 0x4e, 0xd4, 0xfb, 0xbc, 0x2f, 0xef, 0x7d, 0x3d, 0x0d, 0xbf,
+ 0x03, 0x3d, 0x54, 0x6e, 0x16, 0x3d, 0x51, 0x8b, 0x85, 0xbd, 0xac, 0x6b, 0x19,
+ 0xbb, 0x2e, 0x99, 0x9e, 0x3c, 0xd9, 0xa5, 0x35, 0x3d, 0x90, 0x56, 0x59, 0x3d,
+ 0xda, 0xee, 0x7c, 0x3d, 0x63, 0x87, 0x1b, 0xbb, 0x12, 0x90, 0x39, 0xbd, 0x4b,
+ 0xb8, 0x39, 0x3d, 0x3f, 0x49, 0x94, 0xbc, 0xeb, 0x8f, 0x80, 0x3d, 0x8a, 0x9f,
+ 0x81, 0xbd, 0xdb, 0x11, 0x0c, 0x3d, 0x13, 0x28, 0x29, 0x3d, 0x70, 0x84, 0xfc,
+ 0xbc, 0x48, 0x74, 0x10, 0x3c, 0xcc, 0xb3, 0x30, 0xbd, 0x48, 0x07, 0x16, 0x3c,
+ 0x5d, 0x4f, 0x19, 0xbd, 0x2b, 0x80, 0xf7, 0xbb, 0x16, 0x87, 0x08, 0xbd, 0x07,
+ 0x00, 0x88, 0x3d, 0x12, 0x69, 0x44, 0x3d, 0x18, 0x31, 0x0d, 0x3c, 0x57, 0xd3,
+ 0x06, 0x3d, 0x24, 0x3d, 0x07, 0x3d, 0xcc, 0x07, 0x7f, 0x3d, 0xab, 0x2a, 0x79,
+ 0xbd, 0x7e, 0x3c, 0x79, 0xbd, 0xa9, 0x22, 0xfb, 0xbc, 0x3d, 0xa3, 0x3f, 0x3d,
+ 0x9b, 0x63, 0x40, 0x3c, 0x8f, 0xd5, 0x9b, 0x3c, 0x38, 0x24, 0x2b, 0x3d, 0x73,
+ 0x53, 0x02, 0x3d, 0xf4, 0xe3, 0xfb, 0x3c, 0xab, 0x4b, 0x81, 0x3d, 0x6c, 0x44,
+ 0x17, 0x3d, 0xe9, 0xbe, 0x8e, 0x3d, 0x79, 0xc1, 0x23, 0x3c, 0x19, 0xfd, 0x91,
+ 0x3c, 0xf9, 0xea, 0x83, 0x3c, 0x5a, 0xee, 0x86, 0x3c, 0xa7, 0x51, 0x2f, 0xbd,
+ 0x4a, 0xa1, 0x43, 0x3d, 0xf7, 0xc3, 0xdd, 0x3b, 0x41, 0x5d, 0x48, 0xbd, 0x91,
+ 0x94, 0x92, 0xbd, 0x76, 0xb0, 0x87, 0x3d, 0xad, 0x39, 0x8e, 0x3d, 0xa0, 0x5a,
+ 0xc3, 0xbb, 0x13, 0xd2, 0x42, 0xbd, 0x93, 0x32, 0x41, 0xbc, 0x02, 0x56, 0x91,
+ 0xbd, 0x6e, 0x37, 0x12, 0xbd, 0x70, 0x73, 0xe7, 0x3b, 0x85, 0xd7, 0x78, 0x3b,
+ 0xb0, 0xfb, 0x3f, 0xbd, 0x44, 0xb8, 0x2e, 0xbd, 0xcd, 0x1c, 0x92, 0xbd, 0x78,
+ 0xee, 0xe1, 0xbc, 0xb4, 0x56, 0x52, 0xbd, 0xa6, 0xbd, 0x62, 0x3d, 0xdc, 0x38,
+ 0xe8, 0xbc, 0x30, 0xaf, 0x68, 0x3c, 0xe0, 0x72, 0x05, 0xbc, 0x06, 0xad, 0xd5,
+ 0x3b, 0xd9, 0x62, 0x23, 0x3d, 0xf8, 0xa2, 0xee, 0xbc, 0x44, 0x13, 0x07, 0x3d,
+ 0x04, 0xcc, 0xf2, 0x3a, 0xce, 0x3f, 0x2c, 0x3d, 0x25, 0x8b, 0x28, 0x3c, 0x55,
+ 0xd2, 0x7a, 0xbc, 0x19, 0x6f, 0x83, 0x3d, 0x62, 0xaa, 0x32, 0xbd, 0xf2, 0x19,
+ 0x1c, 0xbc, 0x54, 0xc3, 0x8b, 0xbd, 0xdd, 0xeb, 0x52, 0x3c, 0x2a, 0xc7, 0x7c,
+ 0x3d, 0x04, 0xf0, 0xb9, 0x3b, 0xe8, 0x91, 0x84, 0x3d, 0x8d, 0xa2, 0xa3, 0x3c,
+ 0x01, 0xde, 0x7d, 0xbd, 0x14, 0xf3, 0x25, 0xbd, 0xde, 0x87, 0x8e, 0xbd, 0x6b,
+ 0x3b, 0x85, 0x3d, 0x02, 0x85, 0x84, 0xbd, 0x6b, 0x77, 0x6d, 0xbc, 0xb6, 0x9a,
+ 0x53, 0x3d, 0x0f, 0xb3, 0xaa, 0xbb, 0x13, 0x69, 0x55, 0xbd, 0x65, 0x98, 0x57,
+ 0xbd, 0xef, 0x9c, 0xb2, 0xbc, 0xd2, 0x02, 0xd4, 0x3c, 0x8e, 0xca, 0x27, 0x3d,
+ 0x64, 0xc8, 0x42, 0xbd, 0xca, 0x34, 0x39, 0xbd, 0xec, 0x45, 0x78, 0xbc, 0xe3,
+ 0xe3, 0x15, 0xbd, 0xad, 0x80, 0x30, 0x3d, 0xa3, 0xc8, 0x12, 0xbd, 0x11, 0x8e,
+ 0x40, 0x3d, 0x9a, 0x5f, 0x29, 0xbc, 0xbe, 0xc0, 0x8e, 0xbd, 0x2e, 0x01, 0x05,
+ 0xba, 0xde, 0x16, 0x2d, 0x3d, 0xce, 0xc7, 0x68, 0x3d, 0x08, 0x78, 0x4b, 0x3d,
+ 0xb9, 0xc7, 0x8f, 0xbd, 0x99, 0x7d, 0x71, 0x3d, 0x20, 0x52, 0x85, 0x3b, 0x8e,
+ 0x86, 0xcc, 0xbc, 0x18, 0x1e, 0x1e, 0x3d, 0x06, 0x84, 0x35, 0x3d, 0xd8, 0x65,
+ 0x71, 0xbd, 0xb1, 0x95, 0x1e, 0x3d, 0xa8, 0x12, 0x4f, 0x3d, 0xf0, 0x82, 0x6b,
+ 0x3c, 0x82, 0x05, 0x05, 0xbd, 0x78, 0x40, 0xef, 0x3c, 0xea, 0xf1, 0x91, 0xbd,
+ 0x06, 0x99, 0x82, 0x3d, 0x65, 0x80, 0x81, 0xbc, 0xc7, 0xd2, 0x98, 0xbc, 0x1b,
+ 0xab, 0x8c, 0x3b, 0x8d, 0xe6, 0xa2, 0x3c, 0x5a, 0xb0, 0xe8, 0xbc, 0x74, 0x5c,
+ 0x65, 0x3c, 0x53, 0x81, 0x88, 0x3d, 0x77, 0xe4, 0x83, 0xbd, 0x05, 0x68, 0x3f,
+ 0xbd, 0x7f, 0xa0, 0x34, 0xbd, 0x23, 0xc6, 0x57, 0xbd, 0xe8, 0x03, 0x4c, 0xbd,
+ 0xef, 0x5a, 0x91, 0x3c, 0x85, 0x78, 0x46, 0xbd, 0xc3, 0x5f, 0x2e, 0xbd, 0x38,
+ 0x74, 0x09, 0x3d, 0x71, 0x8d, 0x2a, 0xbd, 0x7c, 0xb3, 0x40, 0x3d, 0x26, 0xf6,
+ 0x72, 0xbd, 0x84, 0xfa, 0x4f, 0xbd, 0x34, 0x53, 0xa7, 0x3c, 0x2c, 0x63, 0x6f,
+ 0x3d, 0xe4, 0xa4, 0x29, 0xbd, 0x00, 0x17, 0x21, 0xbb, 0x82, 0x9e, 0x6f, 0x3d,
+ 0x8a, 0x61, 0x8d, 0xbd, 0xc4, 0xd7, 0x45, 0x3d, 0x20, 0x1a, 0xce, 0x3c, 0x86,
+ 0x39, 0x27, 0xbd, 0xf1, 0x45, 0x1f, 0xbd, 0xe0, 0x3e, 0xd4, 0x3c, 0x8a, 0x80,
+ 0x70, 0xbc, 0x80, 0xae, 0xd4, 0x3c, 0x04, 0x93, 0x0a, 0x3d, 0xff, 0x3c, 0x78,
+ 0x3d, 0x31, 0x0e, 0x48, 0x3c, 0x20, 0xa8, 0x89, 0xbd, 0x98, 0x75, 0x07, 0xbc,
+ 0x68, 0xa1, 0x71, 0x3d, 0xe0, 0xe8, 0x8e, 0xbc, 0xe9, 0x29, 0x19, 0x3d, 0x79,
+ 0x7c, 0x4f, 0xbc, 0x90, 0x98, 0xd5, 0x3c, 0x3b, 0xec, 0x1c, 0xbd, 0x36, 0x46,
+ 0x84, 0xb9, 0x18, 0x09, 0x8a, 0xbc, 0x84, 0xce, 0x0d, 0xbc, 0xb8, 0x2c, 0xa8,
+ 0x3c, 0x20, 0x84, 0x18, 0xbc, 0xa0, 0x54, 0x72, 0xbd, 0x5f, 0xd9, 0x82, 0xbd,
+ 0xe7, 0x32, 0x69, 0xbc, 0x58, 0xf3, 0x30, 0xbc, 0x12, 0xff, 0x89, 0x3b, 0x38,
+ 0xb3, 0x50, 0x3c, 0x5c, 0xf7, 0x48, 0x3c, 0x40, 0xb3, 0xb9, 0x3c, 0x08, 0x01,
+ 0x2b, 0x3d, 0xcb, 0x34, 0xc0, 0xbc, 0x9c, 0x64, 0x51, 0xbd, 0x58, 0x1a, 0x2f,
+ 0xbd, 0x4a, 0x45, 0x8a, 0xbc, 0x6a, 0x88, 0xe3, 0x3b, 0xf2, 0xe0, 0x74, 0x3d,
+ 0x08, 0xa7, 0x2d, 0xbd, 0x73, 0x61, 0x17, 0xbd, 0xf0, 0xee, 0xce, 0xbc, 0xda,
+ 0xbc, 0x20, 0xbd, 0x57, 0x27, 0xc6, 0x3c, 0x3c, 0xfc, 0xb2, 0x3d, 0xf9, 0x52,
+ 0x72, 0x3d, 0x98, 0x21, 0x23, 0x3a, 0x64, 0x0e, 0x39, 0xbd, 0x3c, 0x50, 0xff,
+ 0xbd, 0xf0, 0xb9, 0x36, 0xbd, 0xff, 0xe2, 0xa3, 0x3d, 0x1c, 0xad, 0x24, 0xbd,
+ 0x17, 0x26, 0x4b, 0x3d, 0x32, 0xdb, 0xca, 0x3b, 0xc6, 0x04, 0x3c, 0x3d, 0x3c,
+ 0x98, 0x9c, 0x3d, 0xd7, 0xd3, 0x80, 0xbc, 0x30, 0x4e, 0xd9, 0x3c, 0xff, 0xc1,
+ 0x21, 0x3d, 0x66, 0xcc, 0xa5, 0xbc, 0x61, 0x87, 0x98, 0x3d, 0x98, 0x20, 0x32,
+ 0x3d, 0xec, 0xf1, 0x87, 0xbd, 0x40, 0x73, 0xb9, 0xbd, 0xed, 0x67, 0x98, 0x3d,
+ 0x82, 0xde, 0x83, 0x3c, 0xef, 0xb3, 0xe9, 0x3c, 0xf6, 0xd1, 0x2f, 0x3d, 0xb6,
+ 0xa2, 0x6c, 0xbd, 0xfa, 0x55, 0x87, 0xbd, 0x5e, 0x0d, 0x4b, 0xbd, 0x52, 0x83,
+ 0x1b, 0x3d, 0x38, 0xa3, 0x32, 0xbd, 0x68, 0xa3, 0xd0, 0x3c, 0x6b, 0x9b, 0x0e,
+ 0xbd, 0xe8, 0x58, 0x83, 0x3b, 0xac, 0xf2, 0x1d, 0x3d, 0xdc, 0x01, 0xfe, 0xbb,
+ 0x45, 0xd1, 0x37, 0x3d, 0x7d, 0x74, 0x10, 0x3d, 0x39, 0x6f, 0x42, 0xbd, 0x1f,
+ 0x11, 0xd3, 0xbc, 0x58, 0x36, 0x98, 0x3d, 0xe6, 0x99, 0x19, 0xbd, 0x2e, 0x3f,
+ 0x44, 0x3c, 0x04, 0xd0, 0x08, 0xbd, 0x9e, 0x8c, 0x74, 0xbc, 0x73, 0x43, 0xeb,
+ 0xbc, 0xa2, 0x01, 0x9b, 0xbd, 0x30, 0x8a, 0x29, 0xbd, 0x4d, 0xe1, 0x50, 0xbd,
+ 0xc8, 0x2a, 0x1d, 0x3d, 0x2d, 0x12, 0x7d, 0x3d, 0xdd, 0x75, 0x24, 0xbc, 0xd7,
+ 0x2b, 0x48, 0x3c, 0x84, 0x77, 0xf0, 0x3c, 0xf8, 0x69, 0x8a, 0x3d, 0x0d, 0x62,
+ 0x23, 0x3d, 0x8d, 0x2a, 0x65, 0x3d, 0x33, 0xc6, 0xce, 0x3b, 0x34, 0xb9, 0x97,
+ 0x3b, 0xf3, 0x86, 0xe2, 0xbb, 0x5d, 0x2a, 0x53, 0xbd, 0xea, 0x2b, 0x9a, 0xba,
+ 0xbf, 0xd8, 0x91, 0xbc, 0x3d, 0x5f, 0xfa, 0xbc, 0x04, 0x71, 0x82, 0x3d, 0x02,
+ 0x09, 0xbe, 0x3d, 0xa2, 0xb3, 0xad, 0x3c, 0x6c, 0x47, 0x28, 0xbd, 0xce, 0xd6,
+ 0x16, 0xbd, 0x95, 0x44, 0xff, 0x3c, 0x6c, 0x62, 0x82, 0x3d, 0x2a, 0x15, 0xba,
+ 0xbc, 0xc1, 0xa7, 0x83, 0xbb, 0x69, 0x42, 0x7c, 0xbd, 0x03, 0x6e, 0x01, 0x3d,
+ 0xd9, 0x8c, 0x1b, 0xbd, 0xc7, 0x85, 0xdc, 0x3c, 0x76, 0x04, 0x4d, 0x3d, 0x99,
+ 0x3b, 0x69, 0x3c, 0xee, 0x8a, 0x6f, 0x3d, 0x2c, 0xb5, 0x34, 0xbd, 0x95, 0xc2,
+ 0x32, 0xbd, 0x34, 0x5b, 0x8a, 0x3c, 0x0d, 0x52, 0x44, 0xbb, 0xe8, 0xfd, 0xe3,
+ 0xbc, 0x6c, 0x8f, 0x6c, 0x3d, 0x22, 0xe9, 0xce, 0xbc, 0x38, 0x1d, 0xa4, 0x3d,
+ 0x37, 0xb9, 0xcc, 0xbb, 0x58, 0x8e, 0xbb, 0xbc, 0x13, 0x85, 0x8d, 0x3d, 0x7b,
+ 0x10, 0x9d, 0xbd, 0xb0, 0x74, 0x20, 0xbd, 0xbf, 0x6b, 0x24, 0xbc, 0x0b, 0xb2,
+ 0x6f, 0xbd, 0xbe, 0x9c, 0xae, 0x3d, 0x64, 0xfc, 0x34, 0x3d, 0x84, 0x44, 0x59,
+ 0x3b, 0xc5, 0x97, 0xb6, 0xbc, 0x25, 0x1b, 0x42, 0xbd, 0x1c, 0x64, 0x59, 0x3d,
+ 0x00, 0x12, 0x82, 0x3d, 0x64, 0xac, 0x91, 0x3b, 0x3b, 0xae, 0x6b, 0xbd, 0x18,
+ 0x6c, 0xd0, 0x3d, 0x9e, 0xea, 0x60, 0x3d, 0xf3, 0xf6, 0x49, 0xbd, 0xd3, 0xfc,
+ 0x5b, 0xbc, 0xe5, 0x37, 0x64, 0x3c, 0xbe, 0x33, 0x9c, 0xbc, 0x0e, 0x7a, 0x70,
+ 0xbd, 0xf7, 0x19, 0x32, 0xbd, 0x7a, 0x54, 0xac, 0xbd, 0x94, 0x9a, 0x45, 0xbc,
+ 0xb6, 0xa0, 0x55, 0x3d, 0x72, 0x8b, 0x81, 0x3d, 0xec, 0xf7, 0x1d, 0x3c, 0x7c,
+ 0xc0, 0x65, 0xbd, 0x21, 0x3d, 0xa8, 0x3d, 0xfe, 0x98, 0x91, 0xbc, 0xfc, 0x4e,
+ 0x99, 0xbd, 0xd5, 0x77, 0xa0, 0xbd, 0x9a, 0xec, 0x0b, 0x3d, 0xc2, 0xc5, 0x2e,
+ 0xbd, 0x58, 0x39, 0x9b, 0x3d, 0x1a, 0x19, 0x4e, 0xbd, 0x32, 0x1e, 0x11, 0xbd,
+ 0xe2, 0x81, 0x2f, 0xbd, 0x72, 0x93, 0x82, 0x3d, 0xb5, 0x33, 0x96, 0x3d, 0xfd,
+ 0x32, 0x31, 0xbd, 0xf0, 0x5e, 0x7b, 0xbd, 0x37, 0x76, 0x4d, 0xbd, 0x5e, 0xa1,
+ 0x9a, 0x3d, 0x58, 0xb2, 0x89, 0xbd, 0xc0, 0x61, 0x93, 0x3a, 0x12, 0xf4, 0x7a,
+ 0x3d, 0xad, 0xe5, 0x32, 0xba, 0xf3, 0xfe, 0x75, 0x3d, 0xbd, 0xec, 0x57, 0xbd,
+ 0x4d, 0x5b, 0x09, 0x3d, 0x27, 0x1d, 0x1b, 0xbd, 0x26, 0x5e, 0x77, 0xbc, 0x33,
+ 0xd7, 0x30, 0xbd, 0x93, 0xde, 0x6d, 0xbd, 0xfe, 0xdd, 0x6f, 0x3d, 0x07, 0x21,
+ 0xad, 0x3d, 0xb6, 0xfb, 0x77, 0x3d, 0xc7, 0xd4, 0x12, 0x3d, 0xee, 0xd1, 0x1a,
+ 0x3b, 0x57, 0x6a, 0xdf, 0xbc, 0x9a, 0x69, 0x98, 0xbd, 0x18, 0xb5, 0x8b, 0xbd,
+ 0x3f, 0x2a, 0x1b, 0xbc, 0xba, 0x61, 0x4e, 0x3d, 0xf7, 0xfc, 0x15, 0x3d, 0x15,
+ 0x6a, 0x89, 0x3d, 0x0c, 0x26, 0x12, 0xbd, 0x3c, 0x56, 0x75, 0x3d, 0x31, 0x95,
+ 0x49, 0x3c, 0x80, 0x89, 0x27, 0xbd, 0xc5, 0xc8, 0x2d, 0xba, 0xd4, 0xb2, 0x99,
+ 0x3d, 0xbd, 0xfe, 0x19, 0xbd, 0x88, 0x62, 0x88, 0x3d, 0x1a, 0xea, 0xb6, 0x3d,
+ 0x06, 0xc5, 0x95, 0xbd, 0xbe, 0x0c, 0x2d, 0xbd, 0x09, 0x1b, 0x59, 0x3d, 0xf7,
+ 0xd4, 0xbe, 0xba, 0x23, 0x7e, 0x0d, 0xbd, 0x3f, 0x6a, 0x9f, 0x3c, 0x29, 0x6c,
+ 0x86, 0x3c, 0x50, 0x53, 0xad, 0xbc, 0x4d, 0x7e, 0xd5, 0xbd, 0xd2, 0xac, 0x6b,
+ 0x3d, 0xfd, 0xc0, 0x8d, 0xbd, 0x96, 0xc2, 0x3f, 0x3d, 0xc7, 0x50, 0x9d, 0xbc,
+ 0xf8, 0x74, 0xa7, 0xbc, 0x20, 0xcb, 0xbe, 0xbd, 0x39, 0xaa, 0x5d, 0x3d, 0x53,
+ 0x49, 0x99, 0xbc, 0xfe, 0x92, 0xca, 0xbd, 0xf2, 0x46, 0x75, 0xbd, 0x71, 0xfe,
+ 0x6e, 0xbd, 0x9f, 0x2f, 0x59, 0xbd, 0x0b, 0xe7, 0x3f, 0xbc, 0xad, 0x3f, 0x80,
+ 0x3d, 0xec, 0x4d, 0x81, 0xbd, 0x53, 0x8f, 0x8a, 0x3d, 0xfb, 0x2c, 0x54, 0x3d,
+ 0x20, 0x2c, 0x57, 0xbd, 0xc1, 0xeb, 0xe2, 0xba, 0x98, 0xed, 0x46, 0x3d, 0x6a,
+ 0x20, 0xc1, 0x3c, 0x54, 0x95, 0x2c, 0xbd, 0xac, 0xc1, 0x2b, 0x3c, 0x29, 0x2a,
+ 0xf8, 0xbd, 0x4e, 0x69, 0x7f, 0x3d, 0x17, 0x04, 0x29, 0xbd, 0xf2, 0xbb, 0xeb,
+ 0xbb, 0xf1, 0x49, 0x40, 0x3d, 0x00, 0x69, 0x01, 0x3d, 0x8d, 0x53, 0x64, 0x3d,
+ 0xb7, 0x21, 0x0b, 0xbd, 0x43, 0xc5, 0xc7, 0xbd, 0x1b, 0xa3, 0x48, 0x3d, 0xcb,
+ 0x7c, 0x09, 0xbd, 0x20, 0xcb, 0x6e, 0xbb, 0x94, 0x3f, 0x2e, 0x3d, 0xf7, 0x32,
+ 0x72, 0xbd, 0x9a, 0x1e, 0x40, 0xbd, 0x5b, 0xf3, 0x47, 0x3d, 0x02, 0xea, 0x77,
+ 0xba, 0x63, 0xf3, 0xe8, 0x3c, 0xac, 0x35, 0x06, 0xbd, 0xbd, 0x03, 0x4c, 0xbd,
+ 0x11, 0xf6, 0x92, 0x3d, 0x1b, 0x1a, 0x64, 0x3d, 0x51, 0x88, 0x58, 0xbc, 0x61,
+ 0xbf, 0x83, 0xbd, 0xdd, 0x44, 0x73, 0xbd, 0xe7, 0xe5, 0xd0, 0x3c, 0xc9, 0x5f,
+ 0x87, 0x3d, 0xec, 0x20, 0xbe, 0x3d, 0xd9, 0x21, 0x0f, 0x3d, 0xf9, 0xdd, 0xe7,
+ 0xbc, 0xf3, 0x32, 0x91, 0xbd, 0x71, 0xb6, 0x4a, 0x3d, 0x29, 0x35, 0x86, 0x3d,
+ 0xba, 0xf4, 0x40, 0xbd, 0x1c, 0x2b, 0x17, 0xbd, 0x70, 0xfb, 0x3c, 0xbd, 0xed,
+ 0x3e, 0xdf, 0xbc, 0x60, 0xf1, 0x3d, 0x3d, 0x53, 0x6e, 0x87, 0xbd, 0x0f, 0x52,
+ 0x3d, 0x3d, 0x58, 0xd1, 0x47, 0xbd, 0xab, 0x7f, 0xc3, 0x3c, 0x3d, 0x5d, 0xa8,
+ 0xbd, 0xe9, 0x7f, 0x11, 0xbd, 0x88, 0x93, 0x50, 0xbd, 0xf2, 0xd2, 0x0f, 0x3d,
+ 0x24, 0x59, 0x90, 0x3a, 0x99, 0x86, 0x8b, 0xbd, 0x27, 0x21, 0x5f, 0xbd, 0xf4,
+ 0xa1, 0x80, 0x3d, 0x0b, 0xbb, 0x89, 0x3c, 0xbc, 0xda, 0x79, 0x3d, 0xe8, 0x9b,
+ 0x56, 0xbc, 0x42, 0xca, 0xf1, 0x3c, 0x74, 0xe2, 0x86, 0x3c, 0xe4, 0x85, 0x0f,
+ 0x3d, 0x07, 0x57, 0x2e, 0x3d, 0x41, 0x24, 0x85, 0x3d, 0x48, 0x7e, 0x08, 0xbd,
+ 0x91, 0xa8, 0xdd, 0x3c, 0x8c, 0xe1, 0xb7, 0xbc, 0x04, 0xae, 0x2f, 0x3d, 0xe4,
+ 0x63, 0xa2, 0x3c, 0x6e, 0x28, 0x06, 0xbc, 0x8d, 0xd9, 0x67, 0xbd, 0x88, 0x14,
+ 0x43, 0x3d, 0xe5, 0x9a, 0xde, 0x3c, 0x45, 0x3e, 0x9d, 0x3d, 0x03, 0x22, 0xcb,
+ 0xbc, 0x71, 0x92, 0x7c, 0x3d, 0xf7, 0xc6, 0x0d, 0x3d, 0xfb, 0x47, 0xa4, 0x3d,
+ 0x45, 0x18, 0x91, 0xbd, 0xda, 0x0b, 0x79, 0xbc, 0x18, 0x17, 0x71, 0xbd, 0xa2,
+ 0x74, 0x4e, 0xbd, 0xd7, 0xdb, 0x46, 0x3d, 0x35, 0x53, 0xbb, 0x3c, 0x0c, 0x62,
+ 0x0f, 0xbc, 0xe9, 0x2d, 0xdf, 0xbd, 0x33, 0xc7, 0x60, 0x3c, 0x18, 0x74, 0xa8,
+ 0x3c, 0xa3, 0x75, 0x87, 0xbd, 0x7b, 0x58, 0xf3, 0xbd, 0x30, 0xcd, 0xfa, 0x3c,
+ 0x35, 0xbd, 0x9c, 0xbd, 0x93, 0xcf, 0xdb, 0xbc, 0xc2, 0x35, 0xd9, 0xbc, 0x5e,
+ 0x5a, 0x06, 0x3d, 0x3d, 0x8b, 0x39, 0xbd, 0xb7, 0x5d, 0x33, 0xbc, 0x50, 0xca,
+ 0xb8, 0x3c, 0x8b, 0x71, 0xfb, 0x3c, 0x80, 0x8e, 0x2a, 0x3d, 0xa0, 0x72, 0x80,
+ 0xbc, 0x08, 0x4a, 0x00, 0xbd, 0x9b, 0x6f, 0xd2, 0x3b, 0xda, 0x83, 0xf9, 0xbc,
+ 0xed, 0x0c, 0x0b, 0x3c, 0x5d, 0x80, 0x40, 0xbc, 0x84, 0x40, 0x25, 0xbd, 0x52,
+ 0x1e, 0x03, 0x3d, 0x53, 0xd4, 0x54, 0x3c, 0x0b, 0x6b, 0xda, 0x3c, 0xcc, 0x67,
+ 0x17, 0x3b, 0x58, 0x05, 0xe5, 0xba, 0x63, 0x8d, 0x95, 0x3c, 0xc6, 0xa5, 0x5a,
+ 0x3d, 0xdf, 0x29, 0x23, 0xbd, 0x4b, 0x72, 0x9b, 0x3d, 0xef, 0x78, 0x4b, 0xbd,
+ 0xa5, 0x08, 0xb7, 0xbd, 0x9c, 0xb5, 0x78, 0xbc, 0xdf, 0x0c, 0x88, 0x3d, 0x07,
+ 0xab, 0x19, 0x3d, 0xdc, 0xad, 0xc9, 0xbd, 0x5e, 0x37, 0x4f, 0x3d, 0xe6, 0x99,
+ 0x77, 0xbd, 0x12, 0x5f, 0x48, 0xbc, 0x89, 0x82, 0xf2, 0x3b, 0x86, 0x89, 0x44,
+ 0x3c, 0x66, 0x1b, 0xb7, 0xbc, 0x2f, 0x07, 0xd0, 0x3b, 0xb5, 0x85, 0x76, 0xb9,
+ 0xb2, 0xc4, 0x11, 0xbd, 0x5b, 0x02, 0x30, 0xbd, 0xed, 0xed, 0xee, 0x3c, 0x77,
+ 0xbd, 0x24, 0xbb, 0x36, 0xe9, 0x97, 0xbd, 0x2a, 0xe1, 0x6d, 0x3d, 0x75, 0x29,
+ 0xaf, 0x3d, 0xff, 0x38, 0xac, 0xbb, 0x76, 0x6d, 0xe4, 0xbc, 0xf8, 0x03, 0x15,
+ 0xbd, 0x6f, 0x3d, 0x9a, 0xbc, 0x6b, 0x64, 0x1f, 0x3d, 0xa6, 0x7c, 0x6f, 0xbd,
+ 0xa7, 0x60, 0x83, 0x3c, 0xe1, 0xa5, 0x53, 0xbd, 0x04, 0x4f, 0xb6, 0xbc, 0xe7,
+ 0x0b, 0x28, 0x3d, 0x4c, 0x15, 0xa9, 0xbc, 0x68, 0x90, 0x73, 0xbb, 0x77, 0x3e,
+ 0x8e, 0x3c, 0xdd, 0x42, 0x0c, 0xbd, 0x07, 0x7d, 0x22, 0xbd, 0x35, 0x15, 0x82,
+ 0xbd, 0xed, 0x56, 0xe0, 0x3c, 0xfa, 0x8d, 0x7e, 0x3d, 0xab, 0xb5, 0x85, 0xbd,
+ 0x8c, 0x4b, 0xa4, 0xbc, 0xe5, 0xee, 0x53, 0xbc, 0x9e, 0x26, 0x4f, 0xbd, 0xaa,
+ 0xdf, 0x63, 0xbd, 0xd2, 0x48, 0x11, 0x3c, 0xd6, 0x9c, 0x58, 0x3d, 0xa9, 0x90,
+ 0x00, 0x3d, 0x9b, 0xfa, 0x8c, 0x3b, 0x2a, 0x97, 0x1d, 0x3d, 0x37, 0xe9, 0x3e,
+ 0xbd, 0x51, 0xd8, 0xf0, 0xbd, 0x92, 0x65, 0x2b, 0xbd, 0x06, 0x73, 0x21, 0x3c,
+ 0x85, 0x89, 0xad, 0x3d, 0x50, 0x07, 0x60, 0x3d, 0x01, 0x61, 0x9a, 0x3d, 0xcf,
+ 0xba, 0x9c, 0x3d, 0x7c, 0x6f, 0x69, 0x3d, 0x20, 0x79, 0x71, 0xbd, 0xc8, 0x59,
+ 0xd1, 0xbc, 0x2f, 0x68, 0x1e, 0xbd, 0xb2, 0xed, 0x87, 0xbd, 0x3e, 0xe7, 0xa0,
+ 0xba, 0xb1, 0xf0, 0xd0, 0x3c, 0x1c, 0xf1, 0xdd, 0xbc, 0xb0, 0x4a, 0x83, 0xbb,
+ 0xb5, 0x00, 0x55, 0xbc, 0xc6, 0x63, 0x0b, 0x3d, 0xa8, 0x88, 0x2f, 0x3d, 0x3c,
+ 0x6e, 0xd7, 0x3c, 0x68, 0x1d, 0x14, 0xbc, 0xac, 0xd1, 0x37, 0x3d, 0x7f, 0xb7,
+ 0x66, 0x3d, 0xca, 0xd0, 0xc7, 0xbb, 0x72, 0x5a, 0x91, 0x3d, 0x64, 0x09, 0xaf,
+ 0x3c, 0xea, 0x7a, 0x0d, 0xbb, 0x87, 0xd8, 0x4f, 0xbb, 0x88, 0xdf, 0xa5, 0x3c,
+ 0x1a, 0xd5, 0x73, 0xbc, 0x55, 0x5b, 0xce, 0x3a, 0xff, 0x62, 0x16, 0x3d, 0xb9,
+ 0x06, 0xa8, 0xbd, 0xbc, 0x96, 0xc0, 0xbc, 0x77, 0x06, 0x17, 0xbc, 0xe9, 0xdf,
+ 0x7e, 0xba, 0x94, 0x5f, 0xcd, 0x3b, 0x7b, 0x66, 0xf2, 0xbc, 0xc3, 0xdf, 0x7d,
+ 0xbd, 0x9c, 0x07, 0x0e, 0xbd, 0xaa, 0x4e, 0x0a, 0xbd, 0x42, 0x2d, 0x7f, 0x3c,
+ 0x6f, 0x45, 0xb9, 0x3c, 0x6a, 0xf4, 0x2c, 0xbd, 0x66, 0x01, 0x23, 0xbd, 0x5a,
+ 0x2e, 0x12, 0xbc, 0x00, 0x0c, 0xc4, 0xbd, 0x56, 0xf3, 0xd9, 0xbc, 0x57, 0x20,
+ 0x14, 0xbd, 0x8f, 0xae, 0xbd, 0x3c, 0x0a, 0x85, 0xbb, 0xbd, 0x51, 0x63, 0x28,
+ 0xbd, 0xc3, 0x45, 0x19, 0xbd, 0x1a, 0xc0, 0x66, 0x3d, 0x58, 0xac, 0x77, 0xbd,
+ 0x2e, 0xb6, 0xdc, 0xbc, 0xaa, 0x45, 0xe6, 0xbc, 0x06, 0xba, 0x43, 0xbd, 0x71,
+ 0x36, 0xac, 0x3d, 0xf5, 0xcb, 0x96, 0x3d, 0x5b, 0x32, 0x58, 0xba, 0x6a, 0xe8,
+ 0xe0, 0xb9, 0x39, 0xb6, 0xbe, 0x3c, 0x56, 0xcc, 0xc5, 0x3b, 0x6b, 0xde, 0xad,
+ 0xbc, 0x6c, 0xd9, 0xf4, 0xbc, 0xb2, 0xe9, 0x43, 0x3d, 0xf9, 0xd2, 0x1b, 0xbc,
+ 0xb1, 0x0f, 0x19, 0x3d, 0xb3, 0xe0, 0x05, 0x3b, 0xdd, 0x85, 0xa8, 0x3d, 0x92,
+ 0x70, 0xc0, 0xbc, 0xaf, 0xa0, 0x22, 0xbd, 0x9f, 0x05, 0x33, 0xbd, 0x4a, 0xe4,
+ 0xa8, 0x3c, 0x80, 0xf3, 0xc9, 0xba, 0x9f, 0x4c, 0x31, 0xbd, 0x5e, 0x75, 0xa4,
+ 0xbc, 0x4e, 0xa3, 0x73, 0xbd, 0x32, 0x14, 0x96, 0xbd, 0xf1, 0xc8, 0xb1, 0x3c,
+ 0xa6, 0x72, 0x15, 0xbd, 0x06, 0xbc, 0x4c, 0x3d, 0xd6, 0x84, 0x96, 0x3b, 0xbd,
+ 0x95, 0x27, 0x3d, 0x89, 0x66, 0xd8, 0x3c, 0x14, 0xc8, 0xf8, 0xbc, 0x48, 0xc6,
+ 0x2a, 0x3d, 0x68, 0x7c, 0xa4, 0x3d, 0x0b, 0xfe, 0x48, 0x3d, 0x03, 0x4e, 0xa0,
+ 0x3c, 0x14, 0xeb, 0x9e, 0x3d, 0x54, 0x79, 0x17, 0xbd, 0x8d, 0xe5, 0x44, 0x3c,
+ 0x89, 0xb2, 0x14, 0xbc, 0x37, 0x64, 0x98, 0x3d, 0xd5, 0x7d, 0x54, 0xbd, 0x82,
+ 0x97, 0x92, 0xbd, 0x97, 0x4c, 0x7c, 0x3b, 0xf8, 0x3f, 0x2b, 0x3d, 0xa2, 0x52,
+ 0xc8, 0x3c, 0x67, 0x7b, 0x49, 0xbd, 0x8b, 0xdc, 0x84, 0xbc, 0xfc, 0xd2, 0x1c,
+ 0xbd, 0x50, 0x53, 0x8d, 0xbb, 0xa7, 0x93, 0xfe, 0xbc, 0xab, 0xb3, 0xff, 0xbc,
+ 0xb0, 0x0d, 0x12, 0x3c, 0x90, 0xde, 0x69, 0x3d, 0x19, 0x4a, 0x31, 0x3d, 0xba,
+ 0x86, 0xbe, 0xbd, 0xf0, 0xd1, 0x6f, 0xbd, 0x2a, 0x37, 0xa2, 0x3c, 0xba, 0x72,
+ 0x91, 0xbc, 0x69, 0xfe, 0x8f, 0xbb, 0xb4, 0xe0, 0x26, 0x3d, 0x9e, 0x8e, 0x6f,
+ 0x3d, 0x28, 0x1c, 0xa4, 0xbc, 0xeb, 0x11, 0x0b, 0x3d, 0xd3, 0x1a, 0x27, 0x3c,
+ 0x89, 0x93, 0xa3, 0x3d, 0x22, 0xbf, 0x46, 0x3d, 0xe2, 0x27, 0xe5, 0xbc, 0xa1,
+ 0x10, 0x8a, 0xbc, 0xe9, 0x93, 0x65, 0xbd, 0xef, 0x81, 0xce, 0x3c, 0x0c, 0x10,
+ 0x44, 0x3c, 0xdc, 0x0d, 0x15, 0xbd, 0x8d, 0x3b, 0x09, 0x3d, 0xc2, 0xe2, 0x35,
+ 0xbd, 0xc3, 0xde, 0x09, 0x3c, 0x68, 0xc5, 0x8f, 0x3d, 0xa2, 0xb3, 0x38, 0x3d,
+ 0x94, 0xa6, 0x66, 0x3c, 0x5f, 0x15, 0x79, 0x3d, 0x74, 0x80, 0x7e, 0x3d, 0x00,
+ 0xb6, 0xb0, 0xbb, 0xdb, 0xb6, 0x98, 0xbb, 0x8c, 0x1a, 0xb7, 0xbc, 0xa0, 0xf9,
+ 0x7e, 0x3c, 0x66, 0x95, 0x47, 0x3d, 0xca, 0x33, 0xf0, 0xbc, 0xde, 0x00, 0xfa,
+ 0x3b, 0x57, 0x05, 0xfb, 0xbb, 0xfc, 0x7f, 0xcb, 0xbc, 0x31, 0x1c, 0x11, 0x3d,
+ 0x16, 0xe4, 0xfd, 0x3b, 0x3d, 0xd5, 0xb5, 0x3c, 0x8c, 0xd4, 0x69, 0xbd, 0x40,
+ 0x7f, 0x87, 0xbb, 0x26, 0x9d, 0x77, 0xbc, 0x6b, 0xa7, 0xde, 0x3c, 0xf4, 0xd2,
+ 0x00, 0x3c, 0xff, 0x0d, 0xbc, 0x3c, 0xab, 0xfb, 0x6f, 0x3d, 0x5a, 0x15, 0x8b,
+ 0x3b, 0x05, 0x27, 0x77, 0x3d, 0xd8, 0xa8, 0x54, 0x3d, 0xa7, 0xf2, 0x01, 0x3d,
+ 0x20, 0x41, 0x70, 0x3c, 0x19, 0x99, 0xfd, 0xbc, 0xc0, 0xea, 0x48, 0x3d, 0xd7,
+ 0x09, 0x26, 0x3b, 0x79, 0x58, 0x6b, 0x3d, 0x2b, 0x43, 0x2e, 0xbd, 0x58, 0x06,
+ 0x76, 0x3c, 0xc3, 0x4a, 0x8c, 0x3d, 0x4b, 0x5b, 0x62, 0x3d, 0xb2, 0xff, 0x1f,
+ 0xbd, 0xeb, 0x73, 0x08, 0x3d, 0x39, 0xd4, 0x77, 0xbd, 0xfc, 0x94, 0x83, 0xbc,
+ 0x0e, 0x0d, 0x6c, 0x3d, 0x5c, 0x29, 0x73, 0x3d, 0x96, 0xc4, 0x92, 0xba, 0x00,
+ 0x64, 0x97, 0xbd, 0x3b, 0x52, 0x3a, 0xbd, 0x3a, 0x2d, 0x91, 0xbd, 0x62, 0x65,
+ 0x97, 0xbd, 0x72, 0xde, 0xd2, 0xbd, 0x1d, 0x30, 0x00, 0xbd, 0x74, 0x93, 0x95,
+ 0xbd, 0xae, 0x2c, 0xd7, 0xbc, 0xe3, 0xae, 0x27, 0x3d, 0x67, 0x7f, 0x0b, 0x3c,
+ 0xfc, 0xcf, 0x74, 0xbc, 0x7f, 0x2b, 0x74, 0x3d, 0x00, 0x49, 0xa2, 0xba, 0x13,
+ 0xfa, 0x0e, 0xbd, 0x7e, 0xfe, 0x9f, 0xbc, 0xa6, 0x05, 0xc7, 0xbb, 0xc2, 0xa7,
+ 0x2a, 0xbc, 0xb3, 0x63, 0x9b, 0x3a, 0x9c, 0x14, 0x0e, 0x3d, 0x82, 0xc6, 0xb0,
+ 0xbc, 0xc1, 0x25, 0xc0, 0x3c, 0x03, 0x95, 0x45, 0xbd, 0x61, 0xb6, 0x50, 0xbd,
+ 0xf8, 0x77, 0xea, 0x3a, 0x9d, 0xa7, 0xaa, 0x3a, 0xf2, 0x18, 0x1d, 0xbd, 0x42,
+ 0x15, 0x94, 0x3d, 0x7e, 0x0e, 0x47, 0xbd, 0xa5, 0x82, 0x84, 0x3d, 0xed, 0xbe,
+ 0x3b, 0x3d, 0x3b, 0xdc, 0x2e, 0xbd, 0x5c, 0x8c, 0x4b, 0xbd, 0x37, 0xbc, 0x99,
+ 0xbb, 0xb7, 0x55, 0x54, 0x3d, 0x8e, 0x6d, 0xa8, 0xbd, 0x09, 0x3c, 0x3f, 0x3d,
+ 0x83, 0x0e, 0x3a, 0xbd, 0x8f, 0x1f, 0x91, 0x3d, 0x8b, 0x2b, 0x33, 0xbd, 0x92,
+ 0x57, 0x58, 0x3d, 0x71, 0xcd, 0x27, 0xbd, 0xcf, 0x53, 0x30, 0x3d, 0x20, 0x81,
+ 0x64, 0x3d, 0x50, 0x82, 0x60, 0xbd, 0x98, 0x46, 0x2f, 0x3d, 0x32, 0x95, 0x28,
+ 0xbd, 0x70, 0xf5, 0x71, 0x3c, 0x9d, 0x96, 0xb0, 0xbc, 0x5b, 0x59, 0x56, 0xbd,
+ 0x10, 0x59, 0x90, 0x3d, 0xc0, 0x1e, 0xbb, 0x3c, 0x5c, 0x37, 0x9d, 0x3d, 0xbd,
+ 0x75, 0x61, 0x3d, 0xcf, 0x8b, 0x84, 0xbc, 0xb2, 0x23, 0x46, 0x3d, 0x0a, 0x82,
+ 0x02, 0x3d, 0xaf, 0xd4, 0x8e, 0xbb, 0x60, 0x87, 0xca, 0x3c, 0xdb, 0x73, 0x1a,
+ 0xbd, 0x52, 0xa2, 0x09, 0x3d, 0xa2, 0x5b, 0x4a, 0xbd, 0x1d, 0x5d, 0xa0, 0xbb,
+ 0x30, 0x20, 0x7e, 0xbd, 0x84, 0x2a, 0x78, 0xbd, 0x74, 0x5f, 0x6a, 0xbd, 0xa5,
+ 0x1a, 0xa5, 0xbd, 0xa8, 0x46, 0x92, 0x3c, 0xe5, 0x7e, 0x50, 0xbd, 0xc1, 0x19,
+ 0x4b, 0x3c, 0x1a, 0x20, 0x71, 0x3d, 0xa1, 0xa7, 0x48, 0xbc, 0xc3, 0xa7, 0xeb,
+ 0x3c, 0xd4, 0x58, 0x6c, 0xbd, 0x06, 0x40, 0x08, 0x3d, 0x07, 0x97, 0x93, 0x3d,
+ 0x36, 0xb8, 0x5c, 0xbd, 0x69, 0x31, 0xc4, 0x3d, 0x5d, 0x20, 0x62, 0xbc, 0x73,
+ 0x3a, 0xbf, 0xbc, 0xea, 0xff, 0x3f, 0x3d, 0x39, 0x07, 0xec, 0x3c, 0xeb, 0x30,
+ 0xb4, 0xbb, 0x0b, 0x38, 0x72, 0xbd, 0x12, 0x71, 0xfd, 0xbc, 0xc5, 0x09, 0x82,
+ 0x3b, 0x5d, 0x51, 0x84, 0xbd, 0xff, 0x16, 0x49, 0xbd, 0x5e, 0xd1, 0x13, 0xbd,
+ 0xd8, 0xaf, 0x96, 0x3c, 0xea, 0x7c, 0x7e, 0xbd, 0x9b, 0x71, 0x1c, 0x3d, 0xe0,
+ 0xff, 0xaf, 0xbc, 0xac, 0x24, 0x57, 0x3d, 0x8a, 0xf8, 0x49, 0x3d, 0x24, 0xfd,
+ 0xbc, 0xbc, 0x46, 0x2c, 0xac, 0xbd, 0xc8, 0xdf, 0x63, 0xbc, 0x61, 0xc6, 0x2e,
+ 0xbd, 0x9d, 0xec, 0xd9, 0xbc, 0xb1, 0x44, 0x86, 0xbd, 0x85, 0x38, 0x47, 0x3d,
+ 0x7b, 0x49, 0x5a, 0xbd, 0xb0, 0x9c, 0xee, 0xbc, 0x03, 0x6f, 0x33, 0xbd, 0x55,
+ 0x8c, 0x23, 0xbc, 0xd5, 0xcc, 0x82, 0xbc, 0x82, 0xc2, 0xcc, 0xbc, 0xac, 0x00,
+ 0x85, 0x3c, 0xf6, 0xf5, 0x70, 0x3d, 0xb0, 0x0f, 0x03, 0x37, 0xa3, 0xfd, 0x5a,
+ 0xbd, 0x13, 0x57, 0x38, 0x3c, 0x25, 0xe4, 0xea, 0xbc, 0x1a, 0xb8, 0x0e, 0x3c,
+ 0x80, 0x95, 0x20, 0xbb, 0x84, 0x35, 0x36, 0x3d, 0x27, 0x0c, 0x1f, 0xbd, 0x4e,
+ 0x46, 0x8d, 0x3d, 0xa4, 0xb0, 0xef, 0x3c, 0xe1, 0xf5, 0xce, 0xbc, 0x34, 0x54,
+ 0x9d, 0xbc, 0x9f, 0x03, 0xd9, 0x3b, 0x22, 0xe9, 0xed, 0xbc, 0xd3, 0x7d, 0x30,
+ 0xbd, 0xb8, 0x86, 0x1f, 0xbc, 0xed, 0xc3, 0x44, 0x3d, 0xbf, 0x32, 0xa1, 0x39,
+ 0x74, 0xe5, 0x38, 0xbd, 0xa3, 0xe4, 0x6c, 0xbd, 0x56, 0x19, 0x33, 0xbd, 0x17,
+ 0x60, 0xbd, 0xbc, 0xd5, 0xec, 0x4a, 0x3c, 0xa2, 0x27, 0xa4, 0x3d, 0x50, 0xea,
+ 0x77, 0xbd, 0x5a, 0xb3, 0x91, 0x39, 0xf3, 0xc2, 0x19, 0x3d, 0xd2, 0xb9, 0x4f,
+ 0xbd, 0x60, 0x90, 0x81, 0x3d, 0xbf, 0x14, 0x60, 0xbd, 0x7a, 0xdd, 0x62, 0x3c,
+ 0x43, 0x4c, 0xa5, 0xbb, 0xad, 0x1c, 0xe1, 0xbc, 0xc8, 0x0b, 0x15, 0x3d, 0xe1,
+ 0xbd, 0x0f, 0x3d, 0xc6, 0x1f, 0x92, 0x3d, 0xdf, 0x9a, 0x86, 0xbd, 0x08, 0x1a,
+ 0xed, 0x3c, 0xfa, 0x1f, 0x00, 0x3c, 0x90, 0x94, 0x1b, 0x3d, 0x4a, 0x1c, 0x25,
+ 0xbd, 0x79, 0xe4, 0xff, 0xbc, 0xdf, 0xeb, 0x91, 0x3d, 0x43, 0x22, 0x81, 0x3d,
+ 0x1f, 0x1c, 0xa2, 0xbd, 0x54, 0xaf, 0x48, 0xbd, 0xbb, 0x7d, 0x4a, 0x3c, 0x32,
+ 0xcd, 0x6a, 0x3d, 0xc0, 0x75, 0x8b, 0x3d, 0x9a, 0xad, 0x67, 0x3c, 0xd1, 0xe6,
+ 0x30, 0xbd, 0x85, 0x2b, 0x33, 0x3c, 0xee, 0x90, 0x69, 0x3b, 0x7b, 0xdc, 0x96,
+ 0xbd, 0x38, 0x29, 0xad, 0x3b, 0xd8, 0x2b, 0xff, 0xbb, 0x72, 0x62, 0x57, 0x3c,
+ 0x55, 0x29, 0x86, 0x3d, 0xc7, 0x7c, 0x90, 0xbd, 0xfa, 0xa6, 0x71, 0xbd, 0x7f,
+ 0x51, 0x15, 0x3c, 0x7a, 0x11, 0x61, 0xbd, 0xd8, 0xd1, 0x64, 0x3b, 0xbc, 0x7e,
+ 0x8e, 0x3c, 0x06, 0x60, 0xe6, 0x3b, 0x1a, 0xd8, 0x43, 0x3d, 0x9b, 0xa8, 0x99,
+ 0xbd, 0x30, 0x98, 0x17, 0x3d, 0x82, 0xd8, 0x7a, 0xbd, 0xca, 0x23, 0x14, 0x3d,
+ 0x45, 0x6d, 0x18, 0xbd, 0x0d, 0x33, 0x8d, 0x3c, 0xd9, 0x88, 0xb5, 0xbc, 0x9c,
+ 0x01, 0xc6, 0x3b, 0xc2, 0x52, 0xe5, 0x3c, 0xc6, 0xbf, 0x5a, 0x3d, 0xa8, 0x06,
+ 0x1f, 0xbd, 0x1f, 0xaf, 0x4e, 0x3d, 0x84, 0x35, 0xca, 0xbd, 0x50, 0xc8, 0xee,
+ 0x3c, 0x64, 0xe8, 0x35, 0xbd, 0xbc, 0x23, 0x31, 0x3d, 0x36, 0x1d, 0xbf, 0xbd,
+ 0x7c, 0x88, 0x94, 0xbc, 0x0f, 0x8f, 0x1b, 0x3d, 0x08, 0x54, 0x81, 0x3c, 0x12,
+ 0x2f, 0x8a, 0xbd, 0xd7, 0x70, 0x3c, 0xbc, 0xb8, 0x2a, 0x50, 0x3d, 0xc8, 0xed,
+ 0x0e, 0xbd, 0xb7, 0xa3, 0x54, 0x3d, 0xc9, 0x64, 0x6c, 0xbc, 0x89, 0x83, 0x25,
+ 0xbd, 0xef, 0x72, 0x3b, 0x3b, 0xeb, 0xf8, 0xec, 0x3b, 0xe6, 0x5e, 0x0b, 0xbc,
+ 0xd4, 0xc0, 0xf5, 0xbc, 0x8a, 0x04, 0x92, 0x3d, 0xe8, 0x04, 0x39, 0xbd, 0x0f,
+ 0x74, 0xea, 0x3c, 0xfc, 0x8b, 0x01, 0xbc, 0xb2, 0xe0, 0x73, 0x3d, 0xc8, 0xa1,
+ 0xea, 0x3c, 0x99, 0xfe, 0x4f, 0x3d, 0xde, 0x4f, 0x36, 0xbd, 0x73, 0xe5, 0x76,
+ 0xbd, 0x8b, 0xd2, 0xdb, 0x3b, 0x96, 0x72, 0x79, 0x3c, 0xd0, 0x9b, 0x14, 0x3d,
+ 0x3d, 0x6f, 0x6a, 0x3d, 0x21, 0x55, 0x16, 0x3d, 0xeb, 0x2a, 0x91, 0x3d, 0x8c,
+ 0xd0, 0x33, 0xbd, 0x45, 0xdd, 0x54, 0xbd, 0x7e, 0x94, 0x90, 0xbc, 0xd4, 0x4c,
+ 0x8b, 0x3c, 0x4a, 0x6b, 0x19, 0x3d, 0x9e, 0x42, 0xeb, 0x3c, 0x7d, 0xf2, 0x4f,
+ 0x3d, 0x17, 0x4f, 0xab, 0x3c, 0x28, 0x37, 0xa1, 0x3c, 0x6d, 0xb8, 0x88, 0xbd,
+ 0xc1, 0xe3, 0x1e, 0xbd, 0x8f, 0x8c, 0x60, 0x3d, 0xe9, 0x88, 0x93, 0x3c, 0x54,
+ 0x12, 0x8e, 0x3d, 0x04, 0x68, 0xcb, 0xbc, 0x6e, 0xbf, 0xb0, 0xb9, 0xba, 0x8b,
+ 0x16, 0x3d, 0x3a, 0x30, 0xd5, 0x39, 0x89, 0x43, 0x89, 0x3c, 0x89, 0x8c, 0xc0,
+ 0x3b, 0x93, 0x98, 0xd9, 0xbd, 0xc5, 0x26, 0x3e, 0xbd, 0x2a, 0x4f, 0xa9, 0xbb,
+ 0x35, 0xa6, 0xe6, 0xbc, 0xeb, 0x89, 0x1f, 0x3d, 0xea, 0x85, 0xb7, 0xbc, 0xa7,
+ 0x52, 0xbb, 0xbc, 0x02, 0xda, 0x86, 0x3d, 0x82, 0xad, 0xfd, 0xba, 0x01, 0x20,
+ 0x2f, 0xbd, 0xb8, 0x8c, 0x9d, 0xbd, 0x9c, 0xbd, 0x1b, 0x3d, 0x1d, 0xad, 0xe6,
+ 0x3c, 0xac, 0x48, 0x6b, 0x3c, 0xdd, 0x13, 0xcb, 0xbd, 0xee, 0xcd, 0x8a, 0xbd,
+ 0x8b, 0x33, 0x7c, 0x3d, 0xc5, 0x0a, 0x2a, 0x3d, 0x13, 0x49, 0x77, 0x3d, 0x7e,
+ 0x78, 0xd1, 0xbd, 0xd3, 0x18, 0x3c, 0x3c, 0xb7, 0xaa, 0xb1, 0xbc, 0x54, 0x3a,
+ 0xce, 0xbc, 0x86, 0x08, 0x97, 0xbd, 0x04, 0x21, 0x01, 0xbc, 0x72, 0xa8, 0x65,
+ 0x3d, 0x71, 0x0b, 0xf3, 0x3b, 0x14, 0x9e, 0x88, 0x3c, 0x9c, 0xc6, 0x90, 0x3d,
+ 0x1d, 0xdb, 0x37, 0xbd, 0x8e, 0x9e, 0x59, 0x3c, 0xf6, 0xa9, 0x1a, 0xbd, 0xfd,
+ 0xec, 0x19, 0x3d, 0xa3, 0x01, 0x5a, 0xbd, 0xcc, 0xe7, 0x15, 0xbd, 0x26, 0xe6,
+ 0x51, 0x3d, 0xeb, 0x5f, 0x8d, 0x3d, 0x93, 0x7a, 0x73, 0x3c, 0x94, 0x02, 0x10,
+ 0x3d, 0x5d, 0x7e, 0xa7, 0x3c, 0x52, 0x78, 0x12, 0xbd, 0xe2, 0xfb, 0x44, 0x3d,
+ 0xb8, 0xdf, 0xa4, 0x3c, 0x84, 0x3d, 0x0e, 0xbd, 0xad, 0xae, 0x0e, 0x3c, 0x52,
+ 0xda, 0x1e, 0x3d, 0xfe, 0x93, 0x92, 0xbd, 0xe8, 0xe3, 0xde, 0xbd, 0x7a, 0xdc,
+ 0xd9, 0xbc, 0xc3, 0xb0, 0x68, 0x3d, 0x58, 0x56, 0x25, 0xbd, 0x3a, 0x61, 0xdc,
+ 0xbc, 0x71, 0xa2, 0xbc, 0x3c, 0x1b, 0xab, 0x30, 0x3d, 0x2a, 0x68, 0xbd, 0xbb,
+ 0x5e, 0xaf, 0x8b, 0xbd, 0xb4, 0x4d, 0x30, 0x3d, 0xa0, 0x46, 0x72, 0x3d, 0x4e,
+ 0xd2, 0x10, 0x3d, 0x71, 0x47, 0x4e, 0xbd, 0xe5, 0xd4, 0xe6, 0xbc, 0x25, 0x05,
+ 0x87, 0x3c, 0x33, 0x85, 0xec, 0x3c, 0x84, 0x58, 0x5f, 0xbd, 0xb0, 0xfa, 0xc0,
+ 0xbd, 0xc0, 0xdb, 0x87, 0xba, 0xa0, 0x30, 0x13, 0x3d, 0x84, 0x01, 0xe2, 0xbc,
+ 0xee, 0x8d, 0xa1, 0x3c, 0xc8, 0x8c, 0x24, 0x3c, 0x2b, 0x33, 0xf0, 0x3c, 0xc5,
+ 0xdd, 0x55, 0x3c, 0x89, 0x7c, 0xa5, 0xbc, 0x3b, 0x39, 0x19, 0xbd, 0xed, 0x0d,
+ 0x74, 0x3d, 0x98, 0xdf, 0x24, 0xbc, 0xdd, 0xdc, 0x38, 0xbd, 0xab, 0x9f, 0x75,
+ 0x3b, 0xd7, 0x20, 0xf3, 0x3c, 0x96, 0xa3, 0x78, 0x3c, 0x58, 0x44, 0x90, 0xbd,
+ 0x21, 0xcb, 0xf2, 0x3b, 0x18, 0x22, 0x58, 0xbd, 0x7c, 0x1c, 0x1b, 0xbd, 0xdc,
+ 0x4d, 0x19, 0xbd, 0xff, 0x68, 0x35, 0xbb, 0x34, 0xc5, 0x5e, 0x3c, 0x48, 0x3a,
+ 0x90, 0xbd, 0xa1, 0x84, 0xa7, 0x3c, 0x96, 0xc6, 0x46, 0xbd, 0x20, 0x22, 0xb3,
+ 0xbc, 0x16, 0x95, 0x18, 0x3d, 0x84, 0xa2, 0x5e, 0x3d, 0x78, 0x3a, 0x29, 0xbd,
+ 0x37, 0x9a, 0x5a, 0xbd, 0x93, 0x8b, 0x80, 0x3d, 0x25, 0xff, 0x49, 0xbd, 0xf0,
+ 0x1e, 0x8c, 0xbb, 0xde, 0xa1, 0x48, 0x3d, 0x58, 0x67, 0x2d, 0x3d, 0x09, 0x18,
+ 0x26, 0x3d, 0x37, 0x68, 0x85, 0x3d, 0xa0, 0x28, 0x70, 0x3d, 0x33, 0xf5, 0x9f,
+ 0xbc, 0x81, 0xcc, 0x97, 0xbd, 0x75, 0x24, 0x45, 0xbd, 0x60, 0x45, 0x29, 0x3d,
+ 0x6b, 0x87, 0x25, 0xbd, 0x67, 0xd9, 0xb5, 0xbc, 0x15, 0xcb, 0x01, 0xbd, 0x39,
+ 0xa5, 0xc6, 0xbd, 0xd2, 0xbe, 0xb9, 0xbd, 0x7c, 0x53, 0x20, 0xbd, 0x1a, 0x64,
+ 0xb4, 0xbd, 0x5a, 0xc1, 0x1d, 0x3d, 0xdf, 0xdd, 0x50, 0xbc, 0x8e, 0x86, 0x2b,
+ 0x3d, 0x20, 0xeb, 0x4d, 0x3d, 0x9a, 0xf8, 0x88, 0x3d, 0x92, 0xf1, 0x5e, 0xbd,
+ 0x24, 0xb3, 0xd8, 0xbb, 0x19, 0xbc, 0xd9, 0xbc, 0x8d, 0x97, 0x8f, 0xbd, 0x6d,
+ 0xf5, 0x7b, 0x3c, 0xfe, 0x33, 0x66, 0xbc, 0x35, 0x64, 0xfa, 0x3b, 0xe6, 0x00,
+ 0x9d, 0xbc, 0xd6, 0x9c, 0x63, 0xbd, 0x02, 0xff, 0x8e, 0xbd, 0x10, 0xa1, 0x23,
+ 0xbd, 0x93, 0x33, 0x0f, 0xbd, 0x59, 0xfc, 0x1b, 0x3d, 0x43, 0x0c, 0x7f, 0x3d,
+ 0x06, 0xbd, 0x96, 0x3d, 0xe1, 0x5b, 0x9f, 0xbc, 0x44, 0x05, 0xf8, 0x3c, 0x1c,
+ 0x60, 0xec, 0xbd, 0x33, 0x7f, 0x8c, 0xbd, 0x93, 0xcb, 0x0c, 0xbc, 0xc0, 0x8d,
+ 0x0e, 0xbb, 0x16, 0x45, 0x65, 0xbd, 0x76, 0x93, 0x88, 0xbd, 0x49, 0xd0, 0xb3,
+ 0xbd, 0xeb, 0x0e, 0x56, 0xbd, 0x8f, 0x1a, 0xab, 0x3d, 0x30, 0xde, 0x72, 0xb8,
+ 0xcf, 0xc7, 0x1d, 0xbd, 0x12, 0xc3, 0x31, 0xbd, 0x6e, 0x1d, 0x47, 0xbd, 0xb3,
+ 0x0f, 0x8c, 0x3d, 0x31, 0x82, 0x80, 0x3d, 0x44, 0xc4, 0x6b, 0xbc, 0x07, 0x28,
+ 0x5a, 0x3d, 0xa3, 0x3c, 0x3d, 0xbd, 0x13, 0x5c, 0x6a, 0x3d, 0x1c, 0x3f, 0x11,
+ 0x3d, 0x50, 0xac, 0xb5, 0xbc, 0x9f, 0x0e, 0xd9, 0x3c, 0x55, 0xfb, 0xde, 0xbc,
+ 0x6b, 0x4f, 0x6a, 0xbd, 0x38, 0x5f, 0x3f, 0x3b, 0x5a, 0x26, 0x98, 0xbc, 0x32,
+ 0x8c, 0x36, 0x3d, 0x78, 0x0a, 0x73, 0x3c, 0x7f, 0xd4, 0x51, 0x3d, 0x69, 0xdb,
+ 0x97, 0x3d, 0x52, 0x37, 0x80, 0x3d, 0x9b, 0x10, 0x88, 0xbd, 0xc0, 0xbf, 0x90,
+ 0xbd, 0x43, 0x84, 0x44, 0x3d, 0x12, 0x73, 0xc8, 0xbc, 0x84, 0xe0, 0x42, 0x3d,
+ 0xf5, 0x79, 0xd2, 0xbc, 0x88, 0x3b, 0x05, 0x3d, 0xf6, 0x10, 0xf3, 0x3b, 0x73,
+ 0x77, 0x8d, 0x3d, 0x92, 0xf0, 0x77, 0x3d, 0xd4, 0xcd, 0x55, 0xbd, 0x44, 0x7c,
+ 0x88, 0xbd, 0x3b, 0xe3, 0x5f, 0xbd, 0x0c, 0x35, 0x87, 0x3c, 0x09, 0x68, 0xf0,
+ 0x3c, 0x60, 0x3e, 0x47, 0x3a, 0xf6, 0x12, 0xb2, 0xbd, 0x2b, 0xe9, 0x9d, 0x3d,
+ 0x8e, 0x7c, 0x97, 0xbc, 0xb1, 0x05, 0x2e, 0xbc, 0x99, 0x6b, 0x14, 0xbd, 0xb2,
+ 0xa1, 0x85, 0x3d, 0x1c, 0xd1, 0x31, 0x3d, 0x18, 0xe6, 0xf5, 0x3c, 0xa7, 0x25,
+ 0x5a, 0x3c, 0xe0, 0x75, 0x9e, 0xbd, 0x1b, 0xe1, 0x69, 0xbd, 0x1b, 0x22, 0xc0,
+ 0x3d, 0xc4, 0x04, 0x8e, 0x3d, 0x92, 0x7f, 0x9d, 0x3d, 0xd3, 0xf3, 0x80, 0xbb,
+ 0x69, 0x7a, 0x58, 0x3c, 0xd5, 0xc2, 0x92, 0xbc, 0x26, 0x08, 0xa2, 0xbd, 0x9f,
+ 0xe8, 0x45, 0x3d, 0x10, 0xc9, 0x44, 0x3d, 0x7e, 0xac, 0x61, 0x3d, 0x88, 0xa8,
+ 0xf1, 0x3c, 0xa2, 0xd1, 0x87, 0xbd, 0x8c, 0xa7, 0xd1, 0xbc, 0x77, 0x21, 0x86,
+ 0xbd, 0x3b, 0x5a, 0xaa, 0x3d, 0x27, 0x8b, 0xb7, 0x3d, 0xe2, 0x8c, 0x39, 0x3d,
+ 0x16, 0x70, 0xc0, 0xbc, 0x45, 0xcc, 0x81, 0xbd, 0xfd, 0x54, 0x09, 0x3d, 0x7f,
+ 0x19, 0x0d, 0x3c, 0x0a, 0xfe, 0x39, 0xbd, 0xaf, 0x91, 0x66, 0xbd, 0x1c, 0xf9,
+ 0xa3, 0x3d, 0x6d, 0xfa, 0xa7, 0x3b, 0x55, 0x1d, 0xa2, 0x3d, 0xd4, 0x1c, 0x8a,
+ 0x3d, 0x21, 0xeb, 0xbd, 0xbc, 0xd7, 0x77, 0x45, 0xbc, 0x2b, 0xb9, 0x37, 0xbd,
+ 0x7b, 0x7c, 0xbd, 0xbd, 0x59, 0xa0, 0x92, 0xbd, 0xb9, 0x28, 0x2f, 0xbd, 0x1c,
+ 0xb6, 0x8c, 0xbc, 0x48, 0x52, 0x58, 0xbd, 0x90, 0x67, 0xa3, 0x3b, 0x92, 0xff,
+ 0x79, 0x3d, 0x55, 0x80, 0x9d, 0x3c, 0x68, 0x54, 0x98, 0xbd, 0xc6, 0xff, 0xbc,
+ 0xbc, 0x76, 0xb5, 0x72, 0xbd, 0x00, 0x62, 0x86, 0xbd, 0x6b, 0x01, 0xe3, 0xbc,
+ 0x42, 0x03, 0x6e, 0xbd, 0xd6, 0xe1, 0x7d, 0xbd, 0xcd, 0xed, 0x8b, 0x3c, 0x67,
+ 0x9d, 0x49, 0x3d, 0x6a, 0xe8, 0x31, 0x3d, 0xfd, 0x25, 0x4c, 0x3d, 0x87, 0x12,
+ 0xe8, 0xbb, 0x31, 0x54, 0x92, 0xbc, 0xbe, 0xab, 0x98, 0xbb, 0x85, 0x6c, 0xf7,
+ 0x3b, 0xb8, 0x0e, 0xbc, 0xbc, 0xf8, 0xea, 0x9a, 0x3d, 0x36, 0x13, 0xe2, 0xbc,
+ 0x9f, 0xd7, 0x6d, 0x3d, 0x4f, 0x0a, 0xb1, 0x3d, 0xba, 0x5c, 0x6b, 0xbd, 0xae,
+ 0x73, 0x60, 0xbc, 0x61, 0xf2, 0x8b, 0x3c, 0x90, 0x4c, 0x7b, 0xbd, 0x50, 0xef,
+ 0xe9, 0xbd, 0x54, 0x83, 0x99, 0xbc, 0x8f, 0xd5, 0x4d, 0x3d, 0x6b, 0x02, 0x37,
+ 0x3d, 0xc8, 0xe7, 0x84, 0x3d, 0x4e, 0x73, 0x87, 0x3d, 0x7a, 0xcc, 0xaa, 0x3c,
+ 0x0e, 0xde, 0x26, 0xbd, 0xef, 0xfb, 0xc8, 0xbd, 0x96, 0xe9, 0x11, 0xbd, 0xd2,
+ 0xd6, 0x26, 0xbc, 0x01, 0xea, 0x72, 0xbd, 0xf4, 0xb7, 0xad, 0xbb, 0x5b, 0xe7,
+ 0x9e, 0x3d, 0xe6, 0xa1, 0x06, 0xbe, 0x4d, 0xa9, 0xd4, 0x3c, 0x83, 0xc9, 0xdf,
+ 0x3c, 0x31, 0x26, 0x85, 0x3c, 0x4d, 0x25, 0xcf, 0xbb, 0x6c, 0xea, 0x91, 0x3d,
+ 0xb3, 0x55, 0x5d, 0x3c, 0x7f, 0x1d, 0x70, 0xbd, 0x0d, 0x6f, 0x85, 0x3d, 0xbe,
+ 0xe6, 0x35, 0xbd, 0x0f, 0x5b, 0x02, 0xbc, 0x1e, 0xad, 0x60, 0xbd, 0xeb, 0x48,
+ 0x4c, 0x3d, 0x73, 0x67, 0xaf, 0x3c, 0xda, 0x33, 0x03, 0x3d, 0xd9, 0xa3, 0x0d,
+ 0xbb, 0x6e, 0x31, 0x11, 0x3d, 0xb3, 0x7e, 0xfc, 0x3c, 0xc4, 0x86, 0x49, 0x3c,
+ 0x0a, 0x52, 0x0b, 0x3d, 0x68, 0x25, 0xae, 0x3d, 0xe0, 0x16, 0x02, 0x3d, 0xc0,
+ 0x47, 0x3f, 0xbd, 0x98, 0x55, 0x70, 0x3c, 0x1a, 0xbb, 0x38, 0x3d, 0xcf, 0x31,
+ 0xe4, 0xbc, 0xe0, 0x45, 0x39, 0xbd, 0x7c, 0xa1, 0x3f, 0xbd, 0xcc, 0x5b, 0x91,
+ 0xbd, 0x55, 0x28, 0x59, 0x3a, 0x75, 0xdc, 0x02, 0xbd, 0xd8, 0x0d, 0xfe, 0xbb,
+ 0x38, 0x7f, 0x92, 0xbd, 0x0f, 0xeb, 0x83, 0xbc, 0xcf, 0xe7, 0x0c, 0xbd, 0xb5,
+ 0xf8, 0x59, 0x3d, 0xfc, 0xd4, 0xcf, 0xbb, 0xa3, 0x75, 0x8a, 0x3d, 0xac, 0xe9,
+ 0x8e, 0xbd, 0x4a, 0xf9, 0x71, 0x3d, 0xee, 0x83, 0x32, 0xbc, 0x7c, 0x78, 0xa0,
+ 0xbd, 0x87, 0x86, 0x6a, 0xbd, 0x1a, 0x3c, 0xe4, 0xbc, 0x89, 0x4a, 0xa1, 0x3d,
+ 0xa0, 0x39, 0xdd, 0x3c, 0x93, 0xa3, 0x93, 0x3c, 0xdd, 0x08, 0xa2, 0x3d, 0x9a,
+ 0x87, 0x98, 0xbd, 0xe6, 0x5a, 0x32, 0xbd, 0xeb, 0x4d, 0xea, 0xbb, 0x48, 0xda,
+ 0x6b, 0x3c, 0x36, 0x23, 0x82, 0x3d, 0x80, 0x78, 0x90, 0x3d, 0x0e, 0x4c, 0x1b,
+ 0xbd, 0xb9, 0x3c, 0x54, 0x3d, 0x5f, 0x8b, 0xf5, 0xbb, 0x54, 0x40, 0x54, 0xbd,
+ 0x35, 0x04, 0x8e, 0xbc, 0x38, 0xcf, 0xe0, 0x3b, 0x2f, 0xf6, 0x55, 0xbd, 0xe0,
+ 0xed, 0x7e, 0x3c, 0x84, 0x12, 0x9c, 0x3d, 0x74, 0x34, 0xfb, 0xbc, 0x02, 0xd9,
+ 0x93, 0xbd, 0xff, 0x27, 0xa8, 0xbd, 0x83, 0xf3, 0xaf, 0xbb, 0x99, 0x16, 0x7d,
+ 0x3d, 0xc6, 0xd9, 0x32, 0xbd, 0xb1, 0xa4, 0xbd, 0xbc, 0xd2, 0x1c, 0x5b, 0x3d,
+ 0xb3, 0xdb, 0x31, 0x3d, 0xe4, 0x10, 0x03, 0x3c, 0x29, 0xb0, 0x0b, 0xbd, 0x16,
+ 0x47, 0x9b, 0x3d, 0x75, 0x6b, 0xfd, 0xbc, 0x09, 0x92, 0xac, 0x3c, 0x12, 0x2c,
+ 0x07, 0x3d, 0x5a, 0xb3, 0xa0, 0x3c, 0xc9, 0x3d, 0x21, 0xbd, 0xc1, 0x80, 0x6d,
+ 0xbd, 0xa9, 0x20, 0x9c, 0x3d, 0xf5, 0x5b, 0x07, 0xbe, 0x9a, 0x76, 0x6f, 0xbd,
+ 0xd5, 0x11, 0xff, 0x3d, 0x58, 0xda, 0xd4, 0x3c, 0x18, 0x2f, 0xb9, 0x3d, 0xd4,
+ 0xa0, 0x6c, 0xbd, 0x4d, 0xe5, 0x2b, 0xbc, 0x97, 0x9d, 0x5f, 0xbc, 0x55, 0xe6,
+ 0x9b, 0xbd, 0x61, 0xee, 0xb3, 0x3c, 0x24, 0x06, 0xbf, 0x3c, 0xc2, 0x90, 0x09,
+ 0xbd, 0x91, 0xaf, 0x63, 0x3d, 0xde, 0x86, 0x7b, 0x3c, 0xca, 0x42, 0x0d, 0x3c,
+ 0x5f, 0xda, 0xcd, 0xbc, 0x7b, 0x27, 0x13, 0x3d, 0xf9, 0xd1, 0x14, 0x3c, 0xb6,
+ 0x83, 0x4a, 0x3d, 0x37, 0x74, 0x63, 0xbd, 0xbb, 0x85, 0x40, 0xbd, 0x3e, 0x15,
+ 0x13, 0x3d, 0x00, 0xe1, 0x22, 0xbd, 0xef, 0xdd, 0x63, 0xbd, 0x95, 0xdb, 0xa6,
+ 0x3c, 0xf4, 0xc1, 0x86, 0xbd, 0xfd, 0xf0, 0xe5, 0x3c, 0x84, 0xc1, 0x69, 0xbd,
+ 0xe4, 0x85, 0xf5, 0x3c, 0x18, 0xfa, 0x79, 0xbd, 0xe3, 0xd5, 0x2e, 0xbd, 0x32,
+ 0x90, 0x8f, 0xbc, 0x40, 0xfa, 0x08, 0xbc, 0xa4, 0x5f, 0xcb, 0xbc, 0x5a, 0xa7,
+ 0x3f, 0x3d, 0x09, 0x40, 0x23, 0x3d, 0x7b, 0x17, 0x0e, 0xbd, 0x6e, 0x70, 0xb9,
+ 0x3b, 0xc7, 0x3d, 0x4d, 0xbd, 0xe9, 0x57, 0x5d, 0x3d, 0x5c, 0x02, 0x91, 0x3c,
+ 0xc8, 0x08, 0x31, 0xbd, 0x09, 0xea, 0xe3, 0x3c, 0x14, 0x23, 0xf6, 0x3c, 0x95,
+ 0xd1, 0x22, 0xbd, 0xba, 0x27, 0xce, 0x3c, 0xb2, 0x59, 0x42, 0xbd, 0x29, 0x50,
+ 0x6d, 0x3d, 0x20, 0xe5, 0x10, 0xbd, 0xc2, 0x68, 0x5a, 0xbd, 0x04, 0x6e, 0x81,
+ 0xbd, 0xd6, 0xc7, 0xa4, 0xbc, 0x16, 0x22, 0x33, 0x3d, 0x80, 0xbf, 0x70, 0x3c,
+ 0xbf, 0x62, 0x02, 0xbd, 0xdd, 0x19, 0x28, 0xbd, 0x8d, 0x5c, 0x60, 0x3d, 0x96,
+ 0xb4, 0x24, 0xbd, 0x9a, 0xb5, 0x6e, 0xbd, 0x52, 0xb5, 0x81, 0x3d, 0xf3, 0x49,
+ 0x85, 0xbd, 0x4a, 0x65, 0xcc, 0x3c, 0x06, 0xca, 0x13, 0xbd, 0x18, 0x94, 0x07,
+ 0x3d, 0xde, 0x60, 0x45, 0x3c, 0x7a, 0x2d, 0x69, 0x3d, 0x7e, 0xc6, 0xba, 0xbc,
+ 0xff, 0xcf, 0x64, 0x3d, 0x3e, 0x22, 0x98, 0xbd, 0xe1, 0x87, 0xc8, 0x3c, 0xec,
+ 0x54, 0x90, 0xbd, 0x60, 0x0b, 0x09, 0x3d, 0x5e, 0xc7, 0x95, 0x3c, 0x54, 0x1c,
+ 0x5b, 0x3b, 0xac, 0x77, 0xfe, 0x3c, 0x4c, 0x43, 0xea, 0xbc, 0xe4, 0x4d, 0xb3,
+ 0x3c, 0xab, 0x96, 0x20, 0xbd, 0xf7, 0x8a, 0x48, 0xbd, 0xcc, 0xcb, 0x70, 0x3d,
+ 0x25, 0x01, 0x91, 0xbc, 0x9c, 0x9a, 0x96, 0x3c, 0x9c, 0x7d, 0x56, 0x3d, 0x3e,
+ 0x2b, 0x47, 0xbd, 0x44, 0x48, 0x15, 0xbd, 0x38, 0x4e, 0xc1, 0x3c, 0x9e, 0x72,
+ 0x05, 0x3d, 0xe9, 0xbd, 0x44, 0xbc, 0x96, 0xdd, 0x6f, 0x3d, 0x17, 0x2b, 0x4e,
+ 0x3c, 0x21, 0x91, 0x4c, 0x3d, 0x2f, 0x87, 0x8e, 0xbd, 0xf2, 0xd2, 0x31, 0x3d,
+ 0x47, 0x07, 0xad, 0xbc, 0x41, 0x54, 0x89, 0x3c, 0xee, 0xa9, 0x4d, 0x3d, 0xf2,
+ 0xb1, 0x80, 0x3d, 0x6a, 0xd9, 0x78, 0xbd, 0x55, 0x4a, 0x32, 0xbd, 0xd1, 0xd8,
+ 0x44, 0x3d, 0xda, 0x72, 0x7d, 0x3d, 0xa1, 0xd1, 0xbc, 0x3b, 0x7a, 0xf4, 0x32,
+ 0xbd, 0xf0, 0x44, 0x84, 0x3d, 0xd3, 0x0b, 0x8c, 0x3d, 0xd9, 0xc8, 0x58, 0xbd,
+ 0xdd, 0x2c, 0x7c, 0x3d, 0x49, 0x3e, 0x8f, 0x3d, 0x39, 0xbd, 0x95, 0xbd, 0x99,
+ 0x46, 0x25, 0x3d, 0x63, 0xfe, 0x20, 0xbd, 0x0a, 0x1d, 0x62, 0xbc, 0x4b, 0xae,
+ 0x3b, 0xbc, 0x3c, 0x28, 0x84, 0xbc, 0x79, 0x24, 0x25, 0xbd, 0x62, 0x6b, 0x56,
+ 0xbd, 0xe9, 0x9a, 0x88, 0x3d, 0xd6, 0x9f, 0x85, 0xbc, 0xad, 0xf6, 0x51, 0xbd,
+ 0xc2, 0x72, 0x85, 0x3d, 0xf6, 0x0d, 0x89, 0xbd, 0x3e, 0x76, 0xca, 0x39, 0x90,
+ 0x96, 0x89, 0x3d, 0xa1, 0x6e, 0x25, 0xbd, 0x4b, 0xbd, 0x18, 0x3c, 0x0e, 0x05,
+ 0x69, 0xbc, 0x03, 0x9e, 0x76, 0x3d, 0xa3, 0xae, 0x67, 0x3d, 0xc4, 0x38, 0x5a,
+ 0x3d, 0x8c, 0x9d, 0x53, 0xbd, 0x35, 0x24, 0x42, 0xbd, 0x36, 0xfa, 0xcf, 0x3c,
+ 0xe8, 0x09, 0x0f, 0xbd, 0xe9, 0x6e, 0x15, 0xbd, 0x51, 0x03, 0x1b, 0xbd, 0xf7,
+ 0x1d, 0x32, 0x3d, 0x08, 0xfc, 0x2f, 0xbd, 0x9d, 0x4c, 0x65, 0x3d, 0x9d, 0xf0,
+ 0x98, 0xbb, 0xb0, 0xba, 0x0d, 0xbc, 0x64, 0xee, 0x03, 0xbb, 0x92, 0x82, 0x16,
+ 0xbc, 0xa5, 0xa0, 0x94, 0xbd, 0xd0, 0x1f, 0xf1, 0x3c, 0xeb, 0x06, 0x8c, 0xbb,
+ 0xb5, 0xc2, 0x64, 0x3c, 0x7e, 0x30, 0x55, 0x3c, 0x68, 0x89, 0x64, 0x3c, 0xec,
+ 0x1e, 0x9e, 0x3c, 0xf0, 0xc9, 0x57, 0x3d, 0xfe, 0x25, 0x0c, 0xbd, 0x2f, 0xb4,
+ 0x0b, 0x3c, 0x32, 0x76, 0x7a, 0xbd, 0xd2, 0x15, 0xea, 0xba, 0xc0, 0xc9, 0x45,
+ 0xbd, 0xb7, 0xda, 0x48, 0xbc, 0x5e, 0x85, 0x6c, 0x3c, 0xbc, 0xda, 0x84, 0xbc,
+ 0xc6, 0x56, 0x35, 0xbd, 0x21, 0xfd, 0x7d, 0x3d, 0xbf, 0x0c, 0x0f, 0x3b, 0xc2,
+ 0x28, 0xa4, 0xbc, 0xad, 0xa3, 0xe7, 0xbb, 0x77, 0xd9, 0x55, 0x3d, 0x6d, 0x5a,
+ 0x21, 0xbc, 0x3f, 0xa0, 0xd9, 0xbc, 0x1b, 0x86, 0x85, 0x3d, 0x38, 0x2f, 0x1f,
+ 0xbd, 0xd5, 0xa5, 0x43, 0x3d, 0xdb, 0x04, 0x8d, 0xbd, 0xbc, 0x0d, 0x25, 0x3d,
+ 0xf5, 0x71, 0x86, 0x3d, 0xa8, 0x4e, 0x88, 0xbd, 0xca, 0xab, 0x24, 0x3c, 0x8d,
+ 0x03, 0xda, 0x3c, 0xad, 0x77, 0x19, 0xbc, 0x2e, 0x7c, 0xf5, 0x3c, 0x75, 0x45,
+ 0x6e, 0x3d, 0x9b, 0x9f, 0x80, 0xbd, 0x1d, 0xce, 0x85, 0x3d, 0xb6, 0xbe, 0x86,
+ 0xbc, 0xc0, 0x1c, 0x55, 0xbb, 0xd0, 0xc7, 0x5c, 0xbd, 0x1f, 0x60, 0x64, 0x3c,
+ 0x4f, 0x04, 0x60, 0xbd, 0x04, 0xc9, 0x64, 0x3d, 0x0a, 0xbb, 0x10, 0x3b, 0x08,
+ 0x41, 0x92, 0xbd, 0xac, 0x5b, 0x15, 0xbd, 0x44, 0xe8, 0x27, 0x3b, 0x9c, 0x98,
+ 0x0c, 0x3d, 0x09, 0x52, 0x7a, 0x3d, 0x33, 0xe4, 0xcd, 0xbc, 0xda, 0x48, 0x17,
+ 0xbd, 0x26, 0xe5, 0x5d, 0xbb, 0x2f, 0xfc, 0x69, 0xbd, 0x9f, 0xfd, 0x54, 0x3d,
+ 0x1d, 0x45, 0x07, 0xbd, 0x86, 0x69, 0x91, 0x3c, 0x9e, 0x1a, 0xbe, 0xbc, 0xfa,
+ 0xf4, 0x5e, 0x3d, 0xb5, 0x9d, 0x00, 0xbd, 0xe0, 0xfd, 0x90, 0x3c, 0x3a, 0xac,
+ 0xc9, 0xbc, 0x11, 0xa7, 0xb0, 0xbb, 0x3e, 0x18, 0xa8, 0x3c, 0x79, 0x2e, 0x55,
+ 0xbd, 0xe0, 0xb2, 0xfd, 0xbb, 0x72, 0xb0, 0x5d, 0xbc, 0xe1, 0xd9, 0x6f, 0x3d,
+ 0xd5, 0x3a, 0x9f, 0xbc, 0xc8, 0x8f, 0x1a, 0xbd, 0x18, 0x60, 0x3b, 0x3c, 0xc0,
+ 0x90, 0x24, 0xbc, 0x78, 0xb6, 0x50, 0x3d, 0x84, 0xc6, 0x81, 0xbd, 0x98, 0x2d,
+ 0x46, 0x3d, 0x7f, 0x8a, 0x3b, 0x3d, 0x03, 0xd9, 0x7f, 0x3d, 0x50, 0x04, 0xae,
+ 0x3c, 0xaf, 0xae, 0x6b, 0xbd, 0xcd, 0x34, 0x48, 0xbd, 0xbd, 0x05, 0xa8, 0x3c,
+ 0x84, 0xc8, 0x3f, 0xbd, 0xcb, 0x46, 0x89, 0x3d, 0x92, 0x2b, 0x16, 0x3d, 0x98,
+ 0xfb, 0xcd, 0xbc, 0x80, 0x5b, 0x43, 0xbd, 0xac, 0x5e, 0x78, 0x3c, 0xd6, 0xbf,
+ 0x7e, 0x3b, 0x32, 0xec, 0x81, 0x3b, 0xce, 0xab, 0xf1, 0x3b, 0xb2, 0xd7, 0x86,
+ 0xbc, 0xb1, 0xe3, 0x09, 0x3d, 0x4f, 0xc6, 0xa5, 0xbc, 0x4c, 0x1b, 0x89, 0x3c,
+ 0xd6, 0x09, 0x2b, 0x3d, 0x61, 0x67, 0x4a, 0xbc, 0x7a, 0x5e, 0x87, 0xbc, 0x6c,
+ 0x32, 0x55, 0x3c, 0x6b, 0xe0, 0xa7, 0xba, 0x41, 0xc8, 0xb5, 0xbc, 0x94, 0x54,
+ 0x64, 0xbc, 0x81, 0xb6, 0x33, 0x3d, 0x3a, 0x05, 0x59, 0x3d, 0x42, 0x25, 0x46,
+ 0xbd, 0xfc, 0xda, 0x8c, 0xbd, 0x17, 0x64, 0x87, 0x3d, 0x55, 0x39, 0x61, 0x3d,
+ 0x4f, 0xcf, 0x25, 0xbd, 0xfc, 0x4d, 0x26, 0x3c, 0x7c, 0x18, 0xd8, 0x3c, 0x4f,
+ 0x1b, 0x5c, 0x3d, 0x3a, 0x09, 0xcd, 0x3c, 0x27, 0x4a, 0x00, 0x3d, 0x1c, 0xb7,
+ 0xb7, 0xbc, 0x0a, 0x1b, 0x38, 0xbc, 0x88, 0x6d, 0x2f, 0x3d, 0x96, 0xdf, 0x6a,
+ 0xbd, 0x7e, 0x7e, 0xa0, 0xb9, 0x10, 0x23, 0x10, 0xbc, 0xec, 0x6b, 0xbf, 0x3c,
+ 0x1a, 0x8e, 0x7a, 0xbc, 0x68, 0xb1, 0x7c, 0x3d, 0xb0, 0xcc, 0x30, 0xbd, 0xec,
+ 0x59, 0xef, 0x3c, 0x8d, 0xd5, 0x41, 0x3b, 0x82, 0xa1, 0xec, 0xbc, 0x29, 0x35,
+ 0x51, 0xbd, 0x6e, 0x6e, 0x91, 0xbc, 0xf9, 0x6d, 0x2a, 0x3d, 0x5d, 0x97, 0x17,
+ 0x3d, 0xcb, 0xad, 0x29, 0x3c, 0xc4, 0x47, 0x41, 0x3d, 0x40, 0x7c, 0x6a, 0xbc,
+ 0xa6, 0x09, 0x1e, 0x3d, 0x14, 0x9c, 0xf2, 0xbc, 0x70, 0x31, 0x5d, 0x3c, 0xd1,
+ 0x54, 0x70, 0xbc, 0xd8, 0x58, 0xdd, 0x3a, 0x65, 0x21, 0x6a, 0xbd, 0x64, 0x81,
+ 0x99, 0xbd, 0x51, 0x5a, 0x64, 0x3c, 0x8c, 0xa6, 0x90, 0x3c, 0xe6, 0xb6, 0x2a,
+ 0xbd, 0x3d, 0x2a, 0x15, 0xbd, 0x82, 0xbe, 0x8d, 0xbc, 0x65, 0x32, 0x68, 0xbd,
+ 0x0a, 0x5d, 0x6d, 0xbc, 0x24, 0x8c, 0xd6, 0xbc, 0x70, 0x4d, 0xe7, 0x3c, 0x06,
+ 0x58, 0x01, 0x3c, 0x22, 0xd2, 0x58, 0x3d, 0x62, 0x60, 0x88, 0x3c, 0xfc, 0xe6,
+ 0x12, 0x3d, 0x31, 0x59, 0xdb, 0x3c, 0x5d, 0xfb, 0x96, 0xbc, 0xb6, 0x50, 0x7f,
+ 0x3b, 0xd7, 0x01, 0x37, 0x3d, 0x6a, 0x71, 0xc4, 0xbc, 0x8d, 0x28, 0xc9, 0x3c,
+ 0x33, 0x39, 0x4f, 0xbb, 0x14, 0x14, 0x1b, 0x3d, 0x32, 0x36, 0x62, 0xbd, 0xa7,
+ 0xf1, 0x89, 0x3d, 0xc4, 0x12, 0x13, 0x3d, 0xf3, 0x79, 0xde, 0x3c, 0xc0, 0x39,
+ 0xb3, 0xbb, 0x36, 0xb5, 0x54, 0xbd, 0x04, 0xf2, 0xcc, 0xbc, 0x45, 0x14, 0xf8,
+ 0x3a, 0x4b, 0x1d, 0x55, 0xbd, 0x13, 0x35, 0xc6, 0xbc, 0x7a, 0x92, 0x1b, 0xbd,
+ 0x71, 0xb0, 0x3b, 0xbd, 0xfe, 0x84, 0x2f, 0xbd, 0xd4, 0x64, 0x60, 0x3d, 0xa7,
+ 0x0b, 0xb7, 0xbb, 0xd1, 0xc7, 0x8a, 0xbd, 0x21, 0x20, 0x78, 0x3d, 0x1b, 0x25,
+ 0x77, 0x3d, 0x5e, 0x06, 0x20, 0xbd, 0x7d, 0xfa, 0xe0, 0xbc, 0x5b, 0x2b, 0x38,
+ 0x3d, 0x8c, 0x10, 0x90, 0xbd, 0xbe, 0xc0, 0xb2, 0x3c, 0x5a, 0x88, 0x94, 0xbd,
+ 0x80, 0x87, 0x94, 0x3c, 0x73, 0xed, 0x81, 0xbd, 0x73, 0x42, 0x3f, 0xba, 0xdc,
+ 0xf8, 0x4e, 0x3d, 0x9a, 0xd4, 0x8d, 0xbc, 0x3a, 0x6f, 0x72, 0xbc, 0x37, 0xe8,
+ 0x06, 0x3d, 0xbb, 0x35, 0x61, 0x3d, 0x64, 0xc6, 0x4a, 0x3d, 0xee, 0x94, 0x13,
+ 0xb9, 0xc0, 0x4b, 0xaf, 0xba, 0x60, 0x4b, 0x42, 0x3d, 0x40, 0x88, 0xb1, 0x3c,
+ 0xc6, 0x61, 0x6c, 0x3d, 0x92, 0xd0, 0x40, 0x3d, 0x32, 0xc0, 0x8d, 0xbd, 0x90,
+ 0x66, 0xc2, 0xbc, 0x52, 0x1f, 0x14, 0xbd, 0x03, 0x9d, 0x23, 0x3d, 0x81, 0x60,
+ 0xe1, 0x3c, 0xe3, 0x31, 0x5f, 0x3d, 0x38, 0xbc, 0x52, 0x3d, 0x23, 0x3e, 0x3b,
+ 0xbd, 0xf6, 0x53, 0x8e, 0xbd, 0xc9, 0xb1, 0x88, 0xbd, 0x02, 0x0c, 0xc6, 0xbc,
+ 0x2e, 0x6d, 0x26, 0xbd, 0xe2, 0x88, 0x87, 0xbd, 0x45, 0x45, 0x28, 0x3d, 0xbc,
+ 0x73, 0xd7, 0xba, 0x17, 0x1e, 0x15, 0xbc, 0xa6, 0x0c, 0x9c, 0xbc, 0x5a, 0x74,
+ 0x63, 0x3d, 0x05, 0x28, 0xf6, 0x3c, 0xe5, 0xda, 0x4d, 0xbd, 0x02, 0x69, 0x42,
+ 0xbd, 0x8a, 0xb0, 0x2c, 0x3d, 0x27, 0x22, 0x07, 0x3d, 0x6a, 0x7a, 0x08, 0x3b,
+ 0x88, 0xb6, 0x03, 0x3d, 0x80, 0xad, 0xac, 0xbb, 0xc9, 0x67, 0x6d, 0xbb, 0x80,
+ 0xf0, 0x8d, 0xbd, 0x53, 0x78, 0x85, 0x3d, 0x14, 0x99, 0x24, 0xbb, 0x86, 0x7c,
+ 0x0c, 0x3d, 0xbe, 0xff, 0x79, 0x3d, 0x01, 0x39, 0xb4, 0x3c, 0x19, 0x42, 0x52,
+ 0x3c, 0x4d, 0x8b, 0x73, 0x3d, 0xb4, 0x6b, 0xf1, 0x3a, 0x6e, 0x53, 0xb4, 0xbc,
+ 0x09, 0x88, 0x11, 0xbd, 0xdf, 0x5e, 0x86, 0xbd, 0x10, 0xdc, 0x5a, 0xbd, 0x6b,
+ 0xb3, 0x3a, 0xbd, 0x7e, 0x23, 0x84, 0xbd, 0x95, 0x50, 0x8c, 0xbd, 0xd1, 0x50,
+ 0x93, 0x3c, 0x5f, 0x43, 0x67, 0x3a, 0x92, 0xc2, 0x91, 0xbd, 0xbe, 0xb0, 0x4e,
+ 0xbd, 0x8c, 0xeb, 0x36, 0xbd, 0x4e, 0x0e, 0x82, 0xbd, 0xc5, 0x15, 0x0b, 0xbd,
+ 0x1c, 0x66, 0x5a, 0xbd, 0xf6, 0xe4, 0x19, 0x3b, 0x4d, 0x1c, 0x07, 0x3d, 0x70,
+ 0x1f, 0x24, 0x3d, 0x59, 0x80, 0x3b, 0xbd, 0x8e, 0x9e, 0xae, 0xbb, 0x11, 0x6f,
+ 0x8f, 0x3b, 0x5f, 0xc9, 0x74, 0xbd, 0x36, 0x65, 0x2b, 0x3c, 0x43, 0xb4, 0xcf,
+ 0x3c, 0x7f, 0xbf, 0x18, 0x3d, 0x91, 0x58, 0x16, 0xbd, 0x72, 0xc4, 0xf3, 0xbc,
+ 0x80, 0xd3, 0x8a, 0x3b, 0x95, 0x0e, 0xe7, 0x3c, 0xdd, 0x17, 0x1d, 0x3d, 0x55,
+ 0x74, 0x98, 0xbd, 0x5c, 0x6b, 0x1e, 0xbc, 0x02, 0x65, 0x61, 0xba, 0x01, 0x7f,
+ 0x81, 0xbc, 0x97, 0x95, 0x73, 0xbd, 0xd8, 0x60, 0xfd, 0xbc, 0xd4, 0x64, 0x8a,
+ 0x3a, 0xe5, 0x81, 0x24, 0x3c, 0xfd, 0x2b, 0x14, 0x3d, 0x60, 0x49, 0xff, 0x3b,
+ 0x6f, 0x63, 0x33, 0xbd, 0xe0, 0x83, 0x4b, 0xbd, 0xed, 0x7a, 0x10, 0x3d, 0x5b,
+ 0x26, 0x33, 0x3d, 0x03, 0xff, 0x2d, 0x3d, 0xcd, 0xca, 0x42, 0xbd, 0x4c, 0x09,
+ 0x3f, 0x3d, 0xcb, 0xcb, 0x95, 0xbc, 0xff, 0x04, 0x18, 0x3c, 0x99, 0x48, 0x6c,
+ 0xbd, 0xb6, 0x3f, 0x04, 0x3a, 0x68, 0x3d, 0x67, 0x3c, 0x71, 0xd9, 0x7a, 0xbc,
+ 0x88, 0x7d, 0x02, 0x3c, 0x0f, 0xfa, 0x3b, 0xbd, 0x78, 0x64, 0xfc, 0x3c, 0xab,
+ 0x8c, 0x37, 0x3d, 0x08, 0x19, 0xcf, 0xbc, 0x03, 0xe0, 0x85, 0xbd, 0x1b, 0xaf,
+ 0x79, 0xbd, 0x92, 0x9e, 0x67, 0x3d, 0x31, 0x3e, 0x94, 0xbd, 0xe8, 0xd1, 0x1f,
+ 0xbd, 0x4d, 0xa1, 0xcb, 0x3c, 0x9f, 0xc0, 0xf7, 0x3c, 0xa8, 0x88, 0xe1, 0xbc,
+ 0xf7, 0x13, 0x8b, 0x3c, 0x77, 0x1b, 0xfe, 0xbc, 0x11, 0xf0, 0x4d, 0x3d, 0x02,
+ 0x73, 0xff, 0xbc, 0x20, 0x4b, 0x2f, 0x3d, 0x50, 0x14, 0x28, 0x3c, 0xa2, 0x0a,
+ 0xc1, 0xbc, 0xb3, 0xf6, 0xe1, 0xbc, 0x32, 0x98, 0xa1, 0x3c, 0x3f, 0xef, 0xcc,
+ 0x3b, 0xd6, 0xbf, 0x37, 0xbd, 0x4e, 0x0a, 0x15, 0x3d, 0xfd, 0x81, 0x24, 0xbd,
+ 0x62, 0x05, 0x43, 0x3d, 0x4b, 0x8d, 0xb5, 0xbc, 0x0e, 0xe7, 0x7c, 0x3d, 0xd1,
+ 0x64, 0x88, 0xbd, 0xca, 0x03, 0xd3, 0xbb, 0xc9, 0xaa, 0x9f, 0xbb, 0xb5, 0x0e,
+ 0xbf, 0xbc, 0x48, 0x82, 0xe7, 0x3c, 0xa1, 0x4b, 0x10, 0x3d, 0x40, 0x51, 0x68,
+ 0xbb, 0xc0, 0x36, 0xc4, 0x3c, 0xcc, 0xd9, 0x37, 0xbc, 0xec, 0x40, 0xcf, 0x3c,
+ 0xb2, 0x38, 0x52, 0xbd, 0x15, 0xe7, 0x0c, 0xbd, 0x52, 0xea, 0x59, 0x3c, 0xcf,
+ 0xe3, 0xd1, 0xbc, 0x9e, 0xb7, 0x94, 0xbc, 0x1a, 0x13, 0xc8, 0x3c, 0x04, 0x51,
+ 0xa0, 0x3b, 0x7f, 0xb4, 0x32, 0x3d, 0x5e, 0x43, 0x5a, 0x3d, 0x8b, 0x6d, 0x98,
+ 0xba, 0xa4, 0x70, 0x47, 0x3d, 0xe6, 0x23, 0x60, 0x3d, 0x48, 0xf3, 0x8b, 0xbc,
+ 0x85, 0xfe, 0x60, 0x3d, 0x33, 0x94, 0xc7, 0xbc, 0xdd, 0xbf, 0x80, 0xbd, 0x31,
+ 0x98, 0xbb, 0x3b, 0x76, 0x70, 0x8a, 0x3c, 0x72, 0xc5, 0x4e, 0x3c, 0x31, 0x53,
+ 0x20, 0x3d, 0xcd, 0xda, 0x03, 0x3b, 0x8c, 0xc0, 0x3d, 0x3d, 0x9c, 0xaa, 0x90,
+ 0xbd, 0xb5, 0x9f, 0xab, 0x3c, 0x45, 0x77, 0x31, 0xbd, 0xea, 0x85, 0x8e, 0xbd,
+ 0x15, 0x6d, 0x8b, 0xbc, 0xb9, 0x98, 0xb1, 0xbc, 0x09, 0x9b, 0xff, 0x3c, 0x1e,
+ 0xcf, 0x3c, 0x3d, 0x3c, 0xe3, 0x2a, 0xbd, 0x2a, 0xff, 0x20, 0x3d, 0xbb, 0x1c,
+ 0x4a, 0x3b, 0x8f, 0x19, 0x83, 0xbd, 0xad, 0x9f, 0xe5, 0x3c, 0x43, 0x3d, 0x44,
+ 0x3d, 0xaa, 0xb9, 0xe3, 0x3c, 0x8c, 0xd1, 0x86, 0x3d, 0xfa, 0x93, 0x7c, 0x3d,
+ 0x31, 0xe5, 0x67, 0xbc, 0x3f, 0x25, 0x8a, 0xbd, 0x90, 0x91, 0x5e, 0x3b, 0xbf,
+ 0xd8, 0xfe, 0xbc, 0x68, 0xaa, 0x85, 0x3c, 0xb3, 0xb6, 0x07, 0xbd, 0x6f, 0x51,
+ 0x91, 0xbd, 0x3c, 0x5d, 0xc8, 0xbc, 0xba, 0xf5, 0xd3, 0xbb, 0x8d, 0x90, 0xd5,
+ 0xbc, 0x02, 0x78, 0x2f, 0xbc, 0x12, 0x94, 0x10, 0x3d, 0xb2, 0x26, 0x82, 0xbd,
+ 0x49, 0x2a, 0x70, 0x3d, 0x9c, 0xf4, 0x67, 0xbd, 0x8d, 0x33, 0xf3, 0xbc, 0x22,
+ 0xa0, 0xc3, 0x3c, 0x38, 0xb2, 0x31, 0x3d, 0x71, 0xe9, 0x87, 0xbd, 0x7c, 0xc5,
+ 0x96, 0xbd, 0x5b, 0x13, 0xa5, 0xbc, 0x2d, 0x8a, 0x8a, 0x3d, 0x80, 0xc2, 0x24,
+ 0x3d, 0x1e, 0xc5, 0x74, 0x3d, 0xec, 0x3a, 0xca, 0x3c, 0x37, 0xb4, 0x00, 0xbc,
+ 0x29, 0xe2, 0x0c, 0x3d, 0xbc, 0x36, 0x20, 0x3d, 0x58, 0x3a, 0x5f, 0x3d, 0x8a,
+ 0xe4, 0x24, 0xbd, 0x22, 0x99, 0x45, 0xbd, 0xbe, 0xef, 0x0d, 0xbd, 0xbe, 0xae,
+ 0x0f, 0xbc, 0xe1, 0xe9, 0x4e, 0x3c, 0xd2, 0xed, 0x54, 0xbd, 0x62, 0xcb, 0x7d,
+ 0x3c, 0xc8, 0xe4, 0x0d, 0xbc, 0x61, 0xaa, 0xa8, 0x3b, 0x68, 0x56, 0x92, 0xbb,
+ 0x83, 0xb3, 0x25, 0xbd, 0x0a, 0x28, 0x39, 0xbd, 0x9d, 0xd4, 0x13, 0x3c, 0x5c,
+ 0x3c, 0x27, 0x3d, 0x34, 0x21, 0x30, 0x3d, 0x9d, 0xac, 0x54, 0xbd, 0xaa, 0xe8,
+ 0x60, 0x3d, 0xb4, 0xaf, 0xe5, 0x3c, 0xb0, 0x22, 0x1d, 0x3d, 0x9c, 0x7e, 0x64,
+ 0x3d, 0x3e, 0xd9, 0x7b, 0x3d, 0x55, 0x9e, 0x46, 0x3d, 0x47, 0xf9, 0xfe, 0x3a,
+ 0x00, 0xf0, 0x79, 0xbc, 0x49, 0x93, 0xd5, 0xbb, 0x98, 0x75, 0x29, 0xbc, 0xfb,
+ 0xdc, 0x37, 0xbd, 0x9a, 0x0e, 0x65, 0x3d, 0x7a, 0x74, 0x93, 0xbd, 0x39, 0x83,
+ 0xba, 0x3c, 0x20, 0xa3, 0x94, 0xbd, 0xbf, 0x32, 0x18, 0xbc, 0xbd, 0x90, 0x19,
+ 0x3c, 0x31, 0xbe, 0x94, 0xbd, 0x1f, 0xd5, 0x9b, 0x3a, 0x09, 0xa3, 0x44, 0xbd,
+ 0xe4, 0x91, 0xae, 0xbc, 0x98, 0x84, 0x73, 0xbd, 0xe6, 0x64, 0x70, 0x3d, 0xcc,
+ 0x0d, 0x01, 0xbd, 0xb0, 0xd6, 0xce, 0x3c, 0x2a, 0x8b, 0x78, 0xbd, 0x51, 0x8a,
+ 0xcd, 0x3c, 0x76, 0x3b, 0x0b, 0x3b, 0x85, 0xe3, 0x76, 0xbd, 0xad, 0x98, 0x6f,
+ 0x3d, 0xf8, 0xa1, 0x92, 0xbd, 0x22, 0xb9, 0x24, 0xbd, 0x81, 0xf4, 0x62, 0xbd,
+ 0xeb, 0x97, 0x83, 0x3d, 0x0d, 0xa9, 0x91, 0x3a, 0x62, 0x88, 0x0c, 0xbc, 0x99,
+ 0x64, 0x48, 0x3d, 0x0b, 0x11, 0x80, 0xba, 0x94, 0xe3, 0x70, 0xbc, 0xa3, 0x42,
+ 0x56, 0x3c, 0x1c, 0x41, 0xec, 0x3c, 0x68, 0x56, 0x29, 0x3c, 0x50, 0x4a, 0x05,
+ 0x3d, 0xfa, 0x33, 0x37, 0x3d, 0x5d, 0x7c, 0x8d, 0x3d, 0xa8, 0x02, 0x3f, 0x3c,
+ 0xa6, 0x1d, 0x68, 0x3d, 0x41, 0x3b, 0x76, 0x3d, 0x29, 0xa1, 0x56, 0xbd, 0xbd,
+ 0x90, 0x7c, 0x3b, 0xd9, 0x96, 0x62, 0xbd, 0xf2, 0x15, 0xd8, 0xbc, 0xad, 0x62,
+ 0x38, 0x3d, 0x19, 0xc7, 0x0d, 0x3d, 0xda, 0xcc, 0xf8, 0x3b, 0x63, 0xaf, 0x84,
+ 0xbd, 0x42, 0x94, 0x3f, 0xbc, 0x60, 0x67, 0x83, 0x3d, 0x13, 0xdb, 0xa8, 0x3c,
+ 0x8f, 0xcb, 0x5e, 0x3d, 0x97, 0x69, 0x14, 0xbd, 0xd5, 0x52, 0x97, 0x3c, 0x28,
+ 0xb2, 0x09, 0xbb, 0xd0, 0x5c, 0x0f, 0x3d, 0x08, 0x01, 0x38, 0xbd, 0x2a, 0xd1,
+ 0x75, 0xbd, 0xb6, 0x48, 0x5e, 0xbd, 0xe6, 0x3a, 0x40, 0x3d, 0x91, 0x52, 0xb5,
+ 0x3c, 0xe6, 0xe6, 0x2f, 0x3d, 0x7b, 0x0a, 0x0b, 0x3d, 0x05, 0xa6, 0xf1, 0xbb,
+ 0xe5, 0x14, 0x12, 0x3c, 0x70, 0x4a, 0x61, 0xbd, 0xc0, 0xd5, 0x77, 0x3c, 0xea,
+ 0x92, 0x4e, 0x3d, 0xe8, 0xea, 0x7a, 0x3c, 0x85, 0xec, 0x8d, 0xbc, 0x1f, 0x06,
+ 0x3a, 0x3d, 0x24, 0x7d, 0x43, 0x3c, 0x3b, 0xfb, 0x4e, 0x3d, 0x10, 0xdb, 0x26,
+ 0xbc, 0x3c, 0xe4, 0x44, 0x3d, 0x5f, 0x54, 0xe6, 0x3c, 0x32, 0x15, 0xdf, 0xbc,
+ 0x07, 0x77, 0x1f, 0x3d, 0x68, 0x58, 0xea, 0x3c, 0xbe, 0x48, 0x90, 0xbc, 0x42,
+ 0x47, 0x35, 0x3d, 0x21, 0x06, 0x7d, 0xbd, 0x96, 0xd4, 0x67, 0x3c, 0x17, 0x5e,
+ 0x79, 0x3b, 0xd0, 0x09, 0x93, 0xbd, 0xaf, 0x34, 0x3d, 0x3d, 0xc6, 0xd3, 0x8f,
+ 0xbc, 0xae, 0x06, 0x0c, 0x3c, 0x84, 0xeb, 0x04, 0xbd, 0x44, 0xf4, 0x2e, 0xbd,
+ 0xad, 0x8d, 0x61, 0x3c, 0xb0, 0x1e, 0xaf, 0xb9, 0xb6, 0xd3, 0x57, 0xbc, 0x78,
+ 0x89, 0x97, 0x3c, 0x39, 0xa2, 0x41, 0xbd, 0x1c, 0xb3, 0x30, 0xbd, 0x44, 0xc4,
+ 0x90, 0x3c, 0xa3, 0x43, 0x03, 0xbd, 0xe0, 0xe2, 0xc4, 0xbb, 0xf0, 0xf3, 0x4d,
+ 0x3c, 0x6c, 0xf3, 0x85, 0x3d, 0x8f, 0xa9, 0x56, 0xbd, 0x36, 0x75, 0x5c, 0x3d,
+ 0x7e, 0x57, 0x89, 0x3c, 0x3a, 0xb8, 0x29, 0x3c, 0x2c, 0x10, 0x40, 0xbd, 0x5f,
+ 0x74, 0x32, 0xbd, 0xaf, 0x9e, 0x09, 0xbd, 0x60, 0xe4, 0x4b, 0xbd, 0x49, 0xb4,
+ 0xd7, 0x3c, 0xa0, 0x1f, 0x31, 0xbd, 0xd6, 0x5e, 0xde, 0x3c, 0x4e, 0xb1, 0xdb,
+ 0xbc, 0x98, 0x5a, 0x1e, 0x3d, 0x03, 0xe2, 0xa0, 0xba, 0x76, 0xc1, 0x63, 0xbd,
+ 0xbd, 0x03, 0xcf, 0x3c, 0xde, 0x4d, 0x22, 0x3d, 0x6a, 0x58, 0x5c, 0xbb, 0xc3,
+ 0xb8, 0x19, 0xbd, 0xf3, 0x01, 0x8f, 0x3d, 0x40, 0x62, 0xdc, 0x3b, 0x58, 0x64,
+ 0xa0, 0xbc, 0xdc, 0xd4, 0x6d, 0x3d, 0x62, 0x98, 0x1d, 0xbd, 0x96, 0x88, 0x4d,
+ 0x3b, 0x0e, 0xab, 0x46, 0x3d, 0xcb, 0xee, 0xce, 0x3b, 0xc5, 0x27, 0xe2, 0xbb,
+ 0xe4, 0xe4, 0x1c, 0x3d, 0x75, 0x86, 0x08, 0xbd, 0xf0, 0xce, 0x1c, 0x3d, 0xcb,
+ 0x9d, 0x7a, 0x3d, 0x24, 0x56, 0x42, 0xbc, 0x3a, 0x7f, 0xc4, 0xbc, 0x6e, 0xfd,
+ 0x6e, 0x3d, 0xa1, 0x3f, 0x80, 0x3d, 0xfb, 0x13, 0xc9, 0xbc, 0x5f, 0x8f, 0xb9,
+ 0x3c, 0xe3, 0xde, 0x94, 0xbd, 0x9f, 0x88, 0x88, 0xbd, 0x79, 0x27, 0x71, 0x3d,
+ 0xeb, 0xc8, 0x36, 0x3d, 0xe7, 0x2c, 0x9e, 0xbc, 0xb1, 0x19, 0x4d, 0xbd, 0x1e,
+ 0x82, 0x79, 0x3d, 0x75, 0xfe, 0x94, 0xbd, 0xdc, 0xd7, 0x96, 0xbd, 0x3a, 0x57,
+ 0x84, 0x3d, 0x70, 0xcd, 0x09, 0xbd, 0x08, 0xd9, 0x01, 0xbd, 0xa6, 0x1a, 0x85,
+ 0x3d, 0x5e, 0x34, 0xec, 0xbc, 0x3c, 0x0f, 0xa6, 0xbc, 0x0a, 0xc2, 0x6f, 0x3d,
+ 0x72, 0x1c, 0x89, 0x3d, 0xb0, 0x55, 0x12, 0xbd, 0x71, 0x87, 0x1f, 0x3d, 0x03,
+ 0xf0, 0x07, 0x3c, 0x52, 0x7d, 0x29, 0x3d, 0xe0, 0x13, 0x55, 0xbc, 0xe0, 0xac,
+ 0xbb, 0x3c, 0x36, 0x1f, 0x58, 0x3d, 0x34, 0x2f, 0xe3, 0x3c, 0xb5, 0xb7, 0x89,
+ 0xbc, 0x06, 0xfa, 0x93, 0xbd, 0xe7, 0x2e, 0x20, 0xbc, 0xc8, 0x71, 0x4c, 0x3d,
+ 0x03, 0x3b, 0xf6, 0xbb, 0x1c, 0xf7, 0x24, 0x3d, 0x88, 0x07, 0x09, 0x3d, 0xa6,
+ 0x16, 0xde, 0xbc, 0xd4, 0xfa, 0xf5, 0xbc, 0x2e, 0x35, 0x3f, 0x3d, 0x22, 0x36,
+ 0x5c, 0xbd, 0x99, 0xea, 0x90, 0x3d, 0x7c, 0xfd, 0xe6, 0x3c, 0xda, 0x89, 0x2e,
+ 0x3d, 0xea, 0x83, 0x39, 0x3c, 0xe2, 0x35, 0x12, 0x3d, 0xa6, 0xee, 0x46, 0x3d,
+ 0x7b, 0x4e, 0x36, 0xbd, 0x0a, 0x6d, 0xd1, 0x3b, 0x90, 0x59, 0x08, 0xbc, 0x3e,
+ 0xee, 0x86, 0x3b, 0x18, 0x92, 0x13, 0x3d, 0x71, 0xd5, 0x69, 0x3c, 0x5f, 0xc2,
+ 0x8d, 0xbd, 0xb0, 0x51, 0x81, 0x3c, 0x5a, 0x81, 0x9e, 0x3c, 0xcf, 0xae, 0x13,
+ 0x3d, 0xa4, 0x0d, 0x54, 0x3d, 0xb6, 0x82, 0x77, 0x3d, 0x6a, 0x20, 0xf7, 0xbc,
+ 0x60, 0xcc, 0x56, 0xbd, 0x45, 0x8f, 0x23, 0xbd, 0x92, 0x5c, 0x69, 0xbc, 0x8d,
+ 0xb5, 0x5d, 0xbd, 0x39, 0x60, 0x29, 0xbc, 0x06, 0x25, 0x6b, 0x3c, 0xad, 0x40,
+ 0x32, 0xbd, 0xcd, 0xbe, 0xf3, 0xbc, 0x7e, 0xd6, 0x74, 0x3d, 0x2e, 0x72, 0x63,
+ 0x3d, 0xc3, 0xaa, 0x0c, 0xbd, 0x74, 0xfc, 0x6a, 0xbd, 0xff, 0xa6, 0x7b, 0x3d,
+ 0xa8, 0x4f, 0xec, 0xbc, 0x8a, 0x91, 0x39, 0xbd, 0xd1, 0xa4, 0x7b, 0x3d, 0xff,
+ 0x3a, 0x99, 0x3b, 0xe9, 0xd2, 0x4e, 0xbd, 0xc6, 0x84, 0x1e, 0x3d, 0xe7, 0x73,
+ 0xdf, 0xbc, 0x88, 0xfb, 0x08, 0x3d, 0xf9, 0x98, 0xa2, 0xbc, 0x41, 0x1d, 0x8d,
+ 0x3d, 0xe6, 0x32, 0x38, 0x3d, 0x5f, 0xea, 0x1a, 0xbd, 0xce, 0x8f, 0x92, 0xbd,
+ 0xea, 0x1f, 0x69, 0x3d, 0x5b, 0x6e, 0x58, 0xbc, 0x6d, 0xfc, 0x2d, 0x3d, 0xa9,
+ 0x01, 0x83, 0x3d, 0xbc, 0xdb, 0x53, 0x3d, 0x70, 0xea, 0x72, 0xbd, 0xa4, 0xc0,
+ 0xae, 0xbc, 0x80, 0x8a, 0x54, 0x3a, 0x4a, 0x00, 0x80, 0xbc, 0x4a, 0x66, 0x78,
+ 0xbc, 0xbe, 0x62, 0x79, 0xbd, 0xe8, 0x24, 0x84, 0xbc, 0x0d, 0xef, 0x0f, 0x3d,
+ 0xa9, 0xa6, 0x26, 0x3d, 0xb8, 0x68, 0x83, 0xbd, 0xe2, 0x7b, 0x27, 0xbd, 0xdc,
+ 0xda, 0x80, 0xbd, 0x5e, 0x50, 0x88, 0xbd, 0x76, 0x41, 0x8d, 0x3d, 0xee, 0x0a,
+ 0x95, 0xbc, 0xc4, 0x0b, 0x41, 0x3c, 0x6e, 0x16, 0xe0, 0xbc, 0xb2, 0x34, 0x58,
+ 0x3d, 0x65, 0xd4, 0x06, 0x3d, 0x8a, 0x8a, 0x18, 0xbd, 0x99, 0xdd, 0x47, 0x3d,
+ 0x2b, 0xec, 0x00, 0x3d, 0xc3, 0xb1, 0xad, 0xb9, 0xf9, 0x57, 0x77, 0x3c, 0xae,
+ 0xc6, 0x8a, 0xbd, 0x55, 0x51, 0x43, 0x3d, 0x34, 0xd3, 0x1b, 0xbd, 0xda, 0x9e,
+ 0x47, 0x3d, 0xe5, 0x3a, 0x1f, 0x3d, 0x6d, 0xf2, 0x59, 0x3d, 0x14, 0x27, 0xb7,
+ 0xbc, 0xb0, 0x72, 0x8f, 0x3d, 0xbe, 0x91, 0x83, 0xbd, 0xbb, 0x8f, 0x39, 0xbd,
+ 0x40, 0x7f, 0x7e, 0xbd, 0x2d, 0x3e, 0x86, 0x3b, 0xca, 0x43, 0x29, 0xbc, 0xe2,
+ 0xb8, 0x4d, 0x3d, 0x48, 0x31, 0x85, 0xbd, 0xcb, 0x54, 0x1b, 0x3d, 0xb4, 0xc8,
+ 0x56, 0x3d, 0x09, 0x2f, 0x1d, 0x3d, 0xca, 0x8f, 0x10, 0x3d, 0xe1, 0x8d, 0x4c,
+ 0x3a, 0xdb, 0x4d, 0xd2, 0xbc, 0x4a, 0xc7, 0xd1, 0xbc, 0xc8, 0x03, 0xfa, 0x3c,
+ 0x4e, 0x3f, 0xa4, 0xbc, 0x5f, 0x9e, 0x90, 0xbd, 0x13, 0x82, 0xc0, 0x3c, 0x59,
+ 0x55, 0x54, 0x3c, 0xb6, 0x95, 0xa5, 0xbb, 0xef, 0x59, 0xa4, 0x3b, 0x7e, 0x93,
+ 0x1e, 0xbd, 0xaf, 0x49, 0x81, 0xbc, 0xe7, 0xd1, 0xc6, 0xbb, 0xc0, 0xa3, 0xc9,
+ 0x3b, 0x53, 0xa9, 0x77, 0xbb, 0xfa, 0x26, 0x74, 0xbc, 0x06, 0x1b, 0x63, 0x3d,
+ 0xe4, 0x90, 0x0a, 0xbd, 0x64, 0x50, 0x31, 0x3d, 0xff, 0x66, 0x82, 0x3d, 0x9d,
+ 0x1c, 0x06, 0xbd, 0x38, 0x29, 0x40, 0xbd, 0x6f, 0xea, 0x89, 0x3d, 0xdc, 0x8a,
+ 0x3f, 0xbd, 0xd1, 0x88, 0x02, 0x3d, 0x2f, 0x23, 0x27, 0x3c, 0x9c, 0x85, 0x56,
+ 0x3d, 0x41, 0xc7, 0x41, 0xbd, 0x67, 0x51, 0x49, 0x3c, 0x5f, 0x41, 0xf9, 0xbb,
+ 0x15, 0x37, 0xdb, 0xbc, 0x51, 0x7a, 0xd9, 0x3a, 0x05, 0xc0, 0x90, 0xbd, 0x8f,
+ 0xdb, 0x84, 0xbd, 0x3a, 0xc1, 0x48, 0xb9, 0x22, 0x3c, 0xfb, 0x3c, 0x7d, 0xf5,
+ 0x14, 0xbd, 0x26, 0xe6, 0x53, 0xbc, 0xde, 0x94, 0xa0, 0xbc, 0xd9, 0xc4, 0x5e,
+ 0x3d, 0xd4, 0xcf, 0xa6, 0xba, 0xfa, 0x43, 0x18, 0xbd, 0xee, 0x62, 0x19, 0xbd,
+ 0xfb, 0x61, 0x66, 0xbb, 0x1e, 0x8b, 0x82, 0xbd, 0x26, 0xec, 0x87, 0xbd, 0xc2,
+ 0xf6, 0x04, 0x3d, 0x2b, 0x2e, 0xe4, 0xbc, 0x60, 0xa6, 0x4e, 0x3d, 0x21, 0x99,
+ 0x5c, 0x3d, 0xdd, 0xde, 0x37, 0x3d, 0x8e, 0xfc, 0xf5, 0x3c, 0x6d, 0x33, 0xc2,
+ 0x39, 0x48, 0xea, 0x34, 0x3d, 0x79, 0x3e, 0x85, 0xbd, 0x20, 0xb1, 0x3d, 0xbb,
+ 0xdc, 0xe9, 0x64, 0xbc, 0xd2, 0xac, 0x4a, 0xbd, 0x1a, 0x4a, 0x8d, 0xbd, 0xb5,
+ 0xa2, 0xf3, 0x3c, 0xcd, 0x54, 0xb6, 0xbc, 0xc1, 0x9b, 0x2c, 0x3c, 0xd0, 0xea,
+ 0xad, 0xbc, 0x3f, 0xbc, 0x7f, 0x3c, 0xde, 0xe3, 0xe9, 0xbc, 0x1e, 0x28, 0x6f,
+ 0xbc, 0xd1, 0xce, 0xfe, 0xbc, 0xcc, 0x16, 0x21, 0x3d, 0x2a, 0x10, 0x18, 0xbd,
+ 0x5e, 0x73, 0xe9, 0xbb, 0xb3, 0x67, 0xa1, 0xbb, 0x94, 0x7d, 0x0d, 0x3c, 0x1d,
+ 0x67, 0x3b, 0xbd, 0xa9, 0xb9, 0x84, 0x3c, 0xe1, 0xc1, 0x89, 0xba, 0x49, 0x7f,
+ 0x91, 0xbd, 0x47, 0xf8, 0x57, 0xbc, 0x00, 0x6a, 0x24, 0x3d, 0x61, 0x71, 0x6f,
+ 0x3c, 0xd7, 0x6e, 0x4e, 0xbc, 0x07, 0xda, 0x60, 0xbb, 0x2d, 0xd9, 0x8e, 0x3d,
+ 0x0d, 0x9d, 0xc5, 0x3b, 0x50, 0x74, 0xe2, 0xbc, 0xaf, 0x90, 0x2d, 0xbd, 0xce,
+ 0x93, 0x2a, 0x3d, 0x56, 0xee, 0xee, 0xbc, 0x62, 0x58, 0x0a, 0x3d, 0x25, 0x7c,
+ 0x64, 0x3d, 0x23, 0x8d, 0x80, 0x3d, 0x3b, 0xfd, 0x55, 0xbd, 0x8f, 0x71, 0xe2,
+ 0xbc, 0x9c, 0xae, 0x07, 0x3d, 0x0e, 0xe4, 0xdd, 0xbc, 0x93, 0xc9, 0xd7, 0x3c,
+ 0x87, 0x9c, 0xe5, 0xbb, 0xa3, 0xd5, 0x5d, 0x3d, 0x23, 0xdb, 0x3a, 0xbd, 0x67,
+ 0xb3, 0x1a, 0x3d, 0x9e, 0xa1, 0x6b, 0x3d, 0x93, 0x17, 0xc2, 0xbc, 0x0c, 0xb7,
+ 0x33, 0xbd, 0xc0, 0xba, 0xeb, 0xbc, 0x16, 0x2c, 0x4d, 0xbd, 0xed, 0x60, 0x78,
+ 0x3c, 0x54, 0xa3, 0x93, 0xbd, 0x62, 0xa6, 0x8a, 0xbd, 0xdc, 0x16, 0x25, 0xbd,
+ 0xa9, 0xaf, 0x76, 0xbd, 0xab, 0x3c, 0x5d, 0xbd, 0xcf, 0x78, 0x9c, 0x3c, 0x74,
+ 0xf2, 0x97, 0x3c, 0xaa, 0x5d, 0x3b, 0x3d, 0x9c, 0xd2, 0xef, 0x3c, 0xd8, 0x6a,
+ 0x37, 0x3c, 0x44, 0xd2, 0xb9, 0xbc, 0x41, 0x5d, 0x7e, 0x3d, 0x74, 0x3c, 0x7d,
+ 0xbd, 0x40, 0x08, 0x0c, 0xbd, 0xbb, 0xc3, 0x04, 0xbd, 0xd7, 0xd3, 0x5d, 0xbd,
+ 0x41, 0xe7, 0x7c, 0x3d, 0x65, 0x20, 0x6f, 0x3b, 0x4e, 0xef, 0x81, 0x3a, 0xae,
+ 0xe0, 0x5d, 0xbd, 0x3f, 0xfb, 0x82, 0xbd, 0xf1, 0xc5, 0x58, 0xbd, 0x96, 0xab,
+ 0x45, 0x3b, 0x97, 0x5f, 0xcd, 0x3b, 0x39, 0x48, 0x5b, 0x3b, 0x6d, 0xf0, 0x28,
+ 0xbd, 0x08, 0xcc, 0x9f, 0x3c, 0x21, 0xd5, 0x2b, 0xbd, 0xc1, 0xe3, 0x1c, 0x3d,
+ 0x86, 0x52, 0xb4, 0x3c, 0x02, 0xd4, 0xc6, 0xbc, 0xbe, 0xab, 0x27, 0xbd, 0x18,
+ 0x8f, 0x84, 0x3c, 0x7d, 0x47, 0x2e, 0x3d, 0x0a, 0x58, 0x9c, 0x3b, 0x52, 0x72,
+ 0xe4, 0xbc, 0x98, 0x57, 0x5e, 0x3c, 0x24, 0xf1, 0x04, 0xbc, 0x3b, 0xec, 0x0f,
+ 0xbd, 0xf5, 0x54, 0x13, 0x3d, 0x6f, 0xf9, 0x80, 0x3c, 0x80, 0x19, 0xa2, 0xbc,
+ 0xfa, 0x89, 0x35, 0x3d, 0xd8, 0x61, 0x82, 0x3c, 0x21, 0x81, 0x8b, 0x3d, 0x40,
+ 0x2d, 0x65, 0xbc, 0xc6, 0x21, 0x61, 0x3d, 0x51, 0x3d, 0xa9, 0xbc, 0x47, 0x12,
+ 0x55, 0x3d, 0x7e, 0x85, 0x71, 0xbd, 0x22, 0x14, 0x05, 0x3d, 0x94, 0x35, 0x97,
+ 0xbd, 0x3c, 0x00, 0x86, 0xbd, 0x3a, 0x46, 0x5f, 0x3d, 0x18, 0x14, 0x06, 0xbd,
+ 0xb4, 0xea, 0x8c, 0xbd, 0xdc, 0x2e, 0xfe, 0x3b, 0x21, 0x96, 0x3d, 0xbd, 0x3a,
+ 0xf6, 0x8b, 0xbc, 0x3a, 0x3b, 0x6d, 0xbb, 0x39, 0x87, 0x13, 0x3c, 0x15, 0xbc,
+ 0x92, 0xbd, 0x24, 0xb7, 0x13, 0x3d, 0x9c, 0x66, 0x7a, 0xbd, 0x6b, 0xf2, 0x41,
+ 0xbd, 0x1d, 0x15, 0x6a, 0xbc, 0x20, 0x2a, 0x73, 0x3d, 0x25, 0x95, 0x40, 0x3d,
+ 0x23, 0x8f, 0x90, 0xbd, 0xd6, 0x95, 0xa7, 0xbc, 0xbe, 0xce, 0x4f, 0x3d, 0xaf,
+ 0xe0, 0x3f, 0x3d, 0x1b, 0x9f, 0x47, 0x3c, 0x57, 0x37, 0x14, 0x3d, 0x33, 0x06,
+ 0x86, 0x3d, 0xe5, 0x3c, 0x77, 0x3d, 0x60, 0x46, 0x95, 0x3b, 0xee, 0xd2, 0x97,
+ 0xbc, 0x38, 0x20, 0x9c, 0x3c, 0xe6, 0x90, 0xdf, 0xba, 0x77, 0x4f, 0x30, 0x3d,
+ 0x54, 0x87, 0x03, 0x3d, 0x86, 0x7c, 0x25, 0x3d, 0xdb, 0x5a, 0x18, 0x3d, 0x60,
+ 0x84, 0xf9, 0xbc, 0x84, 0x3c, 0xd0, 0xbc, 0xe9, 0x8c, 0x87, 0xbb, 0x39, 0xb9,
+ 0x81, 0x3d, 0x2e, 0x3e, 0x67, 0x3d, 0x5d, 0x57, 0xf8, 0xba, 0x60, 0x31, 0x38,
+ 0x3c, 0xf4, 0x31, 0x02, 0xbd, 0x31, 0x10, 0x98, 0x3c, 0x85, 0x28, 0x16, 0x3d,
+ 0xc5, 0xcd, 0xef, 0x3c, 0x92, 0x8d, 0x59, 0x3d, 0x6a, 0x54, 0x27, 0xbc, 0x72,
+ 0x4a, 0xf7, 0xbc, 0x0d, 0x8d, 0x81, 0x3d, 0xbd, 0x74, 0x8f, 0xbd, 0x80, 0xed,
+ 0x5c, 0x3b, 0xbe, 0x52, 0x7e, 0x3d, 0x49, 0x3f, 0x28, 0xbd, 0xcc, 0xc5, 0xea,
+ 0xbc, 0x2f, 0x46, 0x6b, 0xbd, 0x05, 0xd4, 0x0c, 0xbc, 0x41, 0x09, 0x02, 0x3d,
+ 0x2e, 0xa8, 0x53, 0xbc, 0xc7, 0x56, 0x56, 0xbd, 0xc2, 0x01, 0x88, 0xbd, 0x7a,
+ 0x9c, 0x6f, 0x3d, 0x3c, 0x49, 0x1c, 0x3d, 0x2b, 0x80, 0xe3, 0x3b, 0x43, 0x27,
+ 0x7d, 0x3d, 0x91, 0xa0, 0x58, 0x3d, 0xdb, 0x70, 0x76, 0xbc, 0xc4, 0xfa, 0x04,
+ 0xbd, 0x5e, 0x76, 0xcc, 0x3b, 0x0a, 0xcf, 0xc0, 0xbc, 0xfa, 0x3f, 0x08, 0xbd,
+ 0x26, 0x65, 0xaa, 0x3c, 0x2f, 0xec, 0x37, 0x3d, 0xa0, 0xae, 0x51, 0x3d, 0xbd,
+ 0x0e, 0x4e, 0x3d, 0x4d, 0x36, 0xae, 0xbc, 0xf1, 0xc8, 0x3f, 0xbd, 0x79, 0xe5,
+ 0x84, 0xbc, 0xac, 0x19, 0xf7, 0x3b, 0x5f, 0x52, 0x70, 0xbd, 0x46, 0x15, 0x01,
+ 0xbd, 0x17, 0xb1, 0xb1, 0x3c, 0x2e, 0x19, 0x87, 0xbd, 0x0c, 0xe6, 0x98, 0x3c,
+ 0x35, 0xd0, 0x22, 0xbd, 0xe3, 0x8f, 0x8a, 0xbd, 0x23, 0x8b, 0xfa, 0x3c, 0x01,
+ 0x67, 0x80, 0x3d, 0x6c, 0x9e, 0xb2, 0x3a, 0x6b, 0xbe, 0x8b, 0x3d, 0x74, 0x68,
+ 0xdb, 0x3c, 0x4c, 0x13, 0xae, 0xbc, 0x94, 0xfe, 0x50, 0xbd, 0xdc, 0x7e, 0x2f,
+ 0x3d, 0x78, 0x0a, 0x6e, 0xbc, 0x0e, 0x2b, 0xe9, 0xbc, 0x3b, 0x4b, 0x08, 0x3d,
+ 0x4d, 0x1a, 0x3d, 0xbd, 0x55, 0x7e, 0x51, 0xbb, 0x15, 0xa6, 0xb4, 0xbc, 0xac,
+ 0x1b, 0x86, 0xbb, 0x8a, 0x27, 0x22, 0x3d, 0x39, 0xc8, 0x34, 0xbc, 0x65, 0x0e,
+ 0x1a, 0xbb, 0x4c, 0x08, 0xdb, 0x3b, 0x60, 0x75, 0x2d, 0xbc, 0x25, 0xba, 0x64,
+ 0xbc, 0x8c, 0x05, 0x70, 0x3d, 0x0e, 0xdc, 0xaa, 0xbc, 0x63, 0x17, 0x03, 0x3d,
+ 0x03, 0x9d, 0x36, 0x3c, 0xe3, 0xf5, 0x6e, 0x3d, 0x01, 0xf8, 0x12, 0xbd, 0x15,
+ 0x62, 0xb3, 0x3c, 0xe1, 0x20, 0x1f, 0x3d, 0xbd, 0x41, 0x8d, 0x3d, 0x7b, 0x02,
+ 0x47, 0x3d, 0x8e, 0x9c, 0x93, 0xbc, 0x82, 0xa1, 0x81, 0xbd, 0xb9, 0x59, 0x6e,
+ 0x3c, 0xc6, 0x93, 0x07, 0xbd, 0x4c, 0x87, 0x44, 0x3d, 0x6a, 0x66, 0x49, 0xbd,
+ 0x80, 0xd5, 0x4b, 0xbb, 0x70, 0xd5, 0x09, 0x3c, 0x20, 0x85, 0x06, 0x3c, 0x7e,
+ 0xd6, 0x42, 0x3d, 0x5d, 0x10, 0x01, 0x3c, 0x71, 0xbe, 0x6c, 0xbc, 0xcc, 0xba,
+ 0x2d, 0xbd, 0xbf, 0xf6, 0x90, 0xbd, 0x59, 0xb8, 0x8c, 0x3d, 0x4a, 0xe8, 0x87,
+ 0xbc, 0xee, 0xd3, 0xd1, 0x3c, 0xde, 0xdd, 0xa6, 0xbb, 0x26, 0x06, 0x6a, 0xbc,
+ 0x1f, 0xa2, 0x88, 0xbd, 0x00, 0x6c, 0x24, 0xbb, 0x36, 0xf0, 0x00, 0x3c, 0x1e,
+ 0x54, 0x86, 0xbb, 0x55, 0x5e, 0x01, 0xbc, 0x3e, 0x0e, 0xe8, 0x3c, 0xbd, 0x02,
+ 0x70, 0xbb, 0x8e, 0xb9, 0x85, 0x3d, 0x8e, 0x8a, 0x5d, 0xbb, 0xa4, 0x21, 0x13,
+ 0x3d, 0xd1, 0x77, 0x16, 0xbc, 0x40, 0x95, 0x1d, 0x3c, 0x58, 0x2f, 0xbb, 0x3c,
+ 0xf5, 0x88, 0x86, 0xbb, 0xa0, 0x02, 0x83, 0xbd, 0x93, 0xb8, 0x0a, 0x3c, 0xfd,
+ 0x65, 0xe2, 0xbb, 0x24, 0x21, 0x11, 0x3d, 0xc6, 0x89, 0x8c, 0xbd, 0xc3, 0xa9,
+ 0x7a, 0xbd, 0x43, 0xcf, 0x81, 0xbd, 0xde, 0x81, 0x58, 0xbd, 0x3d, 0x35, 0x23,
+ 0x3d, 0xbe, 0x81, 0x90, 0xbd, 0xd3, 0xd2, 0xbb, 0x3c, 0x60, 0x68, 0xe5, 0xbc,
+ 0x25, 0x64, 0xa8, 0xbb, 0x8e, 0x5e, 0x4e, 0xbd, 0xc3, 0xa4, 0xd3, 0xbc, 0xb0,
+ 0x99, 0xf7, 0xbc, 0x2d, 0x56, 0x17, 0xbd, 0x44, 0x65, 0x2b, 0x3d, 0xa7, 0x80,
+ 0x05, 0xbd, 0xfc, 0xe1, 0x02, 0x3d, 0x65, 0xa7, 0x68, 0x3d, 0x52, 0x5d, 0x8b,
+ 0xbd, 0x6a, 0x9e, 0x83, 0xbd, 0xd4, 0xac, 0x1a, 0xbc, 0x3e, 0x6b, 0x7d, 0xbc,
+ 0xeb, 0xff, 0x40, 0xbd, 0xcd, 0xd2, 0x21, 0x3d, 0x7e, 0xf1, 0x70, 0xbd, 0x9b,
+ 0xc6, 0x6a, 0xbb, 0x1e, 0xb9, 0x20, 0x3d, 0xfd, 0x9b, 0x61, 0xbd, 0x57, 0xf3,
+ 0x5a, 0xbd, 0x5d, 0xbe, 0xbb, 0x3b, 0xd3, 0xc8, 0x50, 0xbd, 0x38, 0x8a, 0x5e,
+ 0xbd, 0x86, 0x65, 0x57, 0x3d, 0x02, 0xc7, 0x85, 0xbd, 0x95, 0x0a, 0x80, 0x3d,
+ 0x08, 0xcd, 0x66, 0x3c, 0x68, 0x38, 0x3d, 0x3c, 0xad, 0x64, 0x12, 0xbd, 0x20,
+ 0x0d, 0xcc, 0x3c, 0x63, 0x2c, 0x3f, 0x3d, 0xf6, 0xe1, 0xdc, 0x3c, 0x5f, 0xa6,
+ 0x35, 0x3d, 0x7b, 0xf6, 0x68, 0xbd, 0x9e, 0x65, 0xd2, 0x3c, 0x13, 0x63, 0x9d,
+ 0xbb, 0xd6, 0x42, 0x51, 0xbc, 0xa2, 0xc5, 0x52, 0xbc, 0x6a, 0x3d, 0x3f, 0x3d,
+ 0xa6, 0xde, 0xf8, 0xbc, 0x01, 0xa1, 0x5b, 0x3d, 0x8d, 0xdf, 0x16, 0xbd, 0x62,
+ 0x4d, 0x35, 0xba, 0x22, 0xca, 0x30, 0xbd, 0x50, 0x22, 0x72, 0xbc, 0xf1, 0xaa,
+ 0x96, 0xbd, 0x52, 0xf4, 0xd9, 0x3c, 0x08, 0x89, 0x6d, 0x3d, 0x90, 0x97, 0xa9,
+ 0x3c, 0x20, 0x9d, 0x0b, 0x3c, 0x47, 0x97, 0xf5, 0xbc, 0x7f, 0xc1, 0x3c, 0x3d,
+ 0x77, 0xa7, 0xeb, 0x3b, 0xe2, 0x0c, 0x77, 0x3d, 0xca, 0x57, 0x3e, 0x3d, 0x16,
+ 0x46, 0x38, 0xbd, 0x15, 0xde, 0x87, 0x3d, 0x10, 0x09, 0x0a, 0xbd, 0xa0, 0xfa,
+ 0x56, 0x3b, 0xba, 0x6c, 0x2f, 0x3d, 0x0f, 0xb9, 0x70, 0x3c, 0x35, 0xb8, 0x8c,
+ 0xbd, 0x88, 0xad, 0xc5, 0xbc, 0xb2, 0x0b, 0x40, 0xbd, 0x63, 0x62, 0x80, 0xbd,
+ 0xb4, 0xd9, 0x78, 0x3c, 0x91, 0x49, 0x8a, 0xbd, 0x59, 0x3c, 0x47, 0x3d, 0xb1,
+ 0xb7, 0x3a, 0xbd, 0x0f, 0x07, 0xea, 0x3b, 0xca, 0x89, 0x50, 0xbd, 0xf6, 0x2c,
+ 0x27, 0xbd, 0x3f, 0xf7, 0x37, 0x3c, 0x1c, 0x12, 0x23, 0x3c, 0x6d, 0x88, 0x97,
+ 0xbd, 0x06, 0x09, 0x66, 0x3d, 0x40, 0xac, 0x80, 0xbc, 0xac, 0xea, 0x7c, 0xbd,
+ 0x7e, 0xfb, 0x1a, 0x3d, 0x11, 0xd1, 0x65, 0x3d, 0x56, 0x13, 0xee, 0xbc, 0xa5,
+ 0xe1, 0x69, 0xbd, 0x47, 0xff, 0x45, 0xbc, 0x20, 0xba, 0x2e, 0xbd, 0xff, 0x15,
+ 0x48, 0xbc, 0x01, 0xd5, 0x8f, 0x3d, 0x42, 0x0f, 0x37, 0x3c, 0x68, 0xbc, 0xcc,
+ 0x3c, 0xf4, 0x1e, 0x39, 0xbd, 0x00, 0x6c, 0x07, 0xb9, 0xe4, 0x6e, 0xb2, 0x3c,
+ 0x9b, 0x53, 0x88, 0xbd, 0x20, 0xf2, 0xef, 0xbc, 0xd3, 0xf3, 0x8e, 0x3d, 0xbc,
+ 0xe9, 0xa6, 0xbc, 0xa3, 0xb6, 0x6b, 0xbc, 0x73, 0xeb, 0xdd, 0xbc, 0xdf, 0xa3,
+ 0x04, 0xbd, 0x1a, 0x9f, 0x21, 0x3c, 0x1d, 0xb7, 0x89, 0xbb, 0x28, 0x66, 0x85,
+ 0xbc, 0xf9, 0x7f, 0x95, 0xbd, 0x4c, 0x07, 0xfa, 0xbc, 0x52, 0x7d, 0x29, 0x3d,
+ 0x66, 0x78, 0x24, 0xbc, 0xd4, 0x70, 0xfa, 0xbc, 0x20, 0xdb, 0x02, 0xbd, 0x51,
+ 0x27, 0x09, 0xbd, 0xb6, 0xb6, 0x42, 0x3d, 0x37, 0xa4, 0x3f, 0xbd, 0xfc, 0x30,
+ 0xb2, 0xbb, 0x2b, 0xa7, 0xb7, 0x3c, 0x77, 0xf6, 0x2e, 0x3d, 0x4e, 0x18, 0x6c,
+ 0x3d, 0xb0, 0xb9, 0xe4, 0x3c, 0xa6, 0xce, 0x89, 0xbd, 0x18, 0x9a, 0xc2, 0x3c,
+ 0x8d, 0xdc, 0x51, 0xbd, 0x50, 0x09, 0x0a, 0x3d, 0xd8, 0x90, 0x6c, 0xbc, 0x28,
+ 0x48, 0x96, 0xbc, 0x50, 0x5f, 0x62, 0xbc, 0x8b, 0xbc, 0x82, 0xbd, 0xb0, 0x24,
+ 0xce, 0x3b, 0x54, 0xb0, 0x4b, 0x3c, 0xd8, 0x02, 0x59, 0x3c, 0x0b, 0x7d, 0xa0,
+ 0x3c, 0x2a, 0x6f, 0xfa, 0xbc, 0x51, 0xf4, 0x0a, 0xbd, 0xe5, 0xdd, 0x45, 0x3d,
+ 0x69, 0xcb, 0x5f, 0x3d, 0x59, 0xee, 0x1b, 0x3d, 0x15, 0x0c, 0x6d, 0x3d, 0xb4,
+ 0xe8, 0x3a, 0x3c, 0xd6, 0x4c, 0x71, 0x3d, 0x2c, 0x6c, 0x5f, 0xbc, 0x23, 0xc7,
+ 0x96, 0x3c, 0x90, 0xfd, 0xef, 0xb9, 0x80, 0x9a, 0xce, 0xbc, 0xc8, 0xa7, 0xfa,
+ 0xbc, 0x3f, 0x84, 0x4d, 0xbc, 0xb9, 0x1e, 0x63, 0x3d, 0x91, 0xff, 0x16, 0xbd,
+ 0xe4, 0x6d, 0x65, 0xbc, 0xbb, 0x19, 0x69, 0xbc, 0xf0, 0xba, 0xfe, 0xbc, 0xbb,
+ 0xe6, 0x30, 0x3d, 0x12, 0x3a, 0x4d, 0x3d, 0x08, 0xa7, 0x79, 0x3d, 0x37, 0x6c,
+ 0x88, 0x3d, 0xb4, 0x66, 0xf1, 0xba, 0xb8, 0x48, 0xcc, 0xbc, 0x61, 0xb9, 0x1d,
+ 0xbd, 0x8a, 0x51, 0x45, 0xbd, 0x2e, 0x8a, 0x59, 0x3d, 0x88, 0xe0, 0x7d, 0xbd,
+ 0x53, 0xc6, 0x8e, 0xbd, 0x0e, 0x7b, 0x5a, 0x3d, 0x13, 0xc2, 0xcb, 0xbc, 0x57,
+ 0xcd, 0x8b, 0xbd, 0x60, 0x8c, 0x4e, 0xbd, 0xe2, 0x03, 0x07, 0x3d, 0x5f, 0x0d,
+ 0x80, 0x3c, 0x5f, 0xc8, 0x3d, 0x3d, 0x89, 0x06, 0xc8, 0x3c, 0x17, 0x2b, 0x88,
+ 0x3d, 0xf6, 0x31, 0x63, 0x3d, 0x51, 0x2b, 0x60, 0xbd, 0xc9, 0x26, 0x67, 0xbd,
+ 0x02, 0x8e, 0x4f, 0xbd, 0xbd, 0x67, 0x20, 0x3d, 0x53, 0xfa, 0x64, 0xbb, 0x27,
+ 0x16, 0x28, 0xbd, 0x45, 0x52, 0xfb, 0xbb, 0x66, 0x53, 0x8d, 0x3c, 0x0c, 0x18,
+ 0x74, 0xbc, 0x60, 0x98, 0x19, 0x3d, 0xd2, 0x7c, 0x3c, 0x3d, 0x77, 0x65, 0x90,
+ 0xbc, 0x69, 0x1e, 0x3e, 0xbd, 0x04, 0x22, 0x7f, 0xbc, 0x7c, 0x5d, 0x2c, 0xbc,
+ 0x51, 0xb3, 0x1f, 0xbc, 0xc4, 0xaf, 0xbf, 0xbc, 0xa8, 0xc5, 0x59, 0x3c, 0xfe,
+ 0x08, 0x62, 0x3d, 0x7c, 0x3a, 0x56, 0x3d, 0x4a, 0xaf, 0x38, 0x3d, 0xd9, 0x9e,
+ 0x26, 0xbd, 0x48, 0xc2, 0x16, 0xbc, 0x6e, 0xcc, 0xec, 0xbc, 0x05, 0x78, 0x0e,
+ 0xbc, 0xd2, 0x5c, 0x51, 0xbd, 0x44, 0x63, 0x6b, 0x3d, 0x7c, 0xfd, 0xca, 0xbb,
+ 0x62, 0xda, 0x30, 0x3c, 0xc4, 0xcc, 0x61, 0x3d, 0xdc, 0xa6, 0x34, 0xbd, 0xff,
+ 0x8f, 0x24, 0xbc, 0x68, 0x37, 0xf6, 0xbc, 0xd1, 0x4d, 0x25, 0xbd, 0x33, 0x6e,
+ 0x91, 0x3c, 0x60, 0x57, 0x6b, 0x3d, 0x04, 0xf7, 0x34, 0xbd, 0x90, 0xe7, 0x30,
+ 0x3d, 0x8e, 0x22, 0x65, 0xbd, 0x62, 0xcf, 0xb6, 0x3c, 0xce, 0x5d, 0x9f, 0x3c,
+ 0xa0, 0x0a, 0x43, 0xbd, 0x1e, 0x7b, 0x56, 0xbd, 0x1f, 0x6a, 0x93, 0xbd, 0x60,
+ 0x5e, 0x39, 0x3d, 0x4d, 0x17, 0x8e, 0xbd, 0x28, 0x00, 0xad, 0x3c, 0x79, 0xd0,
+ 0xab, 0xbb, 0x15, 0xf3, 0x1a, 0xbd, 0x28, 0x13, 0x05, 0x3c, 0x90, 0x55, 0x20,
+ 0x3d, 0x98, 0x9b, 0xc4, 0x3c, 0x32, 0x5f, 0x86, 0xbd, 0x6d, 0xf8, 0x52, 0xbd,
+ 0xcc, 0x28, 0xae, 0x3c, 0x96, 0xc7, 0x81, 0x3d, 0x04, 0x2e, 0x5b, 0xbc, 0xdd,
+ 0xce, 0xb2, 0x3c, 0x14, 0x5d, 0x67, 0x3d, 0x74, 0xe8, 0x77, 0x3d, 0x2e, 0xf5,
+ 0x51, 0x3d, 0x21, 0x78, 0x7a, 0xbd, 0x62, 0xea, 0x6a, 0xbd, 0x36, 0x1c, 0xf4,
+ 0xbc, 0xd0, 0x98, 0xda, 0x3b, 0x26, 0x14, 0x8a, 0xbd, 0xf2, 0xa4, 0x67, 0xbd,
+ 0xb2, 0xa7, 0x39, 0xbd, 0x93, 0xa6, 0xd6, 0x3c, 0xe1, 0xa9, 0xe4, 0x3b, 0x49,
+ 0xca, 0x3f, 0x3d, 0x07, 0xe3, 0x64, 0x3d, 0x1e, 0xf5, 0x4d, 0xbd, 0x4e, 0xc3,
+ 0x8a, 0xbd, 0x88, 0xf9, 0xf8, 0x3c, 0xc6, 0x2a, 0xba, 0xbc, 0x56, 0xd7, 0xb1,
+ 0xbc, 0xbd, 0xff, 0x10, 0x3c, 0xfe, 0x3d, 0x16, 0xbd, 0x88, 0xdd, 0x5f, 0x3c,
+ 0x66, 0xd4, 0x50, 0xbd, 0xe2, 0x59, 0x62, 0x3d, 0x1c, 0xdf, 0xac, 0x3c, 0xc2,
+ 0x72, 0xb7, 0xbc, 0xe2, 0x19, 0x4d, 0xbd, 0xc1, 0xbb, 0xa1, 0x3c, 0xf2, 0x8f,
+ 0x24, 0x3d, 0x2f, 0xb1, 0xeb, 0xbc, 0xa7, 0xe6, 0x13, 0xbd, 0x4c, 0x51, 0x7c,
+ 0xbd, 0x23, 0x87, 0x3e, 0xbd, 0x65, 0x03, 0x86, 0x3b, 0x5d, 0x13, 0x15, 0x3d,
+ 0x44, 0x77, 0x96, 0xba, 0xe9, 0x74, 0x0a, 0x3d, 0xb4, 0xd0, 0x59, 0xbd, 0x4c,
+ 0x9a, 0x22, 0x3d, 0x82, 0x1b, 0x85, 0x3d, 0x09, 0x1e, 0xf9, 0x3c, 0x20, 0xcf,
+ 0x97, 0xbd, 0xf9, 0x46, 0x0e, 0xbd, 0xba, 0x0d, 0x82, 0x3d, 0xf6, 0xf1, 0xd7,
+ 0x3c, 0x8e, 0x08, 0xf8, 0xbc, 0x4d, 0xbf, 0x22, 0xbd, 0xd0, 0x25, 0x8a, 0x3c,
+ 0xa8, 0x71, 0x2e, 0xbd, 0xd9, 0xaa, 0x24, 0x3a, 0x48, 0x85, 0x6c, 0xbd, 0x90,
+ 0x0e, 0x8c, 0x3c, 0x3c, 0x45, 0x50, 0x3d, 0x71, 0xab, 0x65, 0x3d, 0x60, 0x38,
+ 0xdb, 0x3b, 0x9b, 0x94, 0x81, 0xbd, 0xc0, 0xaa, 0xb3, 0xbc, 0xc8, 0x46, 0x93,
+ 0xbc, 0x3a, 0x19, 0xea, 0xbc, 0x16, 0xab, 0x36, 0xbc, 0x20, 0x52, 0x74, 0xbd,
+ 0xbd, 0x3b, 0x75, 0x3d, 0xea, 0xef, 0xc3, 0xbc, 0x54, 0xbe, 0x26, 0xbd, 0x88,
+ 0x03, 0x6c, 0x3d, 0xa0, 0x3e, 0x4a, 0x3d, 0x46, 0x60, 0x0a, 0x3d, 0xf9, 0x88,
+ 0x59, 0x3d, 0xa2, 0x8a, 0x87, 0xbd, 0xde, 0x60, 0x48, 0x3d, 0xc6, 0x87, 0x60,
+ 0x3d, 0x05, 0x18, 0x3d, 0xbc, 0xa8, 0x15, 0x01, 0x3d, 0x68, 0x46, 0x41, 0xbd,
+ 0x7f, 0x8e, 0x58, 0x3d, 0xc6, 0xa4, 0xf6, 0x3c, 0x22, 0xbc, 0x73, 0x3d, 0xe8,
+ 0x2d, 0x83, 0x3c, 0x97, 0x7f, 0x8b, 0xbb, 0xe6, 0x83, 0x81, 0xbc, 0x42, 0x79,
+ 0x5b, 0x3d, 0x62, 0xfb, 0xd4, 0x3b, 0xf3, 0x51, 0x06, 0xbd, 0xb0, 0x65, 0x79,
+ 0x3d, 0xbc, 0x83, 0xdc, 0x3c, 0xbe, 0xbd, 0x8c, 0x3d, 0x64, 0xdf, 0x13, 0x3d,
+ 0x1f, 0xa8, 0x44, 0xbd, 0x1e, 0x7f, 0x87, 0xbc, 0x15, 0x05, 0x6c, 0xbd, 0x43,
+ 0x6b, 0x75, 0xbd, 0x38, 0x5a, 0x64, 0x3d, 0xb8, 0x35, 0x2c, 0x3c, 0x93, 0x41,
+ 0xd5, 0xb9, 0xf4, 0x66, 0x79, 0xbc, 0xd9, 0xda, 0xae, 0xbc, 0xd6, 0x82, 0xd4,
+ 0x3b, 0x48, 0x9e, 0x3e, 0xbd, 0x0c, 0x2c, 0xb7, 0xbc, 0xba, 0x9c, 0x2f, 0xbd,
+ 0x9c, 0x53, 0x4f, 0x3d, 0xf5, 0x5f, 0xe6, 0x3c, 0x60, 0x8e, 0x1f, 0x3b, 0xa6,
+ 0x27, 0x4a, 0xbd, 0xe5, 0x82, 0x9b, 0x3c, 0xb7, 0xe1, 0x84, 0x3d, 0x13, 0x34,
+ 0x34, 0xbc, 0x58, 0xca, 0x09, 0x3d, 0xe2, 0x9f, 0x70, 0x3d, 0x7b, 0x73, 0xa1,
+ 0xbc, 0xdb, 0x26, 0x08, 0xbd, 0xc0, 0x46, 0xce, 0xba, 0xfc, 0xde, 0xe1, 0x3c,
+ 0xf5, 0xd5, 0xbc, 0x3c, 0x03, 0x9b, 0x16, 0x3d, 0x61, 0xda, 0x16, 0xbd, 0x9c,
+ 0x34, 0x15, 0xbd, 0x6c, 0xae, 0x50, 0xbd, 0xc0, 0x47, 0x89, 0xbd, 0xf0, 0xff,
+ 0x52, 0x3d, 0xa2, 0xf2, 0x01, 0x3d, 0x7c, 0x68, 0x1a, 0x3d, 0x70, 0x77, 0x58,
+ 0xbd, 0x62, 0xb8, 0xb3, 0x3c, 0xd8, 0x2e, 0x07, 0xbc, 0xe6, 0x32, 0x8b, 0x3d,
+ 0x6b, 0xa2, 0x53, 0x3d, 0x12, 0xfa, 0x55, 0xbd, 0x7d, 0x83, 0x28, 0x3d, 0x92,
+ 0xa8, 0x73, 0xbd, 0xd5, 0xd5, 0x9c, 0x3c, 0xe5, 0x93, 0x83, 0x3c, 0xf9, 0xc8,
+ 0xb3, 0xbc, 0xfb, 0x27, 0x78, 0xbd, 0xa6, 0x7d, 0x5b, 0x3d, 0x9c, 0x51, 0x4d,
+ 0x3d, 0x25, 0x60, 0x4b, 0x3d, 0xba, 0x91, 0x96, 0xb9, 0xd7, 0xaf, 0xc3, 0x3c,
+ 0x34, 0x25, 0x3c, 0x3d, 0x3a, 0x04, 0x3a, 0x3d, 0x86, 0xb2, 0x30, 0x3c, 0x90,
+ 0xcf, 0x46, 0x3d, 0x96, 0xee, 0xe2, 0xbc, 0x9c, 0x30, 0xa7, 0x3c, 0x56, 0xe3,
+ 0x5a, 0xbd, 0x2f, 0xb6, 0x23, 0x3d, 0xda, 0x3e, 0x3c, 0xbd, 0x6e, 0xa0, 0x5c,
+ 0x3d, 0x28, 0xe0, 0x6e, 0xbd, 0x1a, 0x52, 0x34, 0x3d, 0xb8, 0xcd, 0x27, 0xbc,
+ 0x4a, 0xb4, 0x22, 0x3d, 0x1c, 0xd7, 0x64, 0xbc, 0x8f, 0xd9, 0x1d, 0xbd, 0xa2,
+ 0x1e, 0x17, 0x3d, 0x78, 0xed, 0xe2, 0x3c, 0x82, 0x5e, 0x0d, 0x3c, 0x93, 0x9d,
+ 0x58, 0xbd, 0x35, 0x43, 0x8a, 0xbd, 0xbd, 0xa6, 0xdf, 0x3c, 0x11, 0xc3, 0x3b,
+ 0x3d, 0x6c, 0xad, 0x58, 0xbd, 0x2e, 0x39, 0x1f, 0x3d, 0x45, 0x7d, 0x00, 0x3a,
+ 0xa9, 0xb2, 0x5b, 0x3d, 0x00, 0x38, 0x81, 0x38, 0xaa, 0x9f, 0xc9, 0x3a, 0xaa,
+ 0x79, 0x73, 0xbd, 0x39, 0x7b, 0xf7, 0x3b, 0xc4, 0x9f, 0x4e, 0xbd, 0xa1, 0x0c,
+ 0x64, 0x3a, 0x9b, 0x06, 0x5f, 0xbd, 0x32, 0x21, 0x6d, 0xbd, 0xbe, 0x94, 0x4e,
+ 0x3d, 0x7c, 0x40, 0xf9, 0x3c, 0xc8, 0xac, 0xca, 0x3c, 0x30, 0x76, 0x50, 0xbd,
+ 0x08, 0x66, 0x93, 0xbd, 0x0b, 0x4c, 0xb9, 0x3c, 0x8e, 0xef, 0x26, 0x3d, 0xe3,
+ 0x00, 0x68, 0x3d, 0x51, 0x3a, 0x84, 0xbd, 0x54, 0xac, 0xb3, 0xbc, 0x95, 0x17,
+ 0x91, 0xbd, 0x04, 0xf2, 0x31, 0x3d, 0x48, 0xbb, 0x20, 0x3c, 0xf3, 0x82, 0x88,
+ 0xbd, 0xdd, 0x5e, 0x4e, 0xbd, 0x95, 0x9e, 0x45, 0xbd, 0x62, 0xce, 0x51, 0xbd,
+ 0xa3, 0x8b, 0x3b, 0x3d, 0x40, 0xdb, 0x85, 0x3d, 0x33, 0xdc, 0xc1, 0xbc, 0xa7,
+ 0xb6, 0x7d, 0xbd, 0xd3, 0x99, 0x40, 0xbc, 0x6b, 0x63, 0x18, 0x3d, 0x73, 0x2f,
+ 0x63, 0xbc, 0xf8, 0xa2, 0x4a, 0xbc, 0xa5, 0x0b, 0x76, 0x3d, 0xd5, 0x88, 0x79,
+ 0x3d, 0x97, 0x41, 0x98, 0x3c, 0xe8, 0x20, 0x16, 0x3d, 0xcc, 0x47, 0x78, 0xbd,
+ 0xfd, 0x9a, 0xae, 0x3c, 0xf2, 0xe2, 0x8a, 0xbd, 0x07, 0xd1, 0x19, 0x3d, 0xd4,
+ 0xef, 0x68, 0xbc, 0x82, 0x5d, 0x51, 0x3d, 0x0c, 0x61, 0xc8, 0xba, 0xc1, 0xd5,
+ 0x36, 0xbd, 0xf2, 0x3c, 0x1d, 0x3d, 0x86, 0xdf, 0x65, 0x3d, 0x04, 0x4c, 0x87,
+ 0x3d, 0xe9, 0x46, 0x91, 0x3d, 0xc0, 0x63, 0x33, 0xbc, 0x7c, 0xd0, 0xbf, 0x3c,
+ 0xe8, 0xfe, 0x55, 0xbd, 0x18, 0x50, 0x53, 0x3c, 0x51, 0x99, 0xb0, 0xbb, 0x50,
+ 0x90, 0xec, 0x3b, 0x3d, 0x3a, 0x69, 0xbd, 0x6e, 0x49, 0x09, 0xbc, 0x74, 0x12,
+ 0xde, 0xbc, 0xad, 0x0c, 0x87, 0x3c, 0x35, 0x8f, 0x41, 0x3d, 0x5e, 0xa8, 0x3b,
+ 0xbd, 0x28, 0x85, 0x61, 0x3d, 0xfe, 0xb2, 0xe1, 0x3b, 0xec, 0xbb, 0x0e, 0x3d,
+ 0x04, 0xe3, 0x05, 0x3d, 0x10, 0xeb, 0x07, 0xbd, 0x63, 0x3a, 0x68, 0x3d, 0x55,
+ 0x9c, 0x49, 0x3b, 0x58, 0xdc, 0x62, 0x3d, 0x33, 0x78, 0x03, 0x3d, 0x0f, 0xc8,
+ 0x7a, 0xbd, 0xa3, 0x94, 0x83, 0xbd, 0xf7, 0x86, 0x5d, 0xbd, 0xcb, 0xd6, 0x82,
+ 0x3d, 0xcb, 0x78, 0x82, 0xbd, 0xcb, 0x8b, 0x46, 0xbc, 0x44, 0xff, 0x75, 0xbd,
+ 0x63, 0xc6, 0x48, 0x3d, 0x50, 0x1b, 0x14, 0xbc, 0x57, 0xd1, 0xe1, 0x3c, 0x60,
+ 0xa8, 0xe2, 0x3c, 0x00, 0xa0, 0xf8, 0xb9, 0x9c, 0x9f, 0x24, 0x3d, 0x10, 0x2c,
+ 0x4a, 0x3c, 0x90, 0xdf, 0xbc, 0xbc, 0x9e, 0xae, 0xa4, 0xbc, 0xf7, 0x31, 0x66,
+ 0xbd, 0x1e, 0x83, 0x14, 0x3c, 0x9b, 0xaa, 0x91, 0x3b, 0x91, 0x24, 0x11, 0xbd,
+ 0x54, 0x0b, 0x90, 0x3b, 0x30, 0xa4, 0x64, 0x3d, 0x69, 0xa8, 0x81, 0x3d, 0x5e,
+ 0x35, 0x03, 0xbb, 0xcc, 0xce, 0xa6, 0x3c, 0x2f, 0x18, 0xfd, 0xbc, 0x50, 0x81,
+ 0xe2, 0xbb, 0x40, 0x4b, 0x16, 0x3d, 0xc0, 0x66, 0x63, 0xbd, 0x5f, 0xcd, 0x9b,
+ 0xbc, 0x2f, 0xf8, 0x25, 0xbd, 0xa0, 0x4d, 0x7a, 0x3c, 0x81, 0x0c, 0x5a, 0xbd,
+ 0x54, 0xa9, 0x6a, 0x3d, 0xc0, 0x3b, 0x3c, 0xbd, 0xb4, 0x63, 0xfb, 0x3c, 0x26,
+ 0x9c, 0x11, 0x3d, 0x06, 0xea, 0xa3, 0xbc, 0x3f, 0x44, 0x92, 0xbc, 0x00, 0x88,
+ 0x6f, 0x3b, 0xd8, 0x6f, 0x36, 0xbd, 0xe0, 0xad, 0x89, 0x3d, 0x52, 0xfb, 0x72,
+ 0x3d, 0x64, 0x05, 0x64, 0xbc, 0xd7, 0x2a, 0x57, 0xbd, 0x02, 0x49, 0xad, 0xbc,
+ 0x38, 0xf1, 0x2d, 0xbd, 0x8a, 0x2e, 0x8b, 0x3d, 0x39, 0x44, 0x12, 0xbd, 0xfc,
+ 0xa0, 0xb8, 0xbc, 0x32, 0x17, 0x8a, 0xbd, 0x7e, 0xbf, 0x6b, 0x3d, 0x32, 0x76,
+ 0xad, 0xbc, 0xb0, 0x21, 0x58, 0x3d, 0x62, 0xf5, 0x59, 0x3d, 0xb3, 0x5f, 0x98,
+ 0x3c, 0xa4, 0x02, 0x2c, 0x3b, 0x59, 0x69, 0x97, 0xbd, 0x70, 0xcf, 0x91, 0x3b,
+ 0x6b, 0xc3, 0x47, 0xbd, 0x10, 0xfe, 0xd4, 0xbc, 0x08, 0x93, 0xd1, 0x3b, 0xf5,
+ 0xe9, 0x14, 0xbd, 0x9a, 0x9c, 0x7b, 0x3d, 0x15, 0x75, 0x54, 0x3d, 0x09, 0xbf,
+ 0x57, 0xbc, 0xbf, 0x09, 0x29, 0xbb, 0xf5, 0x6d, 0x91, 0xbd, 0xb8, 0x41, 0xbd,
+ 0x3c, 0x80, 0x60, 0x6e, 0x3c, 0xab, 0xf2, 0x4f, 0xbd, 0x81, 0x36, 0x79, 0x3d,
+ 0x6a, 0x5a, 0x85, 0xbd, 0xf2, 0xac, 0x36, 0x3d, 0x92, 0x7c, 0xc0, 0xbc, 0x00,
+ 0x12, 0x06, 0x3c, 0xfe, 0x9c, 0x66, 0x3d, 0xa0, 0xf3, 0xbb, 0xbb, 0x37, 0xb0,
+ 0x74, 0xbd, 0x18, 0xb1, 0x10, 0xbd, 0x82, 0xd7, 0xe2, 0xbc, 0x87, 0xee, 0x14,
+ 0x3d, 0xe9, 0x2a, 0x40, 0xbd, 0xe3, 0x0d, 0x53, 0x3c, 0x5c, 0x02, 0x93, 0x3c,
+ 0x25, 0x0f, 0x49, 0xbd, 0x88, 0xd8, 0x3f, 0x3d, 0x58, 0xf0, 0x39, 0xbd, 0xe3,
+ 0x0a, 0x3b, 0xbd, 0xeb, 0x61, 0x01, 0x3d, 0xb4, 0xa0, 0x6b, 0xbd, 0x1d, 0x4b,
+ 0x90, 0xbd, 0xb2, 0x31, 0x34, 0xbd, 0xaa, 0x20, 0xad, 0x3a, 0xd5, 0x1e, 0x3a,
+ 0xbd, 0xf4, 0x05, 0x38, 0x3d, 0x1b, 0xb2, 0x46, 0xbc, 0x2c, 0xd7, 0x3e, 0x3d,
+ 0xec, 0x98, 0xc7, 0x3c, 0xe7, 0xd3, 0x21, 0xbd, 0x07, 0x35, 0x60, 0xbd, 0x2b,
+ 0xb9, 0xfd, 0xbc, 0x9b, 0x69, 0x36, 0x3d, 0xdf, 0xdf, 0x6f, 0xbd, 0x5a, 0x80,
+ 0x81, 0xbd, 0x9b, 0x67, 0xf2, 0x3b, 0x20, 0x94, 0xde, 0xbb, 0xc5, 0xfc, 0x29,
+ 0xbd, 0x0c, 0x34, 0x30, 0xbd, 0x50, 0xbb, 0xc9, 0xbc, 0x92, 0x32, 0x93, 0xbc,
+ 0x12, 0xf9, 0x69, 0xbd, 0x1c, 0x84, 0x3a, 0xbc, 0x88, 0x93, 0x84, 0xbd, 0x07,
+ 0x7e, 0xb5, 0x3c, 0xe6, 0xb8, 0x4a, 0x3d, 0xde, 0x7c, 0x55, 0x3d, 0x16, 0x69,
+ 0xf0, 0xbc, 0x91, 0x57, 0x5b, 0xbd, 0xa2, 0x4a, 0x26, 0x3d, 0x5b, 0xdc, 0xaf,
+ 0xba, 0xe8, 0x30, 0xe1, 0xbc, 0xf8, 0x97, 0x21, 0x3d, 0x00, 0x3e, 0x11, 0x3c,
+ 0x92, 0x1c, 0xb1, 0xbc, 0xce, 0x5f, 0xa3, 0x3c, 0x2d, 0x13, 0x88, 0xbd, 0xbc,
+ 0x64, 0xbc, 0x3c, 0xd1, 0x47, 0x97, 0xbb, 0xf2, 0x46, 0x55, 0x3d, 0x70, 0x6e,
+ 0x09, 0x3d, 0x6b, 0x66, 0x93, 0xbd, 0x26, 0xf4, 0xcb, 0xbc, 0x59, 0xb5, 0x84,
+ 0xbc, 0x13, 0x19, 0x8d, 0x3d, 0x35, 0xf3, 0x3e, 0xbc, 0x9d, 0xf8, 0x78, 0x3d,
+ 0x75, 0x6d, 0x4f, 0x3d, 0xd4, 0x8a, 0xd7, 0x3c, 0x74, 0x49, 0x0d, 0xbd, 0x40,
+ 0x3d, 0xcd, 0x3a, 0xa2, 0xb6, 0x64, 0x3d, 0x73, 0xc5, 0x90, 0x3d, 0x5b, 0x4e,
+ 0x85, 0xbd, 0xf6, 0x1b, 0x64, 0x3d, 0x15, 0x44, 0xbf, 0xbc, 0x4c, 0xb6, 0x0e,
+ 0x3d, 0xaf, 0x91, 0x06, 0xbc, 0xa0, 0xc6, 0xdf, 0x3c, 0xb7, 0xb5, 0x66, 0x3d,
+ 0x23, 0x0d, 0x68, 0xbd, 0xcf, 0x9f, 0xe9, 0xbc, 0xcd, 0xa5, 0x1f, 0xbd, 0x92,
+ 0x3c, 0x5b, 0x3d, 0x0c, 0x92, 0x57, 0x3d, 0x73, 0xa2, 0x2e, 0xbd, 0x4a, 0xeb,
+ 0x23, 0xbc, 0x6b, 0xa1, 0x3c, 0xba, 0xd2, 0x19, 0xbb, 0xbc, 0x44, 0x55, 0x29,
+ 0xbd, 0xcd, 0x07, 0x34, 0xbd, 0xbf, 0xaa, 0xf9, 0xba, 0x18, 0x7b, 0x8a, 0xbc,
+ 0x4a, 0xe1, 0x5d, 0x3d, 0x28, 0x1b, 0x38, 0x3c, 0xfd, 0x1b, 0xd0, 0x3b, 0xdd,
+ 0x1c, 0x92, 0xbb, 0xf4, 0x64, 0x31, 0x3c, 0x82, 0x22, 0x44, 0x3d, 0x22, 0xd5,
+ 0x0c, 0xbd, 0x63, 0x1f, 0x24, 0xbd, 0xd0, 0xe3, 0x03, 0x3c, 0xfc, 0x32, 0x22,
+ 0xbc, 0x26, 0x4e, 0xba, 0xbc, 0xf2, 0x18, 0xa8, 0xbc, 0x1d, 0xb1, 0x43, 0xbc,
+ 0x4b, 0x52, 0x17, 0xbd, 0xe1, 0xf7, 0x05, 0x3d, 0xdb, 0xfb, 0xd9, 0x3c, 0x0b,
+ 0x58, 0x8e, 0xbc, 0xc1, 0x1f, 0x81, 0x3d, 0xa0, 0x6f, 0x36, 0xbd, 0x52, 0xec,
+ 0x57, 0xbd, 0x6a, 0x3b, 0x06, 0xbd, 0xb5, 0x5b, 0x9c, 0xbc, 0x08, 0xb1, 0x32,
+ 0xbc, 0xc0, 0xde, 0x85, 0xbd, 0x2d, 0xd5, 0xd2, 0x3c, 0xa6, 0x1d, 0x14, 0xbc,
+ 0x8d, 0x5e, 0xd8, 0x3c, 0x83, 0x8e, 0xcf, 0xbc, 0xa0, 0xc2, 0x83, 0xbd, 0xce,
+ 0x5f, 0x3b, 0xbd, 0x60, 0xbc, 0x7d, 0xbc, 0x8e, 0x9c, 0x7f, 0xbd, 0xb3, 0x61,
+ 0x0b, 0xbd, 0x1c, 0x2b, 0xc9, 0x3c, 0xbc, 0xb7, 0x6f, 0x3c, 0x61, 0x58, 0xda,
+ 0xbc, 0xcc, 0x72, 0x23, 0x3c, 0x28, 0x64, 0x61, 0x3c, 0x5a, 0x19, 0x42, 0x3d,
+ 0xb0, 0x39, 0x13, 0x3c, 0xe6, 0x3a, 0xf7, 0xbc, 0xc4, 0xaf, 0xc4, 0x3c, 0xd2,
+ 0x14, 0xd0, 0xbc, 0x1a, 0x00, 0xb8, 0xbc, 0xf9, 0x9e, 0x23, 0xbd, 0xdf, 0x82,
+ 0x6a, 0xbd, 0x7a, 0xc2, 0x18, 0xbc, 0xbf, 0xb0, 0x11, 0xbc, 0x2d, 0x48, 0x5b,
+ 0xbd, 0xff, 0xff, 0x46, 0x3c, 0x6c, 0x6c, 0x36, 0x3c, 0xec, 0x21, 0x8a, 0xbd,
+ 0x02, 0x85, 0xe0, 0x3c, 0xdf, 0x2e, 0x42, 0xbd, 0xf0, 0xa5, 0x24, 0x3d, 0x0a,
+ 0xd1, 0x00, 0x3d, 0x58, 0x44, 0xb3, 0x3c, 0xc9, 0xe4, 0x33, 0x39, 0xba, 0x0f,
+ 0xb9, 0xbc, 0xba, 0x18, 0x64, 0x3c, 0x9e, 0xc4, 0x50, 0xbc, 0x5f, 0x96, 0x4c,
+ 0x3d, 0xbc, 0xdc, 0x61, 0x3d, 0xba, 0xaf, 0x38, 0x3d, 0xf1, 0x21, 0x89, 0x3d,
+ 0x60, 0x95, 0x05, 0x3c, 0xc6, 0xb2, 0x6e, 0xbc, 0x5f, 0x2d, 0x21, 0xbd, 0xee,
+ 0x52, 0x23, 0x3d, 0x3c, 0xc0, 0x1d, 0xbc, 0x3e, 0xcd, 0x84, 0x3d, 0x00, 0xc5,
+ 0xa8, 0x39, 0x06, 0x5b, 0x4a, 0xbd, 0xec, 0x4b, 0x1b, 0xbd, 0x05, 0x4c, 0x17,
+ 0xbd, 0x18, 0x01, 0x56, 0x3c, 0xcd, 0x05, 0x87, 0xbd, 0xe4, 0x37, 0x41, 0xbc,
+ 0xdc, 0x36, 0x84, 0x3d, 0xa1, 0xd7, 0x09, 0x3d, 0x44, 0xf4, 0x63, 0xbd, 0x56,
+ 0x62, 0x78, 0xbd, 0x12, 0x57, 0x3b, 0xbd, 0x43, 0xcd, 0x71, 0xbb, 0xa3, 0xf6,
+ 0x10, 0x3d, 0x3a, 0x9f, 0xff, 0xbc, 0x6f, 0xdd, 0x8d, 0x3d, 0xb3, 0xd7, 0x08,
+ 0xbd, 0x3e, 0x97, 0x76, 0x3d, 0x99, 0x60, 0x02, 0xbd, 0x08, 0x27, 0x8d, 0x3d,
+ 0xf1, 0x51, 0x29, 0x3d, 0x48, 0x9d, 0xfe, 0x3c, 0x97, 0xb9, 0x72, 0xbd, 0x35,
+ 0x21, 0xab, 0xbc, 0xc3, 0x96, 0x69, 0x3c, 0x05, 0x44, 0x05, 0x3d, 0x80, 0x79,
+ 0x75, 0x3a, 0x94, 0x62, 0xfe, 0x3b, 0x47, 0xb4, 0x64, 0x3c, 0xbb, 0x50, 0x29,
+ 0xbd, 0xe9, 0xb8, 0x6e, 0xbd, 0x2e, 0xab, 0x26, 0xbc, 0x54, 0x42, 0xb6, 0xbc,
+ 0x08, 0xdb, 0x22, 0xbd, 0xae, 0x42, 0x78, 0x3d, 0x3c, 0xba, 0x2c, 0xbc, 0x46,
+ 0xf1, 0x6e, 0x3d, 0xed, 0xb1, 0x88, 0xbd, 0x96, 0x2c, 0x75, 0x3d, 0x26, 0x69,
+ 0x90, 0xbd, 0x9b, 0x7b, 0x77, 0xbc, 0x9a, 0xbc, 0x05, 0xbd, 0x85, 0xb1, 0x19,
+ 0xbd, 0xb8, 0x33, 0x8b, 0xbd, 0xfa, 0xa3, 0x8b, 0xbc, 0xc6, 0x36, 0xf2, 0x3c,
+ 0x4e, 0x81, 0xa2, 0xbc, 0xa7, 0x85, 0x73, 0xbd, 0xca, 0xe5, 0x93, 0xbc, 0xc8,
+ 0x3d, 0x0e, 0x3d, 0x75, 0x3c, 0x00, 0xbd, 0x28, 0x32, 0x0e, 0x3d, 0x8f, 0x29,
+ 0x04, 0xbc, 0x0c, 0x29, 0x37, 0xbd, 0x47, 0x11, 0x83, 0xbd, 0x82, 0x57, 0x2a,
+ 0xbd, 0x45, 0x1f, 0x6b, 0xbc, 0x66, 0xaf, 0x7d, 0xbd, 0xa8, 0x5a, 0x25, 0xbd,
+ 0x96, 0xc0, 0x14, 0x3b, 0xba, 0xf0, 0x1b, 0xbd, 0xe0, 0x71, 0x44, 0xbb, 0x9c,
+ 0x09, 0xb9, 0xbc, 0x45, 0xda, 0x77, 0x3c, 0x2b, 0x5d, 0x80, 0x3d, 0xaa, 0xf0,
+ 0x21, 0x3d, 0xa0, 0x25, 0x31, 0x3d, 0x34, 0xc8, 0x3b, 0xbd, 0x90, 0x50, 0xf6,
+ 0xbc, 0x53, 0xed, 0x04, 0x3a, 0x26, 0xf8, 0x6e, 0x3d, 0x6d, 0x73, 0x0f, 0x3d,
+ 0xe8, 0xac, 0x43, 0x3d, 0xf1, 0x03, 0x8a, 0x3c, 0xc4, 0x94, 0x3d, 0x3d, 0x3c,
+ 0x89, 0x8b, 0x3d, 0x62, 0x99, 0x0f, 0x3d, 0xb6, 0x30, 0x8d, 0x3c, 0xfa, 0x8f,
+ 0x25, 0x3c, 0x4c, 0x45, 0xd2, 0xbc, 0x00, 0x5d, 0xc0, 0x3c, 0xae, 0x8d, 0x6c,
+ 0xbd, 0xcb, 0xa3, 0x92, 0xbd, 0xc4, 0x1e, 0xbb, 0xbc, 0x63, 0xf8, 0xaa, 0x3c,
+ 0xd7, 0x7c, 0x81, 0x3d, 0xbf, 0x33, 0x41, 0x3c, 0x80, 0x59, 0x69, 0xbb, 0x0a,
+ 0x75, 0x37, 0xbd, 0x29, 0xdc, 0x1b, 0xbd, 0x10, 0x1f, 0x46, 0xbd, 0xee, 0xb4,
+ 0x5d, 0x3d, 0xfa, 0x40, 0x95, 0xbd, 0x02, 0xd8, 0x19, 0xbd, 0xa8, 0xd0, 0xf0,
+ 0xbc, 0x0a, 0xb8, 0xc4, 0x3c, 0x68, 0xa8, 0x11, 0xbd, 0x24, 0x4f, 0x3e, 0x3d,
+ 0x39, 0x99, 0x90, 0xbd, 0x7c, 0x43, 0x13, 0xbd, 0x86, 0xe5, 0x8f, 0xbd, 0xa4,
+ 0x16, 0xb4, 0xbc, 0xa0, 0xe9, 0xf2, 0x3c, 0x91, 0x68, 0x5d, 0xbd, 0x51, 0x92,
+ 0x85, 0x3d, 0xd2, 0x4d, 0x35, 0xbd, 0xc7, 0x44, 0x3e, 0xbd, 0x20, 0xf6, 0xe0,
+ 0x3c, 0x6b, 0x38, 0x35, 0x3d, 0xd2, 0x2b, 0x2a, 0xbb, 0xc8, 0xbf, 0x0c, 0xbd,
+ 0xec, 0xd6, 0xfc, 0x3b, 0x1c, 0xae, 0xa9, 0xbc, 0x28, 0x65, 0xb3, 0x3c, 0xdf,
+ 0x29, 0x98, 0xbc, 0x11, 0x52, 0xbd, 0x3c, 0x4d, 0x7d, 0xac, 0x3c, 0x95, 0xcb,
+ 0x09, 0xbc, 0xc5, 0xc5, 0xf8, 0xbc, 0xe6, 0x99, 0x3f, 0x3c, 0xb0, 0x51, 0xfd,
+ 0xbc, 0x88, 0x6b, 0xe0, 0xbc, 0xaa, 0x84, 0x83, 0xbd, 0x98, 0x79, 0x8d, 0x3c,
+ 0xda, 0x5f, 0xf2, 0x3c, 0xb3, 0xcc, 0x7a, 0x3d, 0xc9, 0x55, 0x08, 0x3d, 0xd1,
+ 0x83, 0x33, 0x3d, 0x6c, 0xc1, 0x66, 0xbc, 0x80, 0xf9, 0x62, 0xba, 0xe4, 0xd5,
+ 0x88, 0xbd, 0x60, 0x31, 0xd2, 0xbc, 0x2b, 0x89, 0x86, 0x3d, 0x1b, 0x1e, 0x53,
+ 0xbd, 0xfa, 0x0c, 0x07, 0xbd, 0x50, 0xe8, 0xb5, 0xbc, 0x4f, 0xc6, 0x65, 0xbd,
+ 0xef, 0x09, 0x75, 0xbd, 0xd5, 0x47, 0x0c, 0xbd, 0xcc, 0x4e, 0x89, 0xbd, 0x9c,
+ 0x69, 0xe3, 0x3c, 0x52, 0xea, 0x9d, 0xbc, 0x01, 0x0e, 0x86, 0xbc, 0x2a, 0x61,
+ 0x72, 0xbd, 0x85, 0xbc, 0x87, 0x3d, 0x21, 0xf7, 0x42, 0x3d, 0x0b, 0x60, 0x23,
+ 0xbd, 0x0f, 0x0f, 0xed, 0xbc, 0x7d, 0x05, 0xd2, 0xbc, 0x6e, 0x5e, 0x5f, 0xbd,
+ 0x36, 0x52, 0x92, 0xbd, 0x7e, 0x96, 0x05, 0xbb, 0x6e, 0x51, 0x98, 0x3a, 0xe5,
+ 0x11, 0x19, 0xbd, 0x00, 0xcf, 0x84, 0xbb, 0x61, 0x5e, 0xed, 0x3c, 0x60, 0xcf,
+ 0x50, 0xbb, 0xce, 0xbe, 0x07, 0x3c, 0x5c, 0x81, 0x20, 0x3d, 0x45, 0x85, 0xf6,
+ 0xbc, 0x1d, 0xb7, 0x91, 0x3d, 0x38, 0x08, 0x59, 0x3c, 0x28, 0x93, 0x4b, 0x3d,
+ 0x3a, 0xc4, 0x87, 0xbd, 0x44, 0x7f, 0x04, 0xbd, 0xdd, 0x17, 0x81, 0x3d, 0xbe,
+ 0x94, 0x48, 0x3d, 0x88, 0x6a, 0xce, 0xba, 0x93, 0x5b, 0x20, 0x3d, 0xab, 0x05,
+ 0x90, 0xbd, 0xf9, 0x71, 0xc4, 0x3c, 0x6c, 0xd4, 0x7a, 0x3d, 0x4a, 0x2d, 0x20,
+ 0x3d, 0x94, 0xd7, 0x88, 0x3d, 0x82, 0xb5, 0x87, 0xbd, 0x55, 0x15, 0xec, 0x3b,
+ 0xc0, 0x09, 0xe4, 0xba, 0x31, 0x50, 0xfc, 0x3c, 0x25, 0x49, 0x6e, 0x3c, 0x5c,
+ 0x79, 0x92, 0xbc, 0xed, 0xab, 0x14, 0xbd, 0x24, 0x3e, 0xaa, 0x3c, 0x98, 0x43,
+ 0x58, 0x3d, 0x2f, 0x00, 0x62, 0x3d, 0x3c, 0x09, 0x2d, 0x3d, 0xe3, 0x27, 0x85,
+ 0x3c, 0x7a, 0x37, 0x06, 0x3d, 0x49, 0xe6, 0x62, 0xbd, 0x71, 0x53, 0x94, 0xbd,
+ 0xc4, 0xeb, 0xd0, 0xbb, 0xd8, 0xed, 0x11, 0x3c, 0xfe, 0x75, 0x8c, 0xbc, 0xc4,
+ 0xeb, 0x16, 0xbd, 0xb8, 0xb8, 0xf7, 0x3c, 0x30, 0x85, 0xaa, 0xbb, 0xcb, 0x9f,
+ 0x16, 0xbd, 0x1d, 0xed, 0x8d, 0x3d, 0x0f, 0xf3, 0x08, 0xbd, 0x8e, 0x3c, 0x13,
+ 0x3d, 0xc4, 0x04, 0x74, 0x3d, 0x60, 0xeb, 0x35, 0xbd, 0xe7, 0xcf, 0x38, 0x3d,
+ 0x12, 0xde, 0xaf, 0x3c, 0xca, 0x71, 0x04, 0x3d, 0x1c, 0xd8, 0xeb, 0x3c, 0xc6,
+ 0xfc, 0xb3, 0x3c, 0xa0, 0x37, 0x5a, 0x3d, 0xbe, 0xcc, 0x59, 0x3c, 0x4c, 0x95,
+ 0x9a, 0xbc, 0xa6, 0xff, 0xa8, 0x3b, 0xcd, 0x7d, 0x7d, 0xbd, 0x5c, 0xe7, 0xba,
+ 0x3c, 0xf9, 0x97, 0x02, 0xbd, 0x3a, 0xd3, 0x80, 0xbd, 0xcd, 0xbe, 0x97, 0xbd,
+ 0x3b, 0x0d, 0x35, 0xba, 0x76, 0x27, 0x44, 0x3d, 0x63, 0xae, 0x8a, 0x3d, 0x03,
+ 0x4c, 0x68, 0xbd, 0xe5, 0x9d, 0x0f, 0xbc, 0x6f, 0x5d, 0x45, 0xbb, 0x48, 0x3a,
+ 0x74, 0x3d, 0x85, 0xfa, 0x37, 0xbd, 0x31, 0xf5, 0x1c, 0x3d, 0x0b, 0x19, 0x52,
+ 0xbd, 0x00, 0xcd, 0x9e, 0xb9, 0xdb, 0xe5, 0x84, 0xbd, 0x83, 0xf1, 0x7f, 0xbd,
+ 0xb7, 0x44, 0x63, 0xbd, 0x44, 0x0a, 0x98, 0xbd, 0x60, 0xd8, 0x23, 0xbb, 0xd1,
+ 0x69, 0x61, 0xbd, 0x71, 0x41, 0x5a, 0xbd, 0x2f, 0xd9, 0x70, 0xbd, 0xc3, 0xb8,
+ 0xd3, 0x3c, 0x38, 0xa7, 0x99, 0x3c, 0xe0, 0xa0, 0x21, 0xbd, 0xd2, 0x90, 0xa8,
+ 0xb8, 0xff, 0xae, 0x32, 0x3c, 0x65, 0x1a, 0x0d, 0x3d, 0xa6, 0xd0, 0x39, 0xbd,
+ 0xdd, 0xb4, 0x18, 0xbd, 0xb0, 0xa0, 0xbc, 0x3c, 0xa0, 0xe4, 0x8b, 0x3d, 0x90,
+ 0xe6, 0x25, 0x3d, 0x7c, 0x20, 0x5d, 0x3d, 0x74, 0x50, 0xda, 0xbb, 0x4a, 0xe0,
+ 0x70, 0x3d, 0x02, 0x36, 0x13, 0x3d, 0xaa, 0xab, 0x05, 0xbd, 0xec, 0xda, 0x10,
+ 0xbd, 0xd1, 0x40, 0x35, 0xbd, 0xd2, 0x14, 0x3a, 0xbd, 0xd6, 0x7f, 0x06, 0xbd,
+ 0x55, 0xf8, 0x31, 0x3d, 0xea, 0xc4, 0x5c, 0x3d, 0xd6, 0x89, 0x52, 0x3d, 0x68,
+ 0xe6, 0x44, 0x3d, 0xd5, 0x64, 0x20, 0xbd, 0x18, 0x41, 0xc8, 0x3c, 0x10, 0xfa,
+ 0x44, 0x3d, 0x30, 0x39, 0x20, 0xbc, 0x27, 0x26, 0x85, 0x3d, 0x9e, 0x02, 0x48,
+ 0x3d, 0x59, 0xbb, 0xad, 0xbc, 0x67, 0x3c, 0xe3, 0xbc, 0xcc, 0x6e, 0x4b, 0xbd,
+ 0x08, 0xf9, 0x1c, 0xbd, 0x50, 0x02, 0xa8, 0x3c, 0x77, 0x8c, 0x21, 0xbd, 0x1b,
+ 0x8e, 0x0c, 0x3c, 0x0a, 0xe3, 0x76, 0x3d, 0x60, 0xa0, 0xa6, 0xbc, 0x30, 0x1d,
+ 0x2c, 0x3d, 0x89, 0xab, 0x57, 0xbd, 0x39, 0xdf, 0x8e, 0x3b, 0x4e, 0xd0, 0x81,
+ 0x3d, 0x6f, 0xc7, 0x0c, 0x3d, 0xb8, 0x21, 0x12, 0x3d, 0x32, 0xe6, 0x5a, 0x3d,
+ 0x26, 0xbf, 0x64, 0x3c, 0xa8, 0xaf, 0x35, 0x3d, 0x0e, 0x6e, 0xb4, 0xbc, 0x78,
+ 0x59, 0xa8, 0x3c, 0xd1, 0xca, 0x5c, 0xbd, 0x3a, 0x40, 0x53, 0x3d, 0x30, 0x50,
+ 0x0c, 0xbc, 0x11, 0xd3, 0x35, 0xbd, 0x06, 0x5b, 0x89, 0xbd, 0x2e, 0xe3, 0x63,
+ 0x3d, 0xc5, 0xdc, 0x0e, 0xbd, 0x60, 0x04, 0x2d, 0xbb, 0xae, 0xfb, 0x42, 0x3d,
+ 0x83, 0x52, 0xcd, 0xbc, 0x20, 0x53, 0x06, 0x3d, 0xd5, 0xc6, 0x38, 0x3c, 0xa7,
+ 0xa9, 0xf4, 0xbc, 0x9b, 0x2d, 0x89, 0x3d, 0x70, 0x74, 0x83, 0x3c, 0x06, 0x87,
+ 0xe7, 0x3b, 0x97, 0xa3, 0x92, 0x3c, 0x38, 0x5f, 0xf7, 0x3c, 0xdf, 0x71, 0x3b,
+ 0xbd, 0xfe, 0x14, 0x4d, 0x3d, 0x0a, 0x42, 0xb8, 0xbc, 0xb4, 0xf6, 0x2f, 0x3c,
+ 0x33, 0xe6, 0x94, 0xbd, 0x26, 0x39, 0x71, 0xbd, 0x10, 0xf4, 0x6e, 0xbd, 0xe4,
+ 0x3f, 0x09, 0xbd, 0x35, 0xe6, 0xb7, 0x3c, 0x9b, 0x3a, 0x10, 0xbd, 0x4d, 0x58,
+ 0x43, 0xbd, 0x3e, 0x25, 0x2c, 0xbd, 0x38, 0xdc, 0x4f, 0x3c, 0x06, 0xf5, 0xff,
+ 0xbc, 0x33, 0x3e, 0x81, 0xbd, 0x27, 0x99, 0x8e, 0xbb, 0x27, 0xc9, 0x68, 0xbd,
+ 0xce, 0x6c, 0x81, 0x3c, 0x0e, 0xab, 0x67, 0xbd, 0x50, 0x8a, 0x2f, 0x3c, 0x30,
+ 0x32, 0x37, 0x3d, 0x49, 0xd1, 0x0e, 0xbd, 0x60, 0xe2, 0x38, 0x3d, 0xf8, 0xd0,
+ 0x9f, 0x3c, 0x3e, 0x8a, 0x0d, 0x3d, 0x7e, 0x2f, 0x6a, 0xbd, 0xe8, 0x0f, 0xab,
+ 0x3b, 0x6e, 0x3d, 0x49, 0xbd, 0xba, 0xdd, 0x00, 0x3d, 0x80, 0x40, 0xdc, 0x3b,
+ 0x18, 0x06, 0x76, 0x3d, 0x48, 0xe5, 0x6d, 0x3d, 0xca, 0xcf, 0xa9, 0xbc, 0x3c,
+ 0xb8, 0x50, 0xbc, 0x70, 0xbf, 0x76, 0x3c, 0x0c, 0xbc, 0x1c, 0x3d, 0x59, 0x70,
+ 0xf3, 0xbc, 0x21, 0xaa, 0x83, 0xbc, 0xf6, 0x67, 0x4f, 0xbd, 0x86, 0xa6, 0x71,
+ 0x3c, 0x69, 0xd6, 0x48, 0x3c, 0x50, 0x60, 0x56, 0x3d, 0x9c, 0x25, 0x50, 0xbd,
+ 0x10, 0x27, 0x76, 0x3c, 0x98, 0x24, 0x7b, 0xbd, 0x6c, 0xb9, 0x01, 0xbc, 0xe6,
+ 0xea, 0x85, 0x3d, 0x0e, 0xa0, 0xf5, 0x3b, 0xb4, 0xb3, 0x0e, 0x3d, 0xe2, 0xc0,
+ 0xa1, 0x3c, 0x4c, 0x2c, 0xf6, 0xbc, 0xc8, 0x58, 0x25, 0x3c, 0xd0, 0x2c, 0xeb,
+ 0x3c, 0xa8, 0x0f, 0xfa, 0x3c, 0x50, 0xc1, 0xd6, 0xbb, 0x42, 0x81, 0x4d, 0xbd,
+ 0x37, 0x4c, 0x88, 0xbd, 0xf4, 0x1a, 0xd2, 0xbc, 0x94, 0xb7, 0xaf, 0xbb, 0xaf,
+ 0xeb, 0x0f, 0x3d, 0xed, 0x56, 0xa3, 0x3c, 0x5e, 0x0a, 0x87, 0x3d, 0x5c, 0x4a,
+ 0x64, 0xbc, 0x37, 0x90, 0x62, 0x3c, 0x57, 0xcd, 0xbb, 0x3b, 0x50, 0x0c, 0x76,
+ 0xbd, 0x1c, 0x48, 0x87, 0xbc, 0x38, 0x8a, 0x4e, 0x3c, 0xda, 0x2b, 0x3a, 0x3d,
+ 0xba, 0x1a, 0x81, 0xbc, 0x29, 0xca, 0xba, 0x3c, 0x78, 0x39, 0x2b, 0xbd, 0xd4,
+ 0x80, 0xe2, 0xbb, 0x08, 0x96, 0x95, 0x3c, 0x55, 0x08, 0x50, 0x3c, 0xbd, 0xed,
+ 0x15, 0xbd, 0xd0, 0xeb, 0xe5, 0xbb, 0xa5, 0x5a, 0x22, 0xbc, 0x6c, 0xe7, 0x8f,
+ 0xbc, 0x63, 0x73, 0xb2, 0x3c, 0xc0, 0xae, 0x13, 0x3c, 0x54, 0xbd, 0x6f, 0xbd,
+ 0x9e, 0x5a, 0x60, 0x3d, 0x62, 0xe8, 0x34, 0x3d, 0x38, 0x91, 0x24, 0x3d, 0x10,
+ 0xac, 0x03, 0x3c, 0x04, 0xc0, 0x83, 0xbd, 0x16, 0x48, 0x7e, 0xbd, 0x64, 0x7a,
+ 0x40, 0xbc, 0x52, 0xcf, 0x4a, 0x3d, 0xa1, 0x54, 0x1f, 0xb9, 0x61, 0x19, 0x8c,
+ 0x3d, 0x08, 0xfa, 0x5a, 0xbd, 0x2a, 0xf5, 0x67, 0x3d, 0xb3, 0xcc, 0x12, 0xbd,
+ 0xc3, 0x2a, 0x65, 0x3d, 0x06, 0xbb, 0x41, 0xbd, 0xfc, 0xc0, 0x09, 0xbd, 0x2c,
+ 0xdf, 0xa7, 0xbc, 0xb7, 0xfe, 0x5d, 0xbd, 0xcb, 0x10, 0xa3, 0xbb, 0x75, 0xc3,
+ 0xcd, 0x3c, 0x2b, 0xd5, 0x0e, 0x3d, 0x11, 0x1c, 0x83, 0x3d, 0x71, 0xdc, 0xb2,
+ 0xbc, 0xda, 0xe1, 0x86, 0xbd, 0x39, 0xf2, 0x50, 0x3c, 0x40, 0x25, 0x50, 0x3b,
+ 0x18, 0x17, 0x43, 0xbc, 0x6b, 0xa6, 0x88, 0x3c, 0x60, 0x10, 0x5d, 0xbd, 0x0e,
+ 0x88, 0xa1, 0x3c, 0xa6, 0xd3, 0xe4, 0xbc, 0x11, 0x76, 0x88, 0xbc, 0x1e, 0x07,
+ 0x6c, 0x3d, 0xa6, 0x6e, 0x1b, 0x3d, 0xc0, 0x30, 0x30, 0x3d, 0xf2, 0x34, 0x8d,
+ 0xbd, 0xc0, 0xe2, 0x18, 0x3b, 0xce, 0xef, 0x83, 0xbc, 0xe7, 0x31, 0x0e, 0xbd,
+ 0xd1, 0xf1, 0x8b, 0xbd, 0xba, 0x6e, 0x3e, 0xbc, 0xc7, 0x45, 0x08, 0xbd, 0x57,
+ 0x7e, 0x56, 0x3d, 0x6d, 0xaf, 0x68, 0xbd, 0xef, 0x94, 0x28, 0xbd, 0x65, 0xf5,
+ 0xa5, 0x3c, 0xea, 0x2c, 0x43, 0xbd, 0x5c, 0xc6, 0x5d, 0x3c, 0x3e, 0x7e, 0x3f,
+ 0xbd, 0xd4, 0xa5, 0x7c, 0xbd, 0x14, 0x39, 0x35, 0xbd, 0xc5, 0x8a, 0x08, 0xbd,
+ 0x7e, 0xc0, 0x0c, 0x3d, 0x45, 0xbb, 0x84, 0x3c, 0x0d, 0x10, 0x6f, 0x39, 0x81,
+ 0x04, 0x4b, 0x3c, 0x5b, 0x45, 0xff, 0x3c, 0xab, 0xd1, 0x74, 0xbd, 0x98, 0x8a,
+ 0x38, 0x3c, 0xe3, 0xc7, 0xa9, 0x3c, 0x8b, 0x12, 0x7f, 0xbd, 0x6f, 0xb7, 0xc5,
+ 0x3a, 0x95, 0x7e, 0xaf, 0x3c, 0x50, 0xc8, 0xc5, 0x3b, 0xf9, 0x02, 0x89, 0xbd,
+ 0x6e, 0x63, 0xa2, 0xbc, 0x0c, 0x74, 0x32, 0x3d, 0xea, 0x32, 0x79, 0x3d, 0x0e,
+ 0x34, 0x91, 0xbd, 0xa1, 0x87, 0xec, 0xbc, 0x1c, 0xd4, 0x17, 0x3d, 0xe1, 0xb0,
+ 0x74, 0x3d, 0xe9, 0x8e, 0xc6, 0x3c, 0x8a, 0x62, 0x55, 0xbc, 0x51, 0x37, 0x95,
+ 0xbd, 0x2b, 0xc8, 0xbd, 0xbc, 0x8e, 0xe4, 0xef, 0xbc, 0x11, 0x49, 0x0d, 0x3d,
+ 0xe8, 0xcc, 0x16, 0x3d, 0xc6, 0xa8, 0xc8, 0x3c, 0x98, 0x01, 0x88, 0x3c, 0xbd,
+ 0x8e, 0x46, 0xbd, 0xab, 0x7d, 0xd4, 0xbc, 0x7a, 0xde, 0xb6, 0xbc, 0xf9, 0x44,
+ 0xcd, 0xbc, 0xad, 0xae, 0x13, 0xbc, 0x8d, 0xb5, 0x21, 0xbd, 0x48, 0xfb, 0x05,
+ 0xbc, 0x1d, 0x6d, 0x84, 0x3d, 0x4c, 0x32, 0x8a, 0x3c, 0xa8, 0xe9, 0x69, 0x3c,
+ 0xa6, 0xba, 0x1b, 0xbd, 0xe5, 0xfa, 0x12, 0x3d, 0xea, 0xea, 0x11, 0x3d, 0xa4,
+ 0xa1, 0x10, 0xbd, 0x0c, 0x0e, 0xad, 0x3d, 0x04, 0xeb, 0x1c, 0xbd, 0xe5, 0x6d,
+ 0x0f, 0xbd, 0x1e, 0x40, 0xea, 0x3d, 0xfa, 0xc5, 0x36, 0x3d, 0x7a, 0xd3, 0x34,
+ 0xbd, 0xe2, 0xe5, 0x4b, 0xbd, 0x27, 0x35, 0xf0, 0xbd, 0x60, 0x53, 0xc6, 0xbc,
+ 0xb4, 0x7c, 0x0b, 0xbd, 0x0c, 0xc1, 0xbd, 0x39, 0x4b, 0xfb, 0x67, 0x3c, 0x4c,
+ 0x65, 0xc4, 0x3c, 0x23, 0x9d, 0x88, 0x3c, 0x7c, 0x7e, 0xa0, 0x3b, 0x7f, 0xd2,
+ 0x94, 0x3b, 0x45, 0xd2, 0x24, 0x3d, 0x00, 0xd4, 0xf5, 0xbb, 0x13, 0xf0, 0x99,
+ 0x3d, 0xd6, 0x36, 0xa0, 0x3a, 0x28, 0xb0, 0x5d, 0x3d, 0x9f, 0xf9, 0x81, 0xbd,
+ 0x42, 0x4b, 0x98, 0x3d, 0x29, 0x10, 0x7d, 0x3d, 0x8e, 0xe9, 0xf5, 0xbc, 0xfb,
+ 0xc1, 0x91, 0xbc, 0x71, 0xda, 0xe2, 0xbc, 0x1e, 0x75, 0x3b, 0xbd, 0xbe, 0x22,
+ 0x2f, 0x3d, 0xfa, 0xb6, 0x27, 0xba, 0x8c, 0x36, 0x86, 0x3c, 0x45, 0x63, 0xcf,
+ 0xbc, 0x13, 0x05, 0x5e, 0xbc, 0xba, 0xc5, 0x24, 0xbd, 0xcd, 0x6d, 0x0b, 0x3c,
+ 0x5d, 0xe6, 0x00, 0x3b, 0x82, 0xbb, 0xcf, 0xbc, 0xdb, 0x1f, 0x31, 0xbd, 0x91,
+ 0x32, 0x95, 0xbc, 0x81, 0xff, 0x0b, 0xba, 0xa7, 0xe4, 0x0f, 0x3d, 0x50, 0xd4,
+ 0x2c, 0x3d, 0x4c, 0x82, 0x27, 0x3c, 0x54, 0x76, 0x69, 0x3c, 0xef, 0x41, 0x53,
+ 0xbb, 0x7b, 0x88, 0x26, 0xbd, 0xfa, 0x19, 0x51, 0x3d, 0x83, 0xe9, 0x89, 0xbd,
+ 0x96, 0xa7, 0x4a, 0x3d, 0x87, 0xf0, 0xe6, 0xbc, 0x2b, 0x59, 0x61, 0xbc, 0x4a,
+ 0x9a, 0x7d, 0x3d, 0x7c, 0x95, 0x54, 0x38, 0xa6, 0x6e, 0x69, 0x3d, 0xf3, 0x84,
+ 0x27, 0xbd, 0x84, 0x7f, 0x26, 0x3c, 0xc3, 0xe1, 0x58, 0x3b, 0xa7, 0x2d, 0xa5,
+ 0x3d, 0x13, 0x70, 0x2a, 0xbd, 0xae, 0x66, 0x1f, 0x3d, 0x6d, 0x44, 0xff, 0xbc,
+ 0x66, 0x10, 0xb2, 0x3c, 0x94, 0xd5, 0x98, 0xb9, 0x00, 0xc8, 0xef, 0x3d, 0x5c,
+ 0x00, 0x2f, 0xbc, 0xd7, 0xb1, 0xf6, 0x3c, 0x1b, 0xdb, 0xe1, 0x3c, 0xaa, 0x78,
+ 0xe0, 0x3c, 0xb5, 0xe8, 0xd1, 0x3c, 0xda, 0x9e, 0x39, 0xbc, 0xe4, 0x90, 0x84,
+ 0xbc, 0x42, 0x92, 0x6f, 0xbd, 0xdd, 0xd7, 0x8a, 0x3d, 0xd3, 0x62, 0x90, 0x3c,
+ 0x1c, 0x20, 0x52, 0x3d, 0x1e, 0x29, 0x72, 0xbd, 0xf4, 0x8e, 0x1c, 0x3d, 0xd9,
+ 0xda, 0xaf, 0xbc, 0x60, 0x11, 0x8e, 0xbb, 0x71, 0xc1, 0xbf, 0xbc, 0xec, 0x7f,
+ 0x3d, 0x3c, 0xe5, 0x10, 0x3d, 0xbd, 0x1a, 0xbf, 0x69, 0x3d, 0x3f, 0x56, 0x0b,
+ 0xbb, 0x19, 0x64, 0x9d, 0x3c, 0xe1, 0x00, 0x05, 0x3d, 0x4f, 0x77, 0x8e, 0x3d,
+ 0x0f, 0x4d, 0x35, 0x3d, 0xe5, 0x6d, 0x4d, 0xbd, 0x9d, 0xb6, 0x58, 0x3c, 0x64,
+ 0x44, 0x30, 0xba, 0x08, 0xe8, 0xaa, 0x3c, 0x73, 0xe7, 0x0b, 0x3d, 0x71, 0x00,
+ 0x8c, 0x3d, 0x1a, 0xd9, 0xeb, 0x3c, 0xde, 0x78, 0xf2, 0xbb, 0xe5, 0x50, 0xcb,
+ 0x3d, 0x03, 0x80, 0x7f, 0x3b, 0xb4, 0xf7, 0x1a, 0x3d, 0x32, 0xf5, 0xb0, 0x3d,
+ 0x1c, 0x38, 0xe5, 0x3c, 0xb1, 0x72, 0x05, 0x3d, 0xc3, 0x92, 0xcf, 0x3c, 0xdc,
+ 0x7b, 0x0c, 0xbe, 0x95, 0x0b, 0xfc, 0x3c, 0x5f, 0x34, 0x18, 0x3d, 0xc2, 0x08,
+ 0x19, 0xbd, 0x25, 0xd4, 0x7b, 0x3d, 0x1e, 0xca, 0x88, 0xbd, 0x57, 0x5f, 0x9a,
+ 0x3d, 0x57, 0x98, 0x80, 0x3d, 0x20, 0x7d, 0xdd, 0x3c, 0xdf, 0xb3, 0x65, 0x3d,
+ 0x88, 0xde, 0x8d, 0xbd, 0x45, 0x90, 0x9d, 0x3d, 0x8a, 0xf8, 0xfa, 0xbc, 0xdf,
+ 0xe2, 0xef, 0xb9, 0x21, 0x8d, 0x5a, 0xbc, 0x3e, 0x45, 0x17, 0x3c, 0x11, 0x8d,
+ 0x8d, 0xbd, 0xb9, 0xd3, 0x2b, 0xb9, 0xd1, 0x2b, 0x24, 0xbc, 0x7e, 0x0e, 0x00,
+ 0x3b, 0xfd, 0xc2, 0x2e, 0xbd, 0x80, 0x7d, 0x0d, 0x3d, 0x91, 0x8a, 0x49, 0x3d,
+ 0xba, 0x7e, 0x10, 0x3d, 0xc3, 0x56, 0x2a, 0x3d, 0x1a, 0x4d, 0x6e, 0x3d, 0x20,
+ 0x44, 0x90, 0x3c, 0x2f, 0xd8, 0x79, 0x3d, 0x7b, 0x5c, 0xab, 0x3d, 0x64, 0xa5,
+ 0xe1, 0x3c, 0x26, 0x94, 0x31, 0x3d, 0xcc, 0xaf, 0xec, 0xbd, 0xc0, 0x25, 0x4b,
+ 0xbd, 0xd1, 0x06, 0x87, 0x3d, 0x97, 0x3c, 0x44, 0xbd, 0x9c, 0x81, 0xc2, 0xbc,
+ 0x0a, 0xd3, 0x1a, 0xbd, 0x0d, 0xe3, 0x00, 0xbd, 0x08, 0x6e, 0x53, 0xbd, 0x67,
+ 0x84, 0x1a, 0x3d, 0xeb, 0xd0, 0x2f, 0x3d, 0x76, 0xea, 0x46, 0x3b, 0x3e, 0x6e,
+ 0xbe, 0xbc, 0xf3, 0x6a, 0x11, 0x3d, 0x13, 0xed, 0xb8, 0x3c, 0xc1, 0x4f, 0x9a,
+ 0x3d, 0xd6, 0x9a, 0x31, 0xbd, 0xcc, 0x51, 0x0e, 0x3d, 0x60, 0x8c, 0x89, 0x3d,
+ 0x66, 0xc1, 0x41, 0xbd, 0x75, 0x80, 0xa2, 0x3d, 0x40, 0xbb, 0x5c, 0x3b, 0x6f,
+ 0xb6, 0x90, 0x3d, 0xb7, 0x62, 0x02, 0x3c, 0x54, 0x75, 0x78, 0x3d, 0x3d, 0x29,
+ 0xaf, 0x3d, 0x53, 0x5f, 0x97, 0x3d, 0xaf, 0x83, 0x91, 0xbc, 0xc9, 0x29, 0x55,
+ 0x3d, 0xda, 0x00, 0x82, 0xbb, 0x8d, 0xcd, 0x2e, 0x3d, 0x9d, 0xcb, 0x88, 0xbd,
+ 0x4d, 0x93, 0x3d, 0xbd, 0x55, 0xb8, 0x66, 0xbd, 0x98, 0xf2, 0x4e, 0xbc, 0xf9,
+ 0xe0, 0x28, 0xbc, 0x6f, 0x30, 0x2d, 0x3d, 0xd8, 0xe6, 0x9e, 0x3d, 0x81, 0xcf,
+ 0x31, 0xbd, 0x31, 0x50, 0x45, 0xbd, 0x90, 0x9e, 0x2f, 0xbd, 0x4b, 0x9a, 0x9a,
+ 0x3d, 0x2f, 0x1a, 0xb3, 0xbc, 0x05, 0x59, 0x9b, 0xbc, 0xa6, 0x4f, 0x9b, 0xbc,
+ 0x24, 0x10, 0x9e, 0xbd, 0x91, 0x8e, 0xa5, 0x3c, 0x0c, 0x2a, 0x43, 0x3d, 0x85,
+ 0x85, 0x87, 0xbd, 0x00, 0x61, 0x36, 0xbd, 0x10, 0xb9, 0x43, 0xbc, 0x58, 0x2c,
+ 0x24, 0x3b, 0xb7, 0x4f, 0x80, 0x3d, 0x46, 0x0f, 0x29, 0xbd, 0x76, 0x68, 0x44,
+ 0xbd, 0x57, 0xcf, 0x18, 0xbd, 0x24, 0x15, 0x94, 0x3d, 0x13, 0x57, 0x98, 0x3d,
+ 0x5e, 0xd6, 0x9c, 0x3d, 0xa0, 0x16, 0x9e, 0x3d, 0x66, 0x87, 0x83, 0xbd, 0x19,
+ 0x6d, 0x8b, 0x3d, 0x24, 0x60, 0x9a, 0xbc, 0x00, 0x60, 0xea, 0xbb, 0xba, 0x09,
+ 0x5f, 0xbd, 0xdc, 0xdd, 0xaa, 0x3b, 0x95, 0x08, 0xe9, 0xbc, 0x82, 0x0c, 0xc6,
+ 0x3c, 0x19, 0xb1, 0xda, 0xbc, 0x80, 0x2e, 0x4b, 0x3c, 0xed, 0xab, 0x29, 0x3d,
+ 0x17, 0x38, 0x51, 0x3d, 0x52, 0xa3, 0xef, 0x3c, 0xfd, 0x1c, 0x88, 0xbc, 0x40,
+ 0x9f, 0x3a, 0x3c, 0x87, 0x8a, 0xbe, 0xbc, 0xe5, 0xf4, 0x2a, 0xbd, 0x01, 0x1f,
+ 0x32, 0x3d, 0x2c, 0xbf, 0x3d, 0xbc, 0x33, 0xd3, 0xf9, 0xbb, 0xc4, 0x58, 0x2d,
+ 0xbd, 0x5d, 0xa3, 0x8f, 0x3d, 0x27, 0x5d, 0x90, 0xbc, 0xcf, 0x00, 0x82, 0x3d,
+ 0x0b, 0x65, 0xa7, 0x3d, 0x52, 0x11, 0xff, 0xbc, 0x37, 0xca, 0x18, 0xbd, 0xb9,
+ 0x2f, 0x9d, 0x3c, 0x36, 0x90, 0x68, 0x3d, 0x85, 0x61, 0x6b, 0x3d, 0x27, 0xb0,
+ 0x89, 0xbc, 0xcb, 0xb5, 0xac, 0xbb, 0xf4, 0x4b, 0x79, 0xbc, 0x34, 0x73, 0xe7,
+ 0xbc, 0x81, 0x9b, 0x86, 0x3c, 0x58, 0xc2, 0xce, 0x3c, 0x0a, 0x63, 0x2c, 0xbd,
+ 0xf6, 0xd3, 0xcf, 0xbd, 0xea, 0xf1, 0x01, 0xbd, 0x7a, 0x64, 0xe0, 0xbc, 0x12,
+ 0x3a, 0x28, 0x3d, 0x98, 0xe9, 0x98, 0x3d, 0x95, 0xf1, 0xa8, 0xbc, 0x88, 0xb4,
+ 0x2a, 0x3d, 0x81, 0xdf, 0xc4, 0xbc, 0x62, 0xb8, 0xfb, 0xbc, 0x46, 0xd2, 0x90,
+ 0xbd, 0x74, 0x0a, 0xc4, 0x3c, 0x8e, 0x57, 0x6f, 0x3d, 0xf9, 0xea, 0x78, 0x3d,
+ 0xdc, 0x6e, 0x62, 0xbd, 0x46, 0xe2, 0x16, 0xbd, 0xa6, 0x36, 0x37, 0xbd, 0xf5,
+ 0x36, 0x35, 0xbd, 0x9a, 0x4f, 0xb8, 0xbc, 0xf2, 0xab, 0x15, 0x3c, 0xee, 0x55,
+ 0xd7, 0x3b, 0xfa, 0xd0, 0x1c, 0xbd, 0xd4, 0x6b, 0x97, 0xbc, 0x91, 0x57, 0x51,
+ 0xbd, 0x7c, 0xc9, 0x64, 0x3d, 0xf8, 0x29, 0xcd, 0xbc, 0x75, 0x65, 0x67, 0x3d,
+ 0xaa, 0xd9, 0xa3, 0x3c, 0x55, 0xff, 0x8f, 0x3c, 0x7c, 0x18, 0x46, 0xbd, 0x92,
+ 0x18, 0x2c, 0x3d, 0x3a, 0x9f, 0x8a, 0xbc, 0xee, 0xd4, 0x05, 0x3d, 0x37, 0x03,
+ 0xaa, 0xbd, 0xe9, 0x50, 0x07, 0xbe, 0x1a, 0x94, 0x18, 0x3d, 0x79, 0x69, 0x03,
+ 0xbd, 0x7f, 0xc8, 0xd4, 0xbc, 0x25, 0xa7, 0x86, 0x3a, 0x17, 0xf1, 0x00, 0x3c,
+ 0xfd, 0x40, 0x10, 0x3d, 0x6e, 0x29, 0xf7, 0x3c, 0x05, 0xb0, 0x38, 0xbd, 0x7e,
+ 0x44, 0x5a, 0xbc, 0x0e, 0xdf, 0x66, 0x3d, 0x08, 0x9d, 0x10, 0xbc, 0xff, 0x12,
+ 0x8e, 0xbb, 0x01, 0x3f, 0x67, 0xbc, 0x6e, 0xa6, 0x4f, 0x3d, 0xca, 0x07, 0x63,
+ 0xbd, 0x97, 0x61, 0x4b, 0x3d, 0x71, 0x21, 0x34, 0x3d, 0x4f, 0xa2, 0x6d, 0x3d,
+ 0x8f, 0xf5, 0xe8, 0xbd, 0x72, 0x55, 0x4b, 0xbd, 0xee, 0xb2, 0xe9, 0xbc, 0xf2,
+ 0x49, 0xa7, 0x3d, 0x89, 0x22, 0xf5, 0x3c, 0xd8, 0x73, 0xcb, 0x3d, 0xbb, 0x15,
+ 0x81, 0x3d, 0x33, 0xf1, 0x5c, 0x3d, 0xa7, 0x30, 0x96, 0xbd, 0x4b, 0x2c, 0x58,
+ 0xbd, 0x34, 0x05, 0x00, 0x3d, 0xbd, 0x81, 0x92, 0x3d, 0x67, 0x5b, 0x5f, 0xbc,
+ 0xb4, 0x1e, 0xe6, 0xbd, 0x7c, 0x56, 0x00, 0x3c, 0x7c, 0x6d, 0xa8, 0x3c, 0x9b,
+ 0x21, 0xbd, 0xbb, 0x71, 0xf4, 0x48, 0xbd, 0xf8, 0xe1, 0x87, 0xbd, 0xd7, 0x4f,
+ 0xaf, 0xbc, 0x08, 0xef, 0xd9, 0x3c, 0x3e, 0x7b, 0x24, 0x3c, 0xa8, 0xcc, 0xe7,
+ 0x3c, 0xf0, 0xa0, 0x4a, 0xbd, 0x45, 0xbf, 0x39, 0xbd, 0x4e, 0xb6, 0xd6, 0x3c,
+ 0xfb, 0xfb, 0x49, 0x3d, 0xdd, 0x90, 0x4e, 0x3c, 0x0c, 0xb0, 0x83, 0x3d, 0x2d,
+ 0x83, 0x42, 0x3c, 0x1f, 0x45, 0xeb, 0xbb, 0xd3, 0x7e, 0xf2, 0x3b, 0x4d, 0x22,
+ 0xa6, 0xbd, 0x40, 0x45, 0x5c, 0xbb, 0x8c, 0xa5, 0x1c, 0xbd, 0x57, 0xd9, 0x86,
+ 0x3d, 0x45, 0xfc, 0x4e, 0x3d, 0xc5, 0x64, 0x24, 0x3d, 0xc9, 0xf4, 0x27, 0x3c,
+ 0xc7, 0x86, 0x08, 0x3d, 0x9c, 0x3c, 0x13, 0x3b, 0xab, 0x69, 0x12, 0x3d, 0x0d,
+ 0xfa, 0x80, 0x3d, 0x6b, 0x86, 0x15, 0xbd, 0x93, 0x11, 0x1e, 0xbd, 0x70, 0x3b,
+ 0x02, 0x3b, 0x50, 0x75, 0x06, 0xbd, 0x61, 0xe8, 0x7b, 0xbc, 0x5a, 0x15, 0xa7,
+ 0x3d, 0x47, 0x26, 0x0b, 0x3c, 0xb8, 0x03, 0x98, 0x3c, 0xce, 0xcc, 0x8e, 0x3d,
+ 0x12, 0x6c, 0xba, 0xbc, 0xca, 0x74, 0x5f, 0xbd, 0x84, 0x45, 0xd6, 0x3d, 0x2a,
+ 0xc6, 0xb3, 0xbc, 0x75, 0x88, 0x53, 0x3d, 0x44, 0xc0, 0x37, 0x3c, 0x69, 0x7c,
+ 0x59, 0x3d, 0xc1, 0xa5, 0xe5, 0xbc, 0x61, 0xc0, 0x9f, 0x3c, 0xbc, 0x7d, 0x7e,
+ 0xbc, 0x9c, 0x18, 0x79, 0xbd, 0x09, 0x70, 0x16, 0x3d, 0xdd, 0x36, 0x0b, 0x3d,
+ 0xcc, 0xba, 0xc8, 0x3c, 0xe6, 0xae, 0x18, 0xbc, 0xd6, 0x1a, 0x20, 0xbd, 0x43,
+ 0x22, 0x24, 0xbc, 0xcc, 0x3e, 0xd4, 0x3c, 0xe2, 0x43, 0x1a, 0xbb, 0x02, 0x94,
+ 0xd5, 0x3c, 0x24, 0x73, 0x3d, 0x3d, 0x4d, 0x1c, 0xce, 0x3c, 0x94, 0xea, 0x4a,
+ 0x3d, 0x33, 0x7a, 0x09, 0x3d, 0xf4, 0xcc, 0x66, 0xbd, 0x13, 0xb9, 0x9e, 0xbd,
+ 0x98, 0xbe, 0xb4, 0xbc, 0x19, 0x14, 0x21, 0x3d, 0x97, 0xca, 0x50, 0x3d, 0x8f,
+ 0x3f, 0x2f, 0xbc, 0x69, 0x98, 0x25, 0x3d, 0x55, 0x13, 0x80, 0xbc, 0xef, 0x2e,
+ 0x82, 0x3d, 0x24, 0xea, 0x71, 0xbd, 0x84, 0x97, 0x32, 0xbd, 0xb0, 0xaa, 0xaf,
+ 0x3c, 0xfa, 0x13, 0x9b, 0x3d, 0x56, 0xa5, 0x2b, 0x3d, 0x03, 0x06, 0x2d, 0xbc,
+ 0x6c, 0x24, 0x39, 0xbd, 0x46, 0x80, 0x29, 0x3d, 0x64, 0xdb, 0x61, 0xbb, 0x85,
+ 0x2a, 0x22, 0xbd, 0x9f, 0x47, 0xc1, 0x3d, 0x71, 0xc5, 0x85, 0xbd, 0x00, 0x31,
+ 0x9c, 0xb9, 0xc4, 0xd0, 0x2e, 0xbd, 0x08, 0x5d, 0x36, 0x3d, 0x41, 0x70, 0x3f,
+ 0xbd, 0x01, 0xc0, 0x87, 0x3c, 0x05, 0xf1, 0x37, 0xbc, 0xaf, 0x5d, 0xd4, 0xbb,
+ 0x10, 0xa9, 0x1c, 0x3d, 0xb8, 0xa9, 0x62, 0xba, 0xae, 0x29, 0x71, 0x3d, 0x51,
+ 0x57, 0x73, 0xbc, 0x05, 0x0a, 0xb8, 0xbd, 0xe3, 0x38, 0xa1, 0xbd, 0x3d, 0x08,
+ 0x13, 0x3d, 0x54, 0x69, 0x80, 0xbd, 0xe9, 0x65, 0x60, 0xbd, 0x2e, 0x02, 0x88,
+ 0x3d, 0x00, 0xdf, 0x58, 0xbb, 0xde, 0x06, 0x35, 0xbd, 0x1e, 0x3f, 0x0a, 0xbd,
+ 0x35, 0xe2, 0x15, 0xbd, 0xa6, 0xe3, 0x99, 0x3d, 0x42, 0x8e, 0x2e, 0xbd, 0x9b,
+ 0x10, 0x97, 0xbd, 0xd9, 0x36, 0xca, 0x3b, 0x27, 0x9f, 0x5c, 0xbd, 0xb8, 0x0c,
+ 0x25, 0xbd, 0x61, 0xe3, 0x8e, 0x3d, 0x8b, 0x23, 0xa5, 0xbc, 0xf4, 0xda, 0x47,
+ 0xbd, 0x30, 0x95, 0xac, 0x3c, 0xe1, 0xb0, 0xab, 0xbd, 0xb0, 0x5a, 0x15, 0x3d,
+ 0x58, 0x7e, 0x35, 0x3d, 0x13, 0xeb, 0x48, 0xbc, 0x00, 0xe6, 0x80, 0x3c, 0x39,
+ 0x59, 0x21, 0xbb, 0xca, 0xf7, 0xbe, 0x3d, 0x2a, 0xb9, 0x37, 0x3d, 0x26, 0x13,
+ 0x80, 0x3d, 0x9e, 0xbd, 0xc7, 0x3c, 0xb6, 0xd6, 0x50, 0xbd, 0xa6, 0x52, 0x82,
+ 0x3d, 0x39, 0xa3, 0x81, 0xb9, 0xe3, 0xb2, 0xf8, 0xbd, 0xc5, 0x84, 0x54, 0xbd,
+ 0xba, 0xea, 0x27, 0x3d, 0x1e, 0xce, 0xcf, 0x3c, 0x0d, 0xd3, 0x6f, 0x3c, 0xa7,
+ 0xce, 0x87, 0xbc, 0x67, 0xe3, 0x5e, 0xbd, 0xf6, 0xdc, 0x3b, 0x3d, 0xca, 0x8f,
+ 0x23, 0xbd, 0x69, 0x20, 0x9e, 0x3b, 0x32, 0x59, 0x2e, 0x3d, 0x12, 0x32, 0x09,
+ 0xbd, 0xa1, 0xc3, 0x2a, 0x3c, 0x68, 0x2a, 0x6b, 0xbc, 0xf7, 0xbf, 0x92, 0xbc,
+ 0x97, 0x8c, 0x97, 0x3d, 0x8e, 0xc6, 0x74, 0x3c, 0x04, 0x01, 0x47, 0x3c, 0x6b,
+ 0x51, 0xf0, 0x3d, 0x0e, 0xf6, 0x3b, 0x3b, 0xee, 0xeb, 0x5d, 0x3d, 0x98, 0x69,
+ 0x9b, 0x3c, 0xb5, 0x47, 0xfc, 0xbc, 0x5e, 0x56, 0x40, 0xbc, 0x15, 0x4e, 0xad,
+ 0xbb, 0x84, 0xcf, 0x96, 0x3c, 0xe3, 0x32, 0xbe, 0xbc, 0x36, 0xcd, 0xc8, 0x3d,
+ 0x70, 0xb8, 0x97, 0x3d, 0xd9, 0xc3, 0x28, 0xbd, 0x6c, 0xec, 0x7b, 0x3d, 0xbf,
+ 0x32, 0xc6, 0xbd, 0x98, 0x0d, 0x0f, 0xbe, 0x32, 0xaa, 0x95, 0x3d, 0x6e, 0x2c,
+ 0xfd, 0xbc, 0x10, 0x45, 0xc1, 0xbb, 0x4d, 0x8b, 0x03, 0x3d, 0xe4, 0x05, 0xde,
+ 0xbc, 0x0d, 0x7c, 0xbe, 0x3c, 0x07, 0x24, 0x77, 0x3d, 0x98, 0xb0, 0x2a, 0x3c,
+ 0x21, 0xc9, 0xa3, 0x3c, 0x1a, 0x6d, 0x69, 0x3d, 0x33, 0xf6, 0xeb, 0xbc, 0x40,
+ 0x77, 0x90, 0x3d, 0x6c, 0xf5, 0x99, 0x3c, 0x42, 0x69, 0x08, 0x3d, 0x9b, 0x3f,
+ 0xde, 0xbc, 0xe0, 0x71, 0x04, 0xbd, 0x6a, 0xcd, 0xfe, 0xbb, 0x77, 0xd6, 0xb3,
+ 0x3d, 0xf9, 0xb4, 0xcc, 0x3b, 0x6a, 0x1c, 0x70, 0x3d, 0x10, 0x34, 0x15, 0xbc,
+ 0x82, 0x15, 0x3a, 0x3d, 0xa8, 0xa6, 0x02, 0x3d, 0x06, 0x03, 0xaa, 0x3d, 0x15,
+ 0x2c, 0xe6, 0xbc, 0xac, 0xf0, 0xdc, 0x3c, 0xa7, 0x3b, 0xef, 0xbc, 0x7a, 0xa7,
+ 0x93, 0x3d, 0xaf, 0x46, 0x87, 0x3c, 0xf9, 0x13, 0x76, 0xbb, 0x30, 0x99, 0x15,
+ 0xbd, 0x36, 0xd1, 0x8f, 0xbc, 0xc9, 0x26, 0xaf, 0x3d, 0xc0, 0xa3, 0x5b, 0x3c,
+ 0x69, 0x65, 0x84, 0xbd, 0x1e, 0x30, 0x81, 0x3d, 0xb4, 0xbc, 0x22, 0x3d, 0x16,
+ 0x60, 0x52, 0x3d, 0x5e, 0xfe, 0x6a, 0xbc, 0x16, 0x65, 0x34, 0xbd, 0xfe, 0xab,
+ 0xf0, 0x3c, 0xe1, 0xfd, 0x90, 0x3d, 0xd4, 0x61, 0x6a, 0xbd, 0x55, 0xd1, 0x85,
+ 0xbd, 0x87, 0x6f, 0x66, 0xbd, 0x29, 0x4a, 0x8d, 0x3a, 0xec, 0x8f, 0x91, 0x3d,
+ 0x07, 0x75, 0x5a, 0x3b, 0x95, 0x09, 0x27, 0x3b, 0x25, 0x10, 0xd3, 0x3d, 0xde,
+ 0xfe, 0x0b, 0xbd, 0xe8, 0xd4, 0xc4, 0x3c, 0x4e, 0xda, 0x7d, 0x3c, 0x54, 0xb5,
+ 0xe8, 0xba, 0x69, 0x46, 0x40, 0x3d, 0xd1, 0xd6, 0x48, 0x3c, 0xfa, 0xb9, 0x87,
+ 0x39, 0x5a, 0x17, 0x20, 0xbc, 0xd5, 0x9b, 0x66, 0x3d, 0x19, 0x23, 0xac, 0x3c,
+ 0x56, 0x76, 0x5a, 0xbd, 0x7e, 0x50, 0x3c, 0xbc, 0x02, 0x8b, 0x17, 0xbd, 0x42,
+ 0x85, 0xc6, 0xbd, 0x06, 0x12, 0x9f, 0x3d, 0xad, 0x96, 0xc7, 0xbb, 0xd9, 0xfc,
+ 0xff, 0xbb, 0xb9, 0x86, 0x71, 0x3c, 0xc7, 0xf6, 0x3f, 0xbd, 0xc2, 0x39, 0xf7,
+ 0x3a, 0x25, 0xcb, 0xf0, 0x3c, 0xfe, 0x25, 0xb0, 0xbb, 0xd3, 0x39, 0x02, 0x3d,
+ 0xf8, 0xa3, 0x08, 0xbd, 0xba, 0xf2, 0x4e, 0xbd, 0x53, 0x83, 0x46, 0xbd, 0xae,
+ 0x06, 0x06, 0x3d, 0x69, 0xf3, 0x8f, 0x3d, 0xd3, 0x57, 0x35, 0x3c, 0x05, 0x92,
+ 0xb9, 0x3c, 0x60, 0x8e, 0x5b, 0x3b, 0xab, 0x7a, 0x8d, 0xbc, 0xf6, 0xdf, 0x87,
+ 0xbd, 0x0d, 0xc5, 0x81, 0x3d, 0xec, 0x93, 0x5f, 0x3d, 0xf6, 0x54, 0x85, 0x3d,
+ 0x86, 0xb3, 0x16, 0xbc, 0x7d, 0x95, 0x97, 0x3d, 0xff, 0xd8, 0x0c, 0x3d, 0x21,
+ 0x38, 0x6e, 0xbd, 0x68, 0xfc, 0x83, 0x3d, 0x5c, 0x54, 0x1b, 0xbc, 0x26, 0x1d,
+ 0x03, 0x3d, 0xd8, 0xaa, 0x90, 0xbd, 0xa9, 0x58, 0x0b, 0x3b, 0x02, 0x4e, 0x40,
+ 0xbd, 0xdc, 0x76, 0xe0, 0xbb, 0x14, 0x2e, 0x24, 0x3d, 0xbb, 0x6b, 0xfe, 0x3b,
+ 0xfd, 0xb5, 0x99, 0xbd, 0x4b, 0x2b, 0x0e, 0xbd, 0x2f, 0xc8, 0x69, 0xbd, 0xff,
+ 0xf0, 0x04, 0x3d, 0x46, 0x9c, 0x13, 0x3c, 0x74, 0x89, 0x2e, 0x3d, 0xbe, 0x6e,
+ 0x52, 0xbd, 0x59, 0x23, 0x34, 0x3d, 0x72, 0x3a, 0x3e, 0xbd, 0xf8, 0x03, 0x7a,
+ 0x3d, 0x8e, 0xab, 0x74, 0x3c, 0x6e, 0x5e, 0x82, 0x3d, 0x16, 0x5b, 0x25, 0x3c,
+ 0x56, 0x2c, 0xe7, 0xbd, 0x19, 0x4d, 0xc0, 0x3d, 0x8a, 0xb3, 0xdb, 0xbd, 0x34,
+ 0xe5, 0x67, 0xbc, 0x0f, 0x5d, 0x35, 0x3d, 0xad, 0xad, 0x94, 0x3d, 0xa5, 0xc3,
+ 0xba, 0xba, 0xb4, 0x7f, 0x02, 0x3e, 0xde, 0xcd, 0x8d, 0x3d, 0xc3, 0xa4, 0xa4,
+ 0xbd, 0x7e, 0x1b, 0x37, 0x3d, 0xde, 0xb4, 0x91, 0xbd, 0x78, 0xf2, 0x62, 0xbd,
+ 0x25, 0x4f, 0x60, 0xbd, 0x4e, 0xd2, 0x25, 0xbd, 0xd3, 0xc3, 0xe8, 0xbb, 0x7f,
+ 0x00, 0x68, 0x3d, 0x7a, 0x9c, 0x1e, 0xbd, 0x17, 0x70, 0x81, 0x3c, 0xda, 0xb3,
+ 0x68, 0x3d, 0xab, 0xf3, 0xb4, 0xbc, 0x46, 0x70, 0x16, 0xbd, 0x22, 0xe5, 0x82,
+ 0x3d, 0x75, 0x02, 0x5a, 0x3d, 0xb5, 0xce, 0x86, 0xbd, 0x20, 0x29, 0xa8, 0xbb,
+ 0xe5, 0x29, 0x95, 0xbd, 0x63, 0x0c, 0x5f, 0xbd, 0x42, 0x39, 0x99, 0xbc, 0x27,
+ 0xd6, 0x82, 0xbb, 0x33, 0x1c, 0xda, 0xbc, 0x93, 0x96, 0x76, 0x3d, 0xd3, 0x8c,
+ 0xd3, 0xbd, 0x75, 0x39, 0xe1, 0x3d, 0x42, 0x5b, 0x98, 0xbd, 0x5a, 0xc4, 0x4f,
+ 0x3d, 0x3b, 0xb0, 0x14, 0xbd, 0xfc, 0x99, 0x4b, 0xbc, 0xd4, 0x88, 0x13, 0xbb,
+ 0x6c, 0xca, 0xc4, 0x3d, 0xd4, 0xdc, 0xb1, 0x3d, 0x62, 0x2a, 0x8d, 0x3c, 0xd8,
+ 0x1b, 0xb7, 0x3c, 0x0b, 0x8d, 0xba, 0xbb, 0x78, 0x25, 0x5c, 0xbd, 0xb9, 0xc6,
+ 0xbb, 0xba, 0x26, 0x58, 0xc5, 0xbd, 0x5d, 0x48, 0xb7, 0xbd, 0x71, 0x0d, 0x0e,
+ 0x3d, 0xa8, 0xa7, 0x54, 0xbd, 0x88, 0xfe, 0x84, 0xbc, 0x0b, 0x64, 0x1b, 0xbc,
+ 0xba, 0xaa, 0x8e, 0x3c, 0x89, 0x54, 0xa5, 0xbc, 0xde, 0x32, 0x9c, 0x3c, 0x90,
+ 0x13, 0x66, 0xbd, 0xb2, 0x5e, 0x11, 0xbd, 0xd0, 0x5e, 0xfb, 0xbb, 0x2e, 0x6c,
+ 0x8c, 0xbd, 0x09, 0x4b, 0x2f, 0xbc, 0xa8, 0x5d, 0x27, 0xbd, 0xad, 0xd8, 0x2e,
+ 0x3d, 0x78, 0x5e, 0xf0, 0x3c, 0x8e, 0xc0, 0x12, 0x3d, 0x49, 0xb5, 0xca, 0xbd,
+ 0x1b, 0x2e, 0xb0, 0x3d, 0xeb, 0x3c, 0x8b, 0xbd, 0xe2, 0x4b, 0xd6, 0xbc, 0x14,
+ 0xdf, 0xc3, 0x3c, 0x42, 0x9c, 0x87, 0x3c, 0xb7, 0x90, 0x18, 0x3d, 0xcb, 0x8a,
+ 0xd8, 0x3d, 0xc1, 0x0c, 0x97, 0x3d, 0x35, 0xe8, 0xd3, 0x3c, 0xb1, 0x05, 0x28,
+ 0x3d, 0x03, 0xd2, 0xbc, 0x3d, 0x56, 0xce, 0x44, 0x3d, 0x9f, 0xbf, 0x24, 0x3d,
+ 0x21, 0x81, 0x81, 0xbd, 0xc0, 0xa2, 0xda, 0xbd, 0x50, 0x42, 0x27, 0x3d, 0x5f,
+ 0xb2, 0xb9, 0x3c, 0x04, 0x67, 0x6c, 0x3d, 0xce, 0x89, 0x2c, 0xbd, 0x08, 0x2d,
+ 0x4b, 0x3c, 0x88, 0x86, 0xf7, 0x3c, 0xcd, 0x8e, 0x94, 0x3d, 0x5a, 0x47, 0x6f,
+ 0x3d, 0x67, 0xf4, 0xa2, 0xbd, 0xe3, 0x50, 0x91, 0xbd, 0xde, 0x9e, 0x84, 0x3d,
+ 0xb3, 0x05, 0xbf, 0x3c, 0x10, 0x17, 0x34, 0x3d, 0xf4, 0x1f, 0x0e, 0xbd, 0x47,
+ 0xb9, 0x49, 0x3d, 0xb1, 0x61, 0x10, 0x3d, 0x2a, 0x64, 0x90, 0xbd, 0x1e, 0xc9,
+ 0xb8, 0x3c, 0x7d, 0x23, 0xb8, 0xbd, 0x19, 0x60, 0x85, 0x3d, 0x44, 0xb5, 0x4d,
+ 0xbd, 0x05, 0x79, 0xec, 0x3b, 0xea, 0x1e, 0x21, 0xbd, 0xeb, 0x34, 0x59, 0x3d,
+ 0x50, 0xa9, 0x00, 0x3d, 0x72, 0xf1, 0x4c, 0xb9, 0x98, 0x35, 0xc1, 0x3d, 0xbb,
+ 0x18, 0x36, 0x3d, 0x19, 0x70, 0x62, 0xbd, 0xc5, 0xae, 0x75, 0x3d, 0x27, 0x77,
+ 0xec, 0xbc, 0xab, 0x6d, 0xe1, 0xbd, 0x75, 0x4a, 0xae, 0x3c, 0x2d, 0xea, 0x18,
+ 0xbb, 0xdc, 0x0e, 0x7b, 0x3d, 0xb2, 0x28, 0x24, 0xbd, 0x69, 0xd2, 0x78, 0xbd,
+ 0xed, 0x29, 0x5f, 0xbc, 0xd9, 0x6e, 0x44, 0x3d, 0x3c, 0x6c, 0x87, 0xbd, 0xa5,
+ 0xdf, 0x96, 0xbc, 0x1c, 0x4c, 0x35, 0x3d, 0x54, 0x97, 0x57, 0xbd, 0xe9, 0x88,
+ 0x40, 0xbd, 0x6d, 0x9d, 0x71, 0x3c, 0x3f, 0x74, 0xaf, 0xbb, 0x41, 0xfa, 0x4b,
+ 0x3d, 0x20, 0xe8, 0x7a, 0xbc, 0xe4, 0x37, 0xbe, 0xbd, 0xfa, 0xa2, 0x44, 0xbc,
+ 0x2a, 0x3c, 0x61, 0xbd, 0xec, 0x0f, 0x0c, 0x3d, 0xd7, 0xef, 0x82, 0xbd, 0x0b,
+ 0xe4, 0xd2, 0xbc, 0xd2, 0x57, 0x04, 0x3c, 0xa8, 0x6e, 0xce, 0x3d, 0x3c, 0xd8,
+ 0xa4, 0x3b, 0x1d, 0x19, 0x45, 0xbd, 0xd6, 0x4d, 0x70, 0x3c, 0xed, 0x12, 0xf0,
+ 0xbc, 0x1f, 0xc6, 0x4c, 0x3c, 0xeb, 0x27, 0x8e, 0xbc, 0x6a, 0xf8, 0x4f, 0x3d,
+ 0xcf, 0x2c, 0xe3, 0xbd, 0x3b, 0xc9, 0x05, 0xbb, 0xe0, 0xfa, 0xfd, 0x3c, 0xfe,
+ 0xb8, 0xfb, 0xbc, 0x84, 0xd9, 0x8b, 0x3d, 0xad, 0x88, 0x00, 0x3d, 0x21, 0xfa,
+ 0x47, 0x3d, 0xf6, 0x17, 0x0d, 0xbd, 0xc5, 0x0c, 0xf1, 0x3c, 0xec, 0x3c, 0x13,
+ 0xbd, 0x1a, 0x06, 0x4b, 0xbd, 0x76, 0x04, 0xa4, 0xbc, 0x89, 0x87, 0x92, 0x3d,
+ 0xd2, 0xc6, 0xaf, 0x3d, 0xb1, 0xb1, 0x12, 0x3d, 0x99, 0xa4, 0x23, 0x3d, 0x25,
+ 0x73, 0x75, 0x3b, 0x18, 0x34, 0xa1, 0xbd, 0xc0, 0x90, 0xa5, 0x3d, 0xaa, 0xa8,
+ 0x14, 0xbd, 0x6c, 0xbc, 0xf3, 0x3c, 0x8a, 0x47, 0x51, 0xbc, 0xab, 0xfc, 0x2a,
+ 0x3d, 0xc8, 0xb7, 0x68, 0x3d, 0xff, 0xbf, 0x72, 0x3d, 0x38, 0x39, 0x95, 0x3d,
+ 0xdc, 0x49, 0x94, 0xbc, 0xbd, 0xce, 0x90, 0x3c, 0xcd, 0x13, 0x35, 0x3d, 0xd4,
+ 0xd9, 0x51, 0xbd, 0x16, 0xde, 0xfb, 0xbc, 0xc7, 0x00, 0xb9, 0xbd, 0x38, 0x8e,
+ 0x2e, 0xbc, 0xcb, 0xce, 0x5e, 0x3d, 0x44, 0x22, 0x7a, 0x3c, 0x70, 0x0a, 0x93,
+ 0x3d, 0x9c, 0x88, 0x81, 0x3a, 0x02, 0x89, 0x01, 0xbd, 0x52, 0x9b, 0x50, 0xbc,
+ 0xc7, 0x6f, 0x46, 0x3c, 0x41, 0xb4, 0x57, 0x3d, 0x79, 0x89, 0xd2, 0x3b, 0x20,
+ 0xab, 0x75, 0x3b, 0x40, 0xf2, 0xea, 0x3c, 0x8f, 0x29, 0x8c, 0x3d, 0xb0, 0x20,
+ 0x45, 0xbd, 0xf4, 0x67, 0x8c, 0x3d, 0xbf, 0x3f, 0x9d, 0x3c, 0xa7, 0x71, 0x01,
+ 0xbd, 0x37, 0x6b, 0x02, 0xbc, 0x68, 0xc4, 0x2a, 0x3d, 0x43, 0x60, 0x9b, 0xbc,
+ 0x72, 0xb9, 0x73, 0xbd, 0x90, 0xc4, 0x13, 0x3c, 0xba, 0xbf, 0x50, 0xbb, 0x86,
+ 0x75, 0x78, 0xbd, 0x2e, 0xaf, 0x69, 0xbc, 0xdb, 0x89, 0xbc, 0x3d, 0x05, 0x7f,
+ 0xa8, 0xbd, 0x42, 0x5f, 0x02, 0x3d, 0xe1, 0x3c, 0x12, 0xbd, 0xfd, 0xdf, 0x41,
+ 0x3d, 0x2e, 0xda, 0xe3, 0xbb, 0x80, 0x3c, 0x5f, 0xbd, 0x26, 0x2b, 0x1f, 0xbd,
+ 0xa8, 0xed, 0xd5, 0x3c, 0xa6, 0x84, 0xf1, 0x3c, 0xbe, 0xd2, 0x9a, 0xbb, 0x5b,
+ 0x04, 0x61, 0x3d, 0x2b, 0xe5, 0x06, 0xbd, 0xc9, 0xb8, 0x85, 0x3c, 0x64, 0x7a,
+ 0xc7, 0x3d, 0x4c, 0x12, 0xc9, 0x3c, 0x69, 0x12, 0x63, 0xbd, 0x88, 0x73, 0xbf,
+ 0x3c, 0xfc, 0x66, 0x50, 0xbb, 0x64, 0x31, 0x9a, 0xbd, 0xeb, 0x81, 0x8d, 0x3d,
+ 0x7e, 0x4e, 0xc5, 0x3c, 0x15, 0x80, 0x96, 0x3d, 0xb9, 0x1f, 0x65, 0xbd, 0xe3,
+ 0x99, 0xda, 0xbd, 0x94, 0x02, 0x4a, 0x3c, 0xbf, 0x7b, 0x26, 0x3d, 0x20, 0xae,
+ 0x9d, 0xbb, 0x84, 0x49, 0x1e, 0x3d, 0x88, 0x11, 0x17, 0x3d, 0x45, 0x77, 0x73,
+ 0x3c, 0x76, 0x33, 0xaa, 0x3c, 0x28, 0x4d, 0x4b, 0x3d, 0x49, 0x89, 0x37, 0x3c,
+ 0x3f, 0xe6, 0x92, 0xbd, 0xc8, 0x39, 0xa0, 0x3c, 0xd6, 0xff, 0x0a, 0x3b, 0xb4,
+ 0xef, 0xad, 0xbd, 0xdb, 0x17, 0x19, 0x3c, 0x9a, 0x54, 0x7c, 0xbd, 0xe7, 0x50,
+ 0xcc, 0x3c, 0x91, 0xeb, 0x75, 0xbd, 0x9a, 0x45, 0xac, 0x3d, 0xd3, 0x80, 0x4d,
+ 0xbd, 0x17, 0x6c, 0x19, 0x3c, 0x47, 0xb1, 0x1f, 0xbd, 0xef, 0x17, 0x1d, 0xbd,
+ 0xa2, 0xc8, 0x58, 0xbc, 0xf9, 0xc6, 0x81, 0xbb, 0x70, 0xfc, 0xa1, 0x3b, 0x70,
+ 0x74, 0x38, 0x3d, 0xb9, 0x93, 0x6c, 0x3d, 0xb5, 0x22, 0x89, 0x3d, 0xa8, 0x15,
+ 0xed, 0xbb, 0xee, 0x0c, 0xac, 0xbc, 0xbf, 0xca, 0xbe, 0xbc, 0x8e, 0x0d, 0xbf,
+ 0xbd, 0xfb, 0x0c, 0x92, 0x3c, 0x3d, 0x1e, 0x61, 0xbd, 0xe1, 0xb2, 0x08, 0xbd,
+ 0xcd, 0xab, 0x75, 0xbb, 0xc5, 0x1a, 0x2f, 0x3d, 0x4f, 0x02, 0x92, 0x3c, 0x8f,
+ 0x47, 0x20, 0x3d, 0x33, 0xac, 0xc3, 0x3d, 0xc9, 0xdc, 0xbd, 0xbc, 0x68, 0x6e,
+ 0xb4, 0x3b, 0x32, 0x32, 0xdc, 0x3d, 0xd8, 0xff, 0x92, 0x3d, 0xb3, 0xa4, 0x6f,
+ 0xbd, 0xf0, 0xbe, 0x13, 0xbd, 0xff, 0xf5, 0xdf, 0xbd, 0x67, 0xeb, 0x94, 0x3c,
+ 0xb2, 0xe8, 0x57, 0xbb, 0x92, 0x3f, 0xdc, 0xbb, 0xe3, 0x5f, 0x6b, 0x3c, 0x02,
+ 0xcc, 0x6c, 0xbd, 0x25, 0xa1, 0x57, 0xbd, 0x22, 0x01, 0x82, 0x3d, 0xc3, 0xcf,
+ 0xb2, 0x3c, 0xed, 0x35, 0x56, 0xbb, 0xe3, 0xf0, 0x8c, 0x3d, 0xdb, 0xf1, 0xb1,
+ 0xbc, 0xaa, 0xe4, 0xc2, 0x3b, 0x53, 0x9c, 0xf6, 0xbc, 0x15, 0x86, 0x92, 0x3d,
+ 0xe4, 0xf9, 0x39, 0x3d, 0x09, 0xa5, 0xa8, 0xbc, 0x6e, 0x89, 0xd1, 0xbc, 0x47,
+ 0xd4, 0x7b, 0x3c, 0x7b, 0xff, 0xab, 0x3c, 0x15, 0x58, 0x8d, 0xbd, 0x7b, 0x21,
+ 0xac, 0x3c, 0xda, 0xe5, 0xad, 0xbc, 0x8b, 0xfc, 0xd8, 0xbc, 0x8c, 0xe1, 0x0e,
+ 0xbc, 0x36, 0x43, 0xc6, 0x3d, 0xfa, 0x15, 0x8b, 0xbc, 0xb8, 0xd0, 0x07, 0x3d,
+ 0xd9, 0x12, 0x9c, 0x3c, 0x81, 0x20, 0x4f, 0xbd, 0xd8, 0x7f, 0x18, 0x3b, 0x38,
+ 0xd4, 0x33, 0xbc, 0x00, 0x0f, 0xe2, 0xbd, 0x25, 0xa8, 0xf2, 0x3c, 0x87, 0xa6,
+ 0x96, 0xbd, 0x84, 0xc3, 0xa8, 0x3c, 0xf4, 0x7a, 0x8b, 0x3c, 0xfd, 0xbd, 0x55,
+ 0xbc, 0x45, 0x00, 0x97, 0xbd, 0x81, 0x3a, 0xbd, 0x3b, 0x21, 0x43, 0x30, 0xbd,
+ 0x94, 0x58, 0xa5, 0x3b, 0x30, 0x2f, 0x12, 0xbd, 0xcb, 0xd3, 0x32, 0x3d, 0x36,
+ 0xd2, 0x7c, 0xbd, 0xf2, 0x77, 0x49, 0x3d, 0x87, 0xdd, 0x87, 0xbc, 0x3d, 0x1a,
+ 0x02, 0x3d, 0x5a, 0x1b, 0xc1, 0x3c, 0x04, 0xaf, 0x33, 0xbd, 0x84, 0x02, 0x1d,
+ 0x3d, 0x47, 0x7d, 0x21, 0xbd, 0x46, 0xc4, 0x24, 0x3d, 0x8f, 0x16, 0x27, 0x3d,
+ 0xce, 0x48, 0x22, 0x3d, 0xd9, 0x6b, 0xa3, 0x3c, 0x31, 0x91, 0xbb, 0x3c, 0xef,
+ 0x24, 0x88, 0xbb, 0x1e, 0x6e, 0x41, 0xbd, 0x81, 0xea, 0x80, 0x3d, 0xa6, 0xa7,
+ 0xf2, 0x3d, 0x74, 0xcf, 0xd7, 0x3c, 0x4c, 0x85, 0xf6, 0xbc, 0x57, 0xac, 0x0f,
+ 0x3c, 0x1c, 0x44, 0x53, 0xbd, 0x44, 0x55, 0x35, 0x3d, 0x14, 0x45, 0x11, 0x3d,
+ 0x0d, 0xfa, 0xff, 0xbc, 0xe0, 0xef, 0x32, 0x3d, 0x6c, 0x60, 0xac, 0x3b, 0xd2,
+ 0xe0, 0xab, 0xbb, 0x77, 0x02, 0x3f, 0xbd, 0xcd, 0x77, 0x44, 0x3d, 0x4f, 0x8c,
+ 0x3e, 0xbd, 0x74, 0xd6, 0x5a, 0xbd, 0x33, 0xb6, 0xf2, 0xbc, 0x94, 0xe4, 0x0e,
+ 0x3b, 0x6c, 0x9b, 0xa9, 0x3a, 0x61, 0xd7, 0xea, 0xbc, 0xf6, 0x70, 0xe9, 0x3c,
+ 0x06, 0x81, 0xeb, 0xbc, 0x51, 0x88, 0x47, 0xbb, 0x6c, 0xfb, 0x6d, 0x3d, 0x0a,
+ 0x9d, 0x29, 0xbb, 0xa0, 0x45, 0x36, 0x3c, 0xe5, 0xd9, 0xb8, 0x3c, 0x09, 0xf4,
+ 0x09, 0xbd, 0x2a, 0x13, 0x54, 0xbc, 0xad, 0xb0, 0xa3, 0x3d, 0x5a, 0x07, 0xff,
+ 0x3c, 0x18, 0x10, 0xc9, 0x3c, 0x15, 0xf6, 0x07, 0xbd, 0x05, 0x70, 0x60, 0x3d,
+ 0xb5, 0xbd, 0x50, 0x3d, 0xeb, 0xe1, 0x11, 0x3d, 0xdf, 0x70, 0x40, 0xbd, 0x51,
+ 0x6f, 0x67, 0xbd, 0x61, 0xbf, 0xd0, 0x3c, 0x39, 0x5e, 0x14, 0xbd, 0xae, 0x58,
+ 0xa1, 0x3d, 0xa2, 0x03, 0x88, 0x3d, 0x85, 0x40, 0x89, 0xbd, 0x3e, 0x4f, 0x21,
+ 0x3c, 0x8b, 0x40, 0xcf, 0x3c, 0xa8, 0x0d, 0x76, 0x3d, 0x2f, 0x57, 0xf4, 0x3b,
+ 0x78, 0x71, 0x8f, 0x3c, 0x15, 0x80, 0x72, 0x3d, 0x35, 0xc6, 0xe6, 0xbc, 0x1e,
+ 0xdb, 0x8d, 0x3d, 0xc1, 0x52, 0x58, 0x3d, 0x1e, 0x0c, 0x37, 0x3d, 0x68, 0xdd,
+ 0x25, 0x3d, 0x1a, 0x65, 0x59, 0xbc, 0x22, 0xe3, 0x8b, 0x3d, 0x29, 0xb2, 0x44,
+ 0xbd, 0x56, 0x71, 0x34, 0xbd, 0x1c, 0x3f, 0x7c, 0xbb, 0x88, 0x17, 0x72, 0xbc,
+ 0xbb, 0xb5, 0xae, 0x3c, 0xdd, 0x7b, 0xd5, 0x3c, 0xd3, 0x2f, 0x93, 0x3d, 0x07,
+ 0x46, 0x38, 0x3d, 0x55, 0x2b, 0x47, 0x3d, 0xd2, 0x5c, 0xda, 0x3d, 0xa4, 0x8e,
+ 0x80, 0x3d, 0xe6, 0xdb, 0xc9, 0x3c, 0xf3, 0x2d, 0x3f, 0xbd, 0x66, 0x10, 0xd1,
+ 0xbd, 0xde, 0xa5, 0xda, 0x3c, 0xab, 0x8c, 0xe4, 0x3c, 0x85, 0x1c, 0xc0, 0x3c,
+ 0xba, 0xe5, 0x95, 0xbd, 0x25, 0x50, 0x92, 0x3c, 0x25, 0x15, 0xc9, 0xba, 0x43,
+ 0xdc, 0x63, 0xbc, 0x65, 0xd6, 0x07, 0x3d, 0x87, 0x8c, 0x0e, 0xbc, 0x0d, 0x90,
+ 0x87, 0x3d, 0x9a, 0x0e, 0x4a, 0x3d, 0x67, 0x54, 0x4a, 0x3d, 0x63, 0x8b, 0x24,
+ 0xbd, 0x56, 0x2c, 0xcf, 0xbc, 0x28, 0x2a, 0x23, 0x3d, 0xc6, 0x80, 0xa3, 0xbc,
+ 0x66, 0xe5, 0x09, 0xbd, 0x69, 0xdb, 0x93, 0x3d, 0x00, 0xc7, 0x7e, 0xbd, 0xe0,
+ 0x18, 0x06, 0x3d, 0x02, 0xb9, 0x77, 0xbd, 0x43, 0x60, 0x55, 0x3c, 0x46, 0x45,
+ 0xa4, 0x3d, 0xb1, 0x0a, 0xac, 0x3c, 0x8a, 0xc5, 0x8e, 0x3d, 0xf6, 0x60, 0x31,
+ 0xbc, 0x9b, 0x2d, 0xb0, 0x3a, 0xc3, 0xc4, 0x4a, 0xbd, 0x96, 0x31, 0x82, 0xbd,
+ 0x4e, 0x50, 0x59, 0x3c, 0x2f, 0xf7, 0xd4, 0xbd, 0x18, 0xc1, 0x2b, 0xbd, 0xb8,
+ 0x26, 0x9d, 0x3c, 0xd6, 0x9c, 0x3b, 0xbd, 0xb6, 0xdd, 0x11, 0xbd, 0x4e, 0x51,
+ 0xd9, 0x3b, 0xbd, 0xfd, 0x3b, 0xbd, 0xe2, 0xe9, 0x35, 0xbc, 0x0d, 0xb1, 0x9c,
+ 0x3c, 0x02, 0x6e, 0xab, 0x3c, 0xc9, 0x70, 0x25, 0x3c, 0xae, 0xe4, 0x60, 0xbd,
+ 0x11, 0xc2, 0x49, 0x3d, 0x9b, 0x09, 0xaf, 0xbc, 0xbc, 0x74, 0x75, 0x3c, 0x38,
+ 0x61, 0x16, 0x3d, 0x0c, 0x99, 0x94, 0x3d, 0x01, 0x83, 0x03, 0xbb, 0xc5, 0x45,
+ 0x1b, 0x3d, 0x82, 0xab, 0x6f, 0x3c, 0xe1, 0x41, 0xce, 0x3c, 0x86, 0xd5, 0x79,
+ 0xbd, 0x0e, 0x6c, 0x69, 0x3d, 0xcf, 0xbb, 0x87, 0x3d, 0x65, 0x17, 0xb4, 0xbc,
+ 0xca, 0x64, 0x07, 0x3e, 0x7d, 0x34, 0xca, 0x3d, 0x40, 0x0d, 0xfb, 0x3c, 0x0e,
+ 0xea, 0xc2, 0x3c, 0x06, 0x26, 0x88, 0xbc, 0xed, 0x76, 0x84, 0x3d, 0xca, 0x92,
+ 0xa4, 0xbc, 0x4c, 0x98, 0x74, 0xbd, 0x62, 0x77, 0xdb, 0xbd, 0x97, 0xba, 0x87,
+ 0x3d, 0xe9, 0x05, 0x95, 0xbd, 0xcc, 0xfd, 0x99, 0x3d, 0x36, 0x01, 0x0b, 0xbd,
+ 0x23, 0x33, 0x7d, 0x3d, 0x2f, 0xba, 0x5c, 0x3d, 0xaa, 0xed, 0xb2, 0xbc, 0xfc,
+ 0xe7, 0x97, 0x3d, 0xaa, 0x40, 0x7d, 0x3d, 0x2a, 0x5f, 0x5e, 0x3d, 0x51, 0x91,
+ 0x7d, 0xbd, 0xc8, 0xf8, 0x2a, 0x3d, 0x7b, 0x8c, 0x2f, 0x3d, 0x35, 0xe0, 0xb9,
+ 0xbb, 0xc4, 0x0b, 0x56, 0xbd, 0xcf, 0xd0, 0xb8, 0x3c, 0xf7, 0xef, 0x61, 0x3d,
+ 0xf5, 0x33, 0x9a, 0x3d, 0x07, 0xd8, 0xf0, 0xbc, 0x34, 0x49, 0x61, 0xbd, 0x7c,
+ 0x0c, 0x74, 0xbd, 0x0c, 0x85, 0xf7, 0xbc, 0xeb, 0x13, 0xdd, 0xbc, 0x70, 0x3a,
+ 0xd1, 0x3c, 0xd0, 0x31, 0xe1, 0x3d, 0xbf, 0xb4, 0x90, 0xbd, 0x6c, 0x8a, 0x4f,
+ 0xbc, 0x89, 0x66, 0x29, 0xbc, 0x5d, 0x8a, 0x18, 0xbd, 0xa4, 0x2b, 0x91, 0xbd,
+ 0x6a, 0x8d, 0x2b, 0xb9, 0x44, 0x9f, 0xf1, 0xbd, 0xe3, 0x9a, 0x87, 0x3c, 0x3c,
+ 0x77, 0x5c, 0x3d, 0x1b, 0x6f, 0x50, 0xbd, 0x43, 0x9e, 0x41, 0xbd, 0x13, 0x6f,
+ 0x5d, 0x3d, 0x44, 0x7f, 0x67, 0x3c, 0xf5, 0x9e, 0x31, 0x3c, 0xc0, 0x48, 0x8b,
+ 0x3d, 0x48, 0xc4, 0xd0, 0xbc, 0x80, 0x20, 0x17, 0x3a, 0x4c, 0x44, 0x42, 0x3b,
+ 0xcd, 0x50, 0x0e, 0x3d, 0xf8, 0xdd, 0x6a, 0x3d, 0xa7, 0xa4, 0x57, 0x3c, 0x5c,
+ 0x60, 0x94, 0x3c, 0xd4, 0x6e, 0x34, 0xbc, 0xa3, 0xa2, 0x8e, 0xbd, 0x88, 0xe0,
+ 0xad, 0x3d, 0xdb, 0xd6, 0x9f, 0xbd, 0x14, 0xcb, 0x61, 0xbd, 0x02, 0x50, 0x7f,
+ 0xbd, 0xb9, 0x4c, 0x9d, 0x3d, 0x0d, 0x5a, 0x88, 0x3d, 0x8b, 0x0a, 0x06, 0x3c,
+ 0xdf, 0x17, 0x8e, 0x3d, 0x75, 0x07, 0x0c, 0x3d, 0x5d, 0xd3, 0x52, 0xbd, 0x22,
+ 0x56, 0x0b, 0x3a, 0x62, 0x34, 0xcb, 0xbc, 0x55, 0x58, 0xaa, 0x3c, 0x72, 0x28,
+ 0xa3, 0xbd, 0x60, 0x8d, 0x3f, 0xbc, 0x5b, 0xaa, 0x51, 0xbb, 0xa8, 0x60, 0x31,
+ 0xbd, 0x8c, 0xc5, 0xfb, 0x3c, 0x90, 0x97, 0x3f, 0xbc, 0x94, 0x3a, 0x45, 0xbd,
+ 0xb5, 0xc1, 0x8d, 0xbd, 0x07, 0xd0, 0x08, 0x3d, 0x47, 0x05, 0xe2, 0xbb, 0x69,
+ 0x2e, 0x16, 0x3d, 0xd0, 0x2d, 0x50, 0xbd, 0xd3, 0x88, 0x9e, 0x3d, 0x2f, 0x19,
+ 0xbb, 0xbc, 0x20, 0x1f, 0xa4, 0x3d, 0x38, 0x4e, 0x9c, 0xbc, 0x71, 0x5a, 0x6e,
+ 0x3c, 0x47, 0x9a, 0x49, 0x3d, 0x7a, 0x7b, 0x07, 0x3a, 0x54, 0xf5, 0xcd, 0x3d,
+ 0x54, 0xb0, 0xde, 0x3c, 0xb0, 0xbd, 0x1b, 0x3c, 0x31, 0x85, 0x2c, 0xbd, 0xda,
+ 0x03, 0xe4, 0xbb, 0x9e, 0xf5, 0x87, 0x3d, 0xef, 0x15, 0x41, 0x3d, 0x82, 0x56,
+ 0xa3, 0x3d, 0xfa, 0x31, 0x5e, 0xbd, 0xf2, 0x5e, 0x5f, 0xbb, 0x1c, 0xda, 0x9f,
+ 0x3d, 0x45, 0x09, 0x71, 0xbc, 0x37, 0x80, 0x9a, 0x3b, 0x5a, 0x7a, 0xfd, 0xbc,
+ 0x37, 0x4f, 0x1a, 0xbe, 0xfa, 0x30, 0xeb, 0xbc, 0xa9, 0xd5, 0x74, 0xbd, 0x18,
+ 0xad, 0x9b, 0xbc, 0x00, 0xc4, 0xce, 0x3a, 0x98, 0x58, 0x19, 0x3c, 0xf0, 0x22,
+ 0xa1, 0x3b, 0x84, 0xfa, 0x08, 0xbd, 0x6f, 0xfe, 0x96, 0x3d, 0xe3, 0xc4, 0x90,
+ 0x3d, 0xa0, 0xc8, 0x5a, 0xbc, 0x97, 0x7f, 0xc2, 0xbc, 0xea, 0xcc, 0xcc, 0x3c,
+ 0xae, 0xb0, 0x9c, 0xbc, 0x49, 0xdf, 0x97, 0xbc, 0xdd, 0x01, 0x18, 0xbd, 0x66,
+ 0x26, 0xa7, 0xbc, 0x2a, 0x3d, 0x59, 0xbd, 0x93, 0x1b, 0x1a, 0x3d, 0xd9, 0x46,
+ 0xcc, 0x3c, 0x00, 0xf0, 0x34, 0x3a, 0x99, 0x3d, 0xc0, 0xbc, 0x08, 0xb1, 0x09,
+ 0x3c, 0xbe, 0xfb, 0x79, 0x3d, 0xa9, 0x90, 0x86, 0xbd, 0xa2, 0x17, 0x8f, 0xbd,
+ 0x30, 0x94, 0x8a, 0xbb, 0xd9, 0xd7, 0x82, 0x3d, 0xe4, 0xea, 0x2f, 0xbd, 0x7e,
+ 0x59, 0x73, 0xbd, 0x46, 0x73, 0xe2, 0xbc, 0xe0, 0xd4, 0x42, 0xbc, 0x3c, 0x6c,
+ 0xdf, 0x3c, 0x08, 0xce, 0xf9, 0x3c, 0xfc, 0xe4, 0x79, 0xbd, 0xac, 0x5c, 0x4f,
+ 0xbd, 0x60, 0x67, 0x12, 0xbb, 0xb2, 0xcf, 0xbf, 0xbc, 0xe2, 0x7c, 0x31, 0xbd,
+ 0xb6, 0xc7, 0x18, 0x3d, 0xdc, 0x89, 0x90, 0xbd, 0x0c, 0xf7, 0x99, 0xbc, 0xa0,
+ 0x2a, 0x3c, 0xbd, 0x92, 0x1b, 0x38, 0x3d, 0x34, 0xe9, 0x86, 0xbd, 0x69, 0x76,
+ 0x6d, 0xbd, 0x76, 0x2b, 0x6e, 0x3d, 0x70, 0x53, 0x3f, 0x3d, 0x22, 0xe5, 0x4c,
+ 0x3d, 0x52, 0x57, 0xfc, 0xbc, 0xf8, 0x6b, 0x31, 0xbd, 0xb4, 0xb1, 0xa3, 0x3c,
+ 0x10, 0x0c, 0x60, 0x3c, 0xbc, 0x80, 0x85, 0xbd, 0xe6, 0x9f, 0x78, 0xbd, 0x00,
+ 0x20, 0x90, 0xba, 0xbc, 0x54, 0x5d, 0xbd, 0x6c, 0xd7, 0xc5, 0xbc, 0x87, 0x6b,
+ 0x87, 0x3d, 0x0a, 0x34, 0x0c, 0x3d, 0x44, 0xe5, 0x47, 0xbd, 0xe0, 0xd3, 0x05,
+ 0x3b, 0x23, 0x83, 0x11, 0xbd, 0xab, 0x22, 0x8c, 0xbd, 0x48, 0x17, 0xe9, 0x3c,
+ 0xbd, 0x8a, 0x89, 0x3d, 0xc0, 0x3a, 0x71, 0x3b, 0x08, 0x52, 0x61, 0x3c, 0x40,
+ 0xb4, 0x6d, 0x3c, 0xa0, 0x6a, 0xa0, 0x3b, 0x00, 0xc4, 0xb9, 0x39, 0x74, 0x71,
+ 0xa8, 0x3c, 0x13, 0xa7, 0x90, 0xbd, 0x04, 0xb5, 0xb4, 0xbc, 0x70, 0x36, 0x31,
+ 0x3c, 0x28, 0x25, 0x0f, 0x3c, 0xfc, 0x08, 0x46, 0xbd, 0x80, 0xa0, 0xa5, 0xba,
+ 0xe2, 0x11, 0x6f, 0xbd, 0x39, 0xf0, 0x31, 0xbd, 0xd8, 0xbe, 0x2f, 0xbd, 0x68,
+ 0x21, 0x4d, 0xbd, 0x64, 0x1b, 0x8e, 0xbd, 0x80, 0xd4, 0x78, 0xba, 0x92, 0x81,
+ 0x5a, 0xbd, 0xf4, 0xf9, 0x57, 0xbd, 0x80, 0x59, 0xa2, 0x3c, 0x22, 0xe6, 0xde,
+ 0xbc, 0x91, 0xdf, 0x87, 0xbd, 0x3a, 0xea, 0x22, 0xbd, 0xba, 0xf7, 0x75, 0x3d,
+ 0xba, 0x8a, 0x0c, 0x3d, 0x81, 0xa7, 0x8d, 0xbd, 0x90, 0xee, 0x50, 0xbd, 0x14,
+ 0xa3, 0x90, 0xbd, 0xdc, 0xdf, 0x81, 0x3c, 0x4a, 0xb5, 0x66, 0xbd, 0x10, 0xa0,
+ 0x94, 0x3b, 0x9a, 0x12, 0x2d, 0xbd, 0xda, 0x60, 0x42, 0xbd, 0xea, 0x9f, 0xb0,
+ 0xbc, 0x38, 0xfc, 0x02, 0x3d, 0xa6, 0x08, 0x04, 0x3d, 0x23, 0xf6, 0x03, 0xbd,
+ 0xa2, 0x7a, 0x63, 0x3d, 0x26, 0xca, 0x36, 0x3d, 0x96, 0xd3, 0x0d, 0x3d, 0x3f,
+ 0xfd, 0x89, 0x3d, 0x08, 0xa3, 0x24, 0xbd, 0x28, 0x10, 0x57, 0xbc, 0xbb, 0xb9,
+ 0x83, 0x3d, 0x50, 0x2b, 0xb5, 0x3b, 0x9c, 0x94, 0x19, 0xbc, 0xc4, 0x4d, 0x9a,
+ 0xbc, 0x91, 0xf8, 0x0d, 0xbd, 0x63, 0x13, 0x7d, 0xbd, 0xed, 0xd0, 0x02, 0xbd,
+ 0x1c, 0x10, 0x85, 0xbd, 0x00, 0xca, 0x36, 0x3c, 0xc8, 0x17, 0x7a, 0x3c, 0x24,
+ 0x32, 0xc7, 0xbc, 0x88, 0x75, 0xa5, 0x3c, 0x2e, 0x18, 0x39, 0xbd, 0xd4, 0xa9,
+ 0xfb, 0x3c, 0x8c, 0x61, 0x48, 0x3d, 0x40, 0x34, 0xb1, 0xba, 0xb7, 0xec, 0x83,
+ 0x3d, 0x7c, 0x1d, 0x5a, 0x3d, 0x30, 0x5c, 0x91, 0x3c, 0xcb, 0x9d, 0x85, 0x3d,
+ 0x74, 0xa8, 0x35, 0x3d, 0x93, 0x54, 0x76, 0xbd, 0xa3, 0xb8, 0x8c, 0xbd, 0xf3,
+ 0x38, 0x8d, 0xbd, 0x45, 0x41, 0x8d, 0xbd, 0xb0, 0x35, 0x2c, 0x3d, 0x79, 0x2f,
+ 0x91, 0x3d, 0x1c, 0xa0, 0xde, 0xbc, 0x26, 0xd7, 0x53, 0xbd, 0xec, 0x6e, 0x11,
+ 0x3d, 0x1c, 0x44, 0x8f, 0x3c, 0x2b, 0x97, 0x2b, 0xbd, 0x78, 0x4e, 0x62, 0xbc,
+ 0x4a, 0x20, 0xe3, 0xbc, 0x2e, 0x7e, 0xd5, 0xbc, 0x34, 0xe0, 0xcc, 0xbc, 0x00,
+ 0xd9, 0x05, 0x3d, 0x6e, 0xe3, 0xd8, 0xbc, 0x32, 0x01, 0x51, 0x3d, 0x57, 0x4a,
+ 0x83, 0x3d, 0x98, 0x90, 0x4c, 0xbd, 0x0d, 0x8e, 0x8b, 0x3d, 0x76, 0x2c, 0x32,
+ 0x3d, 0x6a, 0x76, 0x91, 0xbd, 0xc8, 0xf9, 0x85, 0x3c, 0x40, 0x2b, 0x80, 0x3a,
+ 0xe0, 0x00, 0xe3, 0xbb, 0x00, 0x06, 0x79, 0xb9, 0x27, 0xbd, 0x8f, 0x3d, 0xce,
+ 0x76, 0x2c, 0x3d, 0x56, 0x63, 0xd7, 0xbc, 0x30, 0x52, 0xf0, 0xbb, 0x69, 0x1f,
+ 0x85, 0xbd, 0x7e, 0xdb, 0x64, 0xbd, 0x85, 0xd6, 0x87, 0x3d, 0x92, 0xc0, 0x70,
+ 0x3d, 0x4c, 0x7a, 0x78, 0xbc, 0x6c, 0x7d, 0x2b, 0xbd, 0x6f, 0x2b, 0x85, 0x3d,
+ 0x98, 0x48, 0x39, 0xbd, 0x8c, 0x9d, 0xce, 0x3c, 0x08, 0xf9, 0x5c, 0xbc, 0xe8,
+ 0x5a, 0xcd, 0x3c, 0x88, 0xb0, 0x3c, 0x3d, 0xf8, 0x88, 0x4e, 0xbd, 0x30, 0x8f,
+ 0x38, 0x3c, 0xba, 0xa1, 0xc9, 0xbc, 0xba, 0xdc, 0x6d, 0x3d, 0xc0, 0x39, 0x5a,
+ 0xbb, 0xa6, 0x2d, 0x1d, 0x3d, 0x04, 0xde, 0xe4, 0x3c, 0x24, 0x67, 0x4f, 0xbd,
+ 0xde, 0xc0, 0x7c, 0x3d, 0x31, 0x68, 0x09, 0xbd, 0x01, 0x59, 0x80, 0xbd, 0x13,
+ 0x09, 0x91, 0x3d, 0xc8, 0xdd, 0x18, 0x3d, 0x2b, 0x88, 0x91, 0x3d, 0x50, 0xef,
+ 0x80, 0x3c, 0xec, 0x4a, 0x65, 0xbc, 0xb0, 0xca, 0x0a, 0x3d, 0x48, 0x1f, 0x29,
+ 0xbd, 0x56, 0xe9, 0x3a, 0x3d, 0xd0, 0x9c, 0x67, 0xbc, 0xe0, 0x47, 0xdb, 0xbc,
+ 0xd8, 0x70, 0x4a, 0xbd, 0x86, 0x63, 0x39, 0xbd, 0xfb, 0x2a, 0x10, 0xbd, 0xbc,
+ 0xfb, 0x42, 0xbd, 0xdc, 0x59, 0xe4, 0xbc, 0x2e, 0x08, 0x5f, 0xbd, 0x34, 0xb6,
+ 0xe1, 0x3c, 0x76, 0x68, 0x22, 0x3d, 0x18, 0x3d, 0x14, 0x3c, 0xa5, 0xa2, 0x8b,
+ 0xbd, 0x9c, 0x97, 0x87, 0xbd, 0xbd, 0x22, 0x87, 0x3d, 0x20, 0x18, 0x57, 0x3c,
+ 0xb6, 0x45, 0x5e, 0x3d, 0xa4, 0x1e, 0x63, 0xbd, 0x88, 0x1f, 0x68, 0x3c, 0xe0,
+ 0x00, 0x4f, 0x3d, 0x34, 0xe0, 0x5a, 0xbc, 0xd4, 0xd3, 0x61, 0xbc, 0x40, 0x8f,
+ 0x14, 0xbb, 0xae, 0x4e, 0x94, 0xbc, 0x8d, 0x80, 0x61, 0xbd, 0x11, 0xcc, 0x85,
+ 0x3d, 0xb4, 0x7b, 0x24, 0xbd, 0x3e, 0x81, 0x15, 0x3d, 0xaa, 0xe5, 0x85, 0xbd,
+ 0xa0, 0xa4, 0x2c, 0xbb, 0x02, 0x5e, 0x25, 0x3d, 0x5d, 0x8b, 0x37, 0xbd, 0xa1,
+ 0xb0, 0x25, 0xbd, 0x4a, 0xa5, 0x6b, 0x3d, 0xd3, 0x4a, 0x92, 0x3d, 0x40, 0x57,
+ 0x06, 0x3d, 0x20, 0xdd, 0x30, 0x3b, 0xb0, 0x9e, 0xd3, 0x3c, 0x62, 0xb5, 0xd8,
+ 0xbc, 0xa0, 0xec, 0x93, 0xbb, 0x20, 0xc4, 0x7a, 0x3b, 0xc0, 0x64, 0xfe, 0x3b,
+ 0xcb, 0xb4, 0x90, 0x3d, 0x3f, 0x87, 0x8c, 0x3d, 0xfa, 0x94, 0x21, 0x3d, 0x9c,
+ 0xc3, 0x03, 0x3d, 0xc2, 0x4f, 0x8d, 0xbc, 0x22, 0x1e, 0xd2, 0xbc, 0xa0, 0xd5,
+ 0x66, 0xbc, 0xba, 0xf8, 0xcd, 0xbc, 0x7f, 0x26, 0x60, 0xbd, 0x6c, 0x27, 0x90,
+ 0x3c, 0xf4, 0xd5, 0x85, 0x3c, 0xc0, 0x88, 0x3c, 0xbb, 0x8e, 0x17, 0x9d, 0xbc,
+ 0x34, 0xb8, 0xef, 0x3c, 0x78, 0x16, 0xbd, 0x3c, 0x41, 0x5e, 0x90, 0xbd, 0x3e,
+ 0x1c, 0x40, 0x3d, 0xeb, 0xf2, 0x8c, 0x3d, 0xd4, 0xb2, 0xa8, 0xbc, 0x0a, 0xae,
+ 0x29, 0x3d, 0x40, 0x78, 0x1c, 0xbb, 0x60, 0xfb, 0xd1, 0x3c, 0x9d, 0xd0, 0x84,
+ 0x3d, 0x8a, 0xcc, 0x08, 0x3d, 0x72, 0x4d, 0x41, 0x3d, 0xa9, 0x49, 0x50, 0xbd,
+ 0x92, 0x44, 0x1c, 0x3d, 0xc8, 0x15, 0x5f, 0xbd, 0x1a, 0xda, 0xb6, 0xbc, 0xb4,
+ 0x03, 0xd1, 0x3c, 0xdc, 0x8e, 0xb0, 0x3c, 0x88, 0x61, 0x7a, 0xbc, 0xb0, 0xab,
+ 0xc4, 0xbb, 0xa2, 0x9f, 0x35, 0xbd, 0xac, 0xc1, 0x1e, 0xbd, 0x78, 0xd0, 0x54,
+ 0x3d, 0x22, 0x03, 0xa9, 0xbc, 0x00, 0x71, 0x30, 0xbb, 0x30, 0xaa, 0xc8, 0x3b,
+ 0xa9, 0x9c, 0x35, 0xbd, 0x00, 0xb3, 0x09, 0xbb, 0x40, 0x51, 0x2e, 0x3c, 0xc8,
+ 0xb4, 0x23, 0x3c, 0x6d, 0xf4, 0x06, 0xbd, 0xaa, 0x77, 0x6f, 0x3d, 0xce, 0xc4,
+ 0xb1, 0xbc, 0x6f, 0x91, 0x8b, 0x3d, 0x5f, 0xc4, 0x8a, 0x3d, 0xe4, 0x1f, 0xac,
+ 0x3c, 0x4c, 0xc1, 0x89, 0x3c, 0x4c, 0x09, 0x5d, 0xbd, 0x38, 0x91, 0x3e, 0x3c,
+ 0xe0, 0x15, 0x30, 0xbd, 0x60, 0x09, 0xd2, 0x3c, 0xe0, 0x4f, 0x35, 0xbb, 0xe8,
+ 0xf2, 0xdf, 0xbc, 0x40, 0xa5, 0xcc, 0xba, 0x28, 0xaa, 0x04, 0xbc, 0xb4, 0x3b,
+ 0x3d, 0xbc, 0xa8, 0xbc, 0x9d, 0x3c, 0x22, 0x77, 0x51, 0x3d, 0xd3, 0x53, 0x48,
+ 0xbd, 0x80, 0x2a, 0x2c, 0x3b, 0x4e, 0x95, 0x79, 0x3d, 0x9c, 0x2c, 0x52, 0xbd,
+ 0xac, 0x7e, 0xd9, 0x3c, 0x76, 0xd7, 0x78, 0x3d, 0x00, 0xe8, 0x78, 0xbd, 0x2e,
+ 0x63, 0x0f, 0x3d, 0xeb, 0x59, 0x14, 0xbd, 0x84, 0xd4, 0x1c, 0xbc, 0x1d, 0x54,
+ 0x1a, 0xbd, 0xe0, 0x16, 0x5c, 0xbb, 0x5c, 0xf1, 0x48, 0x3d, 0x94, 0x95, 0x59,
+ 0xbc, 0x48, 0x14, 0x37, 0xbd, 0x3e, 0x60, 0x76, 0x3d, 0xb4, 0x88, 0xdb, 0x3c,
+ 0x24, 0xf3, 0x8b, 0xbc, 0xb8, 0x6e, 0x0f, 0x3d, 0x00, 0x2c, 0xda, 0x3a, 0x79,
+ 0x80, 0x88, 0x3d, 0x58, 0xf7, 0x26, 0x3c, 0x10, 0x19, 0x45, 0x3d, 0xf9, 0xba,
+ 0x6a, 0xbd, 0x0e, 0x30, 0x43, 0x3d, 0xe0, 0x09, 0x68, 0x3b, 0x51, 0x84, 0x8f,
+ 0xbd, 0x6a, 0xa1, 0x7a, 0xbd, 0xbc, 0x1c, 0x72, 0xbd, 0x94, 0xf7, 0x75, 0xbd,
+ 0xc8, 0x32, 0x69, 0xbd, 0xf5, 0x29, 0x1e, 0xbd, 0x00, 0xe7, 0x59, 0x3a, 0x90,
+ 0x9c, 0x84, 0xbd, 0x5c, 0x5f, 0x2f, 0xbd, 0x50, 0x8c, 0x95, 0xbb, 0x00, 0x13,
+ 0x85, 0xbd, 0x26, 0xab, 0x7f, 0xbd, 0xc8, 0x91, 0x2a, 0xbc, 0x34, 0xda, 0xd2,
+ 0xbc, 0x2c, 0xb7, 0x4b, 0x3d, 0x73, 0xe4, 0x2b, 0xbd, 0x48, 0x46, 0x8f, 0xbd,
+ 0x0c, 0xa7, 0x36, 0xbd, 0x58, 0x23, 0x9f, 0x3c, 0xec, 0x5b, 0x2e, 0x3d, 0x28,
+ 0xde, 0x34, 0xbd, 0x00, 0xd5, 0x8e, 0x3b, 0x76, 0xa2, 0x76, 0x3d, 0x64, 0xe8,
+ 0x4d, 0x3d, 0x47, 0xc2, 0x82, 0xbd, 0x90, 0x0c, 0x8b, 0xbd, 0x9c, 0x98, 0x1a,
+ 0x3d, 0x74, 0xd4, 0xd1, 0xbc, 0xd6, 0x3b, 0x78, 0x3d, 0x88, 0xad, 0x04, 0xbd,
+ 0x5c, 0x4e, 0xbf, 0x3c, 0x20, 0xd8, 0x5b, 0x3c, 0x68, 0x77, 0x0e, 0xbc, 0xc0,
+ 0x8a, 0xc8, 0x3b, 0x00, 0x68, 0x5d, 0xba, 0x4c, 0x05, 0x30, 0x3d, 0x20, 0xb7,
+ 0x56, 0x3d, 0xa0, 0x6e, 0xef, 0x3c, 0xb4, 0x50, 0x1c, 0x3d, 0x5c, 0x0f, 0x68,
+ 0xbd, 0xf7, 0x3c, 0x53, 0xbd, 0x96, 0xa5, 0x0c, 0x3d, 0x3a, 0x6c, 0x07, 0x3d,
+ 0xa0, 0x60, 0x2c, 0xbd, 0x20, 0xaf, 0xbf, 0xbc, 0x00, 0x2d, 0x05, 0xbb, 0xe0,
+ 0x97, 0x4b, 0x3b, 0x32, 0xdc, 0x37, 0x3d, 0xe2, 0x39, 0x54, 0xbd, 0x2a, 0xde,
+ 0xeb, 0xbc, 0x1e, 0x8b, 0x6d, 0x3d, 0x0c, 0x92, 0xd6, 0xbc, 0xec, 0x48, 0x19,
+ 0xbc, 0x23, 0xd9, 0x90, 0xbd, 0x84, 0x8b, 0x83, 0xbd, 0xc8, 0x8c, 0x7c, 0x3c,
+ 0xfe, 0xca, 0x7d, 0xbd, 0x06, 0xb7, 0x69, 0x3d, 0x34, 0x35, 0xb0, 0x3c, 0x52,
+ 0x14, 0x56, 0xbd, 0xf4, 0xf3, 0x43, 0xbd, 0x34, 0x5e, 0xbf, 0xbc, 0x9c, 0x32,
+ 0x1e, 0x3d, 0xa0, 0x4d, 0xe0, 0x3b, 0x00, 0x68, 0x5d, 0xb8, 0x9e, 0x47, 0x7b,
+ 0x3d, 0xe1, 0xcd, 0x8b, 0x3d, 0xb8, 0x10, 0x8f, 0xbc, 0xc8, 0x30, 0x28, 0x3c,
+ 0xec, 0x42, 0x28, 0x3d, 0xfe, 0xea, 0x8a, 0xbd, 0x36, 0x76, 0x1a, 0xbd, 0xfa,
+ 0x9c, 0xca, 0xbc, 0x10, 0xe9, 0x82, 0xbd, 0x72, 0x8b, 0x7b, 0x3d, 0x46, 0x75,
+ 0x1c, 0xbd, 0x5a, 0xb9, 0x06, 0xbd, 0x6c, 0xa7, 0x25, 0xbc, 0x6a, 0x37, 0xd3,
+ 0xbc, 0xbc, 0x78, 0x85, 0x3c, 0x98, 0xb7, 0x01, 0x3d, 0x3c, 0xb7, 0x0d, 0x3d,
+ 0x3c, 0x57, 0x21, 0xbc, 0x28, 0xfb, 0xa7, 0x3c, 0x18, 0x3f, 0x49, 0x3c, 0x81,
+ 0x34, 0x8d, 0xbd, 0xb4, 0xfb, 0x6e, 0xbd, 0x60, 0x97, 0x95, 0x3c, 0xac, 0xdd,
+ 0x86, 0xbc, 0xd8, 0x6e, 0xda, 0x3c, 0xd8, 0xd9, 0x3d, 0x3d, 0x90, 0xa6, 0xea,
+ 0x3c, 0x40, 0x67, 0x3f, 0x3d, 0x3a, 0x43, 0x69, 0x3d, 0x0a, 0x20, 0x5e, 0x3d,
+ 0x33, 0x91, 0x12, 0xbd, 0xb4, 0xc5, 0x31, 0xbd, 0x0e, 0x96, 0x45, 0x3d, 0xc6,
+ 0x22, 0x37, 0xbd, 0x7c, 0x12, 0x44, 0x3d, 0xc9, 0x61, 0x8a, 0x3d, 0x1c, 0x66,
+ 0x44, 0x3d, 0xa2, 0x51, 0x30, 0x3d, 0xc8, 0xdb, 0xd9, 0x3c, 0xd3, 0xfb, 0x8e,
+ 0xbd, 0x08, 0x6a, 0x91, 0xbd, 0xea, 0x2e, 0x48, 0xbd, 0x60, 0x5b, 0x22, 0xbb,
+ 0x06, 0x39, 0x53, 0x3d, 0x84, 0xb4, 0x0b, 0xbd, 0xa0, 0x77, 0xfa, 0x3b, 0x84,
+ 0xaf, 0xaa, 0x3c, 0x47, 0xd2, 0x86, 0xbd, 0xe3, 0xef, 0x43, 0xbd, 0x36, 0x8d,
+ 0x16, 0x3d, 0x85, 0xa6, 0x85, 0x3d, 0x8e, 0xda, 0xa0, 0xbc, 0xc3, 0x58, 0x80,
+ 0xbd, 0x93, 0x30, 0x0f, 0xbd, 0x0c, 0x85, 0xcf, 0xbc, 0xc0, 0x8c, 0x2a, 0x3c,
+ 0x02, 0xe2, 0x0d, 0xbd, 0xe9, 0xf8, 0x8c, 0xbd, 0x15, 0x8d, 0x8b, 0x3d, 0xf3,
+ 0x1f, 0x8b, 0xbd, 0x0f, 0xa0, 0x80, 0xbd, 0xee, 0x04, 0x63, 0x3d, 0xb4, 0x7a,
+ 0xf6, 0xbc, 0x60, 0x5b, 0x2e, 0xbc, 0x04, 0x6d, 0x42, 0x3d, 0x8a, 0xfc, 0x1c,
+ 0x3d, 0x52, 0xb0, 0x27, 0x3d, 0xe8, 0xf9, 0x35, 0xbd, 0xd4, 0xc2, 0x1b, 0x3d,
+ 0x00, 0x3a, 0x0b, 0xbb, 0x80, 0x7e, 0x4b, 0x3c, 0x06, 0xba, 0x3e, 0xbd, 0x70,
+ 0xc9, 0x35, 0xbd, 0xe0, 0x8b, 0x9d, 0xbb, 0x16, 0x05, 0x2f, 0xbd, 0xa0, 0xeb,
+ 0x03, 0x3c, 0x40, 0x3e, 0x95, 0xbc, 0xea, 0x76, 0x73, 0xbd, 0x90, 0xb0, 0xe8,
+ 0x3c, 0x3e, 0x61, 0x42, 0xbd, 0x17, 0x02, 0x8d, 0xbd, 0x42, 0x66, 0x1d, 0x3d,
+ 0xfe, 0x31, 0x68, 0x3d, 0x52, 0x8e, 0x30, 0xbd, 0x6b, 0xca, 0x10, 0xbd, 0xbd,
+ 0xcc, 0x80, 0xbd, 0x38, 0x91, 0x53, 0xbd, 0x90, 0xd7, 0xd3, 0x3c, 0x00, 0x0c,
+ 0xf4, 0x3b, 0x82, 0xf5, 0x3f, 0xbd, 0xb2, 0xa9, 0x04, 0x3d, 0x62, 0x67, 0x5c,
+ 0x3d, 0x86, 0xab, 0x91, 0xbc, 0xc2, 0x2b, 0xe8, 0xbc, 0x3a, 0x8a, 0x67, 0xbd,
+ 0xcc, 0x83, 0xdb, 0x3c, 0xf0, 0x8a, 0x03, 0x3c, 0x94, 0x78, 0x53, 0x3d, 0x9c,
+ 0x1b, 0xd4, 0x3c, 0xdb, 0xf9, 0x89, 0x3d, 0x40, 0xa5, 0x10, 0x3b, 0x89, 0xed,
+ 0x80, 0xbd, 0x6e, 0xb8, 0x57, 0xbd, 0x12, 0xc2, 0xcf, 0xbc, 0x44, 0x32, 0xb1,
+ 0x3c, 0xd5, 0xed, 0x34, 0xbd, 0x5e, 0x6c, 0x5c, 0xbd, 0x68, 0x69, 0x85, 0x3c,
+ 0x30, 0xdb, 0xb6, 0xbb, 0x00, 0x7f, 0xe0, 0x3c, 0x80, 0x24, 0x1e, 0x3b, 0x78,
+ 0x6f, 0x81, 0xbc, 0x3a, 0x27, 0x1b, 0x3d, 0x7f, 0xb5, 0x8a, 0xbd, 0xbb, 0xc1,
+ 0x8e, 0x3d, 0xa8, 0x7e, 0x69, 0x3c, 0x00, 0x80, 0x47, 0xbb, 0x21, 0xb9, 0x15,
+ 0xbd, 0x14, 0x0b, 0x8e, 0x3c, 0xa2, 0x1b, 0x55, 0x3d, 0x28, 0xea, 0x5b, 0xbd,
+ 0x10, 0x9a, 0x43, 0x3d, 0x40, 0xf6, 0x8a, 0x3a, 0x58, 0xb1, 0x92, 0xbc, 0x5c,
+ 0x0a, 0x4e, 0xbd, 0x10, 0xec, 0x1f, 0xbd, 0xa8, 0x31, 0xa7, 0x3c, 0x60, 0xfa,
+ 0x9f, 0xbb, 0xf0, 0x04, 0xa3, 0xbb, 0xc4, 0xd8, 0x5f, 0xbd, 0xba, 0x5f, 0x66,
+ 0xbd, 0x52, 0x94, 0x97, 0xbc, 0x1a, 0x9b, 0x22, 0xbd, 0xaa, 0x28, 0x59, 0x3d,
+ 0xaa, 0x06, 0x64, 0xbd, 0xe7, 0xc2, 0x83, 0xbd, 0xd0, 0x3d, 0xd0, 0xbc, 0x00,
+ 0x8c, 0xa3, 0x39, 0xd0, 0x27, 0x0c, 0xbc, 0x40, 0x8f, 0x79, 0xbc, 0x9e, 0x32,
+ 0x7f, 0x3d, 0xac, 0x9b, 0xfd, 0xbc, 0xb1, 0x17, 0x91, 0x3d, 0xa8, 0xca, 0x4e,
+ 0x3d, 0x40, 0xc3, 0xb7, 0x3a, 0xc0, 0x8e, 0x78, 0xbb, 0x3f, 0x3c, 0x83, 0x3d,
+ 0x47, 0xdc, 0x81, 0xbd, 0x5b, 0xe6, 0x1c, 0xbd, 0x70, 0xe3, 0xc8, 0xbc, 0x70,
+ 0x12, 0xd6, 0xbb, 0x0c, 0xb6, 0xe3, 0x3c, 0x88, 0x2a, 0x22, 0x3c, 0xd6, 0xbf,
+ 0x8d, 0xbd, 0xde, 0x15, 0x20, 0x3d, 0x76, 0x83, 0x3e, 0xbd, 0x85, 0x35, 0x80,
+ 0x3d, 0xc1, 0x0b, 0x87, 0x3d, 0xbf, 0x64, 0x18, 0xbd, 0x80, 0x22, 0x68, 0x3b,
+ 0xc4, 0xb0, 0xb0, 0x3c, 0xa2, 0xf2, 0x4f, 0xbd, 0xb6, 0x63, 0x04, 0x3d, 0xc0,
+ 0x4a, 0xc9, 0x3c, 0x36, 0x66, 0xc0, 0xbc, 0x64, 0x7a, 0x4c, 0x3d, 0xc1, 0x5b,
+ 0x8c, 0x3d, 0xae, 0xa2, 0x41, 0x3d, 0x66, 0x93, 0x01, 0x3d, 0x6c, 0xb7, 0x37,
+ 0xbd, 0x8c, 0x03, 0x28, 0xbd, 0x7c, 0xf6, 0x69, 0xbd, 0xa2, 0xe7, 0x0d, 0xbd,
+ 0xb0, 0xf3, 0x41, 0x3d, 0xc0, 0xbf, 0xc4, 0x3b, 0xe2, 0x58, 0x46, 0xbd, 0x02,
+ 0xb4, 0x60, 0x3d, 0xa2, 0xf8, 0x29, 0x3d, 0x90, 0xf7, 0xc8, 0x3b, 0xee, 0xad,
+ 0x43, 0x3d, 0x1b, 0x51, 0x12, 0xbd, 0xee, 0xc3, 0x91, 0xbd, 0x20, 0xad, 0x58,
+ 0x3c, 0xc6, 0x54, 0x3a, 0x3d, 0xea, 0xba, 0x60, 0xbd, 0x7e, 0x31, 0x22, 0x3d,
+ 0x98, 0xe6, 0x80, 0xbd, 0x00, 0x41, 0x29, 0x3b, 0x85, 0xec, 0x8c, 0x3d, 0x7a,
+ 0x8e, 0x3e, 0x3d, 0x42, 0x31, 0xfc, 0xbc, 0x58, 0x3c, 0x08, 0x3c, 0xdc, 0x04,
+ 0xb5, 0xbc, 0x9e, 0xbf, 0x0f, 0xbd, 0x70, 0xad, 0x2a, 0xbc, 0x6c, 0x83, 0x8c,
+ 0xbc, 0x6a, 0xd4, 0x6c, 0xbd, 0x62, 0x1b, 0x8e, 0xbc, 0x94, 0x48, 0x1f, 0xbd,
+ 0x35, 0xe0, 0x3d, 0xbd, 0x60, 0x91, 0x88, 0x3b, 0x6c, 0x16, 0x07, 0x3d, 0x30,
+ 0xa0, 0x93, 0x3b, 0x3c, 0xec, 0x5e, 0xbc, 0x66, 0xbf, 0x51, 0xbd, 0xfc, 0x42,
+ 0x47, 0x3d, 0x78, 0x73, 0x71, 0x3c, 0x62, 0x96, 0x89, 0xbd, 0x50, 0x2b, 0xca,
+ 0x3c, 0x98, 0xc5, 0x21, 0x3c, 0xbb, 0x4b, 0x19, 0xbd, 0x36, 0x22, 0x75, 0x3d,
+ 0x44, 0x6e, 0x7d, 0xbd, 0xec, 0x88, 0x8d, 0x3c, 0xa8, 0x57, 0x0e, 0x3c, 0x96,
+ 0x97, 0x01, 0x3d, 0x1c, 0x9c, 0x59, 0x3d, 0xc4, 0x0b, 0x31, 0x3d, 0x60, 0xf0,
+ 0x6c, 0xbc, 0xb8, 0xa9, 0xb4, 0x3c, 0xd8, 0xbb, 0x33, 0xbc, 0x98, 0x35, 0x99,
+ 0x3c, 0xd2, 0x49, 0x3d, 0xbd, 0xe6, 0xc9, 0x5b, 0x3d, 0x42, 0xf7, 0x41, 0x3d,
+ 0xda, 0x13, 0x37, 0xbd, 0x96, 0x91, 0x94, 0xbc, 0xb8, 0xde, 0x89, 0x3c, 0xda,
+ 0x37, 0x08, 0xbd, 0x20, 0xda, 0x3e, 0x3c, 0xda, 0xe8, 0x61, 0xbd, 0x70, 0x8a,
+ 0x29, 0x3d, 0x18, 0xa4, 0x8f, 0xbd, 0x20, 0xee, 0x56, 0x3c, 0x70, 0xc3, 0xc8,
+ 0xbc, 0x5c, 0xf4, 0x99, 0x3c, 0x54, 0xd5, 0x4b, 0xbd, 0x88, 0xcf, 0x6a, 0x3c,
+ 0xa5, 0xc7, 0x1c, 0xbd, 0x10, 0x98, 0xb3, 0xbb, 0x9a, 0xe0, 0x86, 0xbd, 0x3e,
+ 0x34, 0x87, 0xbd, 0xfa, 0x36, 0x7d, 0x3d, 0x40, 0x64, 0xfe, 0xbc, 0xd0, 0x4f,
+ 0x67, 0xbd, 0x21, 0xda, 0x72, 0xbd, 0x2e, 0x02, 0x38, 0xbd, 0xc6, 0xd9, 0xff,
+ 0xbc, 0x1a, 0x30, 0xb9, 0xbc, 0x58, 0xea, 0x58, 0x3c, 0xb1, 0xb7, 0x03, 0xbd,
+ 0x80, 0x5b, 0xfc, 0x3a, 0x43, 0x60, 0x80, 0x3d, 0xa8, 0x67, 0x4a, 0xbd, 0x68,
+ 0xd8, 0x3e, 0x3c, 0xf0, 0xe8, 0x2a, 0x3c, 0x68, 0x26, 0x3f, 0xbd, 0x28, 0x26,
+ 0x73, 0xbd, 0x38, 0xe5, 0x24, 0x3d, 0x00, 0xb0, 0xa1, 0xba, 0x7e, 0x0f, 0x18,
+ 0xbd, 0x35, 0x0d, 0x7c, 0xbd, 0x14, 0xa7, 0x3f, 0x3d, 0x16, 0x49, 0x0e, 0x3d,
+ 0x2e, 0xd8, 0x90, 0xbd, 0x50, 0xc3, 0x21, 0xbd, 0xd4, 0x13, 0x44, 0x3d, 0x70,
+ 0x10, 0xfd, 0x3b, 0x7b, 0x43, 0x87, 0x3d, 0x64, 0xb7, 0xf9, 0x3c, 0xd6, 0xc6,
+ 0xb7, 0xbc, 0x00, 0xd8, 0xbb, 0x3b, 0xe0, 0x1b, 0x42, 0xbb, 0x68, 0x5c, 0xcf,
+ 0xbc, 0xea, 0xfb, 0x8e, 0xbd, 0xdc, 0x09, 0x33, 0x3d, 0x80, 0xef, 0xb9, 0x3c,
+ 0x00, 0xde, 0x92, 0xb9, 0x31, 0x42, 0x08, 0xbd, 0x80, 0x6d, 0x40, 0x3b, 0x80,
+ 0xab, 0x20, 0x3d, 0xc0, 0x60, 0xc3, 0xba, 0x0b, 0xb6, 0x5e, 0xbd, 0xd4, 0x28,
+ 0x3e, 0xbd, 0x47, 0x7b, 0x87, 0x3d, 0x81, 0x52, 0x84, 0x3d, 0x90, 0x8e, 0xc2,
+ 0x3c, 0x04, 0x5b, 0xf3, 0xbc, 0x70, 0xa9, 0xea, 0x3c, 0x55, 0x55, 0x4d, 0xbd,
+ 0x52, 0x8b, 0x59, 0xbd, 0xf2, 0xeb, 0x56, 0x3d, 0x1e, 0xc7, 0x3f, 0x3d, 0xe0,
+ 0x52, 0xa3, 0x3b, 0x16, 0x93, 0x9d, 0xbc, 0x28, 0xeb, 0x36, 0x3d, 0x70, 0x4c,
+ 0x1d, 0x3d, 0x8d, 0x81, 0x14, 0xbd, 0xb0, 0x22, 0xa0, 0xbb, 0x50, 0xfa, 0x87,
+ 0x3c, 0x33, 0xc6, 0x2d, 0xbd, 0xd3, 0xd8, 0x85, 0x3d, 0xe8, 0xfd, 0x15, 0x3c,
+ 0x20, 0x79, 0xe4, 0x3b, 0xb0, 0xd4, 0x4f, 0xbd, 0x24, 0xe9, 0xb5, 0x3c, 0xba,
+ 0x47, 0x27, 0x3d, 0x23, 0xef, 0x02, 0xbd, 0xf0, 0xac, 0x31, 0x3d, 0x62, 0xde,
+ 0xdd, 0xbc, 0x2c, 0xa0, 0x29, 0x3d, 0xa5, 0xec, 0x85, 0x3d, 0xa9, 0x1b, 0x8d,
+ 0x3d, 0x2c, 0x6c, 0xa2, 0xbc, 0xf0, 0xc7, 0x37, 0xbc, 0x6c, 0xf7, 0xc5, 0xbc,
+ 0xf4, 0x1d, 0x1c, 0xbc, 0x20, 0x3c, 0xc9, 0x3b, 0x9d, 0xff, 0x0b, 0xbd, 0x10,
+ 0xa3, 0x53, 0x3d, 0x64, 0xbb, 0xc9, 0xbc, 0xfc, 0x8d, 0xe8, 0xbc, 0x20, 0x1f,
+ 0x5a, 0x3c, 0x11, 0xe2, 0x17, 0xbd, 0xe0, 0x37, 0x97, 0x3b, 0x88, 0x44, 0x2a,
+ 0xbd, 0x88, 0x79, 0x4c, 0xbd, 0xa8, 0x9e, 0x0d, 0x3c, 0x15, 0x54, 0x8c, 0x3d,
+ 0xcb, 0x9b, 0x87, 0x3d, 0x18, 0xdd, 0x07, 0xbd, 0x2b, 0x33, 0x81, 0xbd, 0xb2,
+ 0x57, 0x2e, 0xbd, 0x18, 0xc5, 0x2b, 0xbd, 0x88, 0x10, 0x91, 0xbd, 0x66, 0x69,
+ 0x15, 0x3d, 0x98, 0x6c, 0xf7, 0x3c, 0x10, 0x05, 0x07, 0xbc, 0x44, 0x3b, 0xc6,
+ 0xbc, 0x30, 0x43, 0xa8, 0x3b, 0x5b, 0xd8, 0x38, 0xbd, 0x66, 0x01, 0xe8, 0xbc,
+ 0x36, 0xef, 0xaf, 0xbc, 0x88, 0x76, 0x24, 0x3c, 0x3a, 0x71, 0x5d, 0x3d, 0x30,
+ 0xa0, 0x38, 0xbc, 0x04, 0x86, 0xf5, 0xbc, 0x30, 0xdc, 0x7c, 0x3c, 0x0c, 0x37,
+ 0x2f, 0xbd, 0x80, 0xa4, 0x1f, 0xba, 0x2c, 0xa1, 0x2f, 0xbd, 0xb0, 0xb7, 0xa0,
+ 0x3c, 0x37, 0xb1, 0x14, 0xbd, 0xb6, 0x07, 0x54, 0xbd, 0xb0, 0xbf, 0xd7, 0xbc,
+ 0x6c, 0xc8, 0x2c, 0x3d, 0x2c, 0x09, 0x31, 0x3d, 0x04, 0x69, 0xe4, 0xbc, 0xa0,
+ 0x5e, 0x7a, 0xbb, 0x90, 0x52, 0xb3, 0x3c, 0x4e, 0x6b, 0x84, 0xbd, 0xcc, 0x7e,
+ 0x25, 0x3d, 0x30, 0x08, 0x99, 0xbb, 0x00, 0x08, 0xfc, 0x3b, 0xaa, 0xf0, 0x66,
+ 0x3d, 0x13, 0xa5, 0x8a, 0x3d, 0xc8, 0x1c, 0xad, 0xbc, 0xf1, 0x48, 0x82, 0x3d,
+ 0x7d, 0x18, 0x80, 0xbd, 0x14, 0x52, 0xa6, 0x3c, 0x10, 0x21, 0x9c, 0xbb, 0xfc,
+ 0xda, 0x31, 0xbc, 0x0e, 0x65, 0xd2, 0xbc, 0x74, 0x2a, 0xcd, 0xbc, 0xb6, 0xb6,
+ 0x64, 0x3d, 0x24, 0x32, 0x55, 0x3d, 0x8e, 0xc7, 0xbc, 0xbc, 0x94, 0x15, 0x89,
+ 0x3c, 0x72, 0x1e, 0x3b, 0x3d, 0xb0, 0x0e, 0x25, 0x3c, 0xf8, 0x00, 0xad, 0x3c,
+ 0xc1, 0xb3, 0x92, 0xbd, 0xce, 0xcf, 0x33, 0x3d, 0xe8, 0xec, 0x6a, 0x3c, 0x9e,
+ 0x76, 0x9c, 0xbc, 0x4e, 0x5f, 0x29, 0xbd, 0x7c, 0xa7, 0x88, 0x3c, 0x00, 0xf3,
+ 0xbf, 0x3c, 0x10, 0x12, 0x26, 0x3c, 0xf4, 0x7c, 0x4b, 0x3d, 0x90, 0x83, 0xec,
+ 0xbb, 0xb6, 0x48, 0x92, 0xbd, 0x5c, 0x63, 0x47, 0x3d, 0x3f, 0xb2, 0x71, 0xbd,
+ 0x60, 0x1f, 0x7e, 0xbc, 0xbc, 0xff, 0x9a, 0xbc, 0x96, 0x17, 0xb2, 0xbc, 0x78,
+ 0x09, 0x0a, 0x3c, 0xa5, 0xbb, 0x8d, 0x3d, 0x80, 0x7e, 0xbd, 0x3a, 0x8c, 0x61,
+ 0x8f, 0xbd, 0x70, 0x44, 0x19, 0x3d, 0xde, 0x63, 0x4b, 0x3d, 0x00, 0x61, 0x0b,
+ 0xbb, 0x36, 0x70, 0x32, 0xbd, 0xc6, 0x8f, 0x71, 0x3d, 0xf0, 0xf7, 0xa0, 0xbc,
+ 0x00, 0x80, 0x01, 0xb8, 0xe4, 0xc6, 0x93, 0x3c, 0x08, 0xd4, 0x3b, 0x3c, 0x96,
+ 0x32, 0x40, 0x3d, 0xb8, 0x22, 0x31, 0x3d, 0x4a, 0xd9, 0x6f, 0x3d, 0x28, 0x10,
+ 0x2c, 0xbc, 0x94, 0x4b, 0x9c, 0xbc, 0x90, 0x38, 0x57, 0x3d, 0xa4, 0x0d, 0x81,
+ 0xbc, 0x90, 0xa5, 0xb6, 0x3c, 0x9d, 0xfe, 0x78, 0xbd, 0x3c, 0x24, 0x19, 0x3d,
+ 0xa8, 0x56, 0x0c, 0x3d, 0x6b, 0xec, 0x54, 0xbd, 0x10, 0x49, 0x94, 0xbb, 0x80,
+ 0x25, 0xe9, 0x3c, 0xe4, 0xb5, 0xe2, 0xbc, 0x68, 0xb2, 0x10, 0x3d, 0x6a, 0x13,
+ 0xe0, 0xbc, 0x3a, 0x69, 0x44, 0xbd, 0x18, 0x3f, 0xfc, 0x3c, 0x6e, 0x08, 0x60,
+ 0x3d, 0x5e, 0x5b, 0xa2, 0xbc, 0x7c, 0xbd, 0x81, 0xbd, 0xf0, 0xf9, 0xd6, 0x3b,
+ 0xfa, 0x80, 0x14, 0xbd, 0xdb, 0xb0, 0x8d, 0xbd, 0xb0, 0x41, 0xe5, 0x3b, 0xe0,
+ 0x03, 0xe3, 0x3c, 0xf4, 0x88, 0x07, 0xbd, 0x52, 0x89, 0xd0, 0xbc, 0x90, 0x90,
+ 0x10, 0x3d, 0x9c, 0xc3, 0x3e, 0x3d, 0x2f, 0x07, 0x09, 0xbd, 0x7e, 0x67, 0xf6,
+ 0xbc, 0xde, 0x88, 0xe1, 0xbc, 0xbe, 0x4b, 0x08, 0xbd, 0xac, 0xc1, 0x24, 0x3d,
+ 0x5e, 0xd5, 0x3c, 0x3d, 0x80, 0x9e, 0x01, 0xbc, 0xa6, 0xdb, 0xc7, 0xbc, 0xbb,
+ 0x37, 0x83, 0xbd, 0x34, 0x71, 0x50, 0x3d, 0x10, 0x46, 0x2d, 0xbd, 0x71, 0x50,
+ 0x67, 0xbd, 0x20, 0x2e, 0x15, 0xbb, 0xaa, 0x05, 0x74, 0x3d, 0xc1, 0xb5, 0x79,
+ 0xbd, 0x21, 0xaa, 0x44, 0xbd, 0xda, 0xbd, 0x0c, 0xbd, 0xb1, 0xee, 0x8c, 0x3d,
+ 0x54, 0x83, 0x83, 0xbd, 0x5e, 0xe5, 0x75, 0x3d, 0x52, 0x3d, 0x73, 0x3d, 0x40,
+ 0xf3, 0xd4, 0x3c, 0x9a, 0x1a, 0x78, 0x3d, 0x85, 0x49, 0x62, 0xbd, 0x6b, 0x57,
+ 0x91, 0x3d, 0x30, 0xd7, 0x3f, 0x3d, 0xed, 0x16, 0x3f, 0xbd, 0xd0, 0xf4, 0x85,
+ 0xbb, 0x47, 0x5e, 0x1e, 0xbd, 0x70, 0xe9, 0x87, 0x3c, 0x87, 0x5d, 0x80, 0xbd,
+ 0xa0, 0x7a, 0xb6, 0xbb, 0x03, 0x86, 0x84, 0xbd, 0x50, 0x4c, 0x74, 0x3c, 0x85,
+ 0x86, 0x80, 0x3d, 0x00, 0xe2, 0x56, 0xbb, 0x7e, 0xb0, 0x16, 0xbd, 0x10, 0xa9,
+ 0x80, 0xbd, 0xe0, 0x8b, 0x47, 0x3d, 0x19, 0x07, 0x68, 0xbd, 0x4e, 0xd8, 0x70,
+ 0x3d, 0xa8, 0x10, 0x2a, 0x3d, 0x22, 0x23, 0x96, 0xbc, 0x92, 0xe3, 0x72, 0xbd,
+ 0xb8, 0x0f, 0x13, 0x3d, 0x16, 0xc3, 0x53, 0x3d, 0xa4, 0x95, 0x41, 0x3d, 0x02,
+ 0xc3, 0x6f, 0x3d, 0x48, 0x02, 0xac, 0xbc, 0x40, 0x53, 0x6d, 0x3b, 0xf4, 0x2a,
+ 0x19, 0xbc, 0x10, 0x1f, 0xc2, 0xbb, 0x21, 0xb8, 0x69, 0xbd, 0x97, 0x8c, 0x8a,
+ 0x3d, 0x38, 0x13, 0xb4, 0x3c, 0xf1, 0x0d, 0x8d, 0x3d, 0x00, 0x69, 0x30, 0x3d,
+ 0x38, 0x92, 0xf9, 0x3c, 0xb5, 0xff, 0x8a, 0x3d, 0x15, 0x27, 0x91, 0x3d, 0x96,
+ 0xd4, 0x00, 0x3d, 0x66, 0xde, 0x1c, 0x3d, 0x7c, 0x48, 0x40, 0x3d, 0x08, 0x06,
+ 0xf2, 0x3c, 0x8e, 0xfe, 0x71, 0x3d, 0x90, 0xa1, 0xc6, 0xbb, 0x88, 0x57, 0x05,
+ 0x3c, 0x80, 0x92, 0x6d, 0x3a, 0x80, 0x99, 0xc9, 0xba, 0x0f, 0x0f, 0x33, 0xbd,
+ 0x76, 0xfc, 0x31, 0x3d, 0xd8, 0x9f, 0x23, 0xbd, 0x8c, 0x07, 0x07, 0xbd, 0x68,
+ 0x38, 0x5e, 0x3c, 0xf0, 0x39, 0xbf, 0xbc, 0x6c, 0x16, 0xfc, 0x3c, 0x94, 0xf2,
+ 0xb4, 0xbc, 0x20, 0x52, 0xc4, 0xbb, 0xb7, 0x3f, 0x02, 0xbd, 0x78, 0x48, 0x61,
+ 0xbd, 0x48, 0xad, 0x6b, 0xbd, 0xcd, 0xb1, 0x8c, 0x3d, 0x20, 0x28, 0xcd, 0x3c,
+ 0xb4, 0x49, 0x53, 0x3d, 0x30, 0x59, 0x06, 0x3c, 0xda, 0xea, 0x83, 0xbd, 0xf8,
+ 0xe2, 0x16, 0xbd, 0x96, 0xc3, 0x77, 0x3d, 0x2c, 0x90, 0xf6, 0x3c, 0x94, 0x78,
+ 0x4d, 0xbc, 0x75, 0x0d, 0x2f, 0xbd, 0xa2, 0x00, 0xa7, 0xbc, 0x32, 0xec, 0x7c,
+ 0x3d, 0x6c, 0x7a, 0x5a, 0xbc, 0x7e, 0x59, 0x58, 0x3d, 0x60, 0x65, 0x91, 0x3b,
+ 0x28, 0x8b, 0x75, 0xbd, 0x22, 0xa7, 0x7b, 0x3d, 0xc4, 0xdd, 0x39, 0x3d, 0xe4,
+ 0x54, 0xa3, 0xbc, 0xb6, 0x39, 0x30, 0x3d, 0x38, 0x91, 0x35, 0x3c, 0xd0, 0xb9,
+ 0x10, 0x3c, 0x4c, 0x8a, 0xab, 0x3c, 0x04, 0x8d, 0x0e, 0xbd, 0x20, 0xc2, 0xcb,
+ 0x3b, 0x32, 0xbe, 0x58, 0xbd, 0xec, 0x4e, 0x03, 0x3d, 0xf0, 0x59, 0xee, 0x3c,
+ 0x18, 0x48, 0x0d, 0xbc, 0xa0, 0xfd, 0xe6, 0xbb, 0x8c, 0x9c, 0x4b, 0x3d, 0xa8,
+ 0xe8, 0x13, 0x3c, 0x14, 0xb9, 0x4e, 0xbd, 0xe6, 0xbf, 0x03, 0x3d, 0xf0, 0x7a,
+ 0xdd, 0xbc, 0xc8, 0x1b, 0x91, 0xbc, 0x9b, 0x2a, 0x24, 0xbd, 0x98, 0x93, 0x01,
+ 0xbc, 0x1a, 0x0c, 0x34, 0x3d, 0xfe, 0xfa, 0xa3, 0xbc, 0x7c, 0x82, 0xbd, 0x3c,
+ 0x70, 0x96, 0xe8, 0x3c, 0xa6, 0x08, 0x67, 0x3d, 0x48, 0x11, 0x68, 0xbc, 0x90,
+ 0xfb, 0x58, 0xbd, 0x91, 0x9e, 0x8b, 0xbd, 0x4b, 0xd8, 0x87, 0xbd, 0x6a, 0x90,
+ 0x63, 0x3d, 0x36, 0xa5, 0x20, 0x3d, 0x30, 0x61, 0x3d, 0x3d, 0x56, 0x99, 0x11,
+ 0xbd, 0xce, 0xff, 0x70, 0x3d, 0xd5, 0x52, 0x3d, 0xbd, 0x44, 0x1e, 0x92, 0x3c,
+ 0x6e, 0xb4, 0x44, 0xbd, 0x42, 0xeb, 0xec, 0xbc, 0xa2, 0xea, 0x85, 0xbc, 0x40,
+ 0x48, 0x01, 0x3b, 0x52, 0xcd, 0x75, 0x3d, 0xe9, 0xa7, 0x08, 0xbd, 0x61, 0x2e,
+ 0x0c, 0xbd, 0x06, 0xda, 0x24, 0x3d, 0xce, 0xfc, 0xf7, 0xbc, 0x62, 0xab, 0x7d,
+ 0x3d, 0x2f, 0x02, 0x89, 0xbd, 0xea, 0x05, 0x48, 0xbd, 0xea, 0x7c, 0x7b, 0xbd,
+ 0x80, 0x05, 0x8c, 0xba, 0xba, 0x77, 0x3d, 0xbd, 0xfa, 0xee, 0x34, 0xbd, 0xd2,
+ 0x24, 0x28, 0x3d, 0x30, 0xb2, 0x40, 0xbd, 0x52, 0x8b, 0x18, 0x3d, 0xe3, 0xfc,
+ 0x8b, 0x3d, 0x58, 0x86, 0x65, 0xbc, 0x64, 0x1e, 0xa8, 0xbc, 0xba, 0xc7, 0x75,
+ 0x3d, 0xdb, 0xb4, 0x80, 0x3d, 0x07, 0x16, 0x67, 0xbd, 0x84, 0x95, 0x6d, 0xbc,
+ 0x11, 0xb3, 0x1e, 0xbd, 0x40, 0x9b, 0x56, 0xbb, 0x7e, 0x66, 0x57, 0x3d, 0xca,
+ 0x1c, 0x5e, 0x3d, 0x20, 0xef, 0xe5, 0x3b, 0xd3, 0x0f, 0x2e, 0xbd, 0x8a, 0xdf,
+ 0x81, 0xbd, 0x58, 0xc9, 0x0f, 0x3d, 0xbc, 0x54, 0x63, 0xbd, 0x60, 0x24, 0x85,
+ 0xbd, 0x5a, 0xa5, 0xda, 0xbc, 0x12, 0x87, 0x01, 0x3d, 0xf6, 0xc0, 0x96, 0xbc,
+ 0x78, 0x46, 0x1d, 0x3d, 0xb6, 0x90, 0x62, 0xbd, 0xc0, 0x43, 0x94, 0x3b, 0xf0,
+ 0xed, 0xce, 0xbb, 0xb8, 0x25, 0x14, 0xbc, 0xf4, 0x5c, 0x20, 0xbc, 0xd8, 0x5b,
+ 0x1c, 0x3d, 0x44, 0xcb, 0x4c, 0xbc, 0x2e, 0xf6, 0x36, 0x3d, 0x94, 0xa7, 0xe6,
+ 0xbc, 0xd8, 0xac, 0x4f, 0x3c, 0x06, 0x78, 0x11, 0x3d, 0xe6, 0x53, 0x14, 0x3d,
+ 0x3b, 0x4b, 0x25, 0xbd, 0x03, 0xb6, 0x88, 0xbd, 0xd0, 0xc2, 0x2b, 0x3c, 0xc5,
+ 0xf9, 0x12, 0xbd, 0x78, 0x6f, 0xf5, 0x3c, 0xc6, 0xc0, 0x63, 0x3d, 0x60, 0xd4,
+ 0xa9, 0x3c, 0x1b, 0x87, 0x92, 0x3d, 0x70, 0x70, 0x35, 0xbd, 0xb8, 0xaa, 0x17,
+ 0x3d, 0xec, 0x13, 0xde, 0xbc, 0x04, 0xc8, 0x8c, 0x3c, 0x3c, 0xcd, 0xf4, 0x3c,
+ 0x66, 0x81, 0x4b, 0x3d, 0x3e, 0x59, 0x8b, 0xbd, 0xb8, 0xab, 0x04, 0x3c, 0xdc,
+ 0x9a, 0xd8, 0x3c, 0x00, 0x22, 0x4d, 0x3d, 0x08, 0x10, 0x93, 0x3c, 0x64, 0x64,
+ 0x7e, 0xbc, 0x32, 0xd1, 0x00, 0x3d, 0xfc, 0x6a, 0x2a, 0xbd, 0x04, 0x05, 0xa8,
+ 0x3c, 0x4c, 0xb2, 0xc3, 0x3c, 0x57, 0x68, 0x0d, 0xbd, 0x18, 0x0f, 0x6e, 0xbd,
+ 0x31, 0x3c, 0x0d, 0xbd, 0xa0, 0xef, 0xe0, 0xbb, 0x5a, 0xa3, 0xf2, 0xbc, 0xb3,
+ 0xcd, 0x88, 0x3d, 0x0c, 0x86, 0x6e, 0xbc, 0x78, 0x6a, 0x14, 0xbc, 0x51, 0x9b,
+ 0x2e, 0xbd, 0x45, 0x0b, 0x22, 0xbd, 0xf0, 0x38, 0x9e, 0x3c, 0x53, 0x6c, 0x87,
+ 0x3d, 0x00, 0x20, 0x2d, 0x3a, 0x40, 0xea, 0xd2, 0xba, 0xcd, 0x35, 0x88, 0xbd,
+ 0xb2, 0xad, 0x62, 0x3d, 0xf6, 0x83, 0xb9, 0xbc, 0x92, 0xb4, 0x4b, 0x3d, 0xe6,
+ 0x0e, 0x86, 0xbc, 0x55, 0x4e, 0x85, 0x3d, 0x7e, 0x89, 0x05, 0x3d, 0xa1, 0xb1,
+ 0x83, 0x3d, 0x7c, 0x7c, 0xf5, 0x3c, 0xdb, 0x2e, 0x8c, 0xbd, 0x98, 0x94, 0x5c,
+ 0xbd, 0x0c, 0xfd, 0xb9, 0xbc, 0x40, 0x7e, 0xa5, 0x3c, 0xc0, 0x1e, 0xd6, 0x3a,
+ 0x88, 0x80, 0x1d, 0x3c, 0x48, 0x6f, 0xfe, 0x3c, 0x2a, 0x7a, 0xde, 0xbc, 0x9c,
+ 0x7d, 0x1a, 0xbd, 0x70, 0xd8, 0x1b, 0x3c, 0xa8, 0x27, 0x75, 0xbd, 0x92, 0x9a,
+ 0x53, 0x3d, 0xb3, 0x0a, 0x8b, 0x3d, 0xd0, 0xe2, 0x10, 0x3c, 0xb0, 0x82, 0x9d,
+ 0x3b, 0x38, 0x23, 0x10, 0x3c, 0xc0, 0xfb, 0xab, 0xbb, 0x7a, 0xff, 0x77, 0xbd,
+ 0x3f, 0x50, 0x91, 0x3d, 0x30, 0x33, 0x01, 0x3c, 0x48, 0x28, 0x43, 0x3d, 0xd4,
+ 0x59, 0xac, 0xbc, 0xa3, 0xa9, 0x0d, 0xbd, 0x1c, 0x90, 0x52, 0xbd, 0x40, 0xa7,
+ 0x57, 0x3c, 0x94, 0x79, 0x28, 0xbd, 0xf0, 0x27, 0x9b, 0x3c, 0x02, 0x37, 0x7d,
+ 0x3d, 0x14, 0x5b, 0x94, 0xbc, 0xde, 0x3f, 0x2c, 0xbd, 0x06, 0xe5, 0x2b, 0xbd,
+ 0x58, 0x3a, 0x01, 0xbd, 0xda, 0x88, 0xa5, 0xbc, 0x27, 0x42, 0x08, 0xbd, 0x30,
+ 0x39, 0xd1, 0x3b, 0xdc, 0xf2, 0xb6, 0xbc, 0x78, 0xe4, 0xe9, 0x3c, 0x56, 0xdd,
+ 0x8c, 0xbc, 0x20, 0xbf, 0x17, 0x3d, 0x8a, 0x7a, 0x5e, 0xbd, 0x6a, 0x3e, 0xac,
+ 0xbc, 0xb2, 0x0d, 0x7b, 0x3d, 0x02, 0x11, 0xae, 0xbc, 0x8c, 0x5a, 0x14, 0x3d,
+ 0xba, 0x7e, 0xa6, 0xbc, 0xdc, 0x76, 0x0c, 0x3d, 0xfc, 0x09, 0x5a, 0x3d, 0x4e,
+ 0x8d, 0x8b, 0xbd, 0xd4, 0x0c, 0xa3, 0xbc, 0x7f, 0x0e, 0x8f, 0xbd, 0x20, 0x38,
+ 0x62, 0xbb, 0xe0, 0x57, 0xf8, 0xbb, 0x00, 0x7b, 0x12, 0xba, 0x5c, 0x6f, 0xbe,
+ 0x3c, 0x40, 0xc3, 0x2a, 0x3b, 0xf4, 0xe3, 0xb4, 0x3c, 0xda, 0x17, 0x4d, 0x3d,
+ 0xd0, 0xca, 0x1e, 0x3d, 0x80, 0x09, 0xaa, 0x3c, 0xce, 0x89, 0x5d, 0x3d, 0x24,
+ 0x5d, 0x0f, 0x3d, 0xa0, 0x6d, 0x44, 0x3c, 0x0e, 0x09, 0x92, 0xbc, 0x00, 0xde,
+ 0x57, 0x3c, 0x91, 0x01, 0x73, 0xbd, 0x5e, 0x90, 0x1a, 0x3d, 0x4c, 0xf8, 0xd6,
+ 0x3c, 0xf8, 0x9a, 0x91, 0xbd, 0xe2, 0x1c, 0x5d, 0xbd, 0x80, 0xde, 0x76, 0x3b,
+ 0xd6, 0x26, 0x2c, 0x3d, 0x00, 0xd0, 0x39, 0xbc, 0xfc, 0x5d, 0xee, 0xbc, 0x7a,
+ 0xdc, 0x83, 0xbc, 0x3b, 0x14, 0x81, 0x3d, 0x30, 0x85, 0xf3, 0x3c, 0x0e, 0x0d,
+ 0x85, 0xbd, 0x86, 0x9f, 0xcf, 0xbc, 0x32, 0xf9, 0xfa, 0xbc, 0xdc, 0x92, 0x8e,
+ 0xbd, 0xf0, 0xf2, 0x45, 0x3c, 0xb2, 0xcd, 0x31, 0xbd, 0x40, 0x13, 0xcc, 0xba,
+ 0x81, 0x90, 0x0b, 0xbd, 0xf5, 0xd9, 0x7d, 0xbd, 0x74, 0xf2, 0xc1, 0xbc, 0x8e,
+ 0xb9, 0x2b, 0x3d, 0xb0, 0xef, 0x7e, 0xbd, 0x00, 0x57, 0x81, 0x3c, 0xc2, 0x40,
+ 0x76, 0xbd, 0xaf, 0xe7, 0x08, 0xbd, 0x02, 0x79, 0x26, 0x3d, 0x77, 0x1f, 0x2f,
+ 0xbd, 0x20, 0x66, 0x1c, 0x3c, 0x28, 0x56, 0xc2, 0x3c, 0xe8, 0x78, 0x0e, 0x3c,
+ 0xb8, 0x4e, 0x2c, 0xbc, 0xd0, 0x97, 0x26, 0xbc, 0x5e, 0x8f, 0x3b, 0x3d, 0x30,
+ 0xff, 0x28, 0x3c, 0x91, 0x25, 0x92, 0x3d, 0x20, 0xd1, 0x20, 0xbc, 0x24, 0xb8,
+ 0x23, 0xbd, 0xfc, 0xca, 0x55, 0xbc, 0xf8, 0x46, 0xf0, 0x3c, 0xf7, 0x15, 0x88,
+ 0x3d, 0x96, 0x4a, 0x78, 0x3d, 0x40, 0xdb, 0xce, 0xba, 0x50, 0x38, 0xed, 0x3b,
+ 0x3a, 0xfd, 0x00, 0x3d, 0x40, 0x1d, 0x3d, 0xbb, 0x8a, 0xd6, 0xae, 0xbc, 0x10,
+ 0x55, 0x7a, 0xbd, 0x91, 0x66, 0x59, 0x3d, 0x40, 0x74, 0xd5, 0xbc, 0x76, 0x92,
+ 0xb9, 0xbc, 0xa0, 0x5c, 0x4d, 0x3d, 0x59, 0xd0, 0x4a, 0x3d, 0x65, 0xa7, 0x5e,
+ 0xbd, 0x45, 0x6b, 0xea, 0x3d, 0x2b, 0x08, 0xdf, 0x3c, 0xb3, 0x37, 0x6e, 0x3d,
+ 0xfa, 0xad, 0xe0, 0xbc, 0xc3, 0xd2, 0x01, 0xbe, 0x24, 0x15, 0x90, 0x3d, 0x42,
+ 0xd3, 0xc4, 0x3c, 0x2b, 0xd6, 0x00, 0x3c, 0x9b, 0xf7, 0xcc, 0x3d, 0x7c, 0xc1,
+ 0x37, 0x3d, 0x4c, 0x98, 0xb6, 0x3d, 0x65, 0xac, 0x04, 0x3d, 0xbe, 0x0d, 0xf6,
+ 0x3c, 0x0a, 0x47, 0xb9, 0xbd, 0xa0, 0x2d, 0x4f, 0x3b, 0x44, 0x5d, 0xd1, 0xbc,
+ 0x3c, 0x8b, 0x82, 0x3d, 0xf8, 0xf9, 0x02, 0xbd, 0x21, 0xa7, 0x39, 0xbd, 0xa2,
+ 0x22, 0x82, 0x3d, 0xda, 0x8a, 0xb9, 0xbd, 0x6c, 0x42, 0x95, 0xbc, 0x98, 0x7b,
+ 0x9a, 0x3d, 0x1d, 0x34, 0x40, 0xbd, 0x68, 0xfa, 0x6f, 0x3c, 0xd6, 0x23, 0xa0,
+ 0x3d, 0x5a, 0xe0, 0x71, 0x3d, 0xda, 0xb5, 0x20, 0xbd, 0x0d, 0x43, 0xe0, 0x3c,
+ 0x77, 0xeb, 0x0c, 0x3d, 0x97, 0x10, 0xf9, 0x3c, 0xdb, 0xd9, 0xe6, 0x3a, 0xcb,
+ 0xff, 0x63, 0xbd, 0x75, 0x4f, 0xbf, 0xb9, 0x69, 0x4a, 0x20, 0xbd, 0xa2, 0xbf,
+ 0x56, 0x3d, 0xcc, 0xfe, 0x0e, 0xbe, 0xbe, 0xe9, 0x2e, 0x3d, 0x32, 0x25, 0x5d,
+ 0xbd, 0x77, 0x8a, 0x43, 0xbd, 0xc8, 0x8d, 0x4d, 0x3d, 0xd7, 0x87, 0xe4, 0x3c,
+ 0xc4, 0xf1, 0x50, 0x3d, 0x1a, 0xb6, 0x1a, 0x3d, 0x70, 0x13, 0x0f, 0x3c, 0xeb,
+ 0x1e, 0x6f, 0xbc, 0x4a, 0x22, 0x12, 0x3d, 0x7b, 0xe9, 0xcd, 0x3c, 0x1a, 0x2d,
+ 0x93, 0xbd, 0x21, 0xcd, 0x4b, 0xbd, 0x52, 0x94, 0x21, 0x3d, 0x1c, 0xb7, 0x0e,
+ 0xbd, 0x15, 0xea, 0x0c, 0xbd, 0x55, 0x60, 0xb0, 0x3b, 0xb4, 0x1d, 0xd0, 0x3d,
+ 0x43, 0xa2, 0x7b, 0xbd, 0xc9, 0x7b, 0x12, 0xbd, 0x64, 0x4f, 0x87, 0xbd, 0xea,
+ 0x0f, 0x8c, 0x3d, 0x07, 0x3a, 0xbb, 0xbd, 0xa8, 0xb6, 0x62, 0xbd, 0x74, 0xe8,
+ 0x84, 0x3d, 0xc2, 0x72, 0x6a, 0x3d, 0x58, 0xba, 0x67, 0xbb, 0x31, 0xf4, 0xb2,
+ 0x3d, 0x04, 0x0e, 0x92, 0xbd, 0xd4, 0x9f, 0x7a, 0x3d, 0x81, 0xd4, 0x89, 0xbc,
+ 0xe5, 0xe2, 0xe7, 0xbd, 0xb2, 0xd7, 0x51, 0xbd, 0x64, 0x57, 0x52, 0xbd, 0xb4,
+ 0x3f, 0x73, 0xbc, 0x22, 0x15, 0x4e, 0x3d, 0xe9, 0xf0, 0x4c, 0x3d, 0x05, 0x9b,
+ 0xfa, 0xbc, 0x28, 0xc4, 0xa1, 0x3d, 0xd2, 0x16, 0x51, 0x3d, 0xa0, 0x9f, 0x8f,
+ 0xbb, 0xc9, 0x02, 0x82, 0x3d, 0x13, 0x45, 0x84, 0x3c, 0x0a, 0x79, 0xc9, 0x3c,
+ 0xb9, 0x89, 0x19, 0xbd, 0x57, 0x1f, 0x86, 0xbb, 0xaa, 0xfa, 0xa0, 0x3d, 0x27,
+ 0x94, 0x00, 0xbd, 0x95, 0xf0, 0x86, 0xbd, 0x70, 0x37, 0x81, 0xbc, 0x0a, 0x32,
+ 0x09, 0x3d, 0x18, 0x6d, 0x18, 0xbd, 0x16, 0x40, 0x7e, 0x3d, 0x69, 0xfb, 0xaa,
+ 0xbc, 0x31, 0x93, 0x17, 0xbd, 0x3e, 0xc6, 0x59, 0xbc, 0x17, 0xc8, 0xe7, 0x3c,
+ 0x9e, 0x08, 0xc3, 0x3c, 0x79, 0x41, 0x12, 0x3d, 0xc8, 0xc2, 0x37, 0xbc, 0x3f,
+ 0xc1, 0x8f, 0xbd, 0xd9, 0x75, 0x94, 0xbd, 0x8c, 0xc3, 0x97, 0x3d, 0x36, 0xad,
+ 0x1b, 0xbe, 0x28, 0x9f, 0x80, 0xbc, 0x79, 0x5c, 0x84, 0xbc, 0x20, 0x29, 0x6b,
+ 0x3d, 0xe1, 0xad, 0xd1, 0xbb, 0xa4, 0x2c, 0x08, 0x3d, 0x6e, 0x13, 0x52, 0xbd,
+ 0x4c, 0x51, 0x60, 0x3d, 0xc0, 0xae, 0x92, 0x3d, 0xd3, 0x90, 0x35, 0xbd, 0x04,
+ 0x9e, 0x5f, 0xbd, 0x8c, 0xad, 0xee, 0xbc, 0x6f, 0x0b, 0x3e, 0x3d, 0xfb, 0x15,
+ 0x1c, 0x3c, 0x2f, 0x67, 0x98, 0xbb, 0x90, 0x7f, 0x9f, 0x3d, 0x21, 0x97, 0x2a,
+ 0xbc, 0xa0, 0x67, 0x9d, 0xbd, 0x5d, 0x64, 0x18, 0x3d, 0xaf, 0x36, 0xd9, 0x3b,
+ 0xe0, 0x06, 0xdc, 0x3c, 0xd0, 0x51, 0x8e, 0x3c, 0x48, 0x40, 0x56, 0x3d, 0xac,
+ 0x63, 0xb2, 0xbc, 0x63, 0x31, 0xf6, 0xbc, 0x48, 0x65, 0x07, 0x3d, 0x9c, 0x92,
+ 0x8d, 0xbd, 0x5c, 0xbb, 0x96, 0xbc, 0xa7, 0xdc, 0x07, 0x3c, 0xc4, 0xe5, 0xd8,
+ 0x3c, 0xb9, 0xea, 0x11, 0x3c, 0x10, 0x39, 0x13, 0x3a, 0x18, 0x34, 0x28, 0xbd,
+ 0xf4, 0x41, 0x6c, 0x3c, 0x25, 0x46, 0x12, 0xbd, 0xf9, 0x23, 0x3f, 0x3d, 0xfc,
+ 0x1d, 0xd9, 0x3d, 0x68, 0xc6, 0xa9, 0xbc, 0x97, 0x32, 0x1c, 0xbd, 0x3f, 0x51,
+ 0xbf, 0x3d, 0x7e, 0xd5, 0x3c, 0x3c, 0xda, 0x77, 0xcb, 0xbd, 0x10, 0x52, 0xb6,
+ 0xbc, 0xd8, 0xbd, 0x9b, 0x3d, 0x43, 0xd7, 0x7c, 0x3d, 0x4c, 0x78, 0xb2, 0xbc,
+ 0x7c, 0xda, 0xc9, 0xbc, 0x31, 0x8c, 0x4d, 0x3d, 0x82, 0x0e, 0xcb, 0xbc, 0xed,
+ 0xf9, 0xe8, 0x3b, 0xa8, 0x08, 0x4b, 0x3d, 0x38, 0x3c, 0x4a, 0xbd, 0x1d, 0xd9,
+ 0x0f, 0xbd, 0xd6, 0x17, 0x86, 0x3b, 0xa1, 0x90, 0xab, 0x3d, 0x91, 0xcc, 0x8f,
+ 0xbd, 0x07, 0xfa, 0x39, 0x3d, 0x11, 0x95, 0x03, 0x3d, 0x29, 0x0f, 0x31, 0xbc,
+ 0x87, 0xab, 0x3c, 0x3d, 0xc8, 0xe5, 0x5c, 0xb9, 0x44, 0x79, 0x44, 0xbd, 0x6d,
+ 0x4c, 0x90, 0xbc, 0x86, 0x90, 0xa5, 0xbc, 0x47, 0x61, 0x39, 0xbe, 0xf9, 0xeb,
+ 0x17, 0x3b, 0xea, 0x28, 0xe4, 0xbc, 0x79, 0x88, 0x12, 0xbc, 0x7a, 0x61, 0xdd,
+ 0x3d, 0x7f, 0xfe, 0x49, 0x3d, 0x78, 0x92, 0x5c, 0xbd, 0x6d, 0xe2, 0xa4, 0x3b,
+ 0x68, 0x57, 0x27, 0xbd, 0x61, 0x22, 0xaf, 0x3c, 0x02, 0x98, 0x6e, 0x3d, 0x74,
+ 0x02, 0xbb, 0x3d, 0x33, 0x4d, 0x24, 0xbd, 0x3e, 0x93, 0x81, 0xbc, 0xb2, 0x1e,
+ 0x1f, 0x3d, 0xb5, 0x79, 0x64, 0x3b, 0xbc, 0xfb, 0xf6, 0xbc, 0x61, 0x0c, 0xcd,
+ 0xbd, 0xc1, 0x64, 0x08, 0x3c, 0x6f, 0x3d, 0x27, 0xbd, 0x10, 0xd3, 0xdb, 0xbc,
+ 0xe4, 0xb6, 0xd2, 0x3b, 0x51, 0x12, 0x81, 0x3d, 0x37, 0xee, 0x87, 0xbc, 0xdd,
+ 0x80, 0xaf, 0x39, 0x90, 0x85, 0xaf, 0x3d, 0x80, 0x5f, 0x12, 0xbc, 0xcb, 0x3c,
+ 0x63, 0xbd, 0x81, 0x3c, 0x85, 0x3d, 0x10, 0xe7, 0x54, 0xbc, 0xa6, 0xb7, 0x98,
+ 0xbc, 0x07, 0x98, 0x2f, 0x3d, 0x70, 0x80, 0x28, 0xbe, 0x7a, 0xe5, 0x77, 0x3d,
+ 0x0b, 0x81, 0x51, 0xbd, 0xb1, 0xdf, 0x35, 0xbc, 0xd2, 0xf7, 0x0b, 0x3d, 0xbe,
+ 0x9e, 0x02, 0xbd, 0xa2, 0xc0, 0x03, 0x3d, 0x97, 0xf5, 0x2f, 0xbb, 0xc6, 0x6b,
+ 0x13, 0xbd, 0x81, 0xbc, 0xe8, 0xbb, 0x2a, 0x57, 0x63, 0x3d, 0x49, 0x18, 0x51,
+ 0xbc, 0xd7, 0x9e, 0x44, 0xbd, 0x51, 0x59, 0xb8, 0x3b, 0x5b, 0x9b, 0x86, 0x3c,
+ 0x1d, 0x63, 0x8a, 0x3d, 0x15, 0xc7, 0x94, 0xbd, 0x43, 0xc8, 0x05, 0xbd, 0x7b,
+ 0xc8, 0x26, 0x3d, 0xdc, 0x03, 0xbd, 0x3c, 0xa0, 0x16, 0x2b, 0xbd, 0x33, 0x15,
+ 0xfa, 0x3c, 0xfe, 0xce, 0x91, 0xbc, 0x0f, 0x1e, 0xe3, 0x3b, 0x01, 0x19, 0x2b,
+ 0xbd, 0x26, 0xff, 0x53, 0x3c, 0x4f, 0x22, 0x91, 0xbb, 0xf6, 0x4f, 0x84, 0xbd,
+ 0xc5, 0xf6, 0x8a, 0x3d, 0x76, 0xcf, 0x90, 0xbd, 0x4d, 0x0e, 0xb7, 0x3d, 0x90,
+ 0x1f, 0xd0, 0xbc, 0xd8, 0xa6, 0x7c, 0xbd, 0x39, 0xa0, 0x70, 0x3c, 0x33, 0x14,
+ 0x91, 0xbd, 0xa4, 0x66, 0x12, 0xbb, 0xfd, 0x3b, 0x4e, 0x3d, 0x87, 0x72, 0x0c,
+ 0x3d, 0xa1, 0x1b, 0x7b, 0xbc, 0xe0, 0x0f, 0xb5, 0xbc, 0x74, 0x49, 0x42, 0xbd,
+ 0x61, 0x8f, 0x34, 0x3d, 0x40, 0x4a, 0xb0, 0xbc, 0x19, 0xf3, 0x14, 0x3d, 0x5c,
+ 0xd5, 0x8a, 0x3d, 0x4e, 0xd1, 0x54, 0x3d, 0xd8, 0x0b, 0x0d, 0x3d, 0x04, 0x61,
+ 0x85, 0x3d, 0x7e, 0x9e, 0x33, 0x3d, 0xd7, 0x75, 0xcb, 0x3b, 0x71, 0x7a, 0x89,
+ 0xbb, 0xb5, 0x56, 0x62, 0xbd, 0x00, 0xe5, 0x87, 0xbc, 0x84, 0x92, 0xca, 0xbc,
+ 0xf4, 0x15, 0xbb, 0xbc, 0xe7, 0xae, 0xc5, 0x3a, 0x8a, 0x96, 0x98, 0x3c, 0x55,
+ 0xb6, 0x9a, 0xbc, 0x59, 0x6f, 0x2c, 0x3d, 0x5b, 0x3b, 0x14, 0x3c, 0xd7, 0xb4,
+ 0xa6, 0x3b, 0x3f, 0x09, 0x21, 0x3d, 0x64, 0xfc, 0x54, 0x3c, 0x03, 0xd5, 0xf4,
+ 0xbc, 0x06, 0x74, 0xb6, 0xbd, 0xd5, 0x70, 0x0b, 0xbd, 0xa6, 0xf8, 0x4b, 0x3c,
+ 0xea, 0x46, 0x32, 0xbd, 0xb4, 0x06, 0x3b, 0x3c, 0xc2, 0xa8, 0x0d, 0xbb, 0x12,
+ 0x60, 0x6f, 0x3c, 0x20, 0xca, 0x10, 0x3c, 0x05, 0xcc, 0xa6, 0xbc, 0x7a, 0xdd,
+ 0xdf, 0xbb, 0xcc, 0x65, 0x9e, 0x3c, 0x02, 0x81, 0xe3, 0x3c, 0x58, 0x15, 0x90,
+ 0x3d, 0x80, 0x4a, 0xb2, 0xbd, 0xd3, 0x92, 0x8d, 0x3d, 0xc8, 0x03, 0xd9, 0xbc,
+ 0xc9, 0xce, 0x49, 0xbd, 0x57, 0xb1, 0x87, 0xbc, 0xf8, 0xc8, 0xb9, 0x3d, 0xb5,
+ 0x6a, 0x02, 0xbd, 0x60, 0xe3, 0x24, 0x3d, 0xb3, 0xdd, 0x4d, 0x3d, 0x87, 0x6d,
+ 0x0e, 0xbd, 0xea, 0x2d, 0x67, 0xbd, 0x62, 0x3b, 0xa9, 0xbc, 0xd1, 0x23, 0x79,
+ 0x3d, 0x27, 0x90, 0x1a, 0x3d, 0xfa, 0xf4, 0xa3, 0x3c, 0x88, 0xf8, 0x76, 0xbd,
+ 0x48, 0x27, 0x4e, 0xbd, 0xad, 0xe7, 0x6d, 0x3c, 0xbd, 0x3f, 0xba, 0x3d, 0x6a,
+ 0x30, 0xb8, 0xbd, 0x2e, 0x5c, 0xc7, 0xbb, 0x76, 0x8f, 0x85, 0xbc, 0x9d, 0x0f,
+ 0x48, 0x3d, 0xae, 0x8b, 0xa4, 0x3d, 0x72, 0xca, 0x36, 0x3d, 0xcd, 0xab, 0xad,
+ 0xbc, 0xf4, 0x68, 0x11, 0xbd, 0xe4, 0xf0, 0x20, 0x39, 0x85, 0x8d, 0x52, 0xbd,
+ 0x73, 0x80, 0x89, 0x3d, 0x3e, 0x97, 0x11, 0xbd, 0x44, 0xe7, 0x13, 0x3d, 0x25,
+ 0xc3, 0x68, 0x3d, 0x4f, 0x88, 0x1c, 0x3d, 0x51, 0x5f, 0x86, 0xbc, 0xce, 0x97,
+ 0xfb, 0xbc, 0x0e, 0x5c, 0x11, 0xbd, 0x00, 0x0f, 0x05, 0x3d, 0x8c, 0x5a, 0xe2,
+ 0x3c, 0xdb, 0x30, 0x8c, 0x3d, 0x69, 0xac, 0xd6, 0x3c, 0xb6, 0x26, 0x22, 0x3d,
+ 0x11, 0x74, 0x72, 0xbd, 0x85, 0xc5, 0x4e, 0x3b, 0x9c, 0x72, 0x9e, 0x3d, 0xa6,
+ 0x49, 0x25, 0xbd, 0x9e, 0x77, 0x23, 0x3c, 0x01, 0xbf, 0x35, 0xbc, 0xf9, 0x0a,
+ 0x06, 0xbd, 0x66, 0xc8, 0x70, 0xbd, 0xb9, 0x54, 0x80, 0x3d, 0x70, 0x83, 0xd1,
+ 0xbc, 0x7b, 0x7a, 0xd5, 0xbc, 0x72, 0x5e, 0x1e, 0xbd, 0x7d, 0xb0, 0x24, 0x3d,
+ 0x88, 0x95, 0x3b, 0x3d, 0xb9, 0xc0, 0x4f, 0xbc, 0xf6, 0xf0, 0xcc, 0x3c, 0x6e,
+ 0x8d, 0x20, 0x3c, 0x0e, 0xe0, 0x8f, 0xbd, 0xfe, 0xd6, 0x2f, 0xbe, 0x40, 0x5e,
+ 0x05, 0x3c, 0x43, 0x3c, 0x1f, 0x3d, 0x2b, 0xfe, 0x63, 0xbd, 0xac, 0xfc, 0x78,
+ 0x3d, 0x89, 0xc7, 0x7b, 0xbd, 0xf8, 0x57, 0x38, 0xbd, 0x27, 0xf8, 0x9f, 0x3c,
+ 0xfe, 0xbe, 0x93, 0xbc, 0xa7, 0x0b, 0x52, 0xbc, 0xf9, 0xc1, 0xae, 0x3c, 0x84,
+ 0xf4, 0x6a, 0xbc, 0x3c, 0xcf, 0xf6, 0xba, 0x16, 0x08, 0x95, 0xbc, 0xcf, 0xf0,
+ 0x57, 0xbd, 0x5e, 0x93, 0x98, 0xbd, 0x84, 0x6a, 0xb4, 0x3d, 0xf6, 0x01, 0xe7,
+ 0xbc, 0x52, 0x9a, 0x85, 0xbc, 0x25, 0x22, 0x99, 0x3d, 0x00, 0xa0, 0x87, 0xbb,
+ 0xf8, 0xb5, 0x0e, 0xbc, 0xcd, 0xd6, 0x3d, 0x3d, 0x01, 0x80, 0x2d, 0xbe, 0xf5,
+ 0xcb, 0x94, 0x3d, 0x65, 0x93, 0x7f, 0xbc, 0x90, 0x42, 0x98, 0x3c, 0x1c, 0x10,
+ 0x13, 0x3d, 0xed, 0xb4, 0x8e, 0x3d, 0xdb, 0xd9, 0x01, 0xbd, 0x18, 0xe6, 0x8b,
+ 0x3c, 0x64, 0x69, 0x60, 0x3b, 0x63, 0x00, 0x1c, 0xbd, 0xe4, 0x57, 0x43, 0x3d,
+ 0xac, 0x16, 0xdc, 0x3d, 0x3d, 0x41, 0x3d, 0xbd, 0x18, 0xcb, 0x34, 0xbd, 0x28,
+ 0x93, 0x06, 0x3b, 0xf2, 0x17, 0x02, 0xbd, 0x2d, 0x29, 0x07, 0xbd, 0xde, 0xd1,
+ 0x88, 0xbc, 0xd8, 0x1e, 0x86, 0x3d, 0xda, 0xd2, 0xe3, 0xbb, 0xb6, 0xd8, 0x66,
+ 0xbd, 0xe9, 0xbd, 0x91, 0x3d, 0xd2, 0xf8, 0xa1, 0x3d, 0xce, 0x41, 0x1f, 0x3d,
+ 0x33, 0x84, 0xfa, 0xbc, 0xa7, 0x81, 0x8f, 0x3c, 0xe2, 0xf0, 0xda, 0xbc, 0x8d,
+ 0x67, 0x2a, 0x3d, 0xee, 0x5c, 0xef, 0x3d, 0x00, 0xf6, 0x3c, 0xbb, 0xcd, 0xa3,
+ 0x70, 0x3d, 0x3a, 0x58, 0x89, 0x3d, 0x03, 0xe3, 0x15, 0xbe, 0xfc, 0x75, 0x10,
+ 0x3c, 0xcc, 0xc4, 0x23, 0xbc, 0xd8, 0x48, 0x1f, 0x3c, 0xb2, 0x7c, 0xa1, 0x3a,
+ 0x7f, 0x0b, 0xda, 0x3d, 0x0d, 0xd0, 0x03, 0x3d, 0xf3, 0xca, 0xd9, 0x3b, 0x72,
+ 0x97, 0x1a, 0x3c, 0x5c, 0x19, 0xfa, 0xbd, 0xaa, 0x5d, 0x12, 0x3d, 0x75, 0xda,
+ 0x58, 0x3d, 0xec, 0x05, 0xb1, 0x3c, 0x6a, 0x21, 0xd9, 0xbc, 0x1d, 0x2c, 0x8c,
+ 0x3c, 0xfa, 0x2f, 0x1e, 0xbd, 0x93, 0x81, 0x98, 0xba, 0x42, 0x27, 0x62, 0xbd,
+ 0x1a, 0xe3, 0xa5, 0x3d, 0x17, 0x24, 0x18, 0xbc, 0x73, 0x8a, 0x24, 0xbd, 0xea,
+ 0x88, 0x92, 0xbc, 0x9d, 0x8d, 0xf7, 0xbc, 0xb4, 0xa6, 0xc8, 0xbd, 0xa0, 0xdd,
+ 0x8e, 0xbd, 0x4c, 0x81, 0x72, 0x3d, 0x59, 0x67, 0x48, 0xbd, 0x23, 0x21, 0xb3,
+ 0x3c, 0x6a, 0xc5, 0x43, 0x3d, 0x13, 0x50, 0x85, 0x3d, 0x0a, 0xd5, 0xb9, 0x3c,
+ 0xf3, 0xe6, 0x2b, 0xbd, 0x32, 0x6c, 0xe6, 0xbc, 0x11, 0x7c, 0x05, 0x3d, 0x99,
+ 0xeb, 0x48, 0xbc, 0x7d, 0x87, 0x35, 0xbd, 0x8b, 0x42, 0x5f, 0x3d, 0xae, 0x56,
+ 0x10, 0x3d, 0x02, 0x1e, 0x96, 0x3d, 0xf7, 0x64, 0xab, 0x3d, 0x66, 0xc3, 0xa2,
+ 0x3c, 0xe6, 0x36, 0xd8, 0xbc, 0x8c, 0xaa, 0x29, 0x3d, 0x52, 0x0b, 0x8b, 0xbc,
+ 0xce, 0x93, 0xef, 0xbc, 0xd9, 0x9b, 0x2c, 0xbd, 0x4a, 0x7a, 0xe6, 0x3c, 0xa1,
+ 0xdb, 0xaa, 0x3d, 0xfe, 0xac, 0x77, 0x3c, 0xd0, 0x02, 0xe2, 0xbc, 0x1c, 0xec,
+ 0xef, 0xbc, 0xe0, 0x92, 0xad, 0xbd, 0x46, 0xe8, 0x02, 0x3d, 0xd0, 0x99, 0x45,
+ 0x3b, 0x8a, 0xbc, 0x3f, 0xbd, 0x02, 0x86, 0x84, 0xbd, 0x34, 0xfb, 0xc3, 0xbd,
+ 0x71, 0xb4, 0xb7, 0x3d, 0xc0, 0x74, 0x42, 0xbb, 0xba, 0xef, 0x5d, 0xbc, 0x2b,
+ 0xd3, 0x21, 0x3c, 0x5a, 0xa2, 0xe4, 0xbc, 0x9f, 0xa9, 0x80, 0xbd, 0xa0, 0x48,
+ 0xb3, 0x3d, 0x39, 0xbb, 0xa4, 0xbd, 0xa9, 0x25, 0xb4, 0x3d, 0xb7, 0x12, 0xf3,
+ 0xbc, 0x25, 0x61, 0x37, 0xbd, 0xb9, 0x66, 0x80, 0x3d, 0xcd, 0xce, 0xcf, 0x3d,
+ 0x9f, 0xd0, 0x90, 0xbc, 0xd7, 0xbd, 0xf4, 0x3c, 0x20, 0x96, 0x8e, 0xbd, 0xd9,
+ 0xdf, 0x00, 0xbe, 0x8c, 0xf9, 0x5d, 0xbc, 0x58, 0xf0, 0x1e, 0x3d, 0xee, 0xec,
+ 0x2f, 0xbd, 0x32, 0x6b, 0x46, 0xbd, 0x72, 0x10, 0x2e, 0x3d, 0x33, 0x5a, 0x09,
+ 0xbd, 0x43, 0x78, 0x14, 0x3d, 0x33, 0xde, 0xa1, 0xbd, 0xcd, 0x6e, 0x35, 0x3c,
+ 0x05, 0x48, 0x22, 0xbd, 0x5b, 0x57, 0x80, 0x3d, 0x66, 0x64, 0xd7, 0x3b, 0x26,
+ 0xf1, 0x1a, 0x3c, 0x81, 0x24, 0x8a, 0xbd, 0x00, 0x84, 0x5e, 0xbd, 0xbc, 0xc0,
+ 0xdc, 0x3b, 0x74, 0x77, 0xa3, 0x3d, 0x8a, 0x55, 0xe3, 0x3c, 0x84, 0x75, 0x2e,
+ 0x3d, 0x45, 0x17, 0x3c, 0x3d, 0xcf, 0xd9, 0x62, 0xbd, 0x6e, 0x1c, 0xd2, 0x3c,
+ 0x6e, 0xe1, 0x21, 0xbe, 0x36, 0xf2, 0x95, 0x3d, 0x44, 0x50, 0x00, 0xba, 0x87,
+ 0x5b, 0xc8, 0xbc, 0xeb, 0xe0, 0xbd, 0x3d, 0x92, 0x7c, 0xff, 0x3c, 0x34, 0x97,
+ 0x32, 0x3d, 0x8f, 0x57, 0x73, 0x3d, 0x70, 0xfe, 0x5b, 0x3c, 0xba, 0x43, 0xee,
+ 0xbc, 0xa8, 0x7b, 0x06, 0x3c, 0xfc, 0x87, 0x8f, 0x3d, 0xf2, 0xd6, 0x43, 0xbd,
+ 0x18, 0x3c, 0x11, 0xbc, 0x1e, 0xc3, 0x62, 0x3c, 0x46, 0x98, 0x9e, 0x3c, 0x5a,
+ 0x90, 0xc4, 0xbc, 0xe6, 0x6b, 0x72, 0xbd, 0xce, 0x30, 0xa7, 0x3d, 0x81, 0xa2,
+ 0x10, 0xbd, 0x4e, 0x75, 0x24, 0x3d, 0xff, 0x9d, 0xea, 0xbc, 0x25, 0x08, 0x92,
+ 0x3c, 0x50, 0x0a, 0xf0, 0xbb, 0xf0, 0x91, 0x8d, 0xbc, 0x4c, 0xd8, 0xc8, 0x3c,
+ 0x16, 0xbb, 0x5d, 0xbd, 0x24, 0x8d, 0x32, 0x3d, 0x75, 0x67, 0x64, 0x3d, 0xe0,
+ 0x67, 0x46, 0x3b, 0xbc, 0x93, 0xbb, 0x3c, 0xd2, 0x74, 0x17, 0xbd, 0x45, 0x88,
+ 0x21, 0xbe, 0x4d, 0x15, 0x95, 0x3d, 0x41, 0x5c, 0xe7, 0xbb, 0xc9, 0x97, 0xfd,
+ 0xbc, 0x3b, 0xe2, 0x0f, 0xbd, 0x57, 0x38, 0xab, 0x3d, 0x13, 0x12, 0xeb, 0x3c,
+ 0x92, 0x5d, 0x4f, 0x3d, 0xf0, 0x1f, 0xbf, 0xbc, 0x37, 0x63, 0xf7, 0xbc, 0xa8,
+ 0x76, 0x32, 0x3c, 0x97, 0xd3, 0xc9, 0xbc, 0x28, 0x83, 0x5b, 0x3d, 0xe2, 0x0f,
+ 0x90, 0xbd, 0x31, 0x0b, 0x8a, 0xbd, 0x04, 0x7c, 0xd5, 0xbc, 0x16, 0x5d, 0xa7,
+ 0x3a, 0x54, 0x36, 0x4f, 0xbd, 0x4d, 0xae, 0x64, 0x3d, 0xfd, 0x4c, 0x94, 0xbc,
+ 0x72, 0x3f, 0x96, 0xbc, 0x41, 0xd7, 0xfa, 0x3b, 0x52, 0x45, 0x03, 0xbc, 0x1f,
+ 0x50, 0xa6, 0xbd, 0x28, 0xb9, 0x78, 0x3c, 0x16, 0xa5, 0x77, 0x3c, 0xf2, 0x4e,
+ 0xa1, 0x3c, 0x84, 0xb6, 0x84, 0xbd, 0xc5, 0x78, 0xdc, 0x3c, 0xb4, 0xd1, 0x27,
+ 0xbd, 0x04, 0x20, 0x8d, 0xbd, 0xa0, 0x12, 0x36, 0x3c, 0xce, 0xb5, 0x31, 0xbe,
+ 0x4b, 0xfd, 0x44, 0xbc, 0xe3, 0x38, 0x00, 0xbd, 0xca, 0x35, 0x60, 0x3c, 0xc6,
+ 0xe4, 0x93, 0xb6, 0xc9, 0x84, 0xc0, 0x3a, 0xb3, 0x53, 0x88, 0x3d, 0x08, 0x37,
+ 0x0b, 0x3c, 0xd9, 0x6d, 0x00, 0xbb, 0x54, 0x22, 0xcc, 0xbb, 0x3c, 0x72, 0xa7,
+ 0xbc, 0x39, 0xbd, 0xc0, 0x3d, 0xc7, 0xb5, 0x0a, 0x3b, 0xe3, 0xbc, 0x38, 0xbc,
+ 0x0d, 0x1c, 0x1f, 0xbc, 0xbc, 0x5b, 0x42, 0xbc, 0xf3, 0x43, 0xb2, 0x3c, 0x5e,
+ 0x7e, 0xc3, 0xbc, 0x40, 0xbf, 0x47, 0x3c, 0xe7, 0x7d, 0x3e, 0xbc, 0x30, 0xf4,
+ 0x13, 0xbc, 0x5f, 0x8d, 0xd1, 0x3c, 0xe1, 0x93, 0xe7, 0xbc, 0x73, 0x12, 0x87,
+ 0xbc, 0x52, 0xb6, 0x9d, 0x3b, 0xf6, 0xda, 0x8d, 0x3d, 0x6b, 0xb8, 0x03, 0x3c,
+ 0x58, 0x8e, 0x25, 0xbd, 0x7b, 0xaa, 0x8a, 0xbc, 0x75, 0xd1, 0x84, 0x3d, 0x0e,
+ 0x90, 0xcd, 0xbc, 0x17, 0x0e, 0x8b, 0x3d, 0x87, 0x5e, 0x04, 0xbd, 0xe5, 0x99,
+ 0x9b, 0xbc, 0x0a, 0xdd, 0x3b, 0x3d, 0x22, 0xc9, 0x83, 0xbc, 0xb8, 0x42, 0x3f,
+ 0x3d, 0x86, 0x99, 0x90, 0x3d, 0x41, 0x4e, 0xa2, 0x3d, 0xf0, 0x89, 0x4f, 0xbd,
+ 0xa6, 0x28, 0x75, 0xbd, 0xea, 0xf1, 0x56, 0xbd, 0x96, 0xb0, 0x9b, 0xbc, 0x01,
+ 0x85, 0xb5, 0x3d, 0xcf, 0x71, 0x4c, 0x3d, 0x98, 0xf9, 0x6d, 0xbc, 0xc8, 0x59,
+ 0x38, 0xbd, 0x12, 0x6f, 0x7b, 0x3d, 0x61, 0xac, 0xf1, 0xbb, 0xd4, 0x32, 0x4a,
+ 0x3d, 0x92, 0x25, 0x45, 0x3d, 0x53, 0x88, 0x6d, 0xbd, 0xa0, 0x69, 0xda, 0xbb,
+ 0xf2, 0xf2, 0xda, 0x3b, 0xf3, 0x4d, 0x84, 0xbc, 0x61, 0x96, 0xda, 0x3c, 0xa3,
+ 0x9c, 0x9a, 0x3b, 0x70, 0x04, 0x93, 0xbb, 0x11, 0x0f, 0xe7, 0xbc, 0x06, 0x52,
+ 0x86, 0xbd, 0x0f, 0xf5, 0x6c, 0xbd, 0xe1, 0x4c, 0x8d, 0x3d, 0x59, 0x20, 0xa0,
+ 0xbd, 0xf8, 0x29, 0x94, 0x3d, 0x3f, 0x89, 0x86, 0xbd, 0x15, 0x66, 0x15, 0xbd,
+ 0xad, 0x80, 0xdf, 0x3c, 0x5b, 0xd4, 0x6c, 0xbc, 0x2c, 0x5f, 0x60, 0x3c, 0x2b,
+ 0x82, 0xd5, 0x3c, 0x3f, 0x7e, 0x14, 0xbd, 0x6c, 0xe8, 0xaf, 0xbb, 0xee, 0x8b,
+ 0x27, 0xbd, 0xa0, 0xa8, 0x20, 0xbd, 0xe8, 0x39, 0x54, 0xbc, 0x9b, 0x57, 0xb7,
+ 0x3d, 0x6a, 0x42, 0x81, 0x3d, 0xd3, 0x09, 0x10, 0xbd, 0x95, 0xd4, 0x3a, 0x3d,
+ 0x48, 0xe1, 0xb8, 0xbc, 0xf4, 0x91, 0xa0, 0xbd, 0x8e, 0x67, 0x5e, 0xbd, 0x3b,
+ 0x3d, 0xa0, 0x3d, 0x82, 0x2e, 0x85, 0x3d, 0x10, 0x91, 0x8c, 0xbb, 0x63, 0xb7,
+ 0x75, 0xbd, 0xf5, 0xd8, 0x35, 0xbd, 0xea, 0x58, 0x11, 0xbb, 0xc4, 0x87, 0xe5,
+ 0xbc, 0xb4, 0x14, 0xce, 0x3d, 0x86, 0x00, 0x0b, 0x3c, 0x91, 0x4b, 0xb2, 0xbd,
+ 0xa9, 0x2e, 0x93, 0x3d, 0xc3, 0x3a, 0xc3, 0xbb, 0x7c, 0x8a, 0x83, 0xbd, 0xd2,
+ 0xb1, 0x2e, 0xbd, 0xbb, 0x27, 0xa9, 0xbd, 0xa7, 0x9f, 0x41, 0x3d, 0x0a, 0x47,
+ 0x15, 0xbd, 0xeb, 0x11, 0xca, 0x3c, 0xfe, 0x0d, 0xef, 0xbc, 0x71, 0x53, 0x52,
+ 0x3d, 0x0b, 0x4b, 0x44, 0x3c, 0x9d, 0xbf, 0x10, 0xbb, 0xf9, 0x31, 0xe6, 0x3c,
+ 0x97, 0x60, 0xbd, 0xbd, 0x8c, 0x40, 0x87, 0x3c, 0x30, 0x66, 0x18, 0x3d, 0x1a,
+ 0x2b, 0xcd, 0x3c, 0x52, 0x92, 0x7e, 0xbd, 0x58, 0xee, 0x02, 0x3d, 0x0a, 0x85,
+ 0xf7, 0xbc, 0x76, 0x75, 0x7f, 0xbd, 0xff, 0x11, 0xde, 0x3b, 0x5b, 0x43, 0x4b,
+ 0x3d, 0xa2, 0x53, 0x3f, 0xbd, 0x90, 0xf3, 0x42, 0xbd, 0x5b, 0xb9, 0x1e, 0x3d,
+ 0x43, 0x66, 0x46, 0xbc, 0x3e, 0x79, 0x7f, 0xbd, 0x24, 0xa8, 0xa0, 0xbd, 0xd5,
+ 0xb2, 0xd2, 0x3c, 0xf6, 0x82, 0x7d, 0x3b, 0x52, 0x09, 0x4e, 0xbd, 0x23, 0x30,
+ 0xfa, 0x3d, 0x62, 0xb4, 0x72, 0x3d, 0xa6, 0x3c, 0x98, 0x3c, 0x20, 0x3f, 0xdd,
+ 0xbb, 0xb0, 0xfa, 0x4f, 0xbd, 0x0f, 0x36, 0x24, 0xbb, 0x19, 0xbc, 0x7d, 0xbd,
+ 0x8d, 0xab, 0x2e, 0x3d, 0x1e, 0x67, 0x61, 0x3d, 0x8a, 0x39, 0x61, 0xbb, 0xb1,
+ 0xa0, 0x01, 0xbc, 0x0d, 0x75, 0x64, 0xbc, 0x89, 0xd7, 0x84, 0xbd, 0x1f, 0x26,
+ 0xa6, 0xbd, 0x7a, 0x67, 0x62, 0x3d, 0x3d, 0x4d, 0x06, 0xbb, 0xff, 0xe4, 0x92,
+ 0x3d, 0x32, 0x12, 0x95, 0xbc, 0x4b, 0x2e, 0x8b, 0xbc, 0x8b, 0x4a, 0x14, 0x3c,
+ 0xea, 0x08, 0x81, 0xbd, 0xb3, 0x3e, 0xb3, 0xbd, 0x96, 0x40, 0xef, 0x3c, 0xc6,
+ 0xf4, 0x83, 0xbd, 0x70, 0x8a, 0xad, 0xbc, 0x28, 0x6d, 0x26, 0xbd, 0x0e, 0x8f,
+ 0x89, 0x3a, 0xbc, 0x30, 0xc8, 0xbd, 0x81, 0x3c, 0x22, 0xbd, 0x19, 0x06, 0xb4,
+ 0x3d, 0x2a, 0xbf, 0x2a, 0x3d, 0xc9, 0xd4, 0x00, 0xbd, 0x74, 0x7d, 0x9b, 0x3b,
+ 0xc5, 0x7a, 0x13, 0xbd, 0xbf, 0x24, 0x18, 0xbc, 0x63, 0x21, 0xfd, 0x3c, 0x8f,
+ 0x45, 0xf6, 0xbd, 0xf6, 0xb7, 0x85, 0x3c, 0x49, 0xc7, 0xee, 0xbb, 0x31, 0x16,
+ 0x9c, 0x3d, 0x86, 0x9e, 0x44, 0x3d, 0x97, 0x25, 0x99, 0x3d, 0x33, 0x23, 0xa6,
+ 0x3d, 0x7f, 0x66, 0x2b, 0x3d, 0xbd, 0xe9, 0x43, 0x3d, 0x11, 0x56, 0x76, 0xbc,
+ 0x30, 0x7c, 0x87, 0xbb, 0xfe, 0xae, 0xfb, 0xb8, 0x4c, 0x48, 0x47, 0xbd, 0x74,
+ 0x13, 0x8b, 0xbd, 0x26, 0x22, 0x87, 0x3d, 0x22, 0xb0, 0x87, 0x3d, 0x9f, 0xc6,
+ 0x74, 0xbd, 0x7a, 0x47, 0x70, 0x3c, 0xe0, 0x41, 0x8b, 0x3d, 0xfb, 0xa2, 0x43,
+ 0xbc, 0x63, 0x0d, 0x21, 0xbd, 0x8a, 0x60, 0x36, 0xbb, 0x54, 0xe8, 0x59, 0x3c,
+ 0x21, 0xd4, 0xa9, 0x3b, 0x00, 0x5b, 0x20, 0x3d, 0x61, 0x25, 0x72, 0x3d, 0x39,
+ 0x8d, 0x3b, 0x3d, 0x5e, 0xcd, 0x4f, 0x3d, 0xa0, 0x47, 0x0c, 0xbd, 0x34, 0xc9,
+ 0x09, 0x3d, 0xb8, 0x59, 0xa2, 0xbc, 0x9a, 0xa3, 0x82, 0x3d, 0x1b, 0xd4, 0x1f,
+ 0xbe, 0xa4, 0x45, 0x9d, 0x3d, 0x9e, 0x03, 0xc6, 0x3c, 0x0c, 0x23, 0x30, 0x3d,
+ 0x9c, 0xb4, 0xec, 0xbb, 0xf8, 0x66, 0x9c, 0xbc, 0x6c, 0x32, 0x7e, 0x3d, 0x4b,
+ 0x32, 0x51, 0x3d, 0x64, 0x32, 0x75, 0x3d, 0x1b, 0xc9, 0xd1, 0x3c, 0x98, 0xac,
+ 0x05, 0x3d, 0x4a, 0x99, 0x74, 0x3b, 0x40, 0x86, 0x41, 0xbd, 0xf6, 0xa7, 0x03,
+ 0xbd, 0x95, 0x47, 0x23, 0x3c, 0x78, 0xf3, 0x0c, 0x3d, 0xf4, 0x66, 0xdc, 0x3b,
+ 0x4d, 0x45, 0xbf, 0xbb, 0x65, 0x4b, 0x73, 0xbc, 0x51, 0x10, 0x8c, 0x3c, 0x5e,
+ 0x5a, 0x67, 0x3d, 0xd7, 0x47, 0x82, 0x3d, 0xdc, 0x32, 0x9c, 0xbc, 0xe4, 0xa5,
+ 0x87, 0xbd, 0xc2, 0xd2, 0xc4, 0xbd, 0x08, 0xbe, 0x6e, 0x3d, 0xa8, 0x8b, 0xf1,
+ 0x3c, 0x10, 0xc0, 0xb1, 0xbc, 0x12, 0x09, 0x88, 0x3d, 0x3f, 0x54, 0x25, 0x3d,
+ 0x11, 0x70, 0x26, 0x3b, 0xdd, 0x48, 0x18, 0x3c, 0x01, 0x3c, 0xee, 0xbd, 0x4f,
+ 0x63, 0x36, 0xbc, 0xea, 0x7e, 0x3f, 0x3d, 0x86, 0x4d, 0x45, 0x3d, 0x4b, 0x63,
+ 0x70, 0xbc, 0x32, 0xdf, 0xc0, 0x3d, 0x50, 0x3c, 0x13, 0x3c, 0x0e, 0x61, 0xa3,
+ 0x3d, 0xe8, 0xc5, 0x37, 0xbd, 0x3b, 0xd7, 0x01, 0xbd, 0x20, 0x1b, 0x89, 0xbc,
+ 0x70, 0x18, 0xee, 0xbc, 0x3e, 0xeb, 0xfa, 0xbb, 0x18, 0xda, 0xda, 0x3c, 0xd6,
+ 0x82, 0x19, 0xbd, 0xf1, 0x7e, 0x88, 0xbd, 0x39, 0x1d, 0xb8, 0xbb, 0x67, 0x98,
+ 0x1c, 0x3d, 0x72, 0x83, 0x90, 0x3d, 0xd3, 0x17, 0x6b, 0xbd, 0xcc, 0x55, 0xa8,
+ 0x3c, 0x18, 0x2e, 0x2c, 0xbd, 0x08, 0xc4, 0x34, 0x3c, 0xf8, 0x8f, 0x51, 0xbd,
+ 0x88, 0x62, 0xfe, 0x3c, 0xbc, 0xe0, 0xb1, 0xbc, 0x09, 0x93, 0x88, 0xbb, 0x95,
+ 0x9c, 0xda, 0x3c, 0x83, 0xda, 0x3a, 0xbd, 0xb8, 0x82, 0x81, 0x3c, 0x39, 0xa8,
+ 0x8a, 0xbd, 0x8b, 0xb0, 0x31, 0xbb, 0x4a, 0x2c, 0x07, 0xbe, 0xec, 0x84, 0x9b,
+ 0x3c, 0xc9, 0x97, 0x56, 0x3d, 0x3d, 0xce, 0x97, 0xbd, 0xa6, 0xe3, 0xbc, 0x3d,
+ 0x91, 0xc4, 0x0f, 0x3d, 0x35, 0xe9, 0xd1, 0xbc, 0x10, 0x48, 0x17, 0x3c, 0x9a,
+ 0x86, 0x86, 0xbd, 0x08, 0x63, 0xf9, 0xbc, 0xb0, 0xb0, 0x98, 0x3c, 0x3e, 0x7e,
+ 0x4e, 0x3d, 0xe0, 0x6f, 0x73, 0xbc, 0xa5, 0x9e, 0x03, 0xbd, 0x7c, 0x39, 0x53,
+ 0x39, 0x6d, 0x86, 0x40, 0xba, 0x1d, 0x71, 0x86, 0x3d, 0x62, 0xec, 0x9d, 0x3c,
+ 0x03, 0x1e, 0x29, 0x3d, 0xbd, 0xbf, 0xd2, 0xbd, 0xce, 0x1c, 0x0c, 0x3d, 0x7f,
+ 0xb3, 0x9c, 0x3d, 0x93, 0xa6, 0xa1, 0xbc, 0xb9, 0xf4, 0x6b, 0xbd, 0x17, 0xce,
+ 0x40, 0xbd, 0x33, 0x15, 0x00, 0x3d, 0xd3, 0x33, 0x9c, 0x3d, 0x01, 0xc6, 0xec,
+ 0x3c, 0x65, 0x42, 0xba, 0x3c, 0x33, 0x73, 0xec, 0xbc, 0x47, 0xf8, 0x00, 0x3d,
+ 0xd1, 0x1b, 0x66, 0x3d, 0x10, 0x9b, 0x0b, 0xbe, 0xe6, 0x45, 0x48, 0xbd, 0x90,
+ 0x46, 0xbd, 0x3c, 0x29, 0xe0, 0xb5, 0xbc, 0x50, 0x42, 0x6a, 0x3d, 0x00, 0x37,
+ 0x9e, 0x3d, 0xc1, 0x54, 0xa0, 0x3c, 0x00, 0x3c, 0x2f, 0xbb, 0x05, 0x4f, 0xa7,
+ 0xbc, 0x3d, 0x86, 0x68, 0xbd, 0x24, 0x65, 0x51, 0xbc, 0xff, 0x74, 0x21, 0x3d,
+ 0x81, 0x5d, 0x25, 0x3d, 0x5d, 0xd0, 0x7a, 0xbd, 0x37, 0xb1, 0x40, 0xbd, 0xf0,
+ 0xfd, 0x3d, 0x3d, 0x1e, 0xb2, 0x2a, 0xbc, 0x62, 0x35, 0x9e, 0xbd, 0xeb, 0x65,
+ 0x51, 0xbc, 0x6f, 0xf6, 0x9a, 0xbd, 0x82, 0x5b, 0x81, 0xbc, 0xd7, 0x8a, 0x29,
+ 0x3d, 0x5a, 0x89, 0x81, 0xbb, 0x6d, 0xf8, 0xe0, 0x3c, 0xa6, 0x56, 0x3c, 0x3d,
+ 0x9d, 0xc6, 0x49, 0xbc, 0xdf, 0x38, 0x79, 0x3c, 0x51, 0x74, 0x4e, 0x3d, 0x02,
+ 0xb4, 0x2e, 0xbd, 0x6e, 0x2c, 0x52, 0xbd, 0x98, 0x05, 0x96, 0x3c, 0x5e, 0xef,
+ 0x12, 0x3d, 0xa9, 0x44, 0x29, 0xbd, 0x29, 0xcf, 0x47, 0x3d, 0x08, 0x33, 0xa3,
+ 0xbd, 0xc7, 0xe5, 0x26, 0x3c, 0x16, 0xf0, 0xc7, 0xbc, 0x89, 0xde, 0xa2, 0x3a,
+ 0x57, 0x77, 0xb9, 0x3b, 0xa0, 0x30, 0x9d, 0x3c, 0xd9, 0xf8, 0x91, 0xbc, 0xdc,
+ 0xac, 0x41, 0x3c, 0xc9, 0xe5, 0x1a, 0xbd, 0x66, 0xcc, 0x89, 0x3d, 0xae, 0x83,
+ 0x95, 0xbd, 0xf6, 0x92, 0xd3, 0x3c, 0x6a, 0x9a, 0xf7, 0x3c, 0xb4, 0xf9, 0x7c,
+ 0xbb, 0x79, 0xd8, 0x99, 0xbc, 0x82, 0x88, 0xb6, 0xbc, 0xf7, 0xdf, 0xb3, 0x3d,
+ 0x57, 0xa6, 0xa7, 0xbd, 0x2e, 0x22, 0xd9, 0xbc, 0xd6, 0x67, 0x91, 0xbc, 0x54,
+ 0x25, 0x32, 0x3d, 0xc3, 0x91, 0x93, 0xbd, 0x1d, 0x77, 0x33, 0x3b, 0x56, 0xc9,
+ 0x8b, 0x3d, 0xbf, 0xe2, 0x21, 0x3c, 0xf5, 0x88, 0x80, 0xbd, 0xee, 0x4f, 0xd8,
+ 0xbc, 0xbf, 0x1c, 0x83, 0xbd, 0xa4, 0x91, 0x61, 0x3d, 0xdc, 0xc1, 0x74, 0x3d,
+ 0xb4, 0x4d, 0x90, 0xbd, 0x80, 0x3d, 0xbb, 0x3c, 0x27, 0x03, 0xa2, 0xbb, 0x7e,
+ 0x7e, 0xd9, 0x3c, 0xf4, 0x18, 0x5f, 0xbc, 0xb1, 0xde, 0x83, 0x3d, 0xd5, 0xee,
+ 0x20, 0xbd, 0xbe, 0xa8, 0x7a, 0xbc, 0x01, 0x94, 0x03, 0xbd, 0x27, 0xa8, 0xfc,
+ 0xbd, 0x72, 0x14, 0x56, 0x3d, 0x79, 0x46, 0x0d, 0xbc, 0x69, 0x23, 0xd1, 0x3c,
+ 0x3b, 0x33, 0x49, 0x3d, 0x8d, 0xef, 0x18, 0x3b, 0xe9, 0xe1, 0x8f, 0xbd, 0x4f,
+ 0x45, 0x05, 0x3d, 0x28, 0x80, 0x49, 0x3c, 0xbd, 0x49, 0x18, 0x3d, 0xfd, 0xd4,
+ 0x86, 0x3c, 0xcc, 0x56, 0xa6, 0x3c, 0x37, 0x8e, 0xef, 0x3a, 0x57, 0x1e, 0x5f,
+ 0x3d, 0xc2, 0xef, 0x68, 0xbc, 0x24, 0xc0, 0xbe, 0xbd, 0x9c, 0xfd, 0xa0, 0x3b,
+ 0x48, 0x3b, 0x5d, 0x3d, 0xcf, 0xe0, 0x2c, 0xbd, 0x49, 0x51, 0xa7, 0x3d, 0x65,
+ 0xcf, 0x7a, 0xbc, 0x27, 0x68, 0x4c, 0xbd, 0x00, 0xed, 0x99, 0xbc, 0x2a, 0xac,
+ 0x5d, 0xbd, 0x6b, 0x5c, 0x9a, 0x3c, 0x71, 0xb7, 0x51, 0x3c, 0x1a, 0x04, 0x60,
+ 0xbd, 0x4b, 0xb8, 0x42, 0x3d, 0xf6, 0x92, 0x4f, 0x3d, 0xcb, 0x7a, 0xc4, 0x3c,
+ 0xc2, 0x1f, 0x85, 0x3d, 0xbf, 0x4c, 0x3b, 0x3b, 0x52, 0x04, 0x9a, 0xbd, 0x3a,
+ 0x5c, 0x29, 0x3d, 0x5f, 0x4e, 0xb1, 0x3d, 0xfc, 0x4e, 0x87, 0xbc, 0x59, 0x10,
+ 0xaa, 0x3d, 0x99, 0xff, 0x43, 0x3d, 0x20, 0x80, 0x8e, 0x3c, 0x79, 0x81, 0x3e,
+ 0xbd, 0xfe, 0x38, 0xab, 0xbd, 0x3d, 0x72, 0xad, 0x3d, 0x18, 0xa1, 0x64, 0xbd,
+ 0xa0, 0x6e, 0xb0, 0xbb, 0x19, 0x6b, 0x00, 0x3d, 0x6b, 0x7b, 0x15, 0xbc, 0x45,
+ 0xb5, 0xa6, 0xbd, 0xef, 0x81, 0x05, 0xbd, 0x9f, 0xe8, 0x37, 0x3d, 0x71, 0xbe,
+ 0xb6, 0xbc, 0x22, 0x55, 0xd6, 0xbc, 0x0d, 0x9b, 0xcf, 0x3c, 0x47, 0xa3, 0x92,
+ 0x3d, 0xfd, 0x13, 0x74, 0x3d, 0x4f, 0xef, 0x53, 0x3d, 0x8b, 0xeb, 0x0f, 0xbd,
+ 0xf9, 0x86, 0x00, 0x3d, 0xb8, 0xd1, 0x68, 0xbc, 0x68, 0xa4, 0x1c, 0xbd, 0x96,
+ 0x27, 0x01, 0x3d, 0x28, 0x65, 0x4a, 0x3d, 0xef, 0xa3, 0x41, 0xbd, 0xdd, 0xd4,
+ 0xac, 0x3c, 0x24, 0x42, 0x48, 0x3d, 0x55, 0x49, 0x99, 0x39, 0x7a, 0x2f, 0xde,
+ 0xbc, 0x7f, 0xff, 0x94, 0x3d, 0x76, 0x44, 0x14, 0xbd, 0xea, 0xa9, 0x05, 0x3d,
+ 0xd1, 0xa5, 0x2c, 0x3d, 0xfa, 0x4f, 0x0c, 0xbd, 0xda, 0x0a, 0x6d, 0xbd, 0x52,
+ 0x92, 0x47, 0x3d, 0x8b, 0x87, 0x8b, 0x3d, 0xd0, 0x89, 0x48, 0xbd, 0xaa, 0xbe,
+ 0x03, 0x3d, 0xa0, 0x14, 0x6d, 0xbd, 0x20, 0x3a, 0x80, 0x3d, 0x08, 0x2f, 0x86,
+ 0xbd, 0xf9, 0xfd, 0xa4, 0xbd, 0xde, 0xd5, 0x92, 0xbc, 0xcd, 0x8a, 0x64, 0x3d,
+ 0x48, 0xd0, 0x6c, 0x3d, 0x6a, 0xa3, 0xfa, 0xbc, 0xc3, 0xc7, 0x36, 0xbd, 0xb1,
+ 0x87, 0x2e, 0xbd, 0x3b, 0x6c, 0x9e, 0x3d, 0x56, 0x18, 0x1a, 0xbe, 0x9e, 0xd1,
+ 0xf5, 0x3c, 0xb9, 0xfe, 0xc3, 0xbc, 0x46, 0xbc, 0x40, 0xbd, 0x94, 0x3a, 0x48,
+ 0x3d, 0xbc, 0x4e, 0xbb, 0x3d, 0xa0, 0x7b, 0x94, 0xbc, 0xd8, 0xeb, 0x91, 0x3d,
+ 0x95, 0xa1, 0x99, 0xbd, 0xf4, 0x73, 0x9c, 0x3b, 0x23, 0x2d, 0x8e, 0x3d, 0x46,
+ 0x9c, 0xa5, 0xbb, 0x61, 0x13, 0x50, 0xbd, 0xad, 0x99, 0xf8, 0x3c, 0xd2, 0xac,
+ 0x7d, 0xbd, 0xc1, 0xb2, 0x6d, 0xbc, 0xf7, 0xde, 0x9f, 0xbd, 0x60, 0x72, 0x15,
+ 0x3d, 0x69, 0xaf, 0xa2, 0x3d, 0xfd, 0x72, 0x79, 0x3d, 0xd0, 0xc0, 0xa1, 0xbb,
+ 0x80, 0x21, 0x4f, 0x3d, 0xbc, 0x91, 0x0a, 0xbc, 0x23, 0xa3, 0xee, 0xbc, 0xd0,
+ 0x1a, 0xbb, 0xbd, 0x2a, 0x71, 0x35, 0x3d, 0x21, 0x26, 0x66, 0x3d, 0xb4, 0x17,
+ 0x89, 0xbb, 0x54, 0x4f, 0x80, 0xbc, 0x47, 0x10, 0xf3, 0xbc, 0x22, 0x75, 0x6c,
+ 0x3d, 0xb1, 0x75, 0x00, 0x3d, 0xe2, 0xf4, 0xf5, 0xbd, 0xbe, 0xbc, 0x7b, 0x3d,
+ 0xe3, 0x01, 0xc1, 0xbc, 0x05, 0x25, 0x82, 0xbb, 0x3f, 0x02, 0x5d, 0xbb, 0xa9,
+ 0xc1, 0x5a, 0x3d, 0xea, 0xe4, 0x5e, 0x3c, 0x96, 0xd6, 0xa5, 0x3c, 0xcb, 0x77,
+ 0xa4, 0x3c, 0xb2, 0x4f, 0x06, 0xbd, 0x84, 0xc3, 0x2c, 0xbd, 0x48, 0xdc, 0x9d,
+ 0x3b, 0xdb, 0xd6, 0xbb, 0xbc, 0xc8, 0xdf, 0x98, 0xbc, 0x29, 0x14, 0x31, 0x3d,
+ 0x6f, 0xfa, 0x4f, 0xbd, 0x7c, 0xb4, 0xaa, 0xbd, 0xe0, 0xeb, 0x2e, 0xbd, 0x53,
+ 0x3f, 0xc4, 0x3d, 0xbc, 0xcb, 0x38, 0x3d, 0x30, 0x45, 0x30, 0x3c, 0xf0, 0xc1,
+ 0x0c, 0xbd, 0xb3, 0x20, 0x39, 0xbd, 0x80, 0xe2, 0x8b, 0x3b, 0x35, 0x31, 0x05,
+ 0xbd, 0xf5, 0xaa, 0x49, 0xbc, 0x7d, 0x08, 0x0a, 0x3d, 0xdd, 0x96, 0x84, 0xbc,
+ 0x0f, 0xb9, 0x4c, 0x3d, 0x49, 0xea, 0x86, 0x3d, 0xc9, 0xd0, 0x75, 0xbb, 0xcd,
+ 0x9b, 0xd1, 0x3d, 0x7a, 0x5e, 0x6f, 0xbd, 0x4a, 0x2e, 0xc0, 0xba, 0x3b, 0x7d,
+ 0x7d, 0xbd, 0x2b, 0x8f, 0xfe, 0xbb, 0x2a, 0xf4, 0xce, 0x3d, 0xf6, 0xfc, 0x06,
+ 0xbc, 0xdd, 0x02, 0x4a, 0x3c, 0x71, 0x3c, 0x03, 0xbd, 0x03, 0x9a, 0x90, 0xbd,
+ 0x76, 0xb7, 0xb3, 0xbd, 0xa2, 0xd1, 0x47, 0xbd, 0xc1, 0x56, 0x6e, 0x3d, 0xff,
+ 0x97, 0x57, 0x3d, 0x50, 0x57, 0xe6, 0xbc, 0x8f, 0xb3, 0x3d, 0xbd, 0x75, 0x8e,
+ 0x80, 0xbd, 0xc7, 0x6c, 0x43, 0xbc, 0xaa, 0xe3, 0x9d, 0xbd, 0x6f, 0xe4, 0x1d,
+ 0x3d, 0x3a, 0x57, 0x98, 0x3c, 0x6c, 0x08, 0x5c, 0x3d, 0xeb, 0xd2, 0xa5, 0xbb,
+ 0xf7, 0x60, 0x08, 0xbc, 0x72, 0x03, 0x3b, 0xbd, 0xe7, 0xc1, 0x8f, 0x3d, 0xb6,
+ 0x1f, 0x98, 0x3d, 0x59, 0xff, 0x88, 0x3d, 0x51, 0xe9, 0x73, 0xbc, 0x1f, 0x91,
+ 0xa5, 0x3d, 0x3b, 0x64, 0x17, 0xbd, 0x5b, 0xa5, 0x80, 0x3d, 0x03, 0x38, 0x85,
+ 0x3d, 0xbe, 0x27, 0x90, 0xbd, 0x4e, 0x87, 0xa3, 0xbc, 0xc1, 0xbb, 0x22, 0xbc,
+ 0x8b, 0x25, 0xd0, 0xbb, 0x6a, 0x2f, 0x1d, 0x3d, 0x0a, 0xdd, 0x48, 0x3d, 0x0b,
+ 0x37, 0x37, 0x3d, 0x2a, 0x68, 0x1a, 0x3d, 0xc8, 0x85, 0x4a, 0x3d, 0x0a, 0xa5,
+ 0x03, 0x3c, 0xd2, 0x41, 0x12, 0x3d, 0x25, 0xc3, 0x24, 0x3b, 0x1a, 0x95, 0x33,
+ 0x3d, 0xbf, 0xfd, 0xd7, 0x3c, 0xce, 0xff, 0x6e, 0xbc, 0x91, 0xc5, 0x0f, 0x3c,
+ 0x7e, 0x5f, 0x64, 0xbd, 0x64, 0x7d, 0x1c, 0xbd, 0x42, 0x2d, 0xba, 0x3d, 0x99,
+ 0x69, 0xa5, 0x3c, 0x39, 0x7d, 0x72, 0xbd, 0x6a, 0xbf, 0x8f, 0x3b, 0xaa, 0x43,
+ 0x02, 0x3d, 0xb7, 0xb7, 0x35, 0xbd, 0x97, 0xaf, 0x6c, 0x3c, 0x62, 0x39, 0xd6,
+ 0xbc, 0x33, 0xd6, 0x85, 0x3d, 0x4c, 0x50, 0x47, 0x3d, 0x26, 0x4b, 0x57, 0x3d,
+ 0xf8, 0x80, 0x15, 0x3c, 0x9e, 0x69, 0x05, 0xbc, 0xa4, 0x13, 0xb5, 0x3d, 0x41,
+ 0x17, 0xda, 0xbd, 0x48, 0x79, 0x2b, 0xbb, 0xb4, 0x86, 0xcc, 0xbb, 0xad, 0x20,
+ 0x95, 0xbd, 0x20, 0xf5, 0x01, 0x3e, 0x23, 0x9e, 0x9b, 0x3d, 0xdb, 0xfe, 0x38,
+ 0x3b, 0x23, 0x42, 0x57, 0x3b, 0x42, 0x99, 0x59, 0x3d, 0xf2, 0x9d, 0xba, 0xbd,
+ 0x92, 0xe5, 0x5d, 0x3d, 0x20, 0x17, 0x07, 0xbb, 0xf0, 0x57, 0x08, 0x3d, 0x7d,
+ 0xed, 0x91, 0xbc, 0x2e, 0xc4, 0x8d, 0xbd, 0xdb, 0x15, 0xc2, 0x3c, 0xaa, 0xc3,
+ 0xe6, 0xbb, 0x90, 0x5d, 0xb4, 0xbc, 0xee, 0xaa, 0x9a, 0x3d, 0x74, 0x6d, 0x22,
+ 0xbb, 0x00, 0x65, 0xc2, 0xb9, 0x37, 0x30, 0x07, 0xbd, 0x85, 0xbd, 0x60, 0xbb,
+ 0x2b, 0x40, 0xd7, 0x3c, 0xca, 0x82, 0x33, 0xbd, 0x29, 0xb2, 0x81, 0x3d, 0x08,
+ 0xee, 0xd5, 0x3c, 0x28, 0x34, 0xdf, 0x3c, 0x3d, 0x41, 0x67, 0xbd, 0x0c, 0x1e,
+ 0xf7, 0x3c, 0x9c, 0x86, 0xe4, 0x3c, 0x36, 0x7c, 0x07, 0x3d, 0xc7, 0x27, 0x04,
+ 0xbd, 0x45, 0xcb, 0x77, 0x3d, 0xcf, 0x66, 0x14, 0xbd, 0x29, 0xae, 0x3f, 0xbd,
+ 0x70, 0x86, 0x25, 0xbc, 0x08, 0xc9, 0xa6, 0x3c, 0x70, 0xa3, 0xa8, 0xbb, 0xbe,
+ 0x82, 0x49, 0x3d, 0x13, 0xa1, 0x73, 0xbd, 0xd5, 0x6c, 0x35, 0xbd, 0x98, 0xfa,
+ 0x3a, 0x3c, 0xff, 0x0c, 0xe2, 0xb9, 0x37, 0xe9, 0xf2, 0xbb, 0x78, 0x2d, 0x89,
+ 0xbd, 0xec, 0x2c, 0x88, 0xbc, 0x97, 0x7f, 0x2e, 0x3d, 0x9e, 0x32, 0x88, 0xbd,
+ 0x17, 0xdb, 0x20, 0xbd, 0xde, 0xbd, 0xc7, 0x3b, 0x30, 0x01, 0xf4, 0x3c, 0xf8,
+ 0x47, 0x05, 0xbd, 0xab, 0x0c, 0xdf, 0x3c, 0x8b, 0xdc, 0xa5, 0x3c, 0x62, 0x53,
+ 0x78, 0xbd, 0xf1, 0x6e, 0x56, 0x3d, 0x1e, 0xf2, 0x79, 0x3d, 0x0a, 0xce, 0x9b,
+ 0xbc, 0x18, 0xed, 0xaf, 0x3c, 0xd1, 0x1d, 0x8a, 0x3d, 0x78, 0xe8, 0x6e, 0x3c,
+ 0x1d, 0x2a, 0x84, 0x3d, 0x90, 0xb3, 0x80, 0x3d, 0x26, 0x1f, 0x74, 0x3d, 0x14,
+ 0xc6, 0x79, 0xbb, 0x37, 0x9d, 0x18, 0x3d, 0x1a, 0x28, 0x86, 0x3d, 0x8b, 0x8e,
+ 0x0f, 0xbd, 0x50, 0x3e, 0x82, 0xbc, 0x6f, 0x35, 0x70, 0xbd, 0xa5, 0xa6, 0x88,
+ 0x3d, 0xb6, 0xe7, 0x2a, 0xbd, 0x57, 0x46, 0x0a, 0x3d, 0xd6, 0xba, 0x34, 0xbd,
+ 0xc2, 0xf8, 0xc1, 0xbc, 0x2e, 0xe5, 0x30, 0xbd, 0xd5, 0x76, 0x85, 0x3d, 0xb4,
+ 0xeb, 0x88, 0xbd, 0xb5, 0x44, 0x40, 0x3d, 0x08, 0x9a, 0x8f, 0xbd, 0xe4, 0xa2,
+ 0xdf, 0x3c, 0x40, 0x83, 0xaf, 0x3a, 0xe0, 0xfb, 0x20, 0x3b, 0x84, 0xc3, 0xf1,
+ 0x3c, 0x13, 0x24, 0x88, 0xbd, 0x03, 0x21, 0x4a, 0xbd, 0xd6, 0x14, 0x39, 0x3d,
+ 0x10, 0x2c, 0x84, 0xbd, 0x47, 0xe0, 0xed, 0xbc, 0x8e, 0xfd, 0x91, 0xbc, 0x0e,
+ 0x42, 0x93, 0xbc, 0xe4, 0x43, 0x6b, 0x3d, 0x96, 0xc7, 0x36, 0x3d, 0xb0, 0xc2,
+ 0xac, 0xbb, 0x28, 0x29, 0x74, 0x3d, 0xf0, 0x10, 0xb5, 0xbb, 0x09, 0x5e, 0x6c,
+ 0x3d, 0xc3, 0xa9, 0x97, 0x3c, 0x4f, 0xc1, 0x9c, 0x3c, 0x4e, 0xc4, 0xf0, 0x3c,
+ 0x4e, 0x42, 0xfa, 0xbc, 0x9a, 0x53, 0x79, 0x3c, 0x9e, 0xc3, 0xd8, 0xbc, 0xfe,
+ 0x1e, 0x57, 0x3c, 0xa2, 0xec, 0x3f, 0xba, 0xfa, 0x34, 0x12, 0x3d, 0x43, 0x1c,
+ 0xd4, 0x3c, 0xf3, 0x3f, 0xa5, 0x3a, 0xda, 0xa7, 0x96, 0xbd, 0x6a, 0x5f, 0x2a,
+ 0x3d, 0xbd, 0x83, 0xd3, 0xbb, 0xb8, 0x9c, 0x5b, 0xbd, 0x67, 0xbb, 0x2d, 0x3c,
+ 0x44, 0x9a, 0xb0, 0xbc, 0x5c, 0x1b, 0xe6, 0x3c, 0x10, 0xfd, 0x67, 0xbd, 0x3b,
+ 0x8e, 0x94, 0xbd, 0xf3, 0x97, 0xca, 0xbb, 0x3a, 0xae, 0x3f, 0x3c, 0xd2, 0xbe,
+ 0x81, 0x3d, 0xd7, 0x2c, 0x86, 0xbd, 0x48, 0xc8, 0xbf, 0xbc, 0x00, 0x15, 0x5e,
+ 0xbc, 0x43, 0x09, 0x1d, 0x3d, 0x3d, 0xe7, 0x75, 0xbd, 0x38, 0xe4, 0x5f, 0x3c,
+ 0x8f, 0xe1, 0x09, 0x3d, 0xab, 0xa4, 0x16, 0xbd, 0x69, 0x15, 0x35, 0x3d, 0x6d,
+ 0x6a, 0x20, 0xbd, 0xa1, 0xd2, 0x9b, 0xbb, 0x89, 0xfb, 0xd1, 0x3c, 0x91, 0x05,
+ 0x82, 0x3d, 0x5c, 0x10, 0x3c, 0xbd, 0x7e, 0x4d, 0x5d, 0x3d, 0x5a, 0xac, 0x44,
+ 0xbc, 0xe5, 0x82, 0xfd, 0xbc, 0xd7, 0xc2, 0x82, 0xbd, 0xe7, 0xd3, 0x5f, 0x3d,
+ 0x3e, 0x16, 0x1e, 0x3d, 0x72, 0xcf, 0x9c, 0xbd, 0xf9, 0x44, 0xa2, 0xbc, 0x1c,
+ 0x64, 0x69, 0xba, 0x9e, 0xc1, 0x01, 0x3c, 0x07, 0xc9, 0x81, 0xbd, 0x18, 0x75,
+ 0x25, 0xbd, 0x12, 0x0b, 0xfd, 0xbc, 0x00, 0x54, 0xd5, 0x38, 0x73, 0x47, 0x85,
+ 0xbd, 0xaa, 0x08, 0x68, 0x3d, 0xa5, 0xf5, 0xa8, 0xbc, 0xd7, 0xea, 0x16, 0x3d,
+ 0x38, 0x81, 0x2a, 0xbd, 0xb0, 0x44, 0x45, 0x3d, 0xe6, 0x66, 0x71, 0x3d, 0x39,
+ 0x4d, 0x58, 0xbc, 0x6c, 0xd5, 0xbc, 0xbc, 0x40, 0x65, 0xab, 0x3c, 0x92, 0x4f,
+ 0x83, 0x3d, 0x46, 0xb4, 0x83, 0x3d, 0xf3, 0x7b, 0x5e, 0xbd, 0x8f, 0x77, 0x98,
+ 0xbc, 0x28, 0xd3, 0xe2, 0xbc, 0xa8, 0x94, 0xdc, 0xbc, 0xdc, 0x3a, 0x03, 0x39,
+ 0x6e, 0xd2, 0x81, 0x3c, 0x49, 0x64, 0xb8, 0xbc, 0xdb, 0x96, 0x03, 0xbd, 0xeb,
+ 0x90, 0x4c, 0x3d, 0xcc, 0xc7, 0x45, 0xbc, 0xca, 0xbc, 0x4a, 0xbd, 0xcc, 0xf4,
+ 0x90, 0x3c, 0x1e, 0x78, 0x93, 0x3b, 0xe8, 0x46, 0x68, 0xbd, 0x02, 0xe7, 0x78,
+ 0xbc, 0x95, 0x12, 0x48, 0xbd, 0x36, 0xd3, 0x60, 0xbd, 0x0b, 0x6a, 0x1c, 0x3d,
+ 0x9c, 0xa6, 0xb4, 0x3c, 0x20, 0xe6, 0xca, 0x3c, 0x52, 0x5e, 0x97, 0xbd, 0xe8,
+ 0x0f, 0x10, 0xbd, 0x01, 0xe8, 0x51, 0xbd, 0xf1, 0x2a, 0x0e, 0xbd, 0x1d, 0x03,
+ 0x85, 0x3a, 0x00, 0x7f, 0x50, 0x3d, 0x5a, 0x91, 0xd7, 0xbc, 0xc5, 0x55, 0x3b,
+ 0x3d, 0xd6, 0x47, 0x8a, 0xbd, 0x2d, 0x40, 0x80, 0x3d, 0x49, 0x84, 0xd9, 0xbb,
+ 0x2c, 0x7d, 0x5a, 0x3d, 0x94, 0x2d, 0xcd, 0x3c, 0x84, 0xe9, 0x90, 0xbd, 0x67,
+ 0xf2, 0x95, 0xbd, 0xf6, 0x29, 0x12, 0xbd, 0x7b, 0x2e, 0x64, 0x3d, 0xf5, 0x42,
+ 0x01, 0xbd, 0x42, 0x57, 0x2b, 0x3d, 0x0d, 0xd5, 0x99, 0xbd, 0xdf, 0xd5, 0x4b,
+ 0xbd, 0xc4, 0x97, 0x4a, 0xbd, 0xb1, 0xb5, 0xa0, 0x3c, 0x97, 0xa5, 0x13, 0xbb,
+ 0xda, 0x02, 0x11, 0x3d, 0x6e, 0x22, 0xce, 0xbb, 0x9f, 0x3e, 0xf0, 0x3c, 0x92,
+ 0x5d, 0xb5, 0xbc, 0xda, 0x5e, 0x45, 0x3d, 0x53, 0x93, 0x0a, 0x3d, 0xa4, 0xf0,
+ 0x8b, 0x3c, 0x4a, 0x4c, 0x04, 0x3d, 0x76, 0xc7, 0x8e, 0x3c, 0x55, 0xba, 0x39,
+ 0x3c, 0xa5, 0xed, 0x8c, 0xbd, 0x16, 0x33, 0x80, 0xbd, 0x32, 0xd7, 0x3b, 0x3d,
+ 0x07, 0xe9, 0x62, 0xbd, 0x6e, 0x01, 0x76, 0x3d, 0x42, 0x8b, 0x5e, 0xbd, 0x30,
+ 0x56, 0x07, 0x3d, 0x2c, 0x8b, 0xdb, 0xbc, 0xaf, 0xff, 0x8f, 0xbd, 0xf3, 0x4a,
+ 0x5d, 0xbd, 0xb0, 0x52, 0xb7, 0x3b, 0x29, 0x47, 0x9c, 0xbc, 0x5a, 0x8d, 0x30,
+ 0xbd, 0x71, 0xf8, 0x07, 0x3d, 0xc0, 0x46, 0x27, 0xbd, 0x93, 0x7d, 0x89, 0xbc,
+ 0xd2, 0x61, 0x39, 0x3d, 0x8d, 0x18, 0x69, 0x3c, 0x43, 0xd6, 0x18, 0xbc, 0x00,
+ 0x37, 0x0f, 0xba, 0x68, 0x4c, 0x4a, 0x3d, 0x4a, 0x6d, 0x6c, 0xbd, 0x63, 0x4a,
+ 0x7c, 0xbc, 0x0e, 0xed, 0x6b, 0xbd, 0x43, 0xc3, 0x97, 0xbd, 0xd0, 0x48, 0xa4,
+ 0xbb, 0xb4, 0x48, 0xa0, 0x3c, 0x89, 0x3c, 0x89, 0xbd, 0x00, 0xa7, 0xb4, 0x39,
+ 0xe2, 0xd3, 0x5e, 0x3d, 0x19, 0x2b, 0x10, 0xbc, 0x46, 0xef, 0x9a, 0xbd, 0x1c,
+ 0x32, 0xac, 0x3c, 0xe2, 0x57, 0x4b, 0x3d, 0xf7, 0x44, 0x41, 0x3d, 0x84, 0x06,
+ 0x89, 0xbc, 0x20, 0xf0, 0xb7, 0x3b, 0x3a, 0x7b, 0x50, 0x3d, 0xc0, 0xe4, 0x59,
+ 0xbd, 0x06, 0x58, 0x19, 0x3d, 0x80, 0x23, 0xe1, 0x3b, 0xe2, 0xdc, 0x8c, 0xbd,
+ 0xdc, 0x0a, 0x84, 0x3d, 0x96, 0xfe, 0x23, 0xbb, 0x45, 0x27, 0x40, 0xbd, 0x5d,
+ 0xc4, 0x0f, 0x3d, 0xcc, 0xe2, 0xab, 0xbc, 0x64, 0xec, 0xf8, 0xbc, 0x5e, 0x9d,
+ 0x1f, 0xbd, 0xa4, 0x84, 0x16, 0xbd, 0x26, 0x34, 0x99, 0xbd, 0xeb, 0x94, 0x91,
+ 0x3d, 0xae, 0x2b, 0x25, 0x3d, 0x7d, 0x8a, 0x2c, 0x3d, 0x65, 0xdb, 0xa1, 0xbc,
+ 0xb9, 0x5c, 0x2a, 0x3d, 0xe4, 0x06, 0x1d, 0xbb, 0xb6, 0xca, 0x17, 0x3d, 0xc8,
+ 0xd8, 0x12, 0x3d, 0x5c, 0xf3, 0x28, 0xbd, 0x44, 0x6b, 0x85, 0xbc, 0xa0, 0x1c,
+ 0x05, 0x3b, 0x1e, 0x13, 0x49, 0x3d, 0xd0, 0xbc, 0x07, 0x3d, 0xe4, 0xe8, 0x33,
+ 0x3c, 0xe1, 0xbe, 0x4c, 0x3d, 0xcf, 0xa9, 0x0d, 0x3c, 0x52, 0x61, 0x62, 0x3d,
+ 0x2e, 0x19, 0x63, 0x3d, 0xbe, 0x72, 0x86, 0x3d, 0x20, 0x7b, 0x34, 0x3c, 0xa0,
+ 0x1b, 0x6d, 0xbb, 0xbe, 0xdf, 0xd9, 0x3a, 0x6b, 0xae, 0x4e, 0x3d, 0x3b, 0x38,
+ 0x7d, 0xbd, 0xa1, 0xee, 0x3b, 0x3d, 0x51, 0x91, 0x37, 0x3b, 0x26, 0x34, 0xe4,
+ 0xbc, 0x13, 0x50, 0x8c, 0xbd, 0x5b, 0x2d, 0x52, 0xbd, 0xb3, 0xf6, 0x5d, 0xbc,
+ 0x82, 0x69, 0x3f, 0xbb, 0xf3, 0x6b, 0x14, 0x3d, 0xe8, 0x54, 0x9a, 0x3c, 0x42,
+ 0xa5, 0x35, 0x3d, 0x99, 0x10, 0x0b, 0xbc, 0x87, 0x55, 0x2d, 0xbd, 0x1f, 0x1a,
+ 0x16, 0xbd, 0x99, 0xaa, 0x16, 0xbc, 0x1a, 0x04, 0x3e, 0xbd, 0x62, 0x5f, 0x12,
+ 0x3d, 0xea, 0x90, 0x18, 0x3d, 0x32, 0x9f, 0x17, 0x3d, 0x1c, 0x6f, 0xba, 0x3c,
+ 0xce, 0xe2, 0x13, 0x3d, 0x47, 0xa2, 0xdb, 0xbc, 0xf7, 0x85, 0x4f, 0xbd, 0x24,
+ 0x60, 0xc8, 0xbc, 0xea, 0x00, 0x5e, 0xbd, 0x08, 0x73, 0x58, 0x3d, 0xf3, 0x42,
+ 0x85, 0xbd, 0x0e, 0xcd, 0x91, 0xbd, 0x3c, 0xba, 0xb1, 0xbc, 0x48, 0x41, 0x01,
+ 0x3d, 0xb1, 0xcf, 0x64, 0x3d, 0x6f, 0x25, 0x9a, 0xbc, 0xda, 0xaa, 0xce, 0x3c,
+ 0x22, 0x5f, 0x62, 0x3d, 0xf9, 0x36, 0x9b, 0xbd, 0x85, 0x6f, 0x81, 0x3d, 0x22,
+ 0xd8, 0x2e, 0xbd, 0x72, 0x49, 0x19, 0xbd, 0x21, 0x3c, 0xb9, 0xba, 0xc5, 0x69,
+ 0x8a, 0xbd, 0x68, 0xec, 0x08, 0xbd, 0xd9, 0x7e, 0x06, 0xbd, 0x0e, 0xa4, 0x36,
+ 0x3d, 0x9e, 0xbb, 0x65, 0xbd, 0xaf, 0x04, 0x81, 0x3d, 0x07, 0xa0, 0x7b, 0xbd,
+ 0xa7, 0x30, 0x51, 0xbd, 0x15, 0x8e, 0x05, 0x3c, 0xe0, 0x7a, 0x7c, 0x3c, 0x43,
+ 0x90, 0x04, 0x3d, 0x00, 0xf1, 0x4b, 0xbb, 0xe0, 0xe9, 0x29, 0x3b, 0x6f, 0x91,
+ 0x1d, 0xbd, 0xff, 0xc5, 0xd0, 0x3c, 0x6b, 0x02, 0xe3, 0x3c, 0xba, 0x1f, 0x53,
+ 0xbc, 0x0e, 0xd5, 0x7e, 0x3d, 0x54, 0xe0, 0x97, 0xbc, 0x00, 0x7a, 0xf2, 0xb9,
+ 0x66, 0x00, 0x84, 0x3d, 0x62, 0x17, 0x08, 0xbd, 0x5a, 0x30, 0x46, 0x3d, 0x75,
+ 0xb1, 0x37, 0xbd, 0x6f, 0x28, 0x55, 0x3c, 0xe0, 0xc4, 0x82, 0xbd, 0xfc, 0xf5,
+ 0xb2, 0xbc, 0x96, 0xdc, 0x0a, 0xbb, 0x83, 0x2a, 0x91, 0x3c, 0x29, 0x21, 0x40,
+ 0x3d, 0xff, 0x1f, 0x9c, 0xbd, 0x82, 0xb2, 0x5d, 0x3d, 0x8e, 0x14, 0x2c, 0x3d,
+ 0xec, 0xb2, 0xed, 0xbc, 0xb8, 0xa0, 0x3a, 0xbc, 0x66, 0x70, 0x11, 0xbc, 0x49,
+ 0xa6, 0xd0, 0xbc, 0x55, 0x34, 0x14, 0xbc, 0xb4, 0x65, 0x80, 0x3d, 0x76, 0x98,
+ 0x87, 0xbd, 0x23, 0x3d, 0xa2, 0x3c, 0xaa, 0xc5, 0x7e, 0x3d, 0xb7, 0x41, 0x91,
+ 0xbd, 0x9f, 0xe6, 0x80, 0xbd, 0x20, 0x0a, 0x13, 0x3c, 0xc8, 0xa0, 0xf3, 0x3c,
+ 0x51, 0xf3, 0x04, 0x3d, 0x61, 0x7e, 0x0c, 0x3d, 0xbe, 0x25, 0x47, 0x3d, 0x25,
+ 0x2b, 0x2b, 0x3d, 0xa9, 0x7a, 0x3f, 0xbd, 0xc2, 0xd4, 0xe3, 0xbc, 0x67, 0xc5,
+ 0x79, 0x3d, 0x10, 0x4b, 0xb0, 0x3c, 0xb8, 0xd1, 0x87, 0x3c, 0xd3, 0x7b, 0x54,
+ 0xbd, 0x81, 0x81, 0xcc, 0x3c, 0x85, 0x81, 0x15, 0x3d, 0xaa, 0xa8, 0xb0, 0x3b,
+ 0x4b, 0x90, 0xae, 0x3c, 0xaa, 0x38, 0x0f, 0x3d, 0x92, 0x82, 0x0a, 0xbd, 0xfd,
+ 0x99, 0x51, 0x3d, 0x90, 0x87, 0x0b, 0xbd, 0xc6, 0x71, 0x58, 0xbd, 0x4f, 0x17,
+ 0x86, 0x38, 0x03, 0x9a, 0x00, 0xbd, 0xeb, 0xae, 0x34, 0xbd, 0xab, 0x28, 0x19,
+ 0x3b, 0xc5, 0x48, 0x6c, 0xbd, 0x4a, 0xa3, 0x7c, 0xbd, 0x1f, 0xe7, 0x00, 0x3c,
+ 0xf4, 0xd8, 0xd8, 0x3c, 0xbc, 0x01, 0x59, 0xbd, 0xa9, 0x77, 0xb5, 0xbb, 0x67,
+ 0xc3, 0x82, 0x3d, 0x37, 0xd8, 0x8c, 0x3d, 0xea, 0x92, 0x59, 0x3d, 0x30, 0x97,
+ 0x31, 0x3d, 0x36, 0xb9, 0x23, 0xbb, 0x98, 0x99, 0x7f, 0xbd, 0x0b, 0xfd, 0x8e,
+ 0xbc, 0x80, 0xc6, 0x5c, 0xbd, 0xb2, 0xf0, 0x76, 0x3d, 0x7e, 0x01, 0xe5, 0xbc,
+ 0x0a, 0x94, 0x08, 0x3d, 0xb2, 0x9b, 0x7b, 0xbd, 0xdc, 0x27, 0x6b, 0xbd, 0x32,
+ 0x1e, 0x41, 0x3d, 0x4b, 0xd8, 0x8a, 0xbd, 0xe6, 0xdc, 0xd5, 0x3c, 0x72, 0xfd,
+ 0x09, 0xbd, 0x33, 0x80, 0xc5, 0xba, 0xbc, 0xdd, 0xc0, 0x3b, 0xf4, 0x31, 0x9a,
+ 0xbd, 0x29, 0x45, 0xd9, 0x3c, 0x02, 0x33, 0xd8, 0xbc, 0x97, 0x48, 0x73, 0x3d,
+ 0x7f, 0x13, 0x88, 0xbd, 0x9b, 0xed, 0x40, 0xbd, 0xae, 0x86, 0x7d, 0xbd, 0xea,
+ 0xa5, 0x4a, 0x3b, 0x8d, 0xd4, 0xd8, 0x3c, 0x57, 0xc1, 0x28, 0xbc, 0x6a, 0xb8,
+ 0x15, 0x3d, 0x30, 0xb0, 0xdc, 0xbb, 0x71, 0x34, 0x05, 0xbd, 0x39, 0x9c, 0x8a,
+ 0x3d, 0x98, 0xdd, 0x45, 0xbc, 0xf1, 0xcc, 0xcb, 0xbc, 0xe1, 0xf6, 0xd8, 0x3c,
+ 0xae, 0xb9, 0x18, 0xbb, 0x67, 0x50, 0x82, 0x3d, 0x20, 0x71, 0x82, 0x3d, 0x0e,
+ 0x45, 0x4a, 0xbd, 0x30, 0x86, 0xbe, 0xbb, 0x60, 0xc7, 0x07, 0x3d, 0xdb, 0xf7,
+ 0x04, 0xbd, 0x9a, 0xc3, 0xb2, 0xbc, 0xe0, 0x58, 0xf5, 0xbc, 0x12, 0x0a, 0x48,
+ 0x3d, 0xf7, 0x85, 0x2e, 0x3d, 0xab, 0x2b, 0xe6, 0x3b, 0xed, 0x4c, 0x15, 0xbc,
+ 0x99, 0x4b, 0xb1, 0xbc, 0xa1, 0x82, 0x09, 0x3d, 0x8b, 0x84, 0x09, 0xbd, 0x85,
+ 0x5a, 0x38, 0xbb, 0x83, 0xc7, 0x80, 0xbd, 0xfe, 0xf3, 0x67, 0xbd, 0x6e, 0x25,
+ 0x6f, 0x3d, 0x00, 0xa4, 0xf8, 0xbc, 0x3a, 0x24, 0x17, 0xbc, 0xb2, 0x0d, 0x8a,
+ 0x3c, 0x87, 0xac, 0x69, 0x3d, 0xcd, 0x5f, 0x89, 0xbc, 0x9e, 0x08, 0x7d, 0xbd,
+ 0x4c, 0xa4, 0xa0, 0xbc, 0x63, 0x21, 0x2c, 0x3d, 0x5a, 0x78, 0x71, 0xbd, 0xa2,
+ 0xe8, 0x71, 0x3d, 0x2b, 0xc9, 0xc1, 0xbb, 0x6f, 0x4f, 0x78, 0xbd, 0xa9, 0xee,
+ 0xdf, 0x3c, 0x3c, 0xe2, 0xb3, 0xbc, 0x64, 0xa2, 0x7d, 0xbc, 0xcc, 0x2c, 0x35,
+ 0x3d, 0xfd, 0x8c, 0x86, 0x3d, 0xe9, 0x57, 0xf3, 0x3c, 0xc1, 0x84, 0x82, 0x3d,
+ 0x8e, 0x7a, 0x6c, 0xbd, 0xf1, 0x40, 0x04, 0x3d, 0x7e, 0x17, 0x5b, 0x3d, 0x74,
+ 0xba, 0x83, 0x3a, 0x6f, 0x01, 0x86, 0xbd, 0x62, 0x58, 0x69, 0xbd, 0x33, 0xcd,
+ 0x07, 0x3d, 0x6e, 0xc5, 0x8c, 0xbd, 0x5a, 0x4c, 0x99, 0x3c, 0x87, 0xb8, 0xf0,
+ 0x3c, 0xc1, 0x64, 0x8a, 0x3c, 0x4c, 0x69, 0x23, 0xbd, 0x93, 0x75, 0x80, 0x3d,
+ 0x54, 0x27, 0x87, 0xbd, 0xdc, 0x3e, 0x62, 0x3d, 0x9e, 0xdb, 0x43, 0xbc, 0x03,
+ 0xd4, 0x65, 0xbd, 0x4c, 0xb6, 0x59, 0x3d, 0xc4, 0xa1, 0xe8, 0xbc, 0xf3, 0xdc,
+ 0x87, 0x3d, 0xf5, 0x34, 0x82, 0xbc, 0x4e, 0x2d, 0xe2, 0x3b, 0xd6, 0x1e, 0x3d,
+ 0xbd, 0xea, 0x0c, 0x83, 0x3d, 0x34, 0x3e, 0x20, 0xbd, 0xb6, 0x87, 0x77, 0x3c,
+ 0x9c, 0x9a, 0xe4, 0xba, 0x48, 0x21, 0xa5, 0xbc, 0xb3, 0x81, 0x89, 0x3d, 0xf4,
+ 0x2c, 0x49, 0x3d, 0x98, 0xb5, 0xd6, 0xbc, 0x88, 0xdb, 0x30, 0xbd, 0xa4, 0x2f,
+ 0x88, 0xbc, 0x67, 0xc1, 0xb6, 0xbc, 0x8e, 0xba, 0xb8, 0xbc, 0xdd, 0x22, 0xc2,
+ 0x3c, 0xaf, 0x08, 0x8f, 0x3b, 0xa5, 0x85, 0xcb, 0xbc, 0x26, 0x24, 0x2c, 0x3d,
+ 0x2c, 0x73, 0x35, 0x3c, 0xf9, 0xb2, 0xaf, 0xbb, 0xf2, 0x50, 0x2f, 0xbd, 0x15,
+ 0x10, 0x31, 0x3c, 0x75, 0xdb, 0x67, 0x3d, 0x5c, 0xe2, 0xfe, 0x3c, 0x51, 0xe0,
+ 0x8d, 0x3d, 0x1c, 0x25, 0xb9, 0x3c, 0xcf, 0x20, 0x80, 0x3d, 0x5c, 0x61, 0xdf,
+ 0x3c, 0x9a, 0x2e, 0x5d, 0x3d, 0x4d, 0x63, 0xd8, 0x3c, 0x23, 0x0e, 0x32, 0xbc,
+ 0x6a, 0xaa, 0x61, 0x3d, 0xa3, 0x74, 0x86, 0xbd, 0x60, 0x32, 0x73, 0x3b, 0xe3,
+ 0x8b, 0x73, 0xbc, 0x6d, 0x26, 0x40, 0x3d, 0x8c, 0xbb, 0xbf, 0xbb, 0x4f, 0x89,
+ 0xf9, 0x3c, 0x6a, 0xfe, 0x0b, 0x3d, 0x43, 0x89, 0x3f, 0xbd, 0xe6, 0x1f, 0xda,
+ 0xbc, 0xdf, 0x48, 0x36, 0xbd, 0xd8, 0x5a, 0x8f, 0xbd, 0x58, 0x20, 0xfc, 0x3c,
+ 0xec, 0xc0, 0x69, 0x3d, 0xc9, 0x17, 0x06, 0xbd, 0xc1, 0x2b, 0xd9, 0x3b, 0xba,
+ 0x7f, 0x73, 0x3a, 0xde, 0xd4, 0xbd, 0xbc, 0x9f, 0x94, 0xd6, 0x3c, 0xfe, 0xb3,
+ 0x56, 0x3c, 0xbd, 0xda, 0xd0, 0xbc, 0x9c, 0x13, 0x6c, 0xbc, 0x10, 0x12, 0xab,
+ 0x3c, 0x94, 0x9f, 0x1d, 0xbd, 0x78, 0xbb, 0x9d, 0x3c, 0x6c, 0xca, 0x00, 0xbd,
+ 0x4c, 0xb7, 0xb8, 0x3c, 0x09, 0x38, 0xd3, 0x3c, 0x4c, 0x70, 0x91, 0x3c, 0xe9,
+ 0x6b, 0x26, 0xbc, 0x57, 0x19, 0xa4, 0x3c, 0xd2, 0xf7, 0x54, 0x3d, 0x0f, 0x9a,
+ 0x48, 0x3d, 0xd0, 0xe2, 0x8f, 0x3b, 0x58, 0x63, 0x13, 0x3c, 0x81, 0xda, 0x1b,
+ 0xbd, 0x77, 0x24, 0x83, 0x3c, 0xd7, 0x64, 0xc7, 0x3b, 0xb0, 0xf6, 0x6b, 0xbc,
+ 0x8a, 0xaa, 0x62, 0x3d, 0xa4, 0x13, 0xbb, 0xbc, 0xe8, 0x06, 0xb3, 0x3c, 0xb1,
+ 0x41, 0x77, 0x3d, 0x1c, 0xac, 0xe0, 0x3c, 0x40, 0x0f, 0x25, 0x3c, 0x89, 0xc0,
+ 0x54, 0x3c, 0xec, 0x1d, 0x7a, 0x3d, 0x41, 0x1e, 0x31, 0x3d, 0x51, 0x3e, 0x26,
+ 0x3d, 0x00, 0x55, 0x39, 0xbd, 0x2e, 0x9d, 0x7f, 0x3d, 0x2f, 0xe9, 0x4d, 0xbd,
+ 0x46, 0x85, 0x35, 0xbd, 0xa2, 0x67, 0xf8, 0x3c, 0x16, 0x0f, 0x82, 0xbd, 0xcd,
+ 0x48, 0x9a, 0x3b, 0x62, 0xd9, 0x08, 0x3d, 0x67, 0x0f, 0x5a, 0xbc, 0xd0, 0x09,
+ 0x56, 0xbc, 0x31, 0x38, 0xda, 0xbc, 0x67, 0xf7, 0xa1, 0xbc, 0x8c, 0x2a, 0x79,
+ 0xbd, 0xb3, 0xf5, 0xb1, 0xbc, 0xe8, 0xf4, 0x8b, 0xbd, 0x5f, 0x45, 0x11, 0xbd,
+ 0x9f, 0x79, 0x1e, 0xbd, 0xf5, 0xbf, 0x86, 0x3d, 0x4e, 0xd8, 0xed, 0xbc, 0xcd,
+ 0x66, 0x5b, 0x3c, 0x4a, 0x74, 0x8f, 0x3b, 0xe3, 0x98, 0x4f, 0x3d, 0x0d, 0x54,
+ 0x91, 0xbb, 0x24, 0xb6, 0x1b, 0x3d, 0xd8, 0x0d, 0xb7, 0xbc, 0x04, 0x76, 0x31,
+ 0xbd, 0x10, 0x43, 0x11, 0xbd, 0x0e, 0xc2, 0x02, 0xbd, 0x88, 0x66, 0x43, 0x3c,
+ 0xb5, 0xda, 0x95, 0xbb, 0x07, 0x09, 0x28, 0xbd, 0x22, 0xcc, 0x19, 0xbd, 0xf0,
+ 0x47, 0xfe, 0x3c, 0x10, 0x43, 0xfb, 0xbc, 0x5f, 0x5f, 0x2c, 0x3d, 0xfb, 0xce,
+ 0x18, 0xbc, 0xcd, 0x87, 0x6a, 0x3d, 0xee, 0xf6, 0x61, 0xbd, 0x37, 0x86, 0x12,
+ 0x3d, 0x4c, 0x01, 0xb7, 0x3c, 0x8c, 0x44, 0x19, 0xbd, 0xc1, 0x3d, 0xa6, 0x3c,
+ 0xcd, 0xf1, 0x5e, 0xbb, 0x9e, 0xe0, 0x41, 0x3d, 0x8c, 0xfb, 0x95, 0xbd, 0xa7,
+ 0x04, 0xc1, 0xbb, 0xcc, 0xf0, 0x25, 0xbd, 0x1c, 0x72, 0x81, 0x3c, 0x76, 0xf2,
+ 0x6d, 0x3d, 0x3b, 0xf9, 0x86, 0x3d, 0xc2, 0xbe, 0x4a, 0x3d, 0x5d, 0x80, 0x5a,
+ 0xbd, 0x63, 0x28, 0x3b, 0xbd, 0xb4, 0xb7, 0x5e, 0x3d, 0x04, 0x5b, 0x57, 0x3d,
+ 0x64, 0xac, 0x56, 0xbd, 0xb6, 0x67, 0x35, 0xbd, 0xb1, 0xc7, 0x0b, 0x3d, 0x0c,
+ 0xae, 0x2d, 0x3d, 0xcc, 0x4c, 0x7d, 0xbc, 0x2f, 0x01, 0x34, 0x3d, 0xa8, 0x4e,
+ 0x63, 0x3d, 0xa3, 0xad, 0xb8, 0xbc, 0x32, 0x0c, 0x25, 0xbd, 0x66, 0x15, 0xab,
+ 0xbc, 0x8a, 0x1a, 0x10, 0x3d, 0xca, 0xcb, 0x46, 0x3d, 0x4a, 0xe5, 0xfe, 0x3c,
+ 0x4a, 0xcc, 0xa6, 0x3c, 0x2e, 0x05, 0x4f, 0xbb, 0x31, 0xef, 0x62, 0xbc, 0xa0,
+ 0xeb, 0x7c, 0xbd, 0x49, 0x9b, 0x13, 0x3d, 0x07, 0x55, 0x82, 0x3d, 0xca, 0x81,
+ 0x1d, 0xbd, 0x67, 0xc0, 0x52, 0x3b, 0xae, 0xd6, 0x0d, 0x3d, 0x53, 0x79, 0x70,
+ 0xbd, 0x9c, 0x93, 0xa8, 0xbc, 0x5b, 0xbb, 0x58, 0x3d, 0x73, 0x1d, 0x0b, 0xbd,
+ 0xe8, 0xe9, 0x0f, 0x3d, 0x3b, 0xda, 0xbd, 0xbb, 0x66, 0x91, 0x80, 0x3d, 0x46,
+ 0xcc, 0xe8, 0xbc, 0x86, 0xe3, 0x32, 0x3d, 0x37, 0x9f, 0x5f, 0xbc, 0x9a, 0x06,
+ 0x19, 0xbd, 0xec, 0xb6, 0x78, 0xbd, 0xd9, 0xd5, 0x49, 0xbd, 0xe8, 0xf9, 0x59,
+ 0x3c, 0x48, 0x30, 0x8c, 0x3c, 0x03, 0x1d, 0x8a, 0x3d, 0x4d, 0x47, 0xc6, 0x3c,
+ 0x77, 0x88, 0x9d, 0xbd, 0x3e, 0xf0, 0x63, 0xbd, 0x83, 0x92, 0x2b, 0xbd, 0x9a,
+ 0xb0, 0x05, 0x3d, 0xee, 0x10, 0x86, 0x3c, 0xf1, 0xb2, 0x92, 0xbd, 0x2a, 0x0e,
+ 0x3f, 0xbd, 0x6c, 0xfc, 0xbb, 0xbb, 0x62, 0xee, 0x16, 0x3a, 0xf8, 0xdb, 0xa1,
+ 0x3c, 0x1c, 0xce, 0x43, 0xbd, 0xd3, 0xbf, 0x64, 0xbd, 0xe6, 0xb9, 0xc4, 0x3c,
+ 0x43, 0x6b, 0x63, 0x3c, 0xe8, 0xbd, 0x87, 0x3c, 0x95, 0x2d, 0x29, 0x3d, 0x10,
+ 0xbd, 0x7a, 0xbc, 0x26, 0xe3, 0x8e, 0xbd, 0xa1, 0x64, 0x70, 0xbd, 0xf7, 0x22,
+ 0x8f, 0x3d, 0x68, 0x73, 0x95, 0xbc, 0x33, 0x1c, 0xdb, 0xbc, 0x95, 0x44, 0x11,
+ 0x3d, 0xc5, 0x6c, 0x86, 0xbd, 0xf8, 0x9b, 0x8a, 0xbd, 0x48, 0xba, 0x13, 0x3c,
+ 0x6a, 0x54, 0x28, 0xbd, 0xd0, 0xaa, 0x15, 0xbd, 0x32, 0x4e, 0x56, 0x3d, 0x8e,
+ 0x65, 0x4b, 0x3d, 0x62, 0x4d, 0x76, 0xbc, 0x65, 0x5f, 0x05, 0x3d, 0x40, 0xb5,
+ 0xb5, 0xbb, 0x1a, 0xd6, 0x83, 0x3d, 0x9d, 0xea, 0xa7, 0x3b, 0x73, 0x19, 0x59,
+ 0x3c, 0xb2, 0x83, 0x25, 0xbd, 0x38, 0x93, 0x9e, 0x3c, 0x95, 0xe2, 0x7a, 0x3c,
+ 0xc6, 0x09, 0x95, 0xbd, 0xfe, 0x8a, 0x84, 0x3d, 0x09, 0x99, 0x8c, 0x3d, 0x3d,
+ 0xb5, 0x0e, 0xbd, 0x1e, 0x91, 0x8c, 0xbd, 0xc1, 0x52, 0xce, 0x3c, 0xc2, 0xa5,
+ 0x88, 0xbd, 0x9c, 0x3f, 0x97, 0xbd, 0x79, 0x5b, 0xd3, 0x3c, 0x20, 0xf6, 0xfd,
+ 0x3c, 0xcf, 0x37, 0x5f, 0x3c, 0x41, 0xc8, 0x6e, 0xbd, 0xa4, 0xde, 0xf8, 0x3c,
+ 0xe6, 0x88, 0x19, 0xbc, 0xe3, 0x00, 0x01, 0x3d, 0xa7, 0x4e, 0x1e, 0xbd, 0xb8,
+ 0xa1, 0x65, 0xbd, 0xbf, 0xfd, 0x81, 0xbd, 0xf0, 0x80, 0xe8, 0xbb, 0x3c, 0x62,
+ 0xdc, 0x3c, 0x02, 0x96, 0x70, 0x3d, 0x05, 0x55, 0x7d, 0xbd, 0x66, 0xb3, 0x15,
+ 0x3d, 0xa7, 0x8e, 0x16, 0xbd, 0xf5, 0xcf, 0x06, 0x3d, 0x5b, 0x78, 0xdf, 0xbc,
+ 0x54, 0xcc, 0x2c, 0xbd, 0xdc, 0x15, 0xc6, 0xbc, 0xeb, 0xaf, 0x87, 0x3d, 0x3b,
+ 0x65, 0x95, 0xbd, 0x52, 0x02, 0x65, 0x3d, 0x0a, 0x99, 0x0a, 0xbc, 0x6a, 0xfd,
+ 0x67, 0x3d, 0x00, 0x53, 0x3e, 0xbd, 0xa0, 0xbe, 0xe4, 0xbc, 0xaa, 0x76, 0xf4,
+ 0x3c, 0xd9, 0x22, 0x3c, 0xbd, 0x28, 0xa2, 0x3b, 0x3b, 0x44, 0x27, 0x7e, 0xbd,
+ 0xb3, 0xd4, 0xa8, 0x3c, 0xb3, 0x30, 0x29, 0x3b, 0xd0, 0x0f, 0x3b, 0x3b, 0x74,
+ 0x3e, 0x8a, 0xbd, 0x2f, 0x61, 0x1f, 0xbd, 0x58, 0x65, 0x4a, 0xbd, 0xd7, 0xb7,
+ 0xf8, 0xbc, 0xfd, 0x91, 0x25, 0xbd, 0xfd, 0xd2, 0x39, 0xbd, 0x49, 0xa6, 0x82,
+ 0x3d, 0xd8, 0x60, 0x04, 0x3d, 0xf8, 0x76, 0xac, 0x3c, 0x18, 0x61, 0x2d, 0xbc,
+ 0xd6, 0xf2, 0x0b, 0xbd, 0x18, 0x53, 0x01, 0x3c, 0xac, 0x10, 0xb7, 0x3c, 0x22,
+ 0xab, 0xd0, 0xbc, 0x40, 0x50, 0x3b, 0x3a, 0xf4, 0x70, 0x44, 0xbd, 0xb8, 0xaa,
+ 0x81, 0xbd, 0x09, 0x70, 0x8f, 0x3c, 0x51, 0x00, 0xc5, 0xbc, 0x41, 0x17, 0xb8,
+ 0xbc, 0xd2, 0xe1, 0x07, 0xbd, 0x58, 0xa0, 0x95, 0xbd, 0x7d, 0x24, 0x4b, 0xbd,
+ 0x47, 0x50, 0x5f, 0x3d, 0x4a, 0x41, 0x1e, 0x3d, 0xc1, 0x38, 0x21, 0xbd, 0xbd,
+ 0x82, 0x13, 0x3d, 0xdb, 0xe8, 0x4d, 0xbd, 0x76, 0x8d, 0x1d, 0xbc, 0x96, 0x2f,
+ 0x72, 0x3d, 0xa9, 0x4c, 0x56, 0xbd, 0xe3, 0x39, 0x79, 0x3d, 0xf2, 0xaa, 0x0e,
+ 0x3d, 0xee, 0xfa, 0x27, 0x3d, 0x70, 0x0c, 0x24, 0x3c, 0x3c, 0xf8, 0x7e, 0xbd,
+ 0xc2, 0x3b, 0x55, 0xbb, 0x83, 0x9c, 0xcc, 0x3b, 0x52, 0x0f, 0x5d, 0x3d, 0x86,
+ 0x3f, 0x3a, 0xbc, 0xf0, 0xbb, 0xbc, 0xbb, 0xe0, 0xff, 0xaf, 0x3c, 0x12, 0xca,
+ 0x22, 0x3c, 0xd4, 0x78, 0x41, 0xbc, 0xc9, 0xaa, 0x1f, 0xbd, 0x7c, 0x59, 0x9e,
+ 0x3a, 0x1a, 0x15, 0x4d, 0xbc, 0x25, 0x53, 0xfa, 0xbc, 0x6e, 0xbb, 0x82, 0xbc,
+ 0xc2, 0x7d, 0x8d, 0x3c, 0xa8, 0x73, 0x19, 0xbd, 0x04, 0x34, 0x4c, 0xbc, 0xbb,
+ 0x37, 0x5e, 0x3d, 0xb8, 0xc0, 0x30, 0x3d, 0xac, 0x71, 0x9d, 0xbd, 0xf8, 0x58,
+ 0x2a, 0x3b, 0xd0, 0x94, 0xa4, 0x3b, 0xeb, 0x76, 0x5a, 0xbc, 0xcf, 0x43, 0x94,
+ 0x3c, 0x48, 0x10, 0x66, 0x3d, 0x35, 0xee, 0x78, 0xbc, 0x29, 0x9a, 0x64, 0x3c,
+ 0x39, 0x2a, 0x27, 0x3d, 0xab, 0x94, 0x8a, 0x3d, 0xb2, 0x3c, 0x0f, 0xbd, 0x76,
+ 0x7f, 0x46, 0xbd, 0x68, 0xb2, 0x96, 0xbc, 0x98, 0xa2, 0x61, 0x3d, 0x97, 0x72,
+ 0x92, 0xbd, 0xde, 0xac, 0x51, 0xbd, 0x03, 0xb8, 0x74, 0x3d, 0xb5, 0x3b, 0x8a,
+ 0xbc, 0x70, 0xbf, 0x42, 0xbd, 0xf0, 0x0f, 0xf9, 0x3b, 0xb6, 0x4d, 0xc5, 0x3c,
+ 0x16, 0xeb, 0x72, 0x3d, 0x90, 0x81, 0xcd, 0xbb, 0x00, 0x8b, 0x0b, 0xbc, 0xb1,
+ 0x02, 0xa5, 0x3c, 0xee, 0xa7, 0x7d, 0xbd, 0xf0, 0x26, 0x0e, 0xbd, 0x1c, 0xb0,
+ 0x52, 0xbd, 0x80, 0xdd, 0x2f, 0xbd, 0x43, 0xbb, 0xeb, 0xbc, 0xf9, 0xa6, 0xd1,
+ 0xbc, 0xb1, 0x67, 0x29, 0xbd, 0xaa, 0xee, 0xf4, 0x3b, 0xc4, 0xab, 0x59, 0xbd,
+ 0xb8, 0x83, 0x36, 0x3d, 0x20, 0xfc, 0x60, 0x3b, 0x28, 0xdd, 0x59, 0xbd, 0x5c,
+ 0x16, 0xd1, 0xbc, 0x00, 0xbc, 0xcb, 0xbc, 0x9f, 0x8e, 0x62, 0xbc, 0x8e, 0xde,
+ 0x53, 0xbd, 0xec, 0x4f, 0x26, 0x3d, 0xde, 0x94, 0x46, 0xbd, 0x50, 0x30, 0x0e,
+ 0x3c, 0x20, 0xef, 0x7b, 0xbd, 0x83, 0x86, 0x38, 0x3c, 0x5a, 0xff, 0x1f, 0xbd,
+ 0x61, 0x3e, 0xd5, 0xbc, 0x0b, 0xac, 0x65, 0x3c, 0xfd, 0x06, 0xa5, 0x3c, 0x2c,
+ 0x94, 0x47, 0xbd, 0xe2, 0xc3, 0x7e, 0x3d, 0x40, 0xac, 0x67, 0x3d, 0xa4, 0x7a,
+ 0x77, 0xbc, 0xfc, 0x13, 0xe7, 0x3c, 0x56, 0x69, 0x80, 0x3d, 0x27, 0x58, 0x18,
+ 0x3d, 0x1e, 0x95, 0x0e, 0x3d, 0x3f, 0xa8, 0x41, 0x3d, 0x0f, 0xbb, 0x16, 0xbd,
+ 0x45, 0x72, 0x89, 0xbd, 0xf1, 0xd2, 0xfb, 0x3c, 0x8f, 0x6b, 0x65, 0x3d, 0x50,
+ 0x8a, 0x05, 0x3c, 0x99, 0x24, 0x90, 0xbd, 0xc8, 0x4d, 0x4f, 0x3d, 0x80, 0xb8,
+ 0xd2, 0x3b, 0xe5, 0x51, 0xae, 0x3b, 0x25, 0x33, 0x2a, 0xbd, 0x05, 0x12, 0xd7,
+ 0x3c, 0xc2, 0x1b, 0x33, 0x3c, 0x5f, 0x8d, 0x07, 0xbc, 0x79, 0x60, 0x26, 0x3d,
+ 0xf7, 0x63, 0x83, 0x3d, 0x88, 0xb4, 0xc7, 0xbc, 0x40, 0x5d, 0xb0, 0xba, 0x6e,
+ 0xaf, 0x39, 0xbd, 0x50, 0x93, 0xf3, 0x3c, 0xc4, 0x3b, 0x53, 0x3c, 0xf9, 0x8b,
+ 0x60, 0xbd, 0x74, 0x4e, 0xbd, 0x3c, 0x40, 0xe6, 0xdd, 0x3c, 0x30, 0x78, 0x18,
+ 0x3d, 0xaa, 0xed, 0x76, 0x3d, 0xd7, 0x20, 0x4b, 0x3d, 0x30, 0x08, 0xd1, 0x3c,
+ 0x52, 0xf0, 0x61, 0x3d, 0x75, 0xea, 0x6a, 0x3d, 0x93, 0xef, 0xeb, 0x3c, 0x35,
+ 0xad, 0x96, 0xbd, 0xca, 0x41, 0x21, 0x3d, 0x59, 0x18, 0x1e, 0x3d, 0x2c, 0xa8,
+ 0x81, 0xbd, 0x7e, 0xdb, 0xd7, 0x3c, 0xfc, 0x7e, 0x1b, 0xbd, 0x26, 0x25, 0x86,
+ 0x3d, 0xa9, 0x58, 0x9b, 0xbd, 0x0a, 0xef, 0xfa, 0xbc, 0xfe, 0x74, 0x74, 0x3d,
+ 0xb0, 0x51, 0x80, 0xbd, 0x29, 0x42, 0x88, 0x3a, 0x56, 0xe7, 0x8c, 0xbb, 0x16,
+ 0x5f, 0x43, 0x3d, 0x5b, 0x1d, 0x4c, 0x3c, 0xae, 0x9d, 0xbd, 0xbb, 0xbc, 0xcf,
+ 0x44, 0xbc, 0x78, 0x8d, 0x6c, 0x3d, 0x30, 0x99, 0x2c, 0x3d, 0x52, 0x17, 0x9e,
+ 0xbc, 0x3d, 0x52, 0x18, 0xbd, 0xfa, 0xcc, 0xb4, 0x3c, 0x9d, 0x56, 0x8d, 0x3d,
+ 0x7e, 0xa0, 0x18, 0x3d, 0x88, 0x7b, 0x94, 0xbd, 0xe8, 0x02, 0xc7, 0xbc, 0x08,
+ 0x22, 0x37, 0x3c, 0x18, 0x3b, 0x5d, 0xbd, 0xa4, 0xbb, 0xb4, 0x3c, 0xb0, 0x8d,
+ 0x06, 0x3d, 0xe8, 0xf4, 0xb0, 0xbb, 0xb4, 0x8b, 0x31, 0xbc, 0xf8, 0xdf, 0xf4,
+ 0x3c, 0x29, 0x19, 0x80, 0xbb, 0x29, 0x4c, 0x60, 0x3c, 0x4b, 0x11, 0x93, 0xbd,
+ 0x4b, 0xbd, 0x66, 0xbd, 0x62, 0x8e, 0x88, 0x3c, 0xfe, 0xa2, 0x37, 0x3d, 0x41,
+ 0xe1, 0x36, 0xbd, 0xbe, 0x7b, 0xc1, 0x3b, 0x6c, 0xff, 0xba, 0x3c, 0x8f, 0xae,
+ 0xab, 0xbc, 0x7b, 0x37, 0xd5, 0xbc, 0x0d, 0xac, 0x18, 0xbd, 0xf2, 0xcb, 0x1d,
+ 0x3d, 0xbb, 0xb0, 0x30, 0x3c, 0xbb, 0x1a, 0x41, 0x3b, 0x5b, 0x36, 0x11, 0xbd,
+ 0x96, 0xb3, 0x86, 0x3d, 0x0b, 0xcb, 0xf9, 0x3c, 0x5c, 0x23, 0x60, 0xbc, 0x62,
+ 0xe1, 0x33, 0xbd, 0x10, 0x91, 0x5e, 0x3d, 0xdf, 0xc8, 0x6c, 0xbd, 0xe7, 0x19,
+ 0x60, 0x3d, 0x87, 0xa0, 0x5b, 0x3c, 0x8a, 0xc5, 0x65, 0x3d, 0x6c, 0x2e, 0x31,
+ 0x3d, 0x99, 0xc7, 0x1a, 0x3d, 0xe8, 0xe6, 0x6f, 0x3c, 0x10, 0x95, 0xd9, 0x3b,
+ 0x1d, 0xdd, 0x19, 0xbd, 0xdc, 0xfe, 0x32, 0x3d, 0x83, 0x85, 0x05, 0x3d, 0xd8,
+ 0x24, 0x16, 0x3d, 0xf7, 0x73, 0x20, 0xbd, 0x77, 0x07, 0xc4, 0x3c, 0xdf, 0xd0,
+ 0x92, 0x3c, 0x1a, 0x7d, 0x2c, 0xba, 0xb0, 0x19, 0xe8, 0xbc, 0x9e, 0x97, 0xec,
+ 0xbb, 0x33, 0xb2, 0xb1, 0x3c, 0x89, 0xde, 0x81, 0xbd, 0x9d, 0xae, 0x57, 0xbc,
+ 0x31, 0xd9, 0xbb, 0x3c, 0xa0, 0x2d, 0x27, 0x3d, 0x00, 0x99, 0x43, 0x3c, 0x2e,
+ 0x32, 0x9d, 0xbc, 0xa2, 0x6d, 0x81, 0x3d, 0x38, 0xce, 0xc3, 0xbc, 0x8e, 0xd7,
+ 0x7a, 0x3d, 0x2a, 0x89, 0x00, 0xbc, 0x2e, 0x52, 0x9f, 0xbc, 0x20, 0x47, 0x4d,
+ 0xbd, 0xd9, 0x79, 0x5f, 0x3d, 0x09, 0x2c, 0x97, 0x3c, 0x9c, 0x28, 0x5f, 0x3b,
+ 0x9d, 0xd3, 0x65, 0x3d, 0x44, 0x63, 0xbb, 0xbc, 0x0c, 0xfe, 0xc0, 0x3c, 0x71,
+ 0xfa, 0x08, 0xbd, 0x40, 0x4a, 0xac, 0x3b, 0xca, 0x9d, 0x7a, 0x3d, 0xbd, 0x1c,
+ 0x52, 0xbd, 0xc8, 0x90, 0x0e, 0x3d, 0x6b, 0x89, 0xbd, 0xbc, 0xa0, 0x74, 0x77,
+ 0x3c, 0x8a, 0xe4, 0x44, 0xbd, 0x5f, 0x81, 0x56, 0x3c, 0x39, 0x9a, 0xc9, 0xbc,
+ 0x33, 0xf4, 0x07, 0xbd, 0x48, 0xe0, 0x94, 0xbd, 0x3f, 0xfc, 0xdf, 0xbc, 0x41,
+ 0x3e, 0xa9, 0x3c, 0x18, 0x06, 0x0e, 0x3c, 0xfb, 0xb9, 0xe2, 0x3c, 0x12, 0x14,
+ 0x26, 0xbc, 0x8b, 0x15, 0x97, 0xbd, 0x43, 0xc8, 0x23, 0xbd, 0x8e, 0x30, 0xf7,
+ 0x3a, 0x4c, 0xdc, 0x4f, 0xbd, 0x52, 0x50, 0x3c, 0xbc, 0xda, 0x70, 0x1b, 0x3d,
+ 0xfc, 0xbc, 0x3a, 0x3d, 0x76, 0x5a, 0x39, 0xbd, 0x48, 0xc3, 0x50, 0x3d, 0xf9,
+ 0xd3, 0x81, 0xbd, 0x1e, 0xdf, 0x09, 0xbd, 0xd3, 0xa3, 0x7a, 0x3d, 0x71, 0x42,
+ 0x6b, 0xbd, 0x7e, 0x3a, 0x4e, 0x3d, 0xd0, 0x26, 0xc5, 0xbb, 0xde, 0x7d, 0x2d,
+ 0x3d, 0xc0, 0xda, 0xd8, 0xba, 0x18, 0x43, 0x63, 0x3c, 0xb5, 0x93, 0xb6, 0x3c,
+ 0xc7, 0xee, 0x49, 0xbd, 0xb2, 0x73, 0x47, 0xbd, 0xa6, 0x66, 0x3b, 0x3d, 0xea,
+ 0xa2, 0x04, 0xbd, 0xde, 0x2b, 0x44, 0x3d, 0x41, 0x80, 0xee, 0x3c, 0x11, 0xbe,
+ 0x72, 0x3c, 0x46, 0xdf, 0x63, 0xbc, 0x4d, 0xc3, 0xfb, 0xbc, 0x3d, 0xbc, 0x86,
+ 0x3d, 0xf7, 0xad, 0x02, 0xbd, 0x7d, 0xb7, 0x0f, 0xbd, 0x99, 0x8c, 0x51, 0x3c,
+ 0x85, 0xce, 0x50, 0xbd, 0x0d, 0xe0, 0x41, 0x3d, 0x3a, 0xb3, 0x21, 0xbb, 0xd0,
+ 0x0b, 0xdd, 0xbb, 0x94, 0x62, 0x25, 0xbd, 0xc0, 0xab, 0xd1, 0xbc, 0xf0, 0xf6,
+ 0x89, 0xbb, 0xbe, 0x10, 0xb9, 0xbc, 0x68, 0x2e, 0x3a, 0x3c, 0x22, 0x34, 0x20,
+ 0xbd, 0x4d, 0xd9, 0x75, 0xbc, 0x74, 0x5d, 0x00, 0x3d, 0xf3, 0xd5, 0x5e, 0x3d,
+ 0x7c, 0x61, 0xcc, 0xbc, 0x56, 0x76, 0x13, 0x3d, 0xda, 0x68, 0xe3, 0x3b, 0xa3,
+ 0xa1, 0x89, 0x3d, 0xd0, 0xfa, 0x16, 0x3d, 0xf1, 0x86, 0x48, 0x3c, 0x71, 0x81,
+ 0x83, 0x3b, 0x31, 0x30, 0x2a, 0xbd, 0x4e, 0xc0, 0xd6, 0x3c, 0xe6, 0xf3, 0xfd,
+ 0xba, 0x6d, 0x46, 0x96, 0x3c, 0x60, 0xcc, 0x67, 0xbd, 0x11, 0x9c, 0xc6, 0x3c,
+ 0xa8, 0x63, 0x21, 0xbd, 0xdb, 0xb3, 0x70, 0xbc, 0x42, 0x46, 0x38, 0xbd, 0x88,
+ 0x73, 0x00, 0xbc, 0x48, 0x5e, 0x4e, 0x3d, 0x2d, 0x95, 0x26, 0xbd, 0xa0, 0x22,
+ 0xb3, 0x3c, 0x56, 0xfb, 0x91, 0xbd, 0x51, 0x13, 0x06, 0x3c, 0x85, 0x69, 0x8a,
+ 0x3d, 0x23, 0xf8, 0x89, 0xbd, 0x61, 0x24, 0xd3, 0xbc, 0x28, 0xd0, 0x0a, 0x3c,
+ 0xe9, 0x4e, 0x85, 0x3d, 0xde, 0x12, 0x93, 0xbb, 0x18, 0x55, 0xdd, 0x3b, 0x57,
+ 0xc2, 0x22, 0xbd, 0x85, 0x3f, 0x0a, 0xbd, 0x9d, 0x49, 0x86, 0x3d, 0x50, 0x01,
+ 0x8f, 0x3b, 0x2c, 0xbf, 0xf5, 0xbc, 0x6b, 0xec, 0x04, 0x3c, 0x92, 0x0e, 0x9b,
+ 0xbc, 0xfc, 0xe0, 0x28, 0xbd, 0x16, 0xeb, 0x9d, 0xbb, 0x20, 0xde, 0xf9, 0x3c,
+ 0x58, 0x77, 0x06, 0xbd, 0x5c, 0x2a, 0x92, 0xbc, 0x62, 0x8d, 0xf6, 0xbc, 0x88,
+ 0xcc, 0xa3, 0xbb, 0x60, 0xbf, 0xdb, 0x3c, 0x2c, 0xcb, 0x69, 0xbd, 0xe3, 0xcf,
+ 0x89, 0xbb, 0x35, 0xad, 0x81, 0xbd, 0xf1, 0x3d, 0x3d, 0xbd, 0x05, 0x62, 0x81,
+ 0x3d, 0x4e, 0xbe, 0x4d, 0x3c, 0x7e, 0xbf, 0x85, 0x3d, 0xfb, 0xc4, 0x23, 0xbb,
+ 0xd8, 0x1b, 0x78, 0x3d, 0x1d, 0xd7, 0x9d, 0xbd, 0x5d, 0x69, 0x15, 0x3d, 0xb6,
+ 0x7a, 0x93, 0xbc, 0x8c, 0xf1, 0xdf, 0xbc, 0xec, 0xfa, 0x2b, 0x3d, 0x40, 0xda,
+ 0x86, 0x3a, 0x1c, 0x0e, 0x2f, 0xbd, 0x38, 0x71, 0x4c, 0x3d, 0x68, 0x87, 0x9a,
+ 0xbd, 0x12, 0x86, 0x91, 0xbd, 0x60, 0x8f, 0x95, 0xbd, 0xd0, 0xe1, 0xf4, 0xbc,
+ 0xa2, 0x77, 0x3f, 0x3d, 0xc0, 0xcd, 0xa1, 0x3c, 0xa2, 0x69, 0x6e, 0xbd, 0xba,
+ 0xc9, 0x79, 0x3d, 0x6d, 0x05, 0xec, 0xbc, 0xb0, 0x63, 0x57, 0x3d, 0xfa, 0x05,
+ 0xd4, 0xbc, 0xb2, 0xd2, 0x93, 0x3b, 0x7e, 0x40, 0x09, 0xbd, 0xf0, 0x2e, 0xd6,
+ 0x3c, 0x00, 0x7b, 0x69, 0xbd, 0x6e, 0x10, 0x29, 0xbd, 0x69, 0x91, 0x92, 0xbb,
+ 0x90, 0x9e, 0x38, 0x3d, 0x99, 0x1b, 0x69, 0xbd, 0x32, 0xd2, 0x49, 0x3d, 0x9d,
+ 0xa4, 0x5d, 0xbd, 0x8b, 0x8e, 0x20, 0xbd, 0xcf, 0x0b, 0x92, 0xbd, 0x3c, 0xb7,
+ 0xfb, 0x3c, 0xdf, 0xf9, 0x58, 0x3d, 0xa7, 0xf0, 0x3e, 0xbb, 0x6c, 0x7e, 0xbd,
+ 0x3c, 0x83, 0xdf, 0x12, 0x3d, 0x37, 0x97, 0x84, 0x3d, 0xe0, 0x4e, 0x36, 0x3d,
+ 0xf6, 0x06, 0x90, 0xbd, 0x07, 0xc0, 0xce, 0x3c, 0xb1, 0xc0, 0x49, 0x3d, 0x7b,
+ 0x76, 0x02, 0x3c, 0x29, 0x97, 0x93, 0x3b, 0x16, 0x46, 0x45, 0xbd, 0x10, 0xb1,
+ 0x92, 0x3b, 0x26, 0x69, 0x45, 0x3d, 0x1e, 0x1a, 0x6d, 0x3d, 0x60, 0x9f, 0xe3,
+ 0x3b, 0x07, 0xab, 0x5f, 0x3d, 0x65, 0xce, 0x35, 0xbd, 0x61, 0x0d, 0x43, 0xbd,
+ 0x56, 0xa7, 0x79, 0x3d, 0x61, 0x67, 0x37, 0x3d, 0x26, 0xf4, 0x90, 0xbd, 0x73,
+ 0x2e, 0x1b, 0x3d, 0x39, 0x48, 0xe2, 0xb9, 0x57, 0x1e, 0x32, 0x3d, 0xaa, 0x2d,
+ 0x16, 0x3c, 0xae, 0x6a, 0x94, 0xbc, 0xc1, 0x8b, 0x1e, 0xbd, 0xf1, 0x42, 0x4f,
+ 0xbd, 0x6d, 0x34, 0x66, 0x3d, 0xc2, 0x39, 0x6a, 0xbd, 0x6e, 0x02, 0xab, 0x3c,
+ 0xa8, 0x60, 0x3d, 0xbd, 0x69, 0x24, 0x93, 0xbd, 0xd2, 0x91, 0x8a, 0xbd, 0xfe,
+ 0xa0, 0x30, 0xbd, 0xbd, 0x15, 0x28, 0xbd, 0x00, 0x1c, 0x02, 0x3a, 0x2e, 0xe2,
+ 0x5b, 0xbb, 0xda, 0x90, 0x4d, 0x3d, 0x56, 0xc4, 0xd3, 0xbc, 0x25, 0xb8, 0x6d,
+ 0x3d, 0x89, 0xe0, 0x47, 0x3d, 0x60, 0x4b, 0x04, 0xbb, 0x00, 0xd5, 0xdc, 0x39,
+ 0x33, 0xc0, 0x7e, 0x3d, 0xce, 0x0c, 0x51, 0xbd, 0xb2, 0x49, 0xf0, 0xbc, 0xc8,
+ 0x62, 0xa2, 0xbc, 0xdc, 0x45, 0x2a, 0x3d, 0x5e, 0xe2, 0x1b, 0xbd, 0xa6, 0x02,
+ 0x9a, 0xbd, 0xe2, 0xf0, 0x89, 0xbd, 0xff, 0x15, 0xa8, 0xbc, 0xc2, 0x94, 0xb9,
+ 0x3c, 0x8a, 0x28, 0x8b, 0xbc, 0x27, 0x32, 0x7d, 0x3d, 0x2b, 0x24, 0x75, 0xbd,
+ 0xc1, 0x7f, 0x05, 0xbd, 0x8b, 0x7f, 0x28, 0xbd, 0xa4, 0xd9, 0x9a, 0xbc, 0x03,
+ 0xc7, 0x23, 0xbc, 0xac, 0xd5, 0x6d, 0xbc, 0xfb, 0xf5, 0x70, 0xbc, 0x5c, 0x28,
+ 0x5c, 0xbd, 0xf5, 0xa5, 0x54, 0x3d, 0xc4, 0x5f, 0x87, 0xbd, 0x28, 0x92, 0x51,
+ 0x3c, 0x10, 0xc1, 0x87, 0x3d, 0x00, 0xeb, 0x1c, 0x3c, 0x9a, 0x6a, 0x52, 0x3d,
+ 0x95, 0xc5, 0x1a, 0x3d, 0x9d, 0x84, 0x9b, 0x3c, 0x56, 0x33, 0xda, 0xbc, 0x28,
+ 0x01, 0x64, 0x3d, 0xb1, 0x80, 0x4f, 0xbd, 0x50, 0x61, 0x89, 0xbd, 0xe0, 0x1f,
+ 0x30, 0xbb, 0x63, 0x5a, 0x86, 0x3d, 0x06, 0x30, 0x56, 0x3d, 0xc6, 0x8e, 0x4e,
+ 0xbd, 0xd1, 0xb8, 0xc6, 0xbc, 0xc6, 0x6c, 0xf4, 0xbc, 0x6c, 0x6f, 0x21, 0x3d,
+ 0xea, 0x45, 0x86, 0x3c, 0xe7, 0x7b, 0x1c, 0xbd, 0xba, 0x38, 0x54, 0xbd, 0xa4,
+ 0x78, 0x82, 0x3d, 0xdc, 0x98, 0x18, 0xbc, 0xa0, 0x85, 0x0d, 0x3d, 0x9e, 0xe7,
+ 0x55, 0xbd, 0x8e, 0x64, 0x30, 0x3d, 0xda, 0xf4, 0x48, 0x3d, 0x69, 0xdc, 0xe8,
+ 0x3c, 0x68, 0xc7, 0x0d, 0xbd, 0xdf, 0x7e, 0xb4, 0x3c, 0x3a, 0x30, 0x57, 0x3d,
+ 0xc5, 0x7a, 0x1a, 0xbc, 0x42, 0xa7, 0x8c, 0x3d, 0xb1, 0x9c, 0x4f, 0x3d, 0xa0,
+ 0x74, 0x36, 0xbc, 0x7e, 0x74, 0x25, 0x3d, 0xc8, 0x7c, 0x48, 0x3d, 0x7f, 0x68,
+ 0x55, 0x3c, 0xa6, 0x62, 0xf8, 0xbc, 0x16, 0x5b, 0x2d, 0x3d, 0x79, 0x57, 0x6a,
+ 0xbd, 0x86, 0xf0, 0x8b, 0xbc, 0x20, 0x1c, 0x3f, 0x3c, 0x92, 0x3d, 0x20, 0x3d,
+ 0x40, 0x29, 0x7b, 0xbd, 0x32, 0x88, 0x5b, 0x3d, 0x28, 0x79, 0x2c, 0x3c, 0xeb,
+ 0x80, 0xe3, 0x3c, 0xe5, 0x28, 0xa1, 0x3c, 0x95, 0xbb, 0x88, 0x3d, 0x1b, 0xa9,
+ 0x95, 0xbc, 0xb0, 0x35, 0x5b, 0x3d, 0x02, 0xbd, 0x8e, 0xbc, 0x62, 0xe7, 0x1d,
+ 0xbd, 0xad, 0xe5, 0xca, 0x3c, 0x6f, 0x93, 0x3f, 0xb9, 0x51, 0x7d, 0x48, 0xbd,
+ 0x06, 0x75, 0x68, 0x3d, 0xa7, 0x08, 0x7b, 0xbd, 0x5e, 0xeb, 0x73, 0xba, 0xa1,
+ 0x83, 0x31, 0x3d, 0xcd, 0x92, 0x55, 0x3c, 0x88, 0xdb, 0x3f, 0xbd, 0x67, 0x9c,
+ 0x35, 0x3d, 0xa9, 0x4b, 0x14, 0x3d, 0x94, 0x6b, 0x6c, 0xbc, 0x6c, 0xa8, 0xe7,
+ 0x3c, 0xc0, 0x02, 0xf7, 0xbb, 0xcb, 0xbc, 0x85, 0x3a, 0xf1, 0x91, 0xf0, 0xbc,
+ 0x72, 0x77, 0x83, 0x3d, 0x68, 0xab, 0x30, 0x3d, 0xa0, 0x17, 0x96, 0xbc, 0x7d,
+ 0xe6, 0x19, 0xbd, 0x18, 0x2c, 0x22, 0x3d, 0x88, 0x14, 0xaa, 0x3c, 0x40, 0x4d,
+ 0xb3, 0xbc, 0x4c, 0xc2, 0x7a, 0xbc, 0xf8, 0x68, 0x53, 0x3c, 0x16, 0x1d, 0xc6,
+ 0xbb, 0x2f, 0x2c, 0x71, 0xbd, 0xa3, 0x55, 0x80, 0x3d, 0x96, 0x18, 0x07, 0x3d,
+ 0x34, 0xa8, 0xa1, 0xbc, 0x2b, 0x39, 0x58, 0x3d, 0x23, 0xc6, 0x68, 0x3d, 0x46,
+ 0x84, 0x55, 0x3d, 0x0d, 0xd6, 0x3e, 0x3c, 0x2e, 0xc2, 0x0d, 0x3d, 0x88, 0x20,
+ 0x26, 0x3c, 0x44, 0x1b, 0x23, 0x3d, 0x7f, 0x54, 0x8b, 0xbd, 0xda, 0xa3, 0x54,
+ 0xbd, 0x9e, 0xad, 0x32, 0x3d, 0x17, 0x7c, 0x78, 0x3d, 0xcd, 0x11, 0x9f, 0xbc,
+ 0x2c, 0x53, 0x57, 0x3b, 0x1a, 0x5a, 0x0a, 0xbd, 0x6d, 0x40, 0x67, 0x3d, 0x52,
+ 0xb6, 0x56, 0x3d, 0x1c, 0x07, 0x96, 0xbd, 0xb0, 0x1c, 0x14, 0xbd, 0xc3, 0xda,
+ 0x2b, 0x3c, 0x7a, 0x02, 0x61, 0x3d, 0xbd, 0x9f, 0x2a, 0xbd, 0x72, 0xf9, 0xbf,
+ 0xbc, 0x79, 0xfe, 0xa3, 0x3c, 0xfc, 0x45, 0x43, 0xbd, 0x9e, 0xd3, 0x7b, 0x3d,
+ 0x70, 0x3a, 0x6e, 0xbd, 0x78, 0xdc, 0x30, 0x3c, 0x93, 0x36, 0x67, 0x3d, 0x63,
+ 0x08, 0x84, 0x3d, 0x5e, 0x4f, 0x40, 0x3a, 0xc5, 0xd9, 0xc1, 0x3c, 0xea, 0x6b,
+ 0x31, 0x3d, 0x1e, 0xf8, 0xdc, 0xbb, 0x0b, 0x30, 0xfd, 0xbc, 0xc6, 0xf2, 0x87,
+ 0x3d, 0xc5, 0xc9, 0xc7, 0x3c, 0x98, 0x0c, 0xba, 0x3b, 0xcf, 0x1a, 0x8d, 0xbd,
+ 0x90, 0xa5, 0xe1, 0xbb, 0x16, 0xc3, 0x64, 0x3d, 0x03, 0x3a, 0x95, 0x3c, 0xaa,
+ 0x98, 0x32, 0xbd, 0x95, 0xa5, 0x95, 0xbd, 0xde, 0x9e, 0x88, 0x3a, 0xbb, 0x39,
+ 0x8e, 0xbd, 0x3d, 0xf1, 0x30, 0x3d, 0x6e, 0x57, 0x8c, 0x3d, 0xf3, 0x90, 0x25,
+ 0xbd, 0xf8, 0x97, 0x2e, 0xbd, 0x21, 0xf3, 0x1b, 0x3d, 0x34, 0xd9, 0x5d, 0xbc,
+ 0x24, 0x60, 0x23, 0xbc, 0x32, 0x24, 0xa6, 0x3b, 0x01, 0xf1, 0x61, 0xbd, 0x69,
+ 0x3b, 0xaa, 0x3c, 0x54, 0xf0, 0x53, 0xbd, 0x40, 0x67, 0x64, 0x3b, 0x00, 0x84,
+ 0xa1, 0xbb, 0xda, 0xb5, 0x6e, 0x3d, 0x0f, 0xfb, 0x3d, 0xbc, 0xf9, 0xf3, 0x0c,
+ 0xbd, 0x5b, 0x52, 0xd1, 0xbb, 0x43, 0xf7, 0x04, 0xbd, 0xf9, 0x67, 0x7c, 0x3d,
+ 0x36, 0xed, 0x30, 0xbd, 0xcf, 0x53, 0x62, 0x3c, 0x03, 0xbb, 0x79, 0xbd, 0x6d,
+ 0xc8, 0x40, 0x3d, 0xc5, 0x5c, 0x19, 0x3d, 0x0e, 0xd5, 0x2d, 0xbd, 0x2d, 0x89,
+ 0x92, 0x3d, 0xf3, 0xcc, 0x15, 0x3d, 0xe2, 0x92, 0x9e, 0xbc, 0x44, 0x74, 0x8e,
+ 0xbd, 0x6b, 0x27, 0x96, 0xbd, 0x86, 0xcb, 0xe8, 0x3c, 0xab, 0xda, 0x99, 0xbb,
+ 0xf6, 0x99, 0x19, 0xbb, 0xe8, 0xb3, 0x49, 0x3d, 0xa4, 0x79, 0x85, 0x3c, 0x4f,
+ 0xb4, 0xf5, 0xbc, 0x5c, 0x1a, 0xa9, 0xbc, 0xa7, 0x63, 0x1f, 0xbd, 0x33, 0xff,
+ 0x46, 0xbd, 0x39, 0x7f, 0x97, 0xbd, 0xd8, 0x75, 0x85, 0xbd, 0x55, 0x97, 0x94,
+ 0xbc, 0x3e, 0x73, 0xb0, 0x3c, 0xf8, 0xb8, 0xee, 0x3c, 0xa0, 0xe4, 0x6e, 0x3b,
+ 0x00, 0xde, 0x54, 0x3b, 0x3b, 0x2d, 0x90, 0xbc, 0xae, 0xd9, 0x89, 0xbd, 0x65,
+ 0x3d, 0xf9, 0x3c, 0x5f, 0x64, 0x8a, 0xbd, 0x88, 0x25, 0x7c, 0xbb, 0x8c, 0x64,
+ 0x35, 0xbc, 0x63, 0x28, 0x0c, 0x3d, 0x2d, 0x9c, 0xde, 0xbb, 0x62, 0x5c, 0x96,
+ 0xbc, 0x12, 0x3c, 0x35, 0x3d, 0x50, 0x11, 0xcc, 0x3b, 0x56, 0x1a, 0x80, 0xbd,
+ 0xd0, 0x1a, 0x98, 0xba, 0x88, 0xe4, 0x58, 0x3d, 0x09, 0xc2, 0x9e, 0x3b, 0xce,
+ 0xc4, 0x3c, 0xbc, 0x88, 0x46, 0x09, 0xbd, 0xea, 0xde, 0x04, 0x3c, 0xd4, 0x45,
+ 0x5d, 0xbd, 0x18, 0x90, 0x7e, 0x3d, 0x99, 0x67, 0x91, 0x3d, 0x8d, 0x01, 0xd7,
+ 0xbc, 0x61, 0xdc, 0x6b, 0x3d, 0x36, 0x17, 0x96, 0x3c, 0x7e, 0x27, 0x6f, 0x3d,
+ 0x52, 0xcb, 0xf7, 0x3c, 0xfc, 0x54, 0x75, 0xbc, 0x36, 0xbd, 0x25, 0x3d, 0x86,
+ 0xd1, 0x7b, 0xbd, 0x5c, 0x19, 0x12, 0x3d, 0xda, 0xfb, 0x03, 0x3d, 0xee, 0x5f,
+ 0x37, 0xbd, 0xd4, 0x39, 0x34, 0xbd, 0xb4, 0x2f, 0x8b, 0xbd, 0x29, 0xd4, 0x99,
+ 0xbd, 0x4e, 0x31, 0x4a, 0x3c, 0x3a, 0x73, 0x7b, 0x3d, 0x97, 0x99, 0xac, 0xbb,
+ 0x77, 0xe4, 0xac, 0xbc, 0x0c, 0x31, 0xc3, 0xbb, 0xd7, 0xdb, 0x85, 0x3d, 0x31,
+ 0x4d, 0xd5, 0xbb, 0xb8, 0x71, 0xda, 0x3c, 0x7c, 0x01, 0x5a, 0x3d, 0x32, 0xe9,
+ 0x57, 0x3d, 0x6f, 0xd9, 0x7a, 0x3d, 0x38, 0x6a, 0x77, 0xbc, 0x7b, 0x63, 0x5c,
+ 0xbd, 0x8c, 0xe0, 0x02, 0xbd, 0xf2, 0x35, 0x47, 0x3d, 0x93, 0x0e, 0x59, 0xbd,
+ 0xf8, 0xfa, 0x63, 0x3d, 0x1c, 0x59, 0x49, 0xbd, 0x48, 0x00, 0x3c, 0xbc, 0x52,
+ 0xd8, 0x14, 0x3d, 0xc3, 0x56, 0x42, 0x3c, 0x7d, 0x74, 0xa9, 0x3c, 0x15, 0x40,
+ 0x83, 0x3d, 0x9c, 0x8d, 0xe2, 0xbc, 0x47, 0xdb, 0x86, 0x3d, 0xcc, 0x7f, 0x2d,
+ 0xbd, 0x39, 0xdd, 0x8f, 0x3d, 0xe8, 0xe7, 0x0c, 0x3c, 0xc0, 0xc6, 0xfa, 0x3a,
+ 0x5e, 0x6c, 0x85, 0xbd, 0xae, 0x8d, 0x79, 0x3d, 0x29, 0x90, 0xd8, 0x3c, 0x09,
+ 0x17, 0x85, 0xbc, 0x4d, 0xf9, 0x71, 0xbd, 0x74, 0xa6, 0xf3, 0xbb, 0xf0, 0x65,
+ 0xee, 0xbc, 0x42, 0x45, 0x7b, 0x3d, 0xdc, 0x2b, 0x5e, 0xbd, 0x35, 0x5f, 0x3f,
+ 0x3d, 0x10, 0x00, 0xdd, 0x3b, 0xb8, 0xd0, 0x94, 0xbc, 0xe8, 0xb4, 0xcc, 0xbc,
+ 0xb3, 0x71, 0x2d, 0x3c, 0x00, 0x36, 0xc0, 0x3c, 0x3e, 0x20, 0x1e, 0xbd, 0x0e,
+ 0xdf, 0x62, 0x3c, 0x55, 0xdc, 0x44, 0x3d, 0x27, 0x0e, 0x3a, 0xbc, 0x6b, 0xd4,
+ 0x8c, 0x3c, 0xcc, 0xcc, 0x7f, 0xbd, 0xd4, 0x43, 0x3d, 0xbd, 0x5b, 0xac, 0x58,
+ 0x3c, 0xf0, 0x58, 0xd2, 0xbc, 0x49, 0x1d, 0x38, 0x3d, 0x09, 0x7c, 0x1d, 0xbd,
+ 0x7a, 0x5b, 0x00, 0xbd, 0xe4, 0x6e, 0xf0, 0x3c, 0x4a, 0xd3, 0x56, 0x3d, 0x28,
+ 0x12, 0x8d, 0xbc, 0xbe, 0x44, 0x65, 0x3d, 0x0a, 0xd4, 0x16, 0xbc, 0xb0, 0x96,
+ 0x16, 0xbd, 0xfa, 0xf1, 0x8d, 0x3d, 0x41, 0xd6, 0x74, 0x3d, 0xb5, 0x79, 0x85,
+ 0xbd, 0x5d, 0xfb, 0x8e, 0xbc, 0xd8, 0x46, 0x86, 0xba, 0x2f, 0xa2, 0x8b, 0xbd,
+ 0xd8, 0x91, 0x90, 0xbc, 0xf7, 0x73, 0xe6, 0xbc, 0x6c, 0x45, 0xac, 0x3c, 0xe4,
+ 0xbe, 0x60, 0xbc, 0x4b, 0x18, 0x7f, 0x3d, 0x1f, 0xb0, 0x39, 0x3c, 0xc0, 0x64,
+ 0x71, 0x3d, 0x2f, 0x99, 0x3e, 0xbd, 0xa8, 0x87, 0x2f, 0x3d, 0xdc, 0xb3, 0x94,
+ 0xbd, 0xfa, 0xe2, 0x8c, 0xbd, 0x28, 0xb5, 0x2a, 0x3c, 0xa3, 0x13, 0x31, 0xbd,
+ 0xe6, 0xae, 0xfc, 0xbc, 0x98, 0xb6, 0x68, 0xbd, 0x41, 0xdf, 0x66, 0x3b, 0xde,
+ 0xc5, 0x2e, 0xbd, 0x24, 0x8c, 0x4c, 0xbd, 0xdb, 0x77, 0xe8, 0x3b, 0xc0, 0x23,
+ 0xc1, 0xbc, 0x50, 0xcb, 0x98, 0xbc, 0x44, 0x4b, 0x32, 0x3d, 0xd0, 0xd5, 0xf9,
+ 0xbc, 0x40, 0x77, 0xea, 0x3b, 0xaf, 0x97, 0xbc, 0x3c, 0x9f, 0x07, 0x8d, 0x3d,
+ 0x26, 0xc4, 0x87, 0xbc, 0x48, 0xff, 0x1b, 0x3d, 0x90, 0x07, 0xc0, 0x3b, 0xa0,
+ 0xeb, 0x61, 0xbb, 0x61, 0x90, 0x8c, 0x3d, 0x46, 0x0b, 0x89, 0xbd, 0x61, 0x99,
+ 0x09, 0xbd, 0x27, 0xb3, 0x3a, 0xbc, 0xad, 0x56, 0xff, 0xbc, 0xa6, 0xaf, 0x7f,
+ 0x3d, 0x50, 0x1d, 0x09, 0xbd, 0x82, 0xfd, 0xcd, 0xbc, 0x31, 0x6c, 0x4d, 0x3d,
+ 0x6d, 0xe8, 0x8c, 0x3c, 0x59, 0x5e, 0xb7, 0xbb, 0xa8, 0x14, 0x49, 0x3d, 0x86,
+ 0xe4, 0x89, 0xbc, 0x41, 0xc7, 0x0c, 0xbd, 0xf5, 0x84, 0x80, 0x3d, 0x31, 0x71,
+ 0x88, 0x3d, 0x3b, 0xcf, 0x84, 0xbd, 0x4f, 0xc3, 0x89, 0x3d, 0x24, 0x62, 0x21,
+ 0xbd, 0xb0, 0xc2, 0xdb, 0x3b, 0xf8, 0xc8, 0x46, 0xbd, 0xa5, 0xe0, 0x89, 0x3d,
+ 0x89, 0x41, 0x29, 0x3c, 0x90, 0xbd, 0xe7, 0x3c, 0x78, 0xc9, 0x42, 0xbc, 0x1f,
+ 0xd6, 0x82, 0x3d, 0xfb, 0xcd, 0x87, 0xbd, 0x2a, 0xd2, 0x24, 0xbd, 0x86, 0x49,
+ 0x6d, 0xbd, 0x62, 0x20, 0xc8, 0xba, 0xb0, 0xc4, 0xec, 0xbc, 0xdf, 0x68, 0xb4,
+ 0x3a, 0xe3, 0x0f, 0xe7, 0x3c, 0x41, 0xd5, 0x2e, 0xbd, 0xd4, 0xd6, 0x7c, 0xbd,
+ 0xb6, 0xd8, 0x2f, 0x3d, 0x2e, 0x95, 0xf2, 0xbc, 0x7c, 0xa4, 0xd0, 0xbc, 0x84,
+ 0x63, 0x61, 0x3d, 0xfe, 0x1c, 0x26, 0x3d, 0x29, 0x38, 0x6e, 0x3c, 0xff, 0xb9,
+ 0x12, 0xbd, 0xbc, 0xc6, 0x8d, 0x3d, 0xe1, 0xf5, 0x94, 0xbd, 0xd6, 0x91, 0x86,
+ 0xbd, 0x88, 0xb9, 0x58, 0xbc, 0x50, 0x18, 0xb0, 0xbb, 0x95, 0x6f, 0x84, 0x3d,
+ 0xd1, 0x02, 0x2c, 0xbd, 0xdd, 0xec, 0x00, 0x3d, 0x2c, 0x87, 0x33, 0x3c, 0x83,
+ 0xae, 0x83, 0xbd, 0xf9, 0xfc, 0xc7, 0x3b, 0x54, 0x47, 0x34, 0xbc, 0xdc, 0xeb,
+ 0x44, 0xbc, 0xc1, 0x33, 0x1f, 0xbd, 0x2e, 0xa0, 0xe7, 0xbc, 0x18, 0x92, 0x5b,
+ 0xbc, 0x75, 0xee, 0x48, 0x3d, 0xcf, 0xe5, 0x29, 0x3c, 0xdd, 0xfb, 0xcd, 0xbc,
+ 0x1e, 0xfe, 0x15, 0xbd, 0xfa, 0x83, 0x24, 0xbd, 0x74, 0xa7, 0x1b, 0x3d, 0x79,
+ 0x43, 0xf6, 0x3c, 0xc1, 0x09, 0xcc, 0xbb, 0x23, 0xce, 0x51, 0x3d, 0x90, 0xbd,
+ 0x6d, 0xbd, 0xd3, 0x87, 0xa9, 0x3c, 0xa6, 0x5c, 0x6b, 0x3d, 0x30, 0xbc, 0xd0,
+ 0xbb, 0x43, 0x24, 0x71, 0xbd, 0xf1, 0xc3, 0x69, 0xbc, 0xcc, 0x77, 0x5d, 0xbd,
+ 0xf5, 0x11, 0x95, 0xbd, 0x90, 0x17, 0xc7, 0xbc, 0x44, 0x6c, 0x85, 0xbd, 0xeb,
+ 0x43, 0xd6, 0x3c, 0xe3, 0x8d, 0x8b, 0x3d, 0xbf, 0x68, 0x3d, 0xbd, 0x6d, 0x69,
+ 0x86, 0xbd, 0xb5, 0x14, 0x8f, 0xbd, 0xe9, 0x70, 0x0c, 0xbc, 0x97, 0x30, 0x78,
+ 0x3d, 0xd2, 0x1f, 0x57, 0xbd, 0x08, 0xe4, 0x28, 0x3d, 0x34, 0x1f, 0xf3, 0xbc,
+ 0x18, 0xb7, 0x66, 0xbc, 0x00, 0x60, 0x30, 0x3c, 0xc1, 0x3d, 0x1f, 0xbd, 0x26,
+ 0x9a, 0x85, 0x3d, 0xc6, 0x32, 0x88, 0xbd, 0x36, 0x33, 0x5c, 0xbd, 0x81, 0xb7,
+ 0x89, 0xbd, 0x9f, 0x29, 0xeb, 0xbb, 0xe3, 0x50, 0x3d, 0x3d, 0x24, 0x66, 0x88,
+ 0xbd, 0xcc, 0xc0, 0x0d, 0x3d, 0xd2, 0xa9, 0x92, 0x3c, 0x54, 0x72, 0x02, 0x3d,
+ 0xd5, 0x3b, 0x90, 0xbb, 0x3d, 0x9f, 0x63, 0xbd, 0xed, 0xbe, 0x18, 0xbd, 0x59,
+ 0xec, 0x6e, 0x3b, 0x28, 0xf2, 0x29, 0xbc, 0xc7, 0xce, 0xab, 0x3c, 0xf4, 0xc8,
+ 0x79, 0xbd, 0x7c, 0x71, 0x30, 0x3d, 0x75, 0xbb, 0x80, 0xbc, 0x5c, 0xc6, 0x6b,
+ 0xbd, 0x61, 0x73, 0x3c, 0x3d, 0x74, 0x82, 0x33, 0xbd, 0xd2, 0x32, 0x79, 0x3c,
+ 0x9c, 0x80, 0xb6, 0xbb, 0xef, 0xee, 0x5f, 0x3d, 0xf8, 0x07, 0x30, 0xbd, 0xb1,
+ 0x7f, 0x2f, 0xbd, 0xc2, 0x76, 0x36, 0xbd, 0x9e, 0x38, 0xa3, 0x3c, 0x7c, 0x4e,
+ 0x47, 0xbc, 0x48, 0xce, 0x1a, 0x3d, 0xfc, 0xcd, 0xc2, 0x3c, 0x65, 0xb0, 0x07,
+ 0x3d, 0x51, 0x39, 0x1c, 0x3d, 0x27, 0x56, 0x87, 0x3d, 0x63, 0x07, 0xdd, 0x3c,
+ 0x2b, 0xd5, 0x82, 0x3d, 0xb0, 0x9d, 0x85, 0xbd, 0xc5, 0x43, 0xf0, 0x3c, 0x19,
+ 0x0c, 0x95, 0x3b, 0x28, 0x64, 0x6b, 0xbd, 0x8e, 0x23, 0x09, 0xbd, 0xfa, 0x58,
+ 0xfc, 0x3b, 0x40, 0xca, 0x5d, 0x3c, 0xa0, 0xbe, 0x58, 0xbd, 0xb1, 0x3b, 0x91,
+ 0xbd, 0xd1, 0x73, 0xf0, 0x3a, 0x1d, 0x07, 0x31, 0x3d, 0x7d, 0x80, 0x07, 0x3d,
+ 0xda, 0x52, 0x44, 0x3c, 0x78, 0x62, 0x58, 0x3c, 0x8d, 0x84, 0x01, 0x3d, 0x66,
+ 0x36, 0x76, 0xbd, 0x68, 0xd0, 0x03, 0xbc, 0x43, 0x54, 0x56, 0x3c, 0xae, 0xac,
+ 0x59, 0x3d, 0x36, 0xce, 0x48, 0xbd, 0xd4, 0xc1, 0x65, 0xbc, 0xd9, 0xee, 0x34,
+ 0x3c, 0x80, 0x4c, 0x66, 0xba, 0x88, 0xe1, 0x3c, 0x3c, 0xc8, 0xb7, 0x04, 0x3d,
+ 0x90, 0xdf, 0xdf, 0x3c, 0x20, 0x76, 0x1c, 0x3b, 0xfb, 0x80, 0x1e, 0x3d, 0x7e,
+ 0xbd, 0x19, 0x3d, 0x1f, 0x28, 0x96, 0xbb, 0x19, 0xa6, 0x3c, 0x3c, 0x3f, 0xc7,
+ 0xf9, 0xbc, 0x4a, 0xc2, 0x1a, 0xbd, 0xd5, 0xa0, 0x86, 0xbd, 0x3a, 0xc8, 0xd6,
+ 0x3c, 0xc3, 0x1a, 0x5a, 0x3d, 0x1a, 0x8c, 0x91, 0xbd, 0xd0, 0x10, 0x67, 0x3d,
+ 0x42, 0x5b, 0x16, 0x3d, 0xa3, 0xd2, 0x5b, 0xbc, 0x6c, 0xa0, 0xb6, 0x3c, 0x65,
+ 0xe2, 0x1d, 0xbd, 0x9a, 0xdf, 0x0e, 0xbd, 0xc0, 0x74, 0xcf, 0x3b, 0x84, 0xe1,
+ 0xc1, 0x3c, 0x2a, 0xed, 0x60, 0x3d, 0xe3, 0x10, 0xe4, 0xbc, 0x3f, 0xcc, 0x8b,
+ 0xbd, 0x95, 0xa5, 0x8b, 0x3d, 0xd8, 0xc3, 0x00, 0xbd, 0x85, 0x56, 0x75, 0x3d,
+ 0xac, 0x3a, 0x5b, 0x3d, 0x6a, 0x5d, 0xed, 0xbb, 0xbb, 0xd3, 0xd5, 0x3c, 0xac,
+ 0xb0, 0x3f, 0x3d, 0x70, 0x1a, 0x6b, 0x3c, 0x70, 0xca, 0x28, 0x3c, 0xa2, 0x71,
+ 0xde, 0xbc, 0x00, 0x22, 0x77, 0x3a, 0x43, 0x45, 0x21, 0xbd, 0x17, 0xa9, 0x34,
+ 0x3d, 0x4d, 0x49, 0x2d, 0xbd, 0xb5, 0xd6, 0x8b, 0x3d, 0x84, 0xa5, 0xbd, 0xbc,
+ 0x9d, 0x7f, 0x02, 0xbd, 0x85, 0x08, 0x80, 0xbd, 0xff, 0x2d, 0x8f, 0xbc, 0x04,
+ 0x5f, 0x3b, 0xbd, 0xba, 0xce, 0x17, 0xbd, 0xf3, 0xfc, 0x80, 0x3d, 0xe1, 0x9c,
+ 0x8c, 0xbd, 0xaf, 0x1c, 0xc6, 0x3c, 0x77, 0x31, 0x12, 0x3d, 0xde, 0x28, 0x49,
+ 0xbd, 0x0d, 0xe3, 0x1f, 0xbd, 0x2a, 0x71, 0x30, 0xbc, 0x1e, 0x04, 0x35, 0x3d,
+ 0x08, 0x0a, 0xad, 0x3b, 0xe9, 0x97, 0x98, 0xbc, 0x26, 0xe3, 0x00, 0x3c, 0xbe,
+ 0xf9, 0xbb, 0xbc, 0x77, 0x23, 0x34, 0xbd, 0x55, 0x69, 0x61, 0x3d, 0xc4, 0xb9,
+ 0x8d, 0xbd, 0x5f, 0x82, 0x81, 0x3d, 0x68, 0xff, 0x16, 0xbc, 0x2c, 0xa2, 0x91,
+ 0xbc, 0x67, 0x62, 0x78, 0xbd, 0x76, 0x32, 0x13, 0x3d, 0x68, 0x26, 0x2b, 0x3d,
+ 0x1a, 0xbb, 0xdc, 0xbc, 0xae, 0x91, 0x84, 0x3d, 0xc0, 0xfe, 0x8d, 0xbd, 0xfe,
+ 0x28, 0x88, 0xbc, 0x02, 0x43, 0x0e, 0xbc, 0x0b, 0x35, 0x69, 0xbb, 0xb4, 0xf8,
+ 0x8b, 0xbd, 0xad, 0x86, 0x6e, 0xbd, 0x5c, 0x92, 0x19, 0xbd, 0x03, 0x18, 0x59,
+ 0xbd, 0x58, 0x48, 0x55, 0xbc, 0x2e, 0xaf, 0x4d, 0x3d, 0x70, 0x1a, 0x59, 0xbc,
+ 0x63, 0xf3, 0x3d, 0xbd, 0x97, 0xcd, 0x8f, 0xbd, 0x4b, 0x2b, 0x75, 0x3d, 0x78,
+ 0xf6, 0x78, 0xbd, 0x40, 0x84, 0x01, 0xbd, 0x04, 0xb6, 0x05, 0xbd, 0x21, 0xa7,
+ 0xf7, 0x3c, 0x9e, 0x08, 0xc5, 0x3c, 0x3b, 0xde, 0xa8, 0xbc, 0x04, 0x81, 0x85,
+ 0x3c, 0x7d, 0x36, 0xd2, 0x3c, 0x02, 0xf0, 0xd0, 0xbc, 0xcb, 0xe0, 0x68, 0x3d,
+ 0xb3, 0x19, 0x89, 0xbd, 0x39, 0xf7, 0x5f, 0x3d, 0x6a, 0x8f, 0x05, 0xbc, 0x7c,
+ 0xc8, 0x91, 0xbc, 0xec, 0xc4, 0x93, 0x3c, 0xa0, 0x62, 0x3a, 0xbb, 0x59, 0xfc,
+ 0x1a, 0xbd, 0xc9, 0xcd, 0x95, 0xbd, 0x57, 0xc3, 0x5b, 0xbb, 0x67, 0x2f, 0xe4,
+ 0x3c, 0x13, 0xcc, 0xa5, 0x3c, 0x1d, 0x6c, 0x39, 0xbc, 0x50, 0x64, 0x83, 0x3c,
+ 0x50, 0x6d, 0x5b, 0xbc, 0xda, 0x2a, 0xcd, 0x3c, 0x09, 0xb3, 0x96, 0xbd, 0x91,
+ 0x4f, 0x34, 0x3d, 0x33, 0xd0, 0x17, 0xbd, 0x1d, 0x22, 0x86, 0xbd, 0x9c, 0x1e,
+ 0x0d, 0xbd, 0xd4, 0x2b, 0x9c, 0xba, 0x67, 0xb5, 0xa7, 0xbc, 0x0f, 0xe2, 0x76,
+ 0xbd, 0x4b, 0xb9, 0x71, 0x3d, 0x69, 0xa9, 0x9c, 0xbc, 0x30, 0x44, 0x47, 0x3d,
+ 0xf0, 0xdc, 0x95, 0x3c, 0xe2, 0x1d, 0x22, 0xbd, 0xaa, 0xb5, 0x58, 0xbd, 0x9d,
+ 0x59, 0x7d, 0xbd, 0xa4, 0x92, 0x95, 0x3c, 0x40, 0xaa, 0x8d, 0xbd, 0xf0, 0x3e,
+ 0xb4, 0x3c, 0xc2, 0x03, 0x2a, 0xbd, 0xb0, 0xc5, 0x29, 0xbd, 0xc0, 0x7c, 0x42,
+ 0xbd, 0xea, 0x99, 0x7e, 0x3d, 0xd6, 0xbc, 0x15, 0x3d, 0xb9, 0xda, 0x37, 0xbd,
+ 0xd0, 0x21, 0x9e, 0x3c, 0x79, 0x2e, 0xab, 0xbb, 0x73, 0x17, 0xcd, 0xbc, 0x7c,
+ 0x01, 0xe3, 0x3c, 0xb7, 0xb8, 0xf2, 0x3c, 0x11, 0x4b, 0x45, 0x3d, 0x87, 0x86,
+ 0x9a, 0x3c, 0x2c, 0x70, 0x57, 0xbd, 0x55, 0xdf, 0x1d, 0xbd, 0xf5, 0x86, 0xa6,
+ 0xbc, 0x21, 0x96, 0x49, 0xbd, 0x36, 0x4c, 0x75, 0xbd, 0xc9, 0x1c, 0xa0, 0x3c,
+ 0x5d, 0xba, 0x26, 0x3d, 0xd6, 0x56, 0x02, 0x3d, 0x69, 0x90, 0x12, 0xbc, 0x08,
+ 0x5b, 0x0f, 0xbd, 0x81, 0xce, 0x92, 0xbc, 0x3a, 0xb8, 0x5f, 0x3d, 0x7a, 0xaf,
+ 0xe7, 0x3c, 0x4d, 0x4b, 0x60, 0xbc, 0x78, 0xc0, 0x6c, 0xbd, 0x85, 0x6f, 0xe7,
+ 0x3c, 0xaa, 0xc1, 0xb3, 0x3c, 0x8b, 0xe4, 0xb7, 0x3c, 0xdd, 0xd0, 0x39, 0x3d,
+ 0x48, 0x49, 0x1b, 0x3d, 0xe2, 0x74, 0x28, 0xbd, 0x86, 0x4a, 0x47, 0x3d, 0x30,
+ 0x77, 0xad, 0x3b, 0xe0, 0xa8, 0x0e, 0xbc, 0xec, 0x36, 0xd1, 0x3c, 0xe3, 0x01,
+ 0x8f, 0xbd, 0x56, 0x6c, 0x34, 0xbd, 0x8a, 0x99, 0x20, 0xbb, 0xb1, 0x89, 0x12,
+ 0x3d, 0xea, 0x43, 0x39, 0xbd, 0x26, 0x16, 0xd2, 0x3c, 0xe2, 0x88, 0xc8, 0x3c,
+ 0x63, 0x15, 0xa0, 0x3c, 0x8d, 0x95, 0x3a, 0x3d, 0x86, 0x69, 0x26, 0xbd, 0x4c,
+ 0x38, 0xdb, 0x3b, 0xe0, 0xfa, 0x49, 0x3d, 0x62, 0xdf, 0xb4, 0xbc, 0x6a, 0xe4,
+ 0x89, 0xbc, 0x63, 0x50, 0x6d, 0x3d, 0xfa, 0x35, 0x46, 0xbd, 0xcb, 0xcb, 0x8c,
+ 0xbc, 0x46, 0x94, 0x66, 0x3d, 0xdd, 0xf8, 0xa2, 0xbc, 0x00, 0x34, 0x8c, 0x3d,
+ 0x0a, 0xa1, 0x05, 0x3d, 0x73, 0x92, 0x91, 0xbd, 0x64, 0x3e, 0xf4, 0xbc, 0xcd,
+ 0x5a, 0xa4, 0xbc, 0xe6, 0xce, 0x4b, 0x3d, 0x68, 0xb0, 0xcf, 0xbc, 0x38, 0xd3,
+ 0xe2, 0x3b, 0xfd, 0x03, 0x38, 0xbd, 0x11, 0xc0, 0x92, 0xbd, 0xa8, 0x82, 0x50,
+ 0x3d, 0x2a, 0x9a, 0xaf, 0xbc, 0x0e, 0xea, 0x7b, 0x3d, 0x11, 0xf4, 0x95, 0xbc,
+ 0x34, 0xed, 0xb6, 0x3c, 0x2b, 0x26, 0x6f, 0xbd, 0x15, 0xad, 0x7c, 0x3d, 0x19,
+ 0xc6, 0xed, 0x3c, 0x00, 0xf8, 0x81, 0xbd, 0x74, 0x82, 0x63, 0xbd, 0x62, 0x76,
+ 0x53, 0xbd, 0x48, 0x4f, 0x78, 0x3d, 0x76, 0x0e, 0x5c, 0xbb, 0x24, 0x30, 0x30,
+ 0xbd, 0x86, 0x0a, 0x14, 0x3d, 0x08, 0x29, 0xb3, 0xbc, 0xef, 0x7c, 0x2a, 0xbd,
+ 0x90, 0xb8, 0x09, 0x3d, 0x47, 0x45, 0x66, 0xbc, 0x30, 0x23, 0xb7, 0xbc, 0x8f,
+ 0xd2, 0x5e, 0x3d, 0x31, 0x72, 0x33, 0x3d, 0x26, 0xdc, 0x88, 0xbd, 0xeb, 0x0b,
+ 0x24, 0xbc, 0x14, 0x3c, 0xe9, 0xbc, 0x38, 0xc6, 0xd3, 0x3c, 0x55, 0xd6, 0x09,
+ 0xbd, 0xe5, 0xf7, 0x21, 0xbb, 0x7d, 0x03, 0x0d, 0x3d, 0xe9, 0x91, 0xd6, 0xbb,
+ 0x00, 0x90, 0xe4, 0x3a, 0x21, 0x2c, 0x1a, 0x3d, 0x0c, 0xe1, 0x82, 0x3c, 0x0a,
+ 0xb6, 0x38, 0x3d, 0x6c, 0x03, 0xe9, 0x3c, 0x83, 0x86, 0x05, 0x3d, 0x01, 0x6e,
+ 0x86, 0x3d, 0x99, 0xc2, 0x47, 0xbd, 0x27, 0x07, 0x57, 0x3d, 0xed, 0xd2, 0x59,
+ 0x3d, 0x0f, 0xa1, 0x0a, 0xbc, 0x12, 0x62, 0x6c, 0x3d, 0x16, 0x50, 0xf8, 0x3b,
+ 0x00, 0xf3, 0xdc, 0x3c, 0x5c, 0x4e, 0xa6, 0xbc, 0xfa, 0x73, 0x42, 0x3c, 0xd2,
+ 0x38, 0x8a, 0xbd, 0x35, 0x94, 0x8d, 0xbc, 0x69, 0x22, 0x3e, 0xbd, 0x83, 0xec,
+ 0x6f, 0xbc, 0xb6, 0x37, 0xb4, 0x3c, 0xf1, 0xa7, 0x83, 0x3d, 0x62, 0xbc, 0x82,
+ 0x3d, 0x88, 0x5d, 0xb8, 0xbc, 0xdd, 0x4d, 0x96, 0xbc, 0xaa, 0x38, 0x23, 0xbd,
+ 0x88, 0x3f, 0x4d, 0xbc, 0xc5, 0x2d, 0xfc, 0x3c, 0x78, 0x63, 0x20, 0x3d, 0xe5,
+ 0x87, 0x88, 0x3d, 0x08, 0xed, 0x77, 0xbc, 0x38, 0xef, 0x85, 0xbc, 0x19, 0xc5,
+ 0x90, 0x3d, 0xba, 0xc7, 0x4e, 0x3d, 0xe4, 0xc2, 0xd6, 0x3c, 0xac, 0x97, 0x22,
+ 0xbc, 0xa4, 0x4d, 0x55, 0xbd, 0x02, 0x71, 0x8b, 0xbd, 0xce, 0x55, 0x86, 0x3d,
+ 0xf9, 0x00, 0x9c, 0xbc, 0xbc, 0x84, 0x51, 0x3d, 0x3c, 0xaa, 0x21, 0xbd, 0xb3,
+ 0x0f, 0x43, 0xbd, 0x15, 0x2e, 0x90, 0xbd, 0xa9, 0x5c, 0x7a, 0x3d, 0x11, 0x1e,
+ 0x4b, 0x3d, 0xc7, 0x35, 0xc9, 0xbc, 0x86, 0x61, 0x77, 0xbd, 0x5c, 0xbb, 0x21,
+ 0xbc, 0x39, 0x3c, 0x6d, 0x3d, 0xaa, 0xde, 0xdd, 0x3a, 0xe5, 0xad, 0x0b, 0xbd,
+ 0xd5, 0x2c, 0x8f, 0xbd, 0x9b, 0xd2, 0x40, 0xbc, 0xae, 0xd1, 0x27, 0x3d, 0xa4,
+ 0x43, 0x61, 0x3c, 0x96, 0x2f, 0x26, 0xbd, 0x4c, 0xdb, 0x50, 0xbd, 0xd0, 0xee,
+ 0x55, 0xbc, 0xa9, 0xdf, 0x62, 0x3d, 0xa9, 0xc7, 0x14, 0xbd, 0x02, 0x65, 0x41,
+ 0x3b, 0xdc, 0x7c, 0x20, 0x3c, 0xb5, 0xb9, 0x89, 0x3d, 0x43, 0xc8, 0x8f, 0xbd,
+ 0xe5, 0x6b, 0x3e, 0x3c, 0xcb, 0x96, 0x8d, 0xbd, 0xe8, 0x9b, 0x7d, 0xbd, 0xad,
+ 0x41, 0x91, 0x3d, 0x84, 0x7b, 0xc2, 0x3c, 0xe9, 0xf8, 0x8c, 0x3c, 0x6d, 0x06,
+ 0xf1, 0xbb, 0xac, 0xcc, 0x43, 0x3d, 0x11, 0xd2, 0xe3, 0x3c, 0x69, 0xb6, 0x76,
+ 0xbc, 0x19, 0x3b, 0x71, 0xbd, 0x82, 0x8a, 0xb9, 0xbc, 0x28, 0x56, 0x3a, 0x3d,
+ 0xf6, 0x2b, 0x3c, 0x3d, 0x0f, 0x6e, 0xe1, 0xbb, 0x96, 0x11, 0x84, 0xbc, 0xae,
+ 0xf7, 0x81, 0x3d, 0xd2, 0xd1, 0x80, 0x3d, 0x97, 0xc3, 0xe6, 0xbc, 0x89, 0xe2,
+ 0x57, 0x3c, 0x3d, 0x6e, 0x8e, 0xbc, 0xca, 0x02, 0x4d, 0xbd, 0x62, 0x3c, 0xc1,
+ 0xbc, 0x16, 0x10, 0xed, 0xba, 0x3f, 0xe1, 0xef, 0x3c, 0x0a, 0x5c, 0xab, 0xbc,
+ 0x21, 0xad, 0xd1, 0xbb, 0xbc, 0xfe, 0x32, 0x3c, 0xac, 0x6c, 0x71, 0xbd, 0x15,
+ 0x98, 0x14, 0x3d, 0xb6, 0xee, 0x3a, 0x3c, 0x35, 0x4c, 0x87, 0x3d, 0xb6, 0xcd,
+ 0x4c, 0x3d, 0x10, 0xf7, 0xcc, 0x3b, 0xdb, 0x8a, 0x19, 0xbd, 0x00, 0x38, 0xdb,
+ 0xb8, 0xb3, 0x1b, 0x8e, 0xbd, 0x50, 0xa8, 0x41, 0xbd, 0x64, 0x53, 0x85, 0xbd,
+ 0x46, 0xcf, 0xcd, 0xbb, 0x65, 0xaf, 0xa4, 0x3c, 0x78, 0x82, 0x22, 0xbd, 0xb1,
+ 0xb2, 0x19, 0xbd, 0xaa, 0x2b, 0xe5, 0xbc, 0xb8, 0x9c, 0x3d, 0x3d, 0x30, 0x82,
+ 0x8c, 0x3c, 0xd9, 0x2c, 0x89, 0xbd, 0x27, 0x33, 0x8f, 0x3d, 0x20, 0x09, 0x87,
+ 0x3d, 0x50, 0x15, 0x05, 0xbd, 0x4b, 0xc1, 0x96, 0xbd, 0x82, 0x2a, 0x33, 0x3d,
+ 0xc1, 0x9b, 0x6c, 0xbd, 0xac, 0x51, 0x0c, 0xbd, 0xd7, 0xbc, 0x59, 0xbd, 0x69,
+ 0x2b, 0x37, 0x3c, 0xc0, 0xef, 0x26, 0xbd, 0xc8, 0xba, 0x59, 0x3c, 0xda, 0x1b,
+ 0x18, 0xbd, 0x11, 0xfb, 0x8b, 0x3d, 0xbf, 0xc8, 0x3d, 0xbd, 0x52, 0x1b, 0x00,
+ 0x3d, 0xe8, 0x9d, 0x4d, 0xba, 0xe4, 0x9d, 0x44, 0x3d, 0x87, 0x63, 0x06, 0xbd,
+ 0x76, 0xc3, 0x83, 0x3d, 0x32, 0xe3, 0x84, 0xbd, 0x5a, 0x34, 0x11, 0x3d, 0xe0,
+ 0xb2, 0x0e, 0xbd, 0xa8, 0x02, 0x8a, 0xbd, 0x9c, 0x92, 0x10, 0x3d, 0x47, 0xfd,
+ 0x90, 0xbd, 0x24, 0x45, 0x3c, 0x3d, 0x67, 0x62, 0x96, 0xbd, 0xbb, 0x91, 0x79,
+ 0xbd, 0x80, 0x99, 0x5b, 0xbd, 0x93, 0x7f, 0x83, 0xbd, 0x75, 0x82, 0x10, 0xbd,
+ 0x07, 0xb0, 0xa7, 0xbb, 0x5b, 0x41, 0x66, 0xbd, 0x82, 0xeb, 0x7a, 0xbc, 0x52,
+ 0xca, 0x57, 0xbd, 0x7e, 0xe3, 0x66, 0x3c, 0xab, 0x22, 0x68, 0xbd, 0x51, 0x4b,
+ 0xa9, 0xbc, 0x5e, 0x13, 0xa7, 0xbc, 0xe3, 0x6b, 0x88, 0xbb, 0x80, 0x4c, 0x02,
+ 0x3d, 0xf3, 0x3c, 0x59, 0xbd, 0xb2, 0x10, 0x7e, 0x3d, 0x1a, 0x9d, 0x13, 0xbd,
+ 0x8d, 0xd0, 0x5b, 0x3d, 0xca, 0x7a, 0x74, 0x3d, 0x16, 0x53, 0x4b, 0x3d, 0xc9,
+ 0x0a, 0x89, 0xbd, 0x44, 0x7e, 0x1b, 0xbc, 0x11, 0xca, 0xb2, 0xbc, 0x09, 0xe0,
+ 0x27, 0xbd, 0xe4, 0xed, 0xfb, 0x3c, 0xe4, 0x1a, 0xf9, 0xbc, 0x50, 0x47, 0x2e,
+ 0x3d, 0x1b, 0xed, 0x4e, 0x3d, 0x6d, 0x7c, 0x81, 0xbd, 0x72, 0x2a, 0xdc, 0xbc,
+ 0x6f, 0xa7, 0x59, 0x3d, 0xc0, 0xbd, 0x1e, 0xbc, 0xb2, 0xaf, 0xb9, 0xbc, 0x07,
+ 0x39, 0xba, 0xbc, 0xf4, 0x63, 0x46, 0xbd, 0x45, 0x7b, 0x1a, 0x3d, 0x79, 0xe9,
+ 0xf7, 0x3c, 0x9e, 0xba, 0xf0, 0xbc, 0xc1, 0x09, 0xbb, 0x3c, 0x0e, 0x21, 0x52,
+ 0xbc, 0xed, 0x78, 0x43, 0x3b, 0x73, 0x07, 0x62, 0x3d, 0x71, 0x92, 0x84, 0x3d,
+ 0x7b, 0x59, 0xb2, 0xbc, 0xe0, 0xba, 0x34, 0xbc, 0x0c, 0x23, 0x14, 0xbd, 0x93,
+ 0x93, 0x1f, 0xbd, 0xb7, 0x20, 0x6b, 0xbd, 0x8e, 0x60, 0x8c, 0xbd, 0x00, 0xe9,
+ 0x8c, 0x3d, 0xdf, 0xb4, 0xe1, 0xbb, 0xa0, 0x1a, 0xbf, 0xbc, 0xf6, 0x4c, 0x80,
+ 0x3c, 0x74, 0xeb, 0x18, 0x3d, 0x28, 0x64, 0x8c, 0x3c, 0xba, 0xbd, 0xd3, 0xbc,
+ 0x56, 0xc0, 0x6f, 0x3d, 0x09, 0x02, 0x88, 0xbd, 0x02, 0xd5, 0x58, 0x3d, 0xc1,
+ 0x57, 0x31, 0x3d, 0xfc, 0x52, 0x48, 0x3d, 0x61, 0xdc, 0x64, 0xbd, 0xa7, 0xc3,
+ 0x2b, 0x3d, 0x3b, 0xea, 0x13, 0xbc, 0x0e, 0xac, 0x3c, 0xbd, 0x7e, 0x92, 0x86,
+ 0x3c, 0xbf, 0x14, 0x29, 0xbc, 0xf3, 0x91, 0x7f, 0x3d, 0xf1, 0x9a, 0xac, 0x3c,
+ 0xf8, 0xf5, 0x76, 0x3c, 0xa2, 0x0f, 0x86, 0xbd, 0xc3, 0xeb, 0xb7, 0x3a, 0xff,
+ 0x56, 0x6c, 0x3d, 0x1c, 0xcc, 0x5a, 0xbd, 0x97, 0x3f, 0x78, 0x3d, 0x92, 0xea,
+ 0x9d, 0xbc, 0xbc, 0x51, 0x6a, 0x3d, 0xc5, 0x44, 0x65, 0x3c, 0xbc, 0x66, 0x30,
+ 0x3d, 0x70, 0xe2, 0x26, 0xbd, 0x2e, 0xbe, 0x19, 0x3d, 0x5e, 0xf3, 0x82, 0x3d,
+ 0x32, 0x2f, 0x86, 0xbd, 0x53, 0x73, 0x81, 0x3d, 0x86, 0xef, 0xa2, 0xbc, 0xdb,
+ 0xda, 0x62, 0xbd, 0x82, 0x4e, 0xd3, 0xbc, 0x80, 0xed, 0x93, 0xba, 0x50, 0xc2,
+ 0xd6, 0x3b, 0x82, 0x22, 0xf1, 0xbc, 0x49, 0xd7, 0x7a, 0xbc, 0xe9, 0x00, 0x85,
+ 0x3d, 0xb7, 0x12, 0x4c, 0xbd, 0x90, 0x25, 0x08, 0xb9, 0x2e, 0x76, 0xcb, 0xbc,
+ 0x47, 0x11, 0x97, 0xbd, 0x06, 0x96, 0x2f, 0x3d, 0x44, 0x62, 0x65, 0x3d, 0xe7,
+ 0xa5, 0x1f, 0x3d, 0x2e, 0x9e, 0xbf, 0xbc, 0x00, 0xd8, 0x6c, 0xbc, 0x20, 0xd1,
+ 0x44, 0xbb, 0x19, 0x61, 0x32, 0x3c, 0xf4, 0x7a, 0x30, 0x3d, 0x11, 0x7b, 0xe4,
+ 0xbc, 0x6e, 0x1c, 0x50, 0x3b, 0x9b, 0x64, 0x64, 0xbd, 0x89, 0x52, 0x1f, 0x3d,
+ 0x65, 0x20, 0x2c, 0x3d, 0xb9, 0x45, 0xd7, 0x3c, 0xe8, 0x37, 0x8e, 0x3d, 0x40,
+ 0x5e, 0x50, 0x3c, 0x7a, 0x66, 0x68, 0xbd, 0x45, 0x1b, 0x31, 0xbd, 0xcb, 0x31,
+ 0x47, 0x3d, 0x2f, 0x4a, 0xb3, 0x3c, 0x97, 0x3d, 0xbc, 0xbc, 0x55, 0x24, 0x80,
+ 0xbd, 0x85, 0x56, 0x69, 0xbc, 0x0e, 0x0a, 0x34, 0x3d, 0xec, 0xe8, 0x54, 0xbd,
+ 0xeb, 0x92, 0x6d, 0xbd, 0xe2, 0x61, 0x41, 0x3c, 0xf3, 0x3c, 0x93, 0xbd, 0x10,
+ 0xea, 0xbd, 0xb7, 0x42, 0xec, 0x3b, 0xbd, 0x66, 0xe6, 0x80, 0xbd, 0x84, 0xd9,
+ 0x85, 0x3d, 0x2c, 0xd8, 0xac, 0x3c, 0x72, 0x8e, 0x48, 0x3c, 0x11, 0xa8, 0x9c,
+ 0xbc, 0x08, 0x31, 0x39, 0x3d, 0x0f, 0x3c, 0x7c, 0x3d, 0x58, 0xba, 0x25, 0x3d,
+ 0xce, 0x5f, 0x27, 0x3c, 0x7c, 0x7b, 0x65, 0x3d, 0x96, 0xd6, 0x1e, 0x3d, 0x48,
+ 0x03, 0x73, 0xbd, 0x84, 0x7a, 0x26, 0xbd, 0x92, 0x82, 0x72, 0xbd, 0xeb, 0x8a,
+ 0x0c, 0xbd, 0x84, 0xe7, 0x5f, 0xbd, 0x0b, 0x83, 0xfc, 0x3c, 0xfb, 0xed, 0x8e,
+ 0xbd, 0x52, 0xe2, 0x65, 0x3d, 0xd1, 0xa1, 0x4e, 0xbb, 0x5f, 0x41, 0xce, 0xbc,
+ 0x4b, 0x3d, 0x15, 0xbb, 0x20, 0xc8, 0x90, 0xbd, 0x29, 0xfb, 0x28, 0xbd, 0x04,
+ 0x06, 0x8a, 0xbd, 0x8a, 0x65, 0x30, 0x3d, 0x00, 0x49, 0x93, 0x3a, 0x6e, 0xb0,
+ 0x61, 0x3d, 0x94, 0xcc, 0x87, 0xbc, 0x10, 0x13, 0x3a, 0x3d, 0x5a, 0x7e, 0x7f,
+ 0xbd, 0x4c, 0x1f, 0xd7, 0xbc, 0x82, 0xb3, 0x1e, 0x3d, 0x7e, 0xca, 0x00, 0xbc,
+ 0xe7, 0x69, 0xe4, 0xbb, 0xd5, 0xad, 0x1f, 0x3d, 0xb6, 0x02, 0x72, 0x3d, 0x4b,
+ 0x4f, 0x91, 0xbc, 0x69, 0xd1, 0xd2, 0xbc, 0xf4, 0x42, 0xce, 0x3c, 0xf9, 0x95,
+ 0x8f, 0x3d, 0x5f, 0xd1, 0x52, 0x3c, 0xec, 0xd5, 0x67, 0x3d, 0x79, 0x25, 0x84,
+ 0xba, 0xf3, 0x43, 0x5f, 0x3d, 0x39, 0xdc, 0x2b, 0x3d, 0xc6, 0x40, 0x67, 0xbd,
+ 0xbb, 0xfa, 0x02, 0xbd, 0xf6, 0x13, 0x31, 0xbc, 0x1a, 0x8a, 0x5b, 0x3d, 0x28,
+ 0x8c, 0x3d, 0xba, 0xbd, 0x41, 0x46, 0x3d, 0xc8, 0xb7, 0x80, 0xbb, 0xd7, 0xc5,
+ 0x71, 0x3b, 0x2a, 0x9d, 0x51, 0xbd, 0xfb, 0xe8, 0x66, 0xbd, 0x49, 0x55, 0xad,
+ 0xbc, 0x80, 0x74, 0x36, 0xbd, 0x00, 0x48, 0xc7, 0xbc, 0xec, 0x9e, 0xf8, 0x3c,
+ 0x2d, 0x31, 0x7e, 0x3d, 0x5d, 0xdd, 0x94, 0xbd, 0xfd, 0xce, 0x57, 0x3d, 0xe2,
+ 0x28, 0x0b, 0xbc, 0x00, 0xec, 0x38, 0x3d, 0x88, 0x2f, 0xc9, 0xbc, 0xe8, 0x5d,
+ 0x69, 0x3d, 0xd8, 0x1a, 0x04, 0xbc, 0xa5, 0x91, 0x78, 0x3d, 0x4f, 0x30, 0x06,
+ 0xbc, 0xdf, 0x59, 0x51, 0x3d, 0x00, 0xb6, 0x8f, 0x3a, 0x9f, 0x7e, 0x76, 0xbd,
+ 0x66, 0xc5, 0x1d, 0x3d, 0x99, 0x26, 0x91, 0xbd, 0x82, 0x51, 0x8e, 0xbd, 0xf6,
+ 0xf9, 0x81, 0xbc, 0x60, 0x4a, 0x9d, 0x3c, 0x40, 0xfa, 0xf8, 0xbb, 0x96, 0x7a,
+ 0xf4, 0xbb, 0x8d, 0xfb, 0x02, 0xbd, 0xf0, 0xf1, 0xa8, 0x3c, 0xc9, 0xa7, 0x38,
+ 0xbd, 0x85, 0xc8, 0x4b, 0xbc, 0xc8, 0x56, 0x13, 0x3d, 0x61, 0x4d, 0x88, 0xbd,
+ 0x4e, 0xe1, 0x42, 0x3d, 0xec, 0x20, 0x7c, 0xbc, 0x49, 0x1c, 0x91, 0x3d, 0x40,
+ 0xea, 0x8d, 0xbd, 0x90, 0xa9, 0x5b, 0xbd, 0xe1, 0x98, 0x8e, 0xbd, 0x2f, 0x06,
+ 0xed, 0xbc, 0xa9, 0xa1, 0xe0, 0x3c, 0x54, 0xa1, 0x76, 0xbd, 0x21, 0x88, 0x70,
+ 0xbd, 0x16, 0x25, 0x23, 0xbd, 0xb6, 0xdf, 0x4f, 0x3d, 0xaf, 0x39, 0x57, 0x3d,
+ 0x3f, 0xfa, 0x2a, 0xbd, 0xda, 0x39, 0xcf, 0x3c, 0xf6, 0x8b, 0x5e, 0x3d, 0x49,
+ 0x9e, 0xec, 0xbc, 0x5c, 0x6b, 0x7f, 0x3d, 0x38, 0xf8, 0x8a, 0xbc, 0x15, 0xc8,
+ 0x8a, 0xbd, 0xc9, 0xb5, 0x3f, 0x3d, 0x1c, 0xcd, 0x97, 0xbd, 0x3c, 0xa4, 0xb0,
+ 0xba, 0x85, 0x05, 0x18, 0xbc, 0x0b, 0xf9, 0x81, 0xbd, 0xa7, 0x64, 0x84, 0xbc,
+ 0x17, 0xa4, 0x86, 0x3d, 0x74, 0xbc, 0x6d, 0xbd, 0xbe, 0xaa, 0xe0, 0x3c, 0x70,
+ 0x71, 0x01, 0x3d, 0x34, 0x7c, 0x3b, 0x3d, 0xf7, 0xe5, 0x4a, 0x3d, 0x0b, 0x8a,
+ 0xe2, 0x3c, 0x3a, 0xce, 0x8c, 0xbd, 0xc3, 0x45, 0x17, 0xbc, 0x06, 0x14, 0x40,
+ 0xbd, 0xc8, 0x4e, 0x2a, 0x3d, 0x1e, 0x87, 0x38, 0x3d, 0x12, 0xe6, 0x8e, 0x3d,
+ 0x5d, 0x26, 0x24, 0xbc, 0x96, 0x16, 0x0e, 0xbb, 0xbd, 0x7b, 0xe7, 0xbb, 0xee,
+ 0xf1, 0x86, 0xbc, 0x21, 0x44, 0xe1, 0xba, 0x34, 0xc7, 0x76, 0xbd, 0x84, 0x41,
+ 0x0f, 0xba, 0x79, 0x2a, 0x77, 0x3d, 0xe0, 0x52, 0xce, 0x3c, 0xd3, 0xbd, 0x0c,
+ 0x3d, 0xff, 0x57, 0x8b, 0x3d, 0xc6, 0x60, 0xed, 0x3b, 0xfc, 0x72, 0x7f, 0xbd,
+ 0x18, 0xaa, 0x20, 0x3c, 0xcd, 0x28, 0x0d, 0x3d, 0x18, 0xf7, 0xdb, 0x3a, 0xd6,
+ 0x93, 0x6a, 0x3d, 0x46, 0x48, 0x55, 0xbd, 0x01, 0x2f, 0x7c, 0x3d, 0x75, 0x2d,
+ 0x80, 0x3c, 0x4c, 0x22, 0xd0, 0x3c, 0x17, 0x6d, 0x8b, 0xbb, 0x34, 0x25, 0xec,
+ 0xbc, 0x04, 0x8e, 0x56, 0x3d, 0xd8, 0xab, 0x88, 0x3d, 0x20, 0x51, 0x88, 0xbc,
+ 0x71, 0xdb, 0xd4, 0x3c, 0x41, 0xe5, 0x03, 0xbd, 0x28, 0x8d, 0x0c, 0x3c, 0xa1,
+ 0xe2, 0x7d, 0xbd, 0x10, 0xb2, 0xcd, 0x3c, 0x3b, 0xa9, 0xdf, 0xbc, 0x2d, 0x71,
+ 0x73, 0x3d, 0xfa, 0xcb, 0xd3, 0x3c, 0xb4, 0x04, 0x10, 0xbb, 0xca, 0xec, 0x8c,
+ 0xbd, 0xd1, 0x28, 0x9a, 0x3c, 0x0f, 0x12, 0x2f, 0x3d, 0x93, 0x67, 0x2a, 0x3d,
+ 0x94, 0x98, 0xb7, 0x3c, 0x8e, 0x0f, 0xae, 0xbc, 0xc6, 0x7c, 0xd9, 0x3c, 0xa0,
+ 0x4d, 0x3b, 0xbb, 0x20, 0xf7, 0xd5, 0x3c, 0x7b, 0xa2, 0x72, 0xbd, 0xc5, 0xb9,
+ 0xbd, 0x3c, 0x59, 0x61, 0x1e, 0x3d, 0x8b, 0x95, 0x8c, 0xbd, 0xbe, 0xbf, 0x9b,
+ 0xbc, 0x0f, 0x63, 0x7b, 0x3d, 0x92, 0x1a, 0x66, 0x3c, 0x4f, 0xef, 0xa0, 0x38,
+ 0x8c, 0x24, 0xd9, 0xbc, 0x7d, 0xfa, 0xf8, 0xbc, 0xde, 0xe7, 0x85, 0x3d, 0xa2,
+ 0xd6, 0x13, 0xbd, 0x5e, 0x38, 0x3d, 0xbd, 0xe7, 0x7e, 0xb0, 0x3d, 0xc5, 0x86,
+ 0xba, 0xbc, 0x49, 0x12, 0x93, 0xbd, 0x8e, 0x9e, 0xea, 0x3d, 0x48, 0x93, 0x84,
+ 0xbd, 0x33, 0x48, 0xc7, 0xbc, 0x23, 0x1f, 0x5f, 0x3d, 0x51, 0x20, 0xb5, 0xbb,
+ 0x93, 0xfa, 0x90, 0x3d, 0x99, 0xe1, 0x31, 0xbd, 0x82, 0x3e, 0x89, 0xbd, 0x99,
+ 0x5e, 0xe0, 0xbc, 0x0c, 0xc2, 0x03, 0x3d, 0xe2, 0x69, 0xb2, 0x3c, 0x3d, 0xdb,
+ 0x6e, 0xbd, 0x37, 0xd2, 0x36, 0x3c, 0x89, 0x66, 0x1e, 0xbd, 0xeb, 0x8a, 0x88,
+ 0x3d, 0x1a, 0x34, 0x3d, 0x3d, 0x84, 0x3a, 0x24, 0x3d, 0x2f, 0xd2, 0x78, 0xbd,
+ 0x45, 0x13, 0x82, 0x3d, 0x70, 0x07, 0x94, 0x3d, 0xf9, 0xc5, 0x7f, 0xbd, 0x40,
+ 0x1b, 0x04, 0xbd, 0x74, 0x6f, 0x3a, 0x3d, 0xa0, 0x7d, 0xf8, 0xbc, 0x7e, 0x95,
+ 0x61, 0x3d, 0xc0, 0x56, 0x5d, 0x3b, 0x16, 0xa4, 0x06, 0x3d, 0x4b, 0x46, 0xbf,
+ 0xbd, 0x64, 0x97, 0xe8, 0xbc, 0x79, 0xbd, 0x75, 0x3a, 0x50, 0xb6, 0x6a, 0x3c,
+ 0x7b, 0xcc, 0x29, 0x3c, 0xa8, 0x8f, 0x17, 0x3d, 0xf0, 0xf6, 0xbc, 0x3b, 0x48,
+ 0x26, 0x78, 0xbd, 0x96, 0x9b, 0xe4, 0x3b, 0x87, 0xe5, 0x70, 0x3c, 0x88, 0xf2,
+ 0xac, 0xbb, 0x79, 0x75, 0x05, 0x3c, 0x06, 0x38, 0xa5, 0x3d, 0x8b, 0x4e, 0x0a,
+ 0x3d, 0xf9, 0x2d, 0x95, 0x3d, 0x08, 0xca, 0x7f, 0x3d, 0xc7, 0x5e, 0x1c, 0x3d,
+ 0xf2, 0xbc, 0x57, 0xbc, 0xc6, 0xaf, 0x5a, 0xbd, 0x7f, 0xc5, 0xc7, 0x3c, 0x69,
+ 0x5c, 0x00, 0x3c, 0x69, 0xaf, 0x8a, 0x3d, 0x60, 0x07, 0x01, 0x3d, 0xc3, 0x8f,
+ 0xff, 0x3a, 0xd5, 0x44, 0x1d, 0x3d, 0x66, 0x63, 0x2a, 0xbd, 0xe9, 0xd3, 0x9a,
+ 0xbd, 0x50, 0xc0, 0x0a, 0xbd, 0x32, 0x2d, 0xc6, 0xbc, 0xf0, 0xb1, 0xd4, 0xbb,
+ 0x48, 0xcc, 0xdc, 0x3a, 0xcd, 0x33, 0x6f, 0x3d, 0xea, 0x34, 0x95, 0xbd, 0xb8,
+ 0x4b, 0x2f, 0xbc, 0xe0, 0xa1, 0x0f, 0xbc, 0x0f, 0xee, 0x01, 0x3c, 0x5e, 0x3d,
+ 0x35, 0x3d, 0x6e, 0x51, 0x81, 0xbd, 0xfa, 0x8d, 0x8b, 0x3c, 0x51, 0xc5, 0x0a,
+ 0x3d, 0x8a, 0xa8, 0xc4, 0xbc, 0x66, 0x86, 0x19, 0xbd, 0x50, 0x08, 0x8e, 0x3d,
+ 0x22, 0x74, 0xdd, 0x3b, 0xdb, 0xf4, 0xea, 0x3a, 0xa1, 0x2d, 0x68, 0x3d, 0x7e,
+ 0x82, 0xc6, 0x3d, 0xe6, 0x89, 0x16, 0xbd, 0xe2, 0x72, 0x78, 0xbd, 0x25, 0xe0,
+ 0x82, 0xbd, 0xc2, 0x61, 0x66, 0x3c, 0xb2, 0x57, 0x66, 0x3d, 0x47, 0xa3, 0x40,
+ 0xbc, 0xf7, 0x00, 0x3e, 0xbd, 0x78, 0x7e, 0x42, 0x3d, 0xc3, 0x09, 0x83, 0x3d,
+ 0x1d, 0xac, 0x09, 0x3d, 0x37, 0xc0, 0xd7, 0x3b, 0xae, 0xbb, 0x34, 0xbd, 0x12,
+ 0x34, 0x95, 0x3d, 0xf8, 0x3f, 0x20, 0x3d, 0xa8, 0x30, 0x0b, 0xbd, 0x09, 0x71,
+ 0x02, 0xbd, 0xb7, 0xbc, 0x80, 0x3d, 0x9e, 0x24, 0x48, 0x3d, 0xbb, 0xe7, 0xa6,
+ 0x3d, 0x59, 0xd4, 0x28, 0xbd, 0x98, 0x85, 0x14, 0xbc, 0x25, 0xbe, 0xae, 0x3c,
+ 0x1b, 0x82, 0x85, 0x3c, 0x6c, 0x23, 0xc3, 0x3c, 0x7a, 0xe2, 0x03, 0xbd, 0x75,
+ 0x65, 0x3a, 0x3d, 0x9e, 0x34, 0x76, 0x3b, 0xe1, 0x36, 0x05, 0x3d, 0xd6, 0x9a,
+ 0x37, 0xbd, 0x66, 0x1c, 0x99, 0x3c, 0x9d, 0x65, 0x2a, 0xbd, 0xc3, 0xdd, 0x60,
+ 0xbc, 0x6c, 0xa8, 0x06, 0xbd, 0xb8, 0xb4, 0x85, 0xbd, 0xca, 0x5d, 0x65, 0x3c,
+ 0xe2, 0xce, 0xfa, 0x3c, 0x18, 0xe2, 0x29, 0x3d, 0x4a, 0xd0, 0x31, 0xbc, 0x78,
+ 0xd4, 0x52, 0x3d, 0x7a, 0x03, 0x47, 0x3d, 0x0e, 0x3a, 0xde, 0xbc, 0xd1, 0x1c,
+ 0x72, 0xbd, 0x39, 0xb2, 0x8c, 0xbd, 0x1a, 0x1c, 0xba, 0xbd, 0x20, 0x30, 0x5e,
+ 0x3b, 0x4b, 0x1f, 0x40, 0xbc, 0x70, 0x8b, 0xbd, 0x3c, 0x02, 0x15, 0x12, 0xbd,
+ 0x92, 0x7d, 0x52, 0xbd, 0x98, 0x66, 0x78, 0xbc, 0x73, 0x75, 0x74, 0x3d, 0x91,
+ 0x42, 0x88, 0x3d, 0x8a, 0x00, 0x26, 0xbd, 0xca, 0xd7, 0x86, 0x3d, 0xea, 0xcb,
+ 0x66, 0xbd, 0xb8, 0x28, 0x26, 0x3c, 0xd5, 0x36, 0x90, 0xbd, 0xfa, 0x19, 0x5a,
+ 0x3d, 0xb2, 0x02, 0x81, 0xbd, 0xe3, 0x63, 0x8d, 0x3d, 0xad, 0x2e, 0x0e, 0x3d,
+ 0x01, 0x74, 0x4b, 0xbd, 0xa3, 0x91, 0x08, 0x3d, 0x6d, 0xa0, 0x23, 0xbd, 0x84,
+ 0xbd, 0x0a, 0xbd, 0x28, 0x54, 0x95, 0xba, 0x1c, 0x4a, 0x2f, 0x3d, 0xf0, 0x67,
+ 0xaf, 0xbc, 0xcc, 0x1e, 0x18, 0x3d, 0xd5, 0xf0, 0x29, 0x3d, 0xd9, 0x19, 0x0a,
+ 0xbc, 0x91, 0xf8, 0x1c, 0xbc, 0xf0, 0x4b, 0x1a, 0x3d, 0xc8, 0xdc, 0x52, 0xbc,
+ 0x65, 0x2b, 0x6c, 0xbd, 0x9f, 0x08, 0x9a, 0xbd, 0x11, 0xd4, 0x9e, 0xbc, 0xb0,
+ 0xa3, 0x0d, 0x3c, 0x20, 0x50, 0xd7, 0x3c, 0x65, 0xfc, 0xb7, 0xbc, 0x43, 0xf5,
+ 0x0d, 0xbd, 0xb9, 0x3c, 0x2a, 0x3d, 0x66, 0xb3, 0x5b, 0x3d, 0x6d, 0x26, 0xa0,
+ 0x3d, 0x3a, 0xc0, 0x15, 0xbb, 0x67, 0x1b, 0x0b, 0x3c, 0x20, 0x72, 0xa6, 0xbd,
+ 0xe2, 0x14, 0xa5, 0xbc, 0x37, 0x10, 0x92, 0x3d, 0x24, 0x2d, 0x1c, 0x3d, 0x47,
+ 0xbd, 0x2b, 0xbd, 0x68, 0x0f, 0xa5, 0x3d, 0x96, 0x58, 0x98, 0x3d, 0x25, 0x20,
+ 0xd3, 0x3b, 0xc2, 0x1b, 0xbd, 0x3d, 0x17, 0x2a, 0xa5, 0xbb, 0x34, 0x7e, 0x47,
+ 0x3d, 0x36, 0xb6, 0xd0, 0x3b, 0x6a, 0xba, 0xf3, 0x3c, 0x54, 0x95, 0x25, 0xbd,
+ 0x99, 0x51, 0x81, 0x3d, 0xe6, 0x1b, 0x20, 0xbc, 0x2e, 0xc2, 0x3b, 0xbd, 0xb8,
+ 0xa6, 0x17, 0xbd, 0x86, 0x1f, 0xd7, 0x3c, 0x60, 0x69, 0x8d, 0x3d, 0x00, 0x02,
+ 0x76, 0xbd, 0x86, 0xdb, 0x85, 0x3b, 0x52, 0xb1, 0xd7, 0x3d, 0x7c, 0xd1, 0x4f,
+ 0xbd, 0xb0, 0xe7, 0x13, 0xbd, 0xee, 0xe2, 0x0f, 0x3d, 0x2e, 0x0a, 0x11, 0xbd,
+ 0x59, 0x7e, 0x04, 0xbd, 0xf1, 0xdf, 0x10, 0xbc, 0x9f, 0xfd, 0x90, 0xbc, 0x0a,
+ 0xec, 0x47, 0x3c, 0x9b, 0x06, 0x5a, 0x3d, 0x0e, 0xe3, 0xee, 0xbc, 0x3b, 0xbf,
+ 0xc7, 0x3b, 0x1e, 0xc7, 0x17, 0xbd, 0x65, 0x6d, 0x75, 0x3c, 0x81, 0x92, 0xc3,
+ 0x3c, 0xee, 0x48, 0x9e, 0x3c, 0x6d, 0x2e, 0x4f, 0xbd, 0x42, 0x85, 0x64, 0xbd,
+ 0xe9, 0x0a, 0xbb, 0xbc, 0x73, 0x3f, 0x40, 0xbd, 0xbd, 0x8c, 0xae, 0x3b, 0x4a,
+ 0xae, 0x31, 0x3d, 0x9e, 0x39, 0xfd, 0x3c, 0xd7, 0x4e, 0xe0, 0xbd, 0xf6, 0x05,
+ 0x05, 0xbd, 0xbf, 0x61, 0x31, 0x3c, 0xba, 0x2f, 0x51, 0x3d, 0x16, 0xef, 0xdd,
+ 0x3c, 0x23, 0x64, 0x18, 0x3c, 0x44, 0x4b, 0xce, 0xbc, 0x13, 0xbd, 0xd7, 0xbc,
+ 0xc8, 0xc8, 0xb8, 0xbc, 0x76, 0x69, 0x19, 0xbd, 0x76, 0x51, 0x9c, 0xbd, 0xbe,
+ 0xbc, 0x7d, 0x3d, 0xa3, 0xa2, 0x74, 0x3d, 0xfe, 0xad, 0x06, 0x3c, 0x74, 0xb4,
+ 0x0f, 0x3b, 0x9f, 0x83, 0x8d, 0x3d, 0xa5, 0x84, 0x70, 0x3d, 0x99, 0xa1, 0xe6,
+ 0xbc, 0xf2, 0xf1, 0xbd, 0xbc, 0x29, 0xd8, 0x42, 0xbc, 0x48, 0xb0, 0xa7, 0x3c,
+ 0xce, 0x31, 0x0b, 0xbd, 0x8b, 0xef, 0x39, 0x3d, 0xc5, 0x28, 0xa4, 0x3c, 0xcd,
+ 0x1b, 0xb7, 0x3c, 0x3f, 0x50, 0x55, 0xbd, 0xf4, 0xa8, 0x9d, 0x3d, 0xe3, 0xdb,
+ 0xac, 0x3c, 0x5c, 0xae, 0x68, 0xbc, 0x8e, 0xf1, 0x0f, 0xbc, 0x17, 0x29, 0x87,
+ 0x3c, 0x19, 0x45, 0x23, 0xbd, 0xf0, 0x0f, 0x12, 0xbd, 0x06, 0x74, 0x8b, 0xbd,
+ 0x10, 0x65, 0x00, 0x3d, 0xa3, 0x9d, 0x8a, 0x3d, 0x1e, 0xf4, 0x3d, 0x3d, 0x4e,
+ 0x40, 0x7b, 0x3c, 0xa0, 0xc8, 0xf7, 0xbb, 0x2e, 0x19, 0x1a, 0xbc, 0x37, 0x47,
+ 0x36, 0xbd, 0x8b, 0x65, 0x6d, 0x3d, 0xc0, 0xcd, 0x21, 0xbd, 0x60, 0xb6, 0xa3,
+ 0xbb, 0xa9, 0x58, 0x42, 0xbc, 0x94, 0x1c, 0x73, 0xbd, 0x82, 0xa5, 0xad, 0xbc,
+ 0x51, 0xe5, 0xb5, 0x3d, 0xbd, 0xa1, 0x59, 0x3d, 0x13, 0x5b, 0xdb, 0xbc, 0x44,
+ 0xdc, 0xd3, 0xbc, 0xc8, 0x3f, 0xa5, 0x3d, 0x5d, 0x7c, 0x68, 0x3d, 0xcd, 0xb4,
+ 0xa7, 0xbc, 0x58, 0x2b, 0x48, 0x3d, 0xe6, 0x22, 0xf6, 0xbc, 0xde, 0x4b, 0x0b,
+ 0xbd, 0x71, 0x8f, 0x44, 0xbd, 0x8d, 0xa0, 0x17, 0xbd, 0xd3, 0xd3, 0x36, 0x3d,
+ 0x40, 0x04, 0x3c, 0xbd, 0x4a, 0xdf, 0x82, 0x3b, 0x23, 0x72, 0x20, 0x3d, 0xf5,
+ 0x84, 0x80, 0xbd, 0xf9, 0x1c, 0xf3, 0xbc, 0x84, 0xd9, 0x86, 0xbd, 0x28, 0x42,
+ 0x48, 0xbd, 0x90, 0xd7, 0x32, 0x3d, 0x80, 0x98, 0x01, 0xbc, 0x7f, 0x7a, 0x82,
+ 0xbd, 0x59, 0x12, 0xf3, 0x3c, 0x9b, 0x63, 0xaa, 0xbc, 0x5e, 0x84, 0xb5, 0xbd,
+ 0x95, 0x77, 0x90, 0x3d, 0xad, 0x26, 0xb4, 0xbd, 0xda, 0xfb, 0x0a, 0xbd, 0x44,
+ 0x70, 0x73, 0x3d, 0x70, 0x45, 0x41, 0x3d, 0xe6, 0x6b, 0x73, 0x3c, 0x93, 0x01,
+ 0x78, 0xbd, 0xc3, 0xda, 0xa2, 0x3d, 0x46, 0x41, 0x83, 0x3d, 0x16, 0x40, 0x32,
+ 0x3d, 0xa7, 0xfb, 0xa7, 0xbd, 0xc0, 0x57, 0x28, 0x3b, 0xd0, 0x2b, 0x84, 0xbc,
+ 0x85, 0x89, 0x88, 0x3d, 0xc4, 0xa3, 0x8f, 0xbc, 0xbb, 0xc6, 0x96, 0xbd, 0x7c,
+ 0xae, 0x36, 0xbd, 0xf8, 0x8b, 0x85, 0x3d, 0xfa, 0x35, 0xf5, 0x3c, 0xad, 0x86,
+ 0x63, 0xbc, 0x7c, 0xc1, 0x54, 0x3d, 0xad, 0xfc, 0x09, 0xbd, 0x3a, 0x1f, 0xf2,
+ 0x3c, 0xf4, 0x35, 0x65, 0x3c, 0xd0, 0x53, 0x38, 0xbd, 0x99, 0xf8, 0x36, 0x3d,
+ 0x95, 0xaf, 0x67, 0x3d, 0xd2, 0x76, 0x44, 0x3d, 0x03, 0x46, 0x82, 0x3d, 0xdc,
+ 0xe2, 0x53, 0xbd, 0x49, 0x59, 0x7b, 0xbd, 0x1c, 0x8b, 0xaf, 0x3a, 0x80, 0x30,
+ 0x27, 0xbd, 0xdb, 0x9c, 0x87, 0xbd, 0x8e, 0x09, 0x5c, 0x3d, 0x5e, 0x5d, 0x5d,
+ 0x3d, 0xcc, 0x97, 0xaa, 0xbb, 0x81, 0xe0, 0xb9, 0xbc, 0x61, 0x3a, 0x9a, 0x3b,
+ 0xc9, 0x99, 0x9f, 0x3d, 0x2d, 0x52, 0x10, 0xbd, 0x90, 0x0b, 0xa1, 0x3c, 0xaf,
+ 0x88, 0x81, 0xbd, 0xf4, 0x7a, 0x89, 0xbc, 0xb3, 0xe1, 0xc5, 0xbc, 0x8e, 0xe5,
+ 0x8a, 0xbd, 0x6d, 0xd9, 0x70, 0x3b, 0xdd, 0x1b, 0xa1, 0x3c, 0xdd, 0xeb, 0x42,
+ 0xbd, 0x01, 0xcb, 0xf2, 0x3c, 0x8e, 0x4f, 0xff, 0xbc, 0x28, 0x5e, 0x6a, 0xbc,
+ 0x3f, 0xff, 0x26, 0x3d, 0xc4, 0xfa, 0x87, 0xbc, 0xcb, 0x5e, 0x32, 0xbd, 0x1f,
+ 0xb7, 0xd1, 0xbd, 0x40, 0xb6, 0x8b, 0x3c, 0x22, 0xf5, 0xa5, 0xbc, 0x5e, 0xa1,
+ 0xf7, 0xbc, 0x1a, 0x43, 0x11, 0x3d, 0xc9, 0xfe, 0x18, 0xbd, 0x34, 0x8b, 0x2f,
+ 0x3d, 0x2f, 0xe3, 0x8d, 0x3d, 0xaf, 0x7b, 0x69, 0xbd, 0x63, 0x9d, 0xac, 0x3d,
+ 0xce, 0x45, 0x50, 0xbd, 0xe1, 0x8f, 0x6b, 0xbd, 0x6e, 0xc6, 0x07, 0xbd, 0x58,
+ 0x1e, 0x12, 0x3c, 0x79, 0xdd, 0x06, 0x3d, 0xea, 0x26, 0x83, 0xbd, 0xaa, 0x63,
+ 0xce, 0x3d, 0x3a, 0xb3, 0x81, 0x3b, 0x35, 0x9a, 0xc6, 0x3c, 0x27, 0xc4, 0x59,
+ 0xbd, 0x74, 0x21, 0x30, 0x3d, 0xfe, 0x21, 0x8f, 0xbc, 0xb2, 0x86, 0x78, 0xbc,
+ 0xbb, 0x4f, 0xd7, 0xbd, 0xda, 0xfe, 0x2c, 0xbd, 0x7b, 0x99, 0x21, 0x3b, 0x61,
+ 0xe4, 0x68, 0xbd, 0x66, 0xfd, 0xb2, 0xba, 0xbe, 0x3d, 0x53, 0x3d, 0x53, 0x3f,
+ 0x5c, 0xbd, 0x5b, 0xf9, 0xc4, 0x3c, 0x1c, 0xa3, 0x6c, 0x3d, 0x61, 0x44, 0xfa,
+ 0x3c, 0x35, 0xb8, 0xd9, 0x3c, 0x6d, 0x40, 0xc8, 0xbc, 0xbf, 0x20, 0x2a, 0x3d,
+ 0x84, 0xbd, 0x80, 0x3c, 0x19, 0x27, 0x1c, 0x3d, 0xc8, 0xf0, 0x56, 0x3c, 0x74,
+ 0x85, 0x29, 0x3c, 0xce, 0x5a, 0x91, 0xbc, 0x1f, 0xc3, 0x89, 0xbc, 0x8a, 0xec,
+ 0x62, 0x3d, 0xd0, 0xc0, 0xd2, 0xbb, 0x29, 0x30, 0x36, 0x3d, 0x71, 0xd4, 0xaf,
+ 0x3c, 0x29, 0x52, 0xb9, 0xbc, 0x33, 0xc8, 0x2c, 0x3a, 0x97, 0x8e, 0x18, 0xbb,
+ 0xda, 0xa7, 0x28, 0xbd, 0xaf, 0x8c, 0xc1, 0xbc, 0x62, 0xbb, 0xc7, 0x3b, 0xda,
+ 0x12, 0xbb, 0xbc, 0x7a, 0xfb, 0x3a, 0xbd, 0x04, 0xc0, 0xe3, 0x3c, 0x0f, 0x84,
+ 0xdd, 0xbd, 0xa4, 0x83, 0x87, 0x3d, 0x38, 0x8b, 0x5f, 0xbd, 0x60, 0xb4, 0x98,
+ 0x3c, 0x99, 0xef, 0x5d, 0x3b, 0xda, 0x0b, 0x83, 0x3d, 0x49, 0xf9, 0x93, 0x3d,
+ 0xe4, 0x29, 0x51, 0xbd, 0x5e, 0x33, 0x4b, 0xbd, 0x7a, 0xc5, 0xd5, 0x3b, 0xc2,
+ 0xbc, 0x67, 0x3d, 0x89, 0xa1, 0x55, 0xbd, 0x91, 0x0f, 0x55, 0x3d, 0xf8, 0x89,
+ 0x82, 0xbd, 0x4c, 0xdc, 0xc6, 0xbc, 0xc9, 0xb0, 0x3e, 0xbd, 0x7c, 0x95, 0x25,
+ 0x3d, 0xa2, 0x9f, 0xe1, 0x3b, 0x17, 0xcf, 0x90, 0xbb, 0xd6, 0x9c, 0x47, 0x3b,
+ 0xf6, 0x12, 0x74, 0x3d, 0xba, 0x2e, 0xde, 0x3c, 0x3e, 0x06, 0x74, 0x3d, 0x32,
+ 0x23, 0x5e, 0xbc, 0x02, 0xf3, 0x88, 0xbd, 0x16, 0x5d, 0xdd, 0xbc, 0x50, 0x9b,
+ 0x0a, 0xbd, 0x8e, 0x56, 0xb9, 0xbc, 0xc8, 0x8b, 0x18, 0x3d, 0xfd, 0x15, 0x80,
+ 0x3d, 0x4c, 0x97, 0x5a, 0xbc, 0xe2, 0x63, 0xa4, 0xbc, 0xc3, 0x3d, 0x84, 0xbc,
+ 0x7e, 0xa2, 0x83, 0x3b, 0x6e, 0x8b, 0x4e, 0x3c, 0x24, 0xb4, 0xb3, 0xbb, 0x03,
+ 0x9e, 0xfd, 0x3b, 0xa4, 0x8b, 0x53, 0x3d, 0xbc, 0x81, 0x61, 0xbd, 0x59, 0xde,
+ 0x48, 0x3d, 0x21, 0x16, 0x61, 0xbd, 0x31, 0xbc, 0x1c, 0xbd, 0xfc, 0xe8, 0xf4,
+ 0x3c, 0x88, 0x36, 0x59, 0x3d, 0x12, 0x10, 0xf8, 0xbb, 0xe4, 0x7b, 0x5f, 0xbc,
+ 0xf0, 0x9d, 0x9e, 0x3c, 0xfb, 0x94, 0xdb, 0xbc, 0x54, 0x67, 0x65, 0xbc, 0x5e,
+ 0x6e, 0x3b, 0xbd, 0x12, 0x92, 0x59, 0x3c, 0xf3, 0x69, 0x8b, 0x3b, 0x78, 0x99,
+ 0xdd, 0x3c, 0x85, 0x31, 0x21, 0x3d, 0xe4, 0x6c, 0x33, 0x3d, 0x9c, 0x58, 0x87,
+ 0xbd, 0xd9, 0xf5, 0x31, 0xbc, 0xce, 0xac, 0xb9, 0x3d, 0x0e, 0x2c, 0x5c, 0x3d,
+ 0x6a, 0x94, 0xa9, 0x3d, 0x0e, 0xca, 0x4d, 0xbc, 0x68, 0x0f, 0x4d, 0xbd, 0xd5,
+ 0x31, 0xa6, 0xbc, 0xf1, 0xdc, 0x9b, 0x3d, 0x71, 0x4d, 0xfd, 0xbc, 0xcc, 0x43,
+ 0x1a, 0x3d, 0x1f, 0x4f, 0x51, 0x3d, 0xf0, 0x07, 0xa4, 0x3b, 0x1a, 0x75, 0x40,
+ 0x3d, 0xf6, 0xef, 0x13, 0x3d, 0x58, 0x08, 0x04, 0xbd, 0xf3, 0x55, 0x58, 0x3d,
+ 0x55, 0x7e, 0x6d, 0xbd, 0x96, 0x39, 0x78, 0xbd, 0x19, 0x7d, 0x7f, 0xbd, 0xc3,
+ 0x4a, 0x9a, 0xbd, 0x64, 0xad, 0x24, 0x3d, 0xc8, 0xab, 0x10, 0x3b, 0xa2, 0x7f,
+ 0x76, 0xbd, 0xdd, 0xb6, 0x2e, 0x3d, 0xdb, 0xbf, 0x88, 0x3d, 0x49, 0x2e, 0xbd,
+ 0xbb, 0xdb, 0xdc, 0x86, 0x3d, 0x06, 0xf9, 0x85, 0xbd, 0x3c, 0x44, 0x39, 0xbc,
+ 0x8b, 0x1c, 0x32, 0x3d, 0xf6, 0x3c, 0x7a, 0x3d, 0x68, 0x1f, 0x13, 0xbd, 0x1d,
+ 0x1c, 0xed, 0x3c, 0xa8, 0x9b, 0x08, 0xbc, 0xe4, 0x25, 0xf6, 0xbc, 0xf6, 0xd8,
+ 0x19, 0xbd, 0x24, 0x39, 0x2f, 0xbd, 0x59, 0x25, 0x86, 0xbd, 0xbf, 0xf8, 0x78,
+ 0xbd, 0x33, 0xec, 0x93, 0xbd, 0x65, 0xdd, 0x55, 0xbd, 0x9d, 0x16, 0x05, 0xbd,
+ 0x69, 0xe6, 0x79, 0x3d, 0x64, 0xfd, 0xf0, 0xbc, 0xf7, 0xa3, 0x63, 0xbc, 0xb4,
+ 0x5f, 0xdb, 0xbc, 0x72, 0x22, 0x13, 0x3d, 0x0e, 0x28, 0x03, 0xbd, 0x64, 0x4b,
+ 0xad, 0x3c, 0xcb, 0x9c, 0x15, 0xbd, 0x58, 0x24, 0x55, 0x3d, 0x85, 0x90, 0x18,
+ 0xbc, 0x87, 0xb7, 0x95, 0x3d, 0x5e, 0xd9, 0x78, 0xbd, 0xa6, 0x19, 0x80, 0x3d,
+ 0xd3, 0xf6, 0x08, 0x3d, 0x8c, 0x74, 0x43, 0xbd, 0x06, 0x77, 0x8f, 0xbd, 0x68,
+ 0xc4, 0x6f, 0xbd, 0x6f, 0x45, 0x03, 0x3b, 0xb4, 0xf9, 0x9c, 0x3c, 0xe2, 0x85,
+ 0x8f, 0x3c, 0x3a, 0x70, 0x92, 0x3d, 0x06, 0xaa, 0x28, 0xbd, 0x51, 0x46, 0xc2,
+ 0xbd, 0x39, 0xf2, 0x8f, 0x3d, 0xda, 0xbd, 0x4e, 0x3d, 0x68, 0x6d, 0x57, 0xbc,
+ 0xb3, 0x41, 0x8b, 0x3d, 0xa8, 0x83, 0xa3, 0xbc, 0x3a, 0x05, 0xbf, 0xbc, 0x5b,
+ 0x8d, 0x6e, 0x3d, 0xfa, 0x17, 0x8b, 0xbd, 0xff, 0x33, 0x03, 0x3c, 0x4e, 0x35,
+ 0x6d, 0xbb, 0xf5, 0x98, 0x31, 0xbd, 0xfe, 0x46, 0x20, 0x3c, 0xb7, 0x91, 0x5d,
+ 0x3d, 0xa9, 0x64, 0x97, 0x3c, 0xd8, 0x6a, 0x59, 0xbd, 0x0b, 0xfb, 0x7c, 0x3d,
+ 0x05, 0xf1, 0x26, 0xbd, 0xd4, 0xfd, 0x2a, 0x3d, 0x70, 0xca, 0x1d, 0x3d, 0x76,
+ 0x80, 0xc7, 0xbc, 0xfa, 0x43, 0x7e, 0x3d, 0x6e, 0xda, 0xb6, 0x3c, 0x63, 0x63,
+ 0x25, 0xbd, 0x39, 0xad, 0x9c, 0xbc, 0x89, 0xa0, 0xbf, 0xbd, 0xc7, 0xd6, 0x19,
+ 0x3d, 0x36, 0x1d, 0x22, 0x3c, 0x11, 0x87, 0x8b, 0xbd, 0xa8, 0x59, 0x39, 0xbd,
+ 0xe4, 0x1d, 0x02, 0x3c, 0xf1, 0x0d, 0xf7, 0xbd, 0x16, 0x10, 0xb8, 0x3b, 0x03,
+ 0xfc, 0xa4, 0x3c, 0x32, 0x06, 0x8f, 0xbc, 0x47, 0x59, 0xa3, 0xbc, 0xac, 0x7f,
+ 0xda, 0xbc, 0x4b, 0x26, 0x80, 0x3d, 0x73, 0x33, 0x31, 0xbc, 0x83, 0x75, 0x98,
+ 0xbd, 0xb7, 0x95, 0x65, 0xbd, 0x64, 0x01, 0x21, 0xbd, 0xb8, 0x86, 0x8a, 0x3b,
+ 0xe5, 0x85, 0x4a, 0xbd, 0xe5, 0xc1, 0x45, 0xbc, 0x97, 0x00, 0xab, 0x3c, 0xb6,
+ 0x55, 0x1b, 0xbd, 0x41, 0xcb, 0x01, 0x3d, 0x3c, 0x4e, 0x2f, 0xbc, 0x4c, 0x54,
+ 0xad, 0x3c, 0x70, 0xec, 0x58, 0x3c, 0x57, 0x6e, 0xf9, 0x3c, 0xac, 0xa8, 0x28,
+ 0xbd, 0xea, 0x4c, 0xce, 0xbb, 0x5f, 0x87, 0x1d, 0xbd, 0x0d, 0xe2, 0x5c, 0x3d,
+ 0x1d, 0x21, 0x31, 0xbd, 0xf5, 0x47, 0xd7, 0xbd, 0xb5, 0xd5, 0x0c, 0xbd, 0x81,
+ 0x2b, 0xff, 0x3c, 0x40, 0x81, 0xd2, 0x3c, 0xc3, 0x64, 0x77, 0x3c, 0xd6, 0xdd,
+ 0xc9, 0xbc, 0xee, 0x42, 0x9e, 0xbc, 0x4a, 0xdb, 0x3c, 0x3d, 0xc2, 0x58, 0x82,
+ 0x3d, 0xfa, 0x36, 0x24, 0xbd, 0x36, 0x2e, 0x86, 0x3d, 0x68, 0xee, 0x5e, 0xbd,
+ 0x3c, 0x29, 0x1e, 0xbc, 0x80, 0x1f, 0x88, 0xbd, 0x27, 0xab, 0xb7, 0xbc, 0xce,
+ 0x18, 0xa7, 0xbd, 0xf6, 0x96, 0xa7, 0xbc, 0xde, 0x1b, 0x0a, 0xbd, 0x15, 0x9b,
+ 0x1d, 0x3c, 0x2e, 0xb4, 0x9d, 0x3d, 0x61, 0xba, 0xbe, 0xbc, 0xb8, 0xc8, 0x6a,
+ 0x3d, 0xcc, 0x06, 0xa8, 0xbd, 0x83, 0xae, 0x13, 0xbc, 0x3d, 0xb4, 0x4c, 0xbd,
+ 0xcc, 0xb5, 0x65, 0xbc, 0x0d, 0xad, 0x8b, 0x3c, 0x0e, 0x2f, 0x91, 0x3c, 0x1a,
+ 0xfa, 0x1e, 0x3d, 0xbf, 0xe3, 0xf8, 0x3c, 0x21, 0x8d, 0x8c, 0xbc, 0x30, 0x1b,
+ 0xcb, 0xbc, 0x34, 0x68, 0xf2, 0x3a, 0xed, 0x13, 0x0f, 0xbd, 0x66, 0x39, 0x61,
+ 0xbd, 0xee, 0x87, 0x42, 0x3d, 0xc0, 0x58, 0x69, 0xbc, 0x3e, 0xe4, 0xd5, 0x3c,
+ 0x46, 0x68, 0x30, 0xbd, 0x6c, 0x68, 0xad, 0x3c, 0x36, 0x63, 0x13, 0x3d, 0x0c,
+ 0xf5, 0xf7, 0xbc, 0x56, 0x99, 0x71, 0x3d, 0x4a, 0xba, 0x10, 0x3d, 0xfc, 0xba,
+ 0x3e, 0x3d, 0x5a, 0xd8, 0x82, 0x3d, 0x70, 0x17, 0x92, 0xbd, 0x0f, 0x9b, 0x77,
+ 0xbd, 0x06, 0x4d, 0x78, 0x3d, 0xcb, 0x90, 0x96, 0x3d, 0xa5, 0x6d, 0x04, 0xbd,
+ 0x4a, 0x4f, 0x0f, 0xbc, 0x83, 0x77, 0x3a, 0x3d, 0xdf, 0x43, 0x39, 0x3d, 0x17,
+ 0x17, 0xf7, 0x3c, 0x3d, 0x1a, 0x44, 0xbd, 0x42, 0x1b, 0xdb, 0xbc, 0x1f, 0x26,
+ 0x82, 0xbd, 0xfd, 0x51, 0xa5, 0x3d, 0xc5, 0x70, 0x45, 0x3d, 0x00, 0x17, 0xa1,
+ 0x3c, 0xe1, 0x5c, 0x56, 0xbd, 0x57, 0x8c, 0xe6, 0xbc, 0x87, 0x07, 0xef, 0x3b,
+ 0x9b, 0x41, 0xbf, 0xbd, 0xa1, 0x85, 0xd5, 0x3c, 0x07, 0x20, 0x0a, 0xbd, 0xc0,
+ 0x19, 0xf3, 0xbb, 0x1f, 0xb5, 0xba, 0x3b, 0xa0, 0x79, 0x86, 0xbc, 0x62, 0x56,
+ 0x40, 0xbd, 0x51, 0xf1, 0xa8, 0x3c, 0x83, 0x80, 0x86, 0x3c, 0x18, 0x2b, 0x2d,
+ 0x3d, 0x8d, 0x66, 0xb6, 0x3c, 0x1d, 0xac, 0x2e, 0xbd, 0x91, 0xbc, 0x3e, 0xbd,
+ 0xfb, 0x80, 0x75, 0x3d, 0x7d, 0xa1, 0x54, 0xba, 0x0f, 0xd1, 0x2f, 0xbd, 0xcb,
+ 0x3a, 0x14, 0xbd, 0x76, 0xd3, 0x82, 0xbc, 0x15, 0x06, 0xf5, 0x39, 0xa4, 0xdb,
+ 0x6e, 0x3d, 0x42, 0x46, 0xb7, 0x3c, 0xa3, 0x20, 0x00, 0x3d, 0xfc, 0x4f, 0x2b,
+ 0xbd, 0x06, 0xb1, 0x7e, 0x3d, 0xf8, 0x37, 0xc9, 0xbc, 0x0d, 0x90, 0xd7, 0xbc,
+ 0xb7, 0x8e, 0x0e, 0x3d, 0x68, 0xd8, 0x1d, 0xbc, 0x57, 0xb5, 0x11, 0x3d, 0x68,
+ 0x20, 0x0b, 0x3d, 0x85, 0xda, 0x1e, 0xbd, 0xe0, 0xc0, 0x6b, 0xbd, 0x44, 0x69,
+ 0x96, 0xbd, 0xec, 0xbd, 0x38, 0xbc, 0x09, 0x65, 0x85, 0xbd, 0xb4, 0xf4, 0x57,
+ 0xbd, 0x35, 0xe4, 0xb2, 0xbc, 0xf7, 0x90, 0xd0, 0x3c, 0x78, 0xd1, 0x83, 0xbd,
+ 0xe7, 0x8d, 0x1b, 0xbd, 0x49, 0xa3, 0x94, 0x3d, 0x56, 0xf3, 0x44, 0xbd, 0xb2,
+ 0xce, 0x5e, 0x3d, 0x42, 0x8e, 0x37, 0xbd, 0x22, 0x3e, 0x79, 0xbd, 0xa0, 0x71,
+ 0x6c, 0x3d, 0x23, 0x13, 0xb3, 0xbb, 0x0d, 0x32, 0x21, 0x3c, 0x35, 0x5e, 0xfd,
+ 0xba, 0x0d, 0x0c, 0xbd, 0x3b, 0xcb, 0x0c, 0xaa, 0xbb, 0x33, 0xe8, 0x08, 0xbd,
+ 0x43, 0x7a, 0xa5, 0xbc, 0x15, 0x50, 0x89, 0x3d, 0xd1, 0x86, 0x5b, 0x3d, 0x2a,
+ 0xd8, 0x4c, 0x3d, 0xe1, 0x63, 0x19, 0xbc, 0xee, 0xf0, 0x6f, 0x3d, 0xfa, 0xc2,
+ 0x44, 0x3d, 0x88, 0x3c, 0x6b, 0xbd, 0xe3, 0x24, 0xbb, 0xbc, 0x4c, 0xe6, 0x21,
+ 0x3b, 0x47, 0xf2, 0xa1, 0xbc, 0x46, 0x96, 0xfd, 0x3c, 0x4c, 0x21, 0x86, 0xbd,
+ 0x32, 0x28, 0x83, 0xbc, 0x70, 0x39, 0xa0, 0xbd, 0x80, 0xca, 0x4d, 0xbd, 0xc4,
+ 0x91, 0x8d, 0xbc, 0xab, 0xae, 0x08, 0x3c, 0x54, 0xff, 0xb5, 0xbb, 0x76, 0xae,
+ 0xbe, 0x3c, 0xd8, 0xd1, 0xa5, 0x3d, 0x03, 0x0c, 0x44, 0x3d, 0x92, 0x96, 0x40,
+ 0xbd, 0xd5, 0xc5, 0x1f, 0x3d, 0xdf, 0x09, 0xc0, 0x3c, 0xfb, 0x0d, 0x5f, 0x3d,
+ 0xfd, 0x07, 0x04, 0x3d, 0x1c, 0x43, 0x9a, 0xbd, 0xd7, 0x14, 0x72, 0xbd, 0x2d,
+ 0x50, 0x84, 0xbd, 0x6a, 0x16, 0x7d, 0x38, 0xa6, 0xff, 0x90, 0x3d, 0x44, 0xb7,
+ 0xcc, 0x3c, 0x5d, 0x5f, 0x69, 0xbd, 0x92, 0x8d, 0x6d, 0x3d, 0xf9, 0x02, 0x99,
+ 0xbc, 0xe5, 0x7a, 0xc5, 0xbd, 0xde, 0x5c, 0x69, 0x3d, 0xee, 0xbf, 0xf4, 0x3c,
+ 0x92, 0x19, 0x96, 0x3d, 0xf3, 0x5b, 0x35, 0xbd, 0xf3, 0x90, 0x3b, 0x3d, 0x90,
+ 0xe2, 0xc2, 0xbc, 0x98, 0x91, 0xf9, 0xbc, 0x3b, 0x3b, 0x82, 0xbd, 0xb0, 0x85,
+ 0x30, 0x3d, 0x14, 0x12, 0xea, 0xbc, 0x21, 0x84, 0x8c, 0x3d, 0x93, 0xcd, 0x65,
+ 0x3d, 0xc9, 0x26, 0xda, 0xbc, 0xd5, 0xc3, 0x4e, 0x3c, 0xcc, 0x6e, 0x0f, 0x3d,
+ 0x8d, 0xaf, 0x47, 0x3c, 0x9c, 0xfa, 0xe1, 0x3c, 0x3c, 0xe0, 0x4c, 0x3d, 0x79,
+ 0x22, 0xed, 0x3c, 0xf4, 0x05, 0x3a, 0x3d, 0x59, 0xc0, 0x22, 0xbd, 0x5e, 0xaa,
+ 0xf8, 0xbc, 0xc4, 0xda, 0x22, 0x3c, 0x76, 0x88, 0xaf, 0x3c, 0x1c, 0xf4, 0x3b,
+ 0x3d, 0x4e, 0x6a, 0x1b, 0x3d, 0x60, 0xc7, 0x85, 0x3c, 0xb2, 0xc7, 0x75, 0x3d,
+ 0xbd, 0xe4, 0xbe, 0xbc, 0x54, 0x8e, 0x82, 0x3d, 0x36, 0x27, 0x6a, 0xbc, 0x0d,
+ 0x99, 0x00, 0xbd, 0x38, 0x5e, 0x9f, 0xbc, 0x9d, 0x49, 0xd6, 0x3d, 0xbb, 0x1a,
+ 0x85, 0x3d, 0x6f, 0x89, 0x9f, 0x3c, 0xc5, 0x0b, 0xa7, 0xbc, 0x9e, 0x5a, 0xfa,
+ 0xbc, 0xd3, 0x59, 0x50, 0xba, 0x3f, 0xc6, 0xbc, 0xbd, 0xb3, 0x9c, 0x12, 0xbd,
+ 0x05, 0x39, 0xd6, 0x3b, 0x58, 0x14, 0x0d, 0x3d, 0x63, 0x0e, 0x19, 0x3d, 0x69,
+ 0x9b, 0xa2, 0x3d, 0x68, 0x4d, 0x13, 0x3c, 0x06, 0x73, 0x64, 0xbd, 0x28, 0x79,
+ 0x3c, 0xbd, 0x26, 0x23, 0x28, 0xbc, 0xb5, 0xa2, 0xa5, 0xba, 0xf6, 0x5f, 0x89,
+ 0xbc, 0x66, 0x2e, 0x79, 0xbd, 0x90, 0xee, 0x54, 0xbc, 0x99, 0xf4, 0x4e, 0x3c,
+ 0xdb, 0xdc, 0xd0, 0xbc, 0x3f, 0xed, 0x43, 0xbd, 0x03, 0xdf, 0xf4, 0x3c, 0x7d,
+ 0x40, 0x2b, 0x3c, 0xfb, 0x1d, 0x64, 0x3d, 0xcd, 0x1f, 0xb8, 0x3d, 0xb1, 0xb2,
+ 0x0f, 0x3d, 0x30, 0xf6, 0x38, 0xbd, 0x54, 0xef, 0x84, 0xbc, 0x2f, 0x3f, 0xac,
+ 0xbd, 0xe0, 0xe1, 0xc4, 0xbc, 0x49, 0x0a, 0x03, 0xbd, 0xb8, 0x78, 0x43, 0xbc,
+ 0xbf, 0xbc, 0x80, 0x3a, 0x1a, 0x41, 0x39, 0x3d, 0xd0, 0x5d, 0x8c, 0x3d, 0x8d,
+ 0x8f, 0x5e, 0xbc, 0xfd, 0x1b, 0xed, 0xbd, 0x22, 0x7c, 0x99, 0xbc, 0x4c, 0xb3,
+ 0x1d, 0xbc, 0x10, 0xbb, 0x1c, 0x3c, 0x19, 0x89, 0xd3, 0xbc, 0x2a, 0x64, 0x37,
+ 0x3d, 0x11, 0x87, 0x00, 0x3c, 0x39, 0x0d, 0x1c, 0x3d, 0xb8, 0xeb, 0xde, 0xbc,
+ 0x26, 0x9d, 0x05, 0xbd, 0x51, 0xca, 0x0d, 0xbd, 0xa9, 0xe0, 0xbc, 0x3c, 0xd6,
+ 0x01, 0x2d, 0xbd, 0x72, 0x14, 0xd3, 0x3c, 0xf2, 0x07, 0x81, 0x3c, 0xe4, 0xbb,
+ 0x00, 0x3d, 0x0b, 0x42, 0x09, 0x3b, 0x0e, 0x99, 0x71, 0xbd, 0x32, 0x91, 0x10,
+ 0xbd, 0xa0, 0x0b, 0x05, 0xbd, 0x7f, 0xf8, 0xf6, 0x3c, 0xd4, 0x72, 0xbd, 0x3c,
+ 0xdf, 0xcc, 0x8a, 0x3d, 0x0e, 0x3d, 0x24, 0x3d, 0x71, 0x5a, 0x52, 0xbd, 0xb6,
+ 0x11, 0xda, 0xbc, 0x5b, 0xec, 0x9c, 0x3d, 0x4a, 0x73, 0xfd, 0xbc, 0xc1, 0x2b,
+ 0x9f, 0xbd, 0x06, 0xed, 0x2f, 0xbd, 0x38, 0x4c, 0x53, 0x3d, 0x36, 0x8d, 0xc1,
+ 0x3c, 0x14, 0x26, 0xa3, 0xbd, 0x2d, 0x2f, 0x0a, 0xbb, 0xfd, 0x7d, 0xa5, 0xbd,
+ 0x10, 0xbe, 0xe4, 0x3b, 0x77, 0x22, 0x6a, 0x3d, 0xdd, 0x33, 0xc3, 0x3c, 0x3e,
+ 0x8e, 0xbb, 0xbd, 0x60, 0x54, 0x81, 0x3d, 0x02, 0xcf, 0x15, 0x3d, 0x06, 0x28,
+ 0xd5, 0x3d, 0xda, 0xb6, 0x6f, 0xbd, 0xf6, 0x93, 0x86, 0xbc, 0x98, 0x16, 0x45,
+ 0x3d, 0xdc, 0x9e, 0x47, 0x3c, 0x8b, 0x3a, 0x82, 0xbd, 0x11, 0x05, 0xb6, 0xbd,
+ 0x0e, 0x26, 0xc1, 0xbc, 0xe2, 0xdc, 0xab, 0x3d, 0x10, 0x6e, 0x84, 0x3d, 0x49,
+ 0x2f, 0x1c, 0xbb, 0x0e, 0x73, 0x7a, 0x3c, 0x82, 0x17, 0x29, 0x3d, 0x88, 0x40,
+ 0x91, 0x3b, 0x2d, 0xcd, 0xf3, 0xbc, 0xcc, 0x39, 0x37, 0xbd, 0xb0, 0x03, 0x17,
+ 0x3d, 0xb8, 0xd0, 0x22, 0x3d, 0xc6, 0x69, 0x90, 0x3c, 0x09, 0x0f, 0xc2, 0x3b,
+ 0x7a, 0x64, 0xcc, 0xbc, 0x26, 0x93, 0x22, 0x3d, 0xa3, 0xe0, 0x4b, 0xbd, 0x7d,
+ 0xca, 0x2f, 0xbb, 0xda, 0x26, 0x19, 0x3d, 0xe7, 0x88, 0x47, 0xbc, 0x4e, 0x0f,
+ 0x3b, 0x3d, 0xf8, 0x1c, 0x1c, 0x3d, 0xb4, 0x23, 0x8e, 0x3d, 0xaf, 0xa6, 0x10,
+ 0xbd, 0xfc, 0x9a, 0x9c, 0x3c, 0x35, 0x69, 0x9f, 0x3d, 0xe4, 0x5f, 0x8f, 0xbd,
+ 0xc7, 0xe3, 0x98, 0x3d, 0xab, 0xb8, 0xcc, 0x3b, 0x6a, 0xa9, 0x0f, 0xbd, 0x0d,
+ 0x8a, 0x6a, 0xbd, 0x1e, 0xec, 0x10, 0x3d, 0xa0, 0x13, 0xe8, 0x3b, 0xc0, 0x77,
+ 0x93, 0x3c, 0x3f, 0x03, 0x0b, 0x3d, 0xde, 0x40, 0xb4, 0x3c, 0xfc, 0xdb, 0x06,
+ 0xbd, 0xc3, 0x86, 0x90, 0x3d, 0x54, 0x89, 0x37, 0x3d, 0x55, 0xd4, 0x8d, 0xbd,
+ 0x39, 0x31, 0xb7, 0xbc, 0xab, 0x31, 0xc0, 0xbc, 0x60, 0x17, 0xdb, 0xbb, 0x49,
+ 0xa9, 0x2f, 0xbc, 0xbf, 0xcb, 0xd6, 0x3b, 0x83, 0x93, 0x16, 0x3d, 0xba, 0xdd,
+ 0x1b, 0xbd, 0xd1, 0x6a, 0x17, 0x3d, 0x45, 0x0f, 0x1d, 0xbd, 0xa3, 0xc1, 0xb5,
+ 0xbd, 0x88, 0x0e, 0x6e, 0x3d, 0x41, 0x5d, 0x06, 0x3d, 0xd8, 0xeb, 0xb4, 0x3c,
+ 0xe5, 0xc8, 0x88, 0xbb, 0x48, 0x65, 0x47, 0x3d, 0xff, 0xe8, 0xa6, 0xbd, 0x12,
+ 0x2a, 0x10, 0xbd, 0xd0, 0x90, 0x8b, 0x3d, 0x17, 0x08, 0xfc, 0xbc, 0x8e, 0xb4,
+ 0x9a, 0xbc, 0x70, 0x79, 0x3f, 0x3d, 0xd8, 0xad, 0x06, 0x3c, 0xf8, 0x4e, 0x81,
+ 0xbd, 0x82, 0xf1, 0x71, 0xbd, 0x9f, 0x19, 0xcc, 0xbd, 0xaf, 0x6a, 0x45, 0x3d,
+ 0x4e, 0x39, 0x25, 0x3d, 0x17, 0x43, 0x74, 0x3d, 0x52, 0x51, 0x53, 0xbd, 0x53,
+ 0x10, 0x5f, 0xbd, 0x5f, 0x60, 0xf7, 0x3c, 0xf4, 0x07, 0x6d, 0x3d, 0x68, 0x1d,
+ 0x29, 0x3d, 0xd6, 0xf7, 0xad, 0xbc, 0x09, 0x0d, 0x8f, 0xbd, 0x17, 0xae, 0xd7,
+ 0x3c, 0x63, 0xf2, 0xc7, 0xbc, 0x4e, 0xa0, 0x05, 0xbd, 0x53, 0x3b, 0xc5, 0xbc,
+ 0x81, 0xf4, 0x82, 0x3d, 0x5e, 0xc9, 0x56, 0xbd, 0x32, 0xb8, 0xbd, 0xbc, 0xf2,
+ 0x3e, 0xc7, 0xbc, 0x76, 0x7f, 0x76, 0xbd, 0x19, 0x45, 0x13, 0xbd, 0xb9, 0x17,
+ 0x88, 0x3d, 0xef, 0x15, 0x68, 0xbd, 0x7a, 0xb8, 0xf6, 0x3a, 0xa8, 0x56, 0x72,
+ 0xbb, 0x96, 0x68, 0xce, 0x3d, 0x13, 0x43, 0x0a, 0xbd, 0x87, 0x3f, 0x91, 0x3c,
+ 0xd7, 0x12, 0x8b, 0x3b, 0x2f, 0x85, 0xbf, 0xbc, 0x33, 0xfc, 0x62, 0xbc, 0x5f,
+ 0xb3, 0x8f, 0xbc, 0x9f, 0x1a, 0xf5, 0xbc, 0x3b, 0x75, 0x68, 0x3d, 0x58, 0xae,
+ 0x3c, 0x3d, 0xe3, 0x00, 0x5d, 0x3d, 0xcf, 0x69, 0x9c, 0x3d, 0xdb, 0x20, 0xb3,
+ 0x39, 0x31, 0x1a, 0x7a, 0xbc, 0x11, 0x37, 0xd0, 0x3c, 0x1d, 0x5d, 0x84, 0x3d,
+ 0xb2, 0x5d, 0xe9, 0xbc, 0x24, 0x74, 0xe5, 0xbc, 0x86, 0x1d, 0xea, 0xbb, 0x65,
+ 0x94, 0x76, 0x3d, 0x9a, 0xb2, 0xeb, 0x3c, 0x62, 0x9f, 0x44, 0xbb, 0xca, 0x35,
+ 0xa8, 0xbc, 0x25, 0x51, 0x23, 0x3d, 0xa9, 0xac, 0x00, 0xbd, 0xb9, 0x13, 0xa6,
+ 0x3d, 0x3e, 0x3e, 0x10, 0xbc, 0x5f, 0x40, 0x8b, 0x3d, 0x75, 0xef, 0x70, 0x3b,
+ 0xf8, 0x66, 0xa4, 0x3c, 0x69, 0x24, 0x84, 0x3c, 0x2a, 0xd2, 0x76, 0xbc, 0x67,
+ 0xef, 0x9f, 0xbc, 0xe1, 0x67, 0xcb, 0xbc, 0xe1, 0x4c, 0xa9, 0xbd, 0x18, 0xb6,
+ 0x96, 0x3d, 0x29, 0xaa, 0x84, 0xbd, 0x80, 0x0d, 0x5b, 0x3d, 0x35, 0xe7, 0x02,
+ 0x3d, 0xea, 0xf8, 0x46, 0xbd, 0xba, 0x63, 0x42, 0x3d, 0x3e, 0x6d, 0x83, 0x3d,
+ 0x0d, 0x47, 0x3c, 0xbd, 0x79, 0xe3, 0xa1, 0x3c, 0x7b, 0x77, 0x17, 0xbd, 0x4d,
+ 0x55, 0x53, 0x3d, 0xc3, 0x91, 0x7e, 0xbd, 0x9b, 0x6b, 0x49, 0x3d, 0x30, 0xad,
+ 0xc7, 0xbc, 0xc1, 0x27, 0x3e, 0xbd, 0xea, 0xaf, 0x51, 0x3d, 0x12, 0x3a, 0x94,
+ 0xbc, 0xf1, 0x36, 0xf1, 0x3c, 0x6a, 0x5a, 0x93, 0x3b, 0x88, 0x1e, 0xb1, 0xbc,
+ 0x3c, 0x43, 0x37, 0xbd, 0x74, 0xda, 0x9a, 0xbd, 0x53, 0x3d, 0x7b, 0x3d, 0xe7,
+ 0x18, 0xdd, 0xbc, 0xba, 0x1b, 0xd9, 0xbc, 0xe8, 0x9a, 0x64, 0xbd, 0xca, 0x36,
+ 0x2b, 0x3d, 0xc6, 0x99, 0xbc, 0x3c, 0xa6, 0x76, 0x72, 0x3d, 0x59, 0x8a, 0xb5,
+ 0x3c, 0x07, 0xf8, 0xd7, 0x3d, 0xdd, 0xaf, 0x2a, 0xb8, 0x77, 0xac, 0xb7, 0x3c,
+ 0x53, 0xd6, 0x12, 0xbd, 0x19, 0x6c, 0x63, 0x3c, 0xe0, 0xf5, 0x32, 0xbd, 0x72,
+ 0xc2, 0xae, 0xbd, 0x04, 0x6b, 0x12, 0x3c, 0xea, 0x76, 0x99, 0x3d, 0x5e, 0x14,
+ 0x25, 0xbd, 0x16, 0x01, 0x01, 0xbc, 0x6d, 0x0e, 0xb8, 0x3d, 0x78, 0x70, 0x85,
+ 0x3b, 0x7b, 0xb9, 0x55, 0xbb, 0x59, 0xa4, 0x2f, 0x3d, 0xbb, 0xf1, 0x4e, 0xbc,
+ 0x6e, 0x1e, 0x6f, 0x3d, 0x6d, 0xd0, 0x82, 0x3d, 0xa1, 0x2a, 0x38, 0xbd, 0x82,
+ 0x0e, 0x81, 0x3d, 0x51, 0x1a, 0xe8, 0x3c, 0x78, 0x0f, 0xb2, 0xbc, 0xdb, 0x4a,
+ 0x9f, 0x3d, 0xeb, 0xf7, 0x5f, 0x3b, 0xf0, 0x3e, 0xe2, 0xbc, 0x9c, 0x11, 0x91,
+ 0x3c, 0xb0, 0xbd, 0x1a, 0x3c, 0xce, 0x3f, 0x1c, 0xbb, 0x0e, 0xe3, 0x0b, 0x3d,
+ 0x2e, 0x44, 0x15, 0x3d, 0x90, 0x12, 0xe8, 0x3c, 0x84, 0xb7, 0x46, 0x3d, 0x4f,
+ 0x51, 0x90, 0x3c, 0x5f, 0xee, 0xe8, 0x3c, 0x8f, 0xa8, 0xd2, 0xbb, 0x86, 0x20,
+ 0x7c, 0x3d, 0xe8, 0x1f, 0x48, 0xbc, 0xbb, 0x7f, 0x59, 0x3d, 0x62, 0xf1, 0x8a,
+ 0xbc, 0x94, 0x28, 0x0c, 0x3c, 0xdd, 0x8f, 0x1a, 0xbd, 0xad, 0x5a, 0xa8, 0x39,
+ 0x4d, 0x0c, 0x71, 0x3d, 0x96, 0xa2, 0x91, 0x3d, 0xe7, 0x9c, 0x69, 0xbc, 0x1f,
+ 0x9d, 0x0c, 0xbd, 0x6e, 0xbe, 0xe7, 0x3c, 0x97, 0x28, 0x35, 0xbd, 0x11, 0xb7,
+ 0x8c, 0xbd, 0x3b, 0xc0, 0xc1, 0x3c, 0x02, 0x96, 0xd7, 0x3c, 0x79, 0x02, 0x4d,
+ 0xbc, 0x6c, 0xad, 0xb7, 0x3c, 0x9a, 0xef, 0x29, 0x3d, 0xe9, 0x73, 0x9b, 0x3d,
+ 0x58, 0xd3, 0x17, 0x3d, 0xea, 0xcc, 0x2d, 0xbd, 0x64, 0x3a, 0x9e, 0xbd, 0x9a,
+ 0x8b, 0x3c, 0xbd, 0x4f, 0x97, 0x88, 0xbc, 0x1b, 0x18, 0x27, 0xbc, 0x22, 0xdc,
+ 0xde, 0xbd, 0xb4, 0xbe, 0x94, 0xba, 0x5a, 0xc7, 0xe0, 0x3b, 0xe9, 0xd7, 0x07,
+ 0x3c, 0xcb, 0x47, 0xf2, 0x3c, 0x04, 0xca, 0x2f, 0x3d, 0x25, 0x4d, 0xd9, 0x3c,
+ 0xc1, 0xb9, 0x37, 0xbd, 0xa1, 0x9a, 0x0c, 0x3d, 0x78, 0xae, 0x88, 0xbd, 0x02,
+ 0xb5, 0x98, 0x3d, 0x63, 0x8b, 0x79, 0xbd, 0xab, 0xe4, 0xaa, 0x3d, 0x5a, 0x1e,
+ 0x02, 0xbc, 0x16, 0x17, 0x68, 0x3b, 0xf8, 0x36, 0x0d, 0x3b, 0x1f, 0x67, 0x8c,
+ 0xbd, 0xbc, 0x52, 0xe2, 0xbc, 0x2f, 0xee, 0xe2, 0xbb, 0x46, 0x45, 0x08, 0x3d,
+ 0xd2, 0xea, 0xc9, 0x3c, 0x00, 0xcc, 0x5c, 0x3d, 0x1e, 0x1f, 0x54, 0x3c, 0x10,
+ 0x3e, 0x8e, 0x3c, 0x1e, 0x6d, 0x5f, 0xbd, 0xfb, 0xdb, 0x64, 0x3d, 0x62, 0x27,
+ 0xb5, 0xbd, 0x0a, 0x8c, 0x51, 0xbd, 0x5e, 0x4d, 0xae, 0xbd, 0xd4, 0xd2, 0x65,
+ 0x3d, 0x88, 0xc4, 0xc0, 0x3c, 0x25, 0x97, 0xb9, 0xbb, 0x6d, 0x7c, 0x5b, 0x3d,
+ 0x42, 0x2f, 0x0e, 0xbb, 0x42, 0xfc, 0xb3, 0xba, 0x38, 0x1c, 0xae, 0xbc, 0x4d,
+ 0xba, 0x7a, 0xbd, 0x15, 0xf7, 0x9d, 0x3d, 0x51, 0xc4, 0x82, 0x3d, 0x70, 0xa9,
+ 0x47, 0x3d, 0x68, 0x1c, 0xdf, 0x3c, 0xef, 0x44, 0x71, 0x3c, 0xdf, 0x7d, 0x80,
+ 0x3d, 0x6c, 0x6c, 0xcd, 0xbc, 0x9b, 0xf2, 0x68, 0x3d, 0x61, 0x10, 0x64, 0x3d,
+ 0x31, 0x19, 0xda, 0x3c, 0xc3, 0x1c, 0xdc, 0xbb, 0xe1, 0x30, 0x13, 0xbc, 0x4d,
+ 0xd5, 0xaf, 0xbb, 0x39, 0xaa, 0x43, 0xbd, 0x9a, 0x51, 0x75, 0xbd, 0xc3, 0x2b,
+ 0x5e, 0x3c, 0x2f, 0x60, 0xed, 0x3c, 0x2a, 0x8e, 0x87, 0x3d, 0x0e, 0x88, 0x08,
+ 0xbd, 0xcb, 0x1a, 0xc2, 0x3b, 0x86, 0xdb, 0x44, 0xbd, 0x3c, 0xb2, 0xd8, 0xbc,
+ 0xd8, 0x5c, 0x2a, 0x3d, 0xf9, 0xb9, 0x06, 0xbd, 0xf6, 0x2f, 0x52, 0x3d, 0xda,
+ 0x46, 0xe9, 0x3b, 0xeb, 0x10, 0xd5, 0x3c, 0x5a, 0x5a, 0x70, 0x3b, 0x58, 0xd3,
+ 0x30, 0x3c, 0xb3, 0x7e, 0x00, 0xbd, 0x81, 0x37, 0x56, 0xbd, 0x0a, 0x66, 0x12,
+ 0xbd, 0xd7, 0xca, 0x80, 0xbd, 0x89, 0x4c, 0x52, 0x3d, 0x42, 0x49, 0xab, 0x3c,
+ 0x79, 0xe8, 0xa6, 0xbd, 0xa2, 0x35, 0xd5, 0xbd, 0xa3, 0x0c, 0x0e, 0xbd, 0x4f,
+ 0x10, 0x8a, 0x3d, 0xd4, 0xbe, 0x64, 0x3d, 0x38, 0x13, 0xfd, 0x3d, 0x86, 0xc8,
+ 0x82, 0xbd, 0xd2, 0x11, 0x46, 0x3d, 0xcc, 0x13, 0x6a, 0x3d, 0x29, 0x91, 0xe2,
+ 0xbc, 0x9a, 0x59, 0xc8, 0xbc, 0x6d, 0xd3, 0x79, 0xbd, 0x00, 0x17, 0xbd, 0x3d,
+ 0x2f, 0x3d, 0x13, 0xbd, 0xf2, 0x5e, 0x5a, 0x3d, 0x91, 0xd3, 0x22, 0xbc, 0x8d,
+ 0x7d, 0xdd, 0x3c, 0xcb, 0xd3, 0x47, 0x3d, 0x51, 0x39, 0x43, 0x3d, 0x8e, 0xba,
+ 0xb3, 0x3c, 0xcf, 0xdc, 0x5d, 0xbc, 0xe8, 0xf4, 0x69, 0xbd, 0x75, 0xed, 0x4a,
+ 0xbd, 0x3e, 0xa3, 0x52, 0x3d, 0x55, 0xbe, 0x6e, 0xbd, 0x84, 0x86, 0xb3, 0xbc,
+ 0x7d, 0x3b, 0x4f, 0xbd, 0xd0, 0x9c, 0x8f, 0xbb, 0xe4, 0x9f, 0x39, 0x3d, 0x10,
+ 0x5c, 0xf0, 0xbb, 0x64, 0x15, 0x82, 0xbc, 0x12, 0xf8, 0x45, 0x3d, 0xf6, 0xfc,
+ 0x40, 0x3d, 0x64, 0x01, 0x84, 0xbc, 0x4e, 0x97, 0x28, 0x3d, 0xc0, 0xb8, 0x30,
+ 0x3d, 0xf8, 0x94, 0x71, 0xbd, 0x59, 0x5a, 0x61, 0xbd, 0x9e, 0x55, 0x8d, 0xbd,
+ 0x00, 0x77, 0xfa, 0xbc, 0x9c, 0xbf, 0x17, 0x3d, 0x94, 0x7a, 0x4f, 0xbd, 0xb1,
+ 0xa6, 0x8f, 0xbd, 0xad, 0xc3, 0x8a, 0x3d, 0xf0, 0xca, 0x8b, 0x3c, 0x2a, 0xe4,
+ 0x2b, 0xbd, 0x34, 0x81, 0x44, 0xbd, 0x48, 0x55, 0x52, 0xbd, 0x2e, 0x7e, 0x63,
+ 0x3d, 0x3a, 0x07, 0x4e, 0x3d, 0xb0, 0xb9, 0x7a, 0x3c, 0x18, 0x7d, 0x6e, 0xbc,
+ 0x7a, 0x0e, 0x3c, 0xbd, 0xdc, 0x81, 0x8c, 0xbd, 0xc8, 0xa4, 0x71, 0x3c, 0xca,
+ 0x20, 0x28, 0x3d, 0x28, 0x36, 0xf6, 0x3c, 0x28, 0xef, 0x3c, 0x3d, 0x88, 0x83,
+ 0x3e, 0x3c, 0x74, 0x45, 0x34, 0x3d, 0x80, 0x11, 0x06, 0xba, 0x8c, 0xd1, 0x79,
+ 0xbc, 0x84, 0x71, 0x26, 0xbd, 0x98, 0x15, 0x15, 0x3c, 0x4a, 0x0e, 0x92, 0xbc,
+ 0x75, 0x17, 0x83, 0x3d, 0xfc, 0x9c, 0xc1, 0xbc, 0x4c, 0xe3, 0xb5, 0x3c, 0x10,
+ 0xc9, 0x23, 0x3c, 0xd0, 0xde, 0x1a, 0x3c, 0x22, 0x15, 0x92, 0xbd, 0xe6, 0x39,
+ 0x48, 0xbd, 0x16, 0x40, 0x91, 0xbd, 0x5c, 0xf1, 0xb4, 0x3c, 0x4a, 0xf7, 0xbc,
+ 0xbc, 0x80, 0x48, 0x44, 0x3c, 0xc8, 0x47, 0x15, 0xbc, 0xcb, 0x39, 0x4d, 0xbd,
+ 0x04, 0xe1, 0xc0, 0x3c, 0x86, 0x40, 0x43, 0xbd, 0x3f, 0x39, 0x6a, 0xbd, 0x00,
+ 0xfd, 0x30, 0xbb, 0x18, 0x14, 0x60, 0xbc, 0xf0, 0x88, 0x12, 0x3d, 0x21, 0xf7,
+ 0x90, 0x3d, 0xfc, 0xcc, 0xa1, 0x3c, 0xa6, 0x1f, 0x2d, 0x3d, 0x0a, 0x14, 0x46,
+ 0xbd, 0x37, 0x3c, 0x5f, 0xbd, 0x32, 0x53, 0x94, 0xbc, 0x58, 0x51, 0xb1, 0xbc,
+ 0xd7, 0x03, 0x89, 0x3d, 0xfe, 0x03, 0x37, 0xbd, 0x9e, 0x06, 0x89, 0xbd, 0xbc,
+ 0xf6, 0x41, 0x3d, 0xf0, 0x87, 0x32, 0x3d, 0xdc, 0x11, 0xeb, 0xbc, 0x4a, 0x89,
+ 0x3b, 0x3d, 0xd2, 0xf1, 0x2b, 0x3d, 0x78, 0xcb, 0x38, 0xbc, 0x46, 0xda, 0xff,
+ 0xbc, 0xee, 0x9c, 0x8d, 0xbd, 0x14, 0x8e, 0xcd, 0xbc, 0x08, 0x6f, 0x05, 0x3d,
+ 0x00, 0xac, 0x8e, 0xbd, 0x90, 0xa2, 0x84, 0xbb, 0x9b, 0x36, 0x32, 0xbd, 0x2b,
+ 0x3f, 0x89, 0x3d, 0x80, 0x9a, 0x03, 0xbb, 0x06, 0xac, 0x17, 0x3d, 0xf8, 0x22,
+ 0x3f, 0xbd, 0x75, 0xae, 0x90, 0xbd, 0x76, 0xdd, 0x3e, 0xbd, 0x7c, 0x72, 0x92,
+ 0x3c, 0x4c, 0x38, 0x44, 0xbd, 0xba, 0x8f, 0x21, 0x3d, 0x00, 0x88, 0x7e, 0xbb,
+ 0xdc, 0xd2, 0x92, 0x3c, 0x1a, 0x45, 0x77, 0x3d, 0x54, 0xa1, 0x50, 0xbc, 0x44,
+ 0xea, 0x2d, 0x3d, 0x8e, 0xbd, 0x1d, 0x3d, 0x1b, 0xb9, 0x88, 0x3d, 0x20, 0xc4,
+ 0x8b, 0xbd, 0x43, 0x9e, 0x05, 0xbd, 0x80, 0x93, 0x4a, 0x3d, 0x02, 0xb3, 0x8a,
+ 0xbd, 0x40, 0x5c, 0xbb, 0x3b, 0x54, 0x22, 0x37, 0xbd, 0x04, 0xd5, 0xed, 0xbc,
+ 0xae, 0xce, 0x87, 0xbd, 0x0c, 0x0f, 0xe3, 0xbc, 0xc1, 0x1f, 0x48, 0xbd, 0x68,
+ 0x6a, 0x9a, 0x3c, 0xd0, 0x0b, 0x8f, 0x3c, 0xc8, 0x5c, 0x00, 0x3d, 0x60, 0xf9,
+ 0xd5, 0xbb, 0x57, 0x9a, 0x88, 0xbd, 0xf2, 0x1a, 0x8d, 0xbd, 0x52, 0x69, 0x63,
+ 0x3d, 0xb8, 0x69, 0x89, 0x3c, 0x56, 0xfb, 0x0a, 0x3d, 0x00, 0xc3, 0x10, 0xba,
+ 0x0e, 0xcd, 0x56, 0xbd, 0x1a, 0xf7, 0x61, 0x3d, 0xf8, 0x95, 0x8b, 0xbd, 0x3c,
+ 0x34, 0x14, 0xbd, 0xed, 0xc6, 0x8f, 0x3d, 0xee, 0xc2, 0x1c, 0x3d, 0xa0, 0x9d,
+ 0x04, 0xbb, 0xfd, 0x06, 0x56, 0xbd, 0xa0, 0xe7, 0x12, 0x3b, 0xae, 0x01, 0xbd,
+ 0xbc, 0xb0, 0x52, 0x16, 0x3d, 0x00, 0x9e, 0x97, 0xba, 0x40, 0xaf, 0x58, 0x3d,
+ 0xa4, 0x80, 0x97, 0x3c, 0xa0, 0x07, 0x22, 0x3b, 0x59, 0x3b, 0x01, 0xbd, 0x83,
+ 0x64, 0x87, 0x3d, 0x0e, 0xfd, 0x96, 0xbc, 0x3a, 0xf8, 0x7b, 0xbd, 0x7d, 0x61,
+ 0x0a, 0xbd, 0xe2, 0x4c, 0x58, 0xbd, 0xc0, 0x1b, 0x81, 0xbb, 0x70, 0x48, 0x0b,
+ 0x3d, 0x5a, 0x4c, 0x94, 0xbc, 0x6a, 0x49, 0x5b, 0x3d, 0x58, 0x79, 0x7a, 0x3c,
+ 0x54, 0xe4, 0x10, 0xbd, 0x0f, 0x05, 0x8c, 0x3d, 0x00, 0x70, 0xb3, 0xba, 0xfe,
+ 0x52, 0xec, 0xbc, 0x80, 0x87, 0xe5, 0x3b, 0x76, 0x35, 0x7f, 0x3d, 0x20, 0x23,
+ 0x36, 0x3b, 0x48, 0xe0, 0x16, 0x3d, 0x0e, 0xdb, 0x53, 0x3d, 0x76, 0x7d, 0xcb,
+ 0xbc, 0x79, 0xf8, 0x5c, 0xbd, 0x8a, 0x7c, 0x39, 0x3d, 0x8c, 0x87, 0x1d, 0x3d,
+ 0x3a, 0x32, 0x08, 0xbd, 0x54, 0xa9, 0x6a, 0xbc, 0x22, 0xad, 0xad, 0xbc, 0xd2,
+ 0x4b, 0x68, 0x3d, 0x86, 0x89, 0xee, 0xbc, 0x42, 0xee, 0x7d, 0x3d, 0x56, 0x9e,
+ 0x46, 0x3d, 0x58, 0xcd, 0xd0, 0x3c, 0xb4, 0x6d, 0x9f, 0x3c, 0x0c, 0x5b, 0x20,
+ 0xbd, 0x40, 0xe8, 0x2c, 0x3b, 0x23, 0xd1, 0x80, 0x3d, 0xee, 0x0f, 0xc8, 0xbc,
+ 0x1c, 0x52, 0xd5, 0x3c, 0x68, 0x8d, 0x63, 0xbc, 0x9c, 0xb3, 0x37, 0xbd, 0x0c,
+ 0x04, 0xde, 0x3c, 0x50, 0x20, 0x93, 0x3b, 0xac, 0xef, 0xf6, 0x3c, 0xac, 0x6e,
+ 0x93, 0xbc, 0x92, 0x06, 0x64, 0x3d, 0x28, 0xdd, 0x74, 0x3c, 0xf7, 0x67, 0x86,
+ 0x3d, 0x2c, 0x86, 0x43, 0x3d, 0x30, 0x55, 0x89, 0xbd, 0xa0, 0xf0, 0xd7, 0xbb,
+ 0xe4, 0x7f, 0x05, 0x3d, 0x18, 0xf7, 0x3f, 0x3c, 0x46, 0xaf, 0xcb, 0xbc, 0x80,
+ 0xf0, 0xb3, 0x3b, 0xdc, 0xe9, 0x81, 0x3c, 0xef, 0x3f, 0x5c, 0xbd, 0xfe, 0xb8,
+ 0xa1, 0xbc, 0x90, 0x44, 0x41, 0x3c, 0x4e, 0xc8, 0x30, 0xbd, 0x63, 0x6e, 0x72,
+ 0xbd, 0xbc, 0x52, 0xbf, 0xbc, 0x7c, 0x04, 0x47, 0xbd, 0x4c, 0xe3, 0x4e, 0xbd,
+ 0x34, 0x8b, 0x36, 0x3d, 0xd1, 0xf2, 0x33, 0xbd, 0x16, 0x48, 0x09, 0x3d, 0x8c,
+ 0x31, 0x00, 0xbd, 0xd9, 0x91, 0x8e, 0xbd, 0xf2, 0x8d, 0x64, 0xbd, 0x48, 0x20,
+ 0xbf, 0xbc, 0x60, 0x89, 0x53, 0x3b, 0x00, 0x96, 0x71, 0x3a, 0x44, 0x6e, 0x8c,
+ 0xbd, 0x90, 0x6b, 0x7d, 0xbd, 0x64, 0x71, 0xa6, 0x3c, 0x52, 0x23, 0x70, 0x3d,
+ 0xf3, 0x05, 0x80, 0x3d, 0xb4, 0xe2, 0x68, 0xbd, 0x20, 0x6f, 0xf9, 0x3b, 0x60,
+ 0x31, 0x2c, 0x3d, 0x30, 0x78, 0x4b, 0xbd, 0xd8, 0xae, 0x23, 0xbc, 0x40, 0xea,
+ 0xc5, 0x3a, 0xd0, 0xe7, 0x86, 0xbd, 0xa0, 0x57, 0x47, 0x3d, 0x70, 0x78, 0xab,
+ 0x3b, 0x1c, 0xab, 0xb1, 0xbc, 0x2a, 0x75, 0x5d, 0xbd, 0xd0, 0xd1, 0x26, 0xbd,
+ 0x90, 0x93, 0x3a, 0xbd, 0xb4, 0x8a, 0xe9, 0xbc, 0xac, 0xf1, 0xa5, 0xbc, 0x10,
+ 0xa3, 0xa7, 0xbb, 0x02, 0xb2, 0x73, 0xbd, 0x2e, 0x27, 0xb7, 0xbc, 0xd0, 0x0c,
+ 0x92, 0xbd, 0x0e, 0x8e, 0x77, 0x3d, 0x5a, 0x78, 0x0a, 0x3d, 0xf4, 0xa9, 0xc5,
+ 0x3c, 0x82, 0x8a, 0x15, 0x3d, 0x3d, 0x25, 0x13, 0xbd, 0x7e, 0x35, 0x12, 0xbd,
+ 0x2a, 0xd2, 0x6e, 0x3d, 0x78, 0x60, 0xcb, 0xbc, 0x70, 0x92, 0x81, 0xbd, 0xca,
+ 0x3f, 0x2f, 0xbd, 0x3b, 0x71, 0x67, 0xbd, 0x80, 0x79, 0x83, 0xba, 0xc6, 0x2a,
+ 0x47, 0x3d, 0x86, 0x99, 0x72, 0x3d, 0x6c, 0x59, 0x8f, 0x3c, 0x73, 0x59, 0x14,
+ 0xbd, 0x23, 0x83, 0x82, 0x3d, 0x94, 0x4d, 0x8b, 0xbd, 0x9c, 0x05, 0x2f, 0xbd,
+ 0x60, 0xae, 0x57, 0x3d, 0x95, 0x1c, 0x86, 0x3d, 0x26, 0xaf, 0x78, 0x3d, 0x47,
+ 0x4b, 0x4e, 0xbd, 0x96, 0xfd, 0x75, 0x3d, 0xb2, 0x63, 0x35, 0x3d, 0xc0, 0x00,
+ 0xa3, 0x3b, 0x12, 0x16, 0x3d, 0x3d, 0x8e, 0xd2, 0x56, 0xbd, 0x02, 0xff, 0xec,
+ 0xbc, 0x96, 0x20, 0xcc, 0xbc, 0xf4, 0x61, 0x0b, 0x3d, 0x20, 0x12, 0x58, 0x3b,
+ 0x5a, 0xa3, 0x4c, 0x3d, 0x80, 0x86, 0x64, 0x3b, 0x0e, 0x77, 0x70, 0x3d, 0xd0,
+ 0x7b, 0xe8, 0xbb, 0x92, 0x2d, 0x20, 0xbd, 0xc8, 0x33, 0x6f, 0xbc, 0xf8, 0x0f,
+ 0x76, 0x3c, 0x3a, 0xea, 0x36, 0x3d, 0xc0, 0x6c, 0x47, 0x3b, 0x00, 0x3b, 0x98,
+ 0xbc, 0x88, 0x52, 0x3b, 0x3c, 0xa8, 0x58, 0x54, 0x3c, 0x5a, 0xff, 0x4f, 0x3d,
+ 0xfe, 0x26, 0x5e, 0x3d, 0x7c, 0x39, 0x8e, 0xbc, 0x96, 0x37, 0x75, 0x3d, 0xbd,
+ 0x95, 0x86, 0xbd, 0x6b, 0x40, 0x91, 0x3d, 0x40, 0x14, 0x3a, 0xbb, 0xf0, 0xe0,
+ 0x0f, 0xbc, 0xeb, 0x23, 0x82, 0x3d, 0xe0, 0x7c, 0x8e, 0x3b, 0x60, 0x71, 0x11,
+ 0xbc, 0x3e, 0x89, 0x2c, 0xbd, 0x9a, 0x0a, 0x7f, 0xbd, 0xe8, 0x86, 0xcd, 0x3c,
+ 0xd4, 0x1d, 0xfe, 0x3c, 0xc6, 0x1f, 0x63, 0x3d, 0xe8, 0x6a, 0x2d, 0x3c, 0xec,
+ 0xb5, 0x02, 0x3d, 0x78, 0xcb, 0xe0, 0xbc, 0x74, 0x19, 0x64, 0xbc, 0xf0, 0xf7,
+ 0x69, 0xbc, 0x11, 0x97, 0x92, 0xbd, 0xe2, 0x89, 0x8b, 0xbd, 0x36, 0xe1, 0xa2,
+ 0xbc, 0x38, 0x7d, 0xb2, 0xbc, 0xf4, 0x26, 0x16, 0x3d, 0x70, 0x40, 0x90, 0xbd,
+ 0xe0, 0x0a, 0x70, 0x3c, 0x86, 0xb8, 0x35, 0x3d, 0x67, 0xd7, 0x8d, 0x3d, 0xd0,
+ 0xdc, 0x17, 0xbc, 0x10, 0xf7, 0xcd, 0xbb, 0xfe, 0x64, 0x59, 0x3d, 0x34, 0xf3,
+ 0x3c, 0xbd, 0x40, 0xfe, 0xae, 0xba, 0xd1, 0x87, 0x85, 0x3d, 0x10, 0x58, 0x65,
+ 0xbd, 0x66, 0xaf, 0x5d, 0xbd, 0x42, 0x56, 0x5d, 0x3d, 0x7c, 0xce, 0x5f, 0xbd,
+ 0xc0, 0x38, 0x96, 0x3a, 0x33, 0x59, 0x90, 0x3d, 0x06, 0x1a, 0xa6, 0xbc, 0xd4,
+ 0xb0, 0x83, 0x3c, 0xa8, 0xf4, 0x07, 0x3c, 0xa5, 0x8f, 0x90, 0x3d, 0x36, 0xd8,
+ 0xc0, 0xbc, 0xf0, 0xf5, 0x31, 0x3d, 0x30, 0x56, 0x88, 0xbd, 0x3c, 0x96, 0x05,
+ 0xbd, 0x89, 0xc2, 0x89, 0x3d, 0x19, 0x10, 0x06, 0xbd, 0xa2, 0xaa, 0x63, 0x3d,
+ 0x5e, 0x9b, 0x76, 0xbd, 0xa5, 0x57, 0x8c, 0x3d, 0x48, 0xe9, 0x2a, 0x3c, 0xe0,
+ 0xd9, 0x3a, 0x3b, 0xd3, 0x1c, 0x7f, 0xbd, 0x8c, 0x60, 0x21, 0xbc, 0x38, 0xc1,
+ 0x67, 0xbc, 0xf0, 0x83, 0x62, 0x3c, 0x58, 0xcb, 0x3f, 0x3d, 0xc7, 0xd9, 0x83,
+ 0x3d, 0x3e, 0xf5, 0x90, 0xbd, 0xeb, 0xb8, 0x8b, 0xbd, 0x0a, 0x86, 0x05, 0x3d,
+ 0x61, 0xb6, 0x39, 0xbd, 0x56, 0x8f, 0x04, 0x3d, 0x19, 0xbd, 0x33, 0xbd, 0x24,
+ 0xd1, 0x50, 0x3d, 0xd0, 0x14, 0xf8, 0x3c, 0x2c, 0x43, 0x49, 0x3d, 0x98, 0xa1,
+ 0x53, 0xbc, 0xc2, 0x43, 0x26, 0x3d, 0x8e, 0xed, 0xff, 0xbc, 0xb7, 0x58, 0x75,
+ 0xbd, 0x00, 0xb7, 0x85, 0x3a, 0x8c, 0xb1, 0x83, 0xbc, 0x08, 0x40, 0x92, 0xbd,
+ 0x35, 0x28, 0x08, 0xbd, 0x30, 0x4f, 0x84, 0x3c, 0x34, 0x0b, 0x22, 0xbc, 0x30,
+ 0x1a, 0x07, 0x3c, 0xaa, 0xd6, 0x87, 0xbd, 0xa2, 0xfd, 0x7d, 0xbd, 0xfe, 0xa0,
+ 0xb7, 0xbc, 0xa2, 0x0a, 0x33, 0x3d, 0x10, 0x60, 0xe4, 0xbb, 0x64, 0x49, 0x10,
+ 0xbd, 0xf4, 0xd0, 0x48, 0xbc, 0x12, 0x7a, 0x38, 0x3d, 0x28, 0xb9, 0xee, 0xbc,
+ 0x05, 0xbe, 0x50, 0xbd, 0xce, 0x2f, 0xd5, 0xbc, 0x04, 0x8f, 0x39, 0xbd, 0xa8,
+ 0x16, 0x0c, 0xbd, 0x64, 0xe1, 0x79, 0xbc, 0xd4, 0x20, 0x8c, 0x3c, 0x28, 0x73,
+ 0x1c, 0x3d, 0x20, 0x66, 0x97, 0x3c, 0x66, 0x6e, 0xc1, 0xbc, 0x6d, 0xfc, 0x91,
+ 0xbd, 0xc5, 0x79, 0x89, 0xbd, 0xd0, 0x3c, 0x90, 0x3c, 0xfc, 0x19, 0x55, 0xbd,
+ 0x72, 0x96, 0x80, 0xbd, 0x80, 0x81, 0x46, 0x3d, 0xea, 0x10, 0x30, 0x3d, 0x00,
+ 0xdc, 0xe2, 0x3b, 0x44, 0x30, 0x78, 0xbc, 0x3a, 0x5b, 0x39, 0x3d, 0x00, 0x8d,
+ 0x8c, 0xbb, 0x70, 0x9f, 0x3b, 0xbc, 0x1c, 0xa9, 0x5c, 0xbc, 0x04, 0xa9, 0xe4,
+ 0xbc, 0x3a, 0xd9, 0x39, 0x3d, 0xa0, 0x11, 0xfd, 0x3c, 0x76, 0x3b, 0xf9, 0xbc,
+ 0xb9, 0xdd, 0x6f, 0xbd, 0xf5, 0xcb, 0x91, 0xbd, 0xee, 0x45, 0x5d, 0xbd, 0x13,
+ 0x1c, 0x8d, 0xbd, 0x10, 0xb7, 0xb6, 0x3b, 0x60, 0xc8, 0x77, 0x3b, 0x70, 0x4d,
+ 0xbf, 0xbb, 0x38, 0x4f, 0x80, 0xbd, 0xa9, 0x6b, 0x92, 0xbd, 0x78, 0x8e, 0x7e,
+ 0x3c, 0x70, 0xd1, 0x6e, 0x3c, 0x79, 0x4c, 0x85, 0xbd, 0xcc, 0xac, 0x2b, 0x3d,
+ 0x49, 0x46, 0x5f, 0xbd, 0x68, 0x60, 0x6d, 0xbc, 0x50, 0x53, 0xe4, 0x3b, 0x35,
+ 0x39, 0x81, 0x3d, 0xf0, 0x01, 0x12, 0x3c, 0x4c, 0x27, 0x8b, 0xbd, 0xce, 0x8d,
+ 0x71, 0x3d, 0xcc, 0x9a, 0x8e, 0xbd, 0x9e, 0x6f, 0xcd, 0xbc, 0xea, 0x23, 0x19,
+ 0x3d, 0xac, 0xed, 0x95, 0x3c, 0x76, 0x32, 0x68, 0x3d, 0x08, 0xcc, 0x58, 0x3c,
+ 0xc8, 0xe2, 0xcc, 0x3c, 0xf1, 0x85, 0x81, 0x3d, 0x06, 0xdc, 0x6b, 0x3d, 0x16,
+ 0x15, 0xf0, 0xbc, 0xda, 0x56, 0x4e, 0x3d, 0x58, 0x5c, 0x90, 0xbc, 0xe4, 0x79,
+ 0x37, 0xbd, 0x40, 0x1b, 0x6a, 0xbd, 0x00, 0x4e, 0x63, 0x3b, 0xbc, 0xfc, 0x35,
+ 0x3d, 0xe6, 0x87, 0xf9, 0xbc, 0xb0, 0xfc, 0x0c, 0x3d, 0x96, 0x7f, 0x53, 0xbd,
+ 0x1e, 0xe1, 0x04, 0x3d, 0x10, 0x11, 0x87, 0x3c, 0xce, 0xd1, 0x42, 0x3d, 0x1c,
+ 0x27, 0xca, 0xbc, 0xd8, 0x71, 0xfa, 0x3c, 0xea, 0xce, 0x76, 0x3d, 0x2c, 0x0e,
+ 0xbc, 0x3c, 0x9b, 0x96, 0x48, 0xbd, 0x60, 0x7b, 0x93, 0xbb, 0x8a, 0x69, 0xa8,
+ 0xbc, 0xc0, 0xcd, 0x79, 0x3c, 0xd0, 0xe0, 0x87, 0xbd, 0xe6, 0x91, 0x53, 0xbd,
+ 0x96, 0xe0, 0x03, 0x3d, 0x8b, 0x7a, 0x81, 0xbd, 0x16, 0x64, 0x80, 0xbd, 0x84,
+ 0xac, 0x87, 0x3c, 0xf8, 0xb7, 0xfc, 0xbc, 0x63, 0x2a, 0x38, 0xbd, 0x5a, 0x71,
+ 0x35, 0xbd, 0xda, 0xff, 0x49, 0xbd, 0x50, 0xcd, 0xdb, 0xbb, 0xc0, 0x85, 0x37,
+ 0xbb, 0x2a, 0x21, 0x35, 0x3d, 0xb6, 0x59, 0xcc, 0xbc, 0x10, 0x02, 0xe7, 0x3b,
+ 0x78, 0xf5, 0x54, 0xbc, 0xb0, 0x3c, 0x58, 0x3c, 0xf4, 0x96, 0x59, 0x3d, 0x10,
+ 0xd7, 0xd2, 0xbb, 0x1a, 0x0c, 0x79, 0x3d, 0x48, 0x2c, 0x6b, 0x3c, 0xc0, 0x44,
+ 0x89, 0xbb, 0x5c, 0xf0, 0xa3, 0x3c, 0xd0, 0x1c, 0x07, 0x3d, 0x02, 0xcd, 0x94,
+ 0xbc, 0xa8, 0x51, 0x99, 0xbc, 0xc0, 0xb9, 0x40, 0x3c, 0xe0, 0x85, 0x86, 0x3c,
+ 0x74, 0x77, 0x9f, 0x3c, 0x15, 0xe0, 0x71, 0xbd, 0x00, 0xf1, 0xfc, 0xb9, 0x50,
+ 0x39, 0x11, 0x3c, 0xb7, 0x13, 0x81, 0x3d, 0x60, 0x31, 0xe5, 0x3c, 0x8c, 0x42,
+ 0xf6, 0xbc, 0x4c, 0x34, 0x8a, 0xbc, 0xb8, 0x26, 0xe6, 0x3c, 0xf4, 0x56, 0x69,
+ 0xbc, 0xcc, 0xb4, 0xa1, 0x3c, 0xf0, 0x8e, 0x48, 0xbd, 0xcb, 0xab, 0x91, 0xbd,
+ 0x00, 0xc4, 0x5e, 0xbb, 0xdd, 0xf5, 0x8c, 0x3d, 0xc8, 0x1a, 0x8a, 0x3c, 0x1c,
+ 0x9c, 0xda, 0xbc, 0x89, 0x6e, 0x83, 0x3d, 0x00, 0x6e, 0x3c, 0x39, 0x80, 0x82,
+ 0xd0, 0x3a, 0x00, 0x09, 0xc2, 0xb9, 0x04, 0x06, 0x38, 0xbc, 0x0a, 0x7a, 0xf7,
+ 0xbc, 0x50, 0xac, 0x1d, 0x3c, 0x9e, 0xd8, 0xfa, 0xbc, 0xea, 0xed, 0x71, 0xbd,
+ 0x7f, 0xf6, 0x0a, 0xbd, 0x20, 0x2d, 0x30, 0x3b, 0xd0, 0x7c, 0x96, 0x3b, 0x2e,
+ 0x61, 0x3f, 0x3d, 0xb0, 0x0a, 0x2d, 0x3d, 0x80, 0xac, 0x47, 0xbb, 0x7a, 0x9e,
+ 0xe6, 0xbc, 0x50, 0x90, 0x44, 0x3c, 0x0d, 0x23, 0x8e, 0xbd, 0x00, 0x3a, 0x59,
+ 0x3a, 0x12, 0xa5, 0x52, 0xbd, 0xbc, 0x90, 0xac, 0x3c, 0x00, 0x77, 0xe1, 0x3a,
+ 0x83, 0x27, 0x8a, 0xbd, 0x40, 0xcd, 0xb0, 0xbc, 0x6a, 0xf8, 0x22, 0x3d, 0xc0,
+ 0xfe, 0xc8, 0xbb, 0x52, 0x28, 0x63, 0x3d, 0xb2, 0xd2, 0xbe, 0xbc, 0x80, 0x68,
+ 0x42, 0xbc, 0xa4, 0x31, 0x58, 0xbc, 0xae, 0xda, 0x3a, 0xbd, 0xcb, 0xd7, 0x80,
+ 0xbd, 0x32, 0x43, 0x60, 0x3d, 0x52, 0xc1, 0xa9, 0xbc, 0x18, 0x3a, 0x2d, 0x3c,
+ 0x8e, 0x17, 0x5f, 0xbd, 0x9d, 0xcc, 0x85, 0x3d, 0x5c, 0x7c, 0x12, 0x3d, 0xde,
+ 0x24, 0x78, 0x3d, 0xec, 0xba, 0x16, 0x3d, 0xd1, 0xb1, 0x3d, 0xbd, 0xf0, 0x7f,
+ 0xe3, 0x3c, 0xe0, 0xf7, 0xef, 0xbb, 0x28, 0x65, 0x18, 0xbd, 0x7a, 0x38, 0x48,
+ 0x3d, 0xad, 0xff, 0x81, 0xbd, 0x72, 0xe6, 0x69, 0x3d, 0x98, 0x35, 0x08, 0xbd,
+ 0x16, 0xb5, 0x3a, 0xbd, 0x26, 0x18, 0x52, 0xbd, 0xc4, 0xb5, 0xc9, 0x3c, 0xbc,
+ 0xcc, 0x93, 0x3c, 0x6e, 0x74, 0xc9, 0xbc, 0xae, 0x05, 0x14, 0x3d, 0x96, 0x6c,
+ 0x78, 0x3d, 0x48, 0xe7, 0x7a, 0xbc, 0xe2, 0x8b, 0x65, 0xbd, 0xda, 0x9c, 0x97,
+ 0xbc, 0xbc, 0xc8, 0xab, 0x3c, 0xf0, 0xb1, 0x5f, 0xbd, 0xbe, 0x43, 0x3d, 0x3d,
+ 0xf8, 0xc7, 0x81, 0xbd, 0xd0, 0xc7, 0xcd, 0x3c, 0xfe, 0x77, 0x72, 0xbd, 0x32,
+ 0x3c, 0x7c, 0x3d, 0xfa, 0x2e, 0x84, 0xbc, 0x4c, 0xbc, 0x04, 0x3d, 0xc6, 0x29,
+ 0x8f, 0xbd, 0x4c, 0x07, 0xb8, 0x3c, 0x51, 0xb8, 0x45, 0xbd, 0x4c, 0x84, 0x7b,
+ 0xbd, 0x8e, 0x26, 0x3e, 0xbd, 0x48, 0xcc, 0x96, 0xbc, 0xb0, 0x59, 0x32, 0x3d,
+ 0xd6, 0x47, 0xba, 0xbc, 0xf9, 0x32, 0x81, 0x3d, 0xb0, 0xb8, 0x88, 0xbb, 0x80,
+ 0x93, 0xfd, 0x3a, 0x4a, 0x8d, 0x39, 0x3d, 0x88, 0x34, 0xa1, 0x3c, 0x20, 0x3b,
+ 0x53, 0x3b, 0x10, 0x26, 0x35, 0x3d, 0x50, 0xab, 0x77, 0xbc, 0x89, 0x68, 0x69,
+ 0xbd, 0x56, 0xd0, 0x15, 0x3d, 0x56, 0x3f, 0x3e, 0xbd, 0xa0, 0x94, 0xb5, 0x3c,
+ 0xa9, 0x10, 0x90, 0xbd, 0xfa, 0xe9, 0x48, 0xbd, 0x66, 0x62, 0x6a, 0x3d, 0xdc,
+ 0x51, 0xb0, 0x3c, 0x20, 0x13, 0x4d, 0xbd, 0x40, 0xbf, 0xe5, 0xba, 0x50, 0x61,
+ 0x9e, 0x3b, 0xa0, 0xbd, 0xeb, 0xbc, 0xd9, 0x55, 0x48, 0xbd, 0x4c, 0xbf, 0x0e,
+ 0xbd, 0x80, 0x28, 0x20, 0x3b, 0xea, 0x77, 0x72, 0x3d, 0x08, 0xd6, 0x02, 0x3d,
+ 0x7b, 0x14, 0x42, 0xbd, 0x8c, 0x7f, 0x91, 0x3c, 0x82, 0xe4, 0x16, 0xbd, 0x30,
+ 0x61, 0xaf, 0x3c, 0xd2, 0x5c, 0x5a, 0xbd, 0xc0, 0x16, 0x69, 0x3b, 0xe9, 0x5b,
+ 0x84, 0x3d, 0x49, 0xc3, 0x7e, 0xbd, 0x90, 0x7f, 0xf7, 0x3c, 0x3e, 0xd5, 0x85,
+ 0xbd, 0x38, 0xb7, 0x43, 0x3c, 0x4e, 0x4d, 0xc0, 0xbc, 0x00, 0x78, 0xea, 0x3a,
+ 0x32, 0xb2, 0x92, 0xbd, 0xb0, 0xc3, 0x1d, 0x3c, 0x90, 0xc2, 0x23, 0x3c, 0x80,
+ 0x14, 0xc5, 0x3b, 0x00, 0xf1, 0x87, 0xbc, 0x26, 0xf4, 0x8a, 0xbd, 0x10, 0xa6,
+ 0x9a, 0x3b, 0x78, 0x8b, 0x72, 0xbd, 0x85, 0xef, 0x12, 0xbd, 0xd8, 0x93, 0x02,
+ 0x3d, 0x80, 0x8b, 0xca, 0x3a, 0x18, 0x72, 0x17, 0xbc, 0x65, 0x2d, 0x83, 0x3d,
+ 0xfb, 0xe9, 0x81, 0x3d, 0x60, 0xf3, 0x46, 0xbd, 0xb4, 0xab, 0x1a, 0xbc, 0x30,
+ 0x0c, 0xf9, 0x3c, 0xb6, 0xc5, 0x63, 0xbd, 0x8e, 0x20, 0xdd, 0xbc, 0x5c, 0x18,
+ 0x97, 0xbc, 0x10, 0x42, 0x43, 0x3d, 0x11, 0xab, 0x84, 0x3d, 0xec, 0xcf, 0x30,
+ 0x3d, 0x38, 0x0e, 0x6a, 0x3c, 0x3e, 0x40, 0xd9, 0xbc, 0xce, 0x14, 0x14, 0x3d,
+ 0x5c, 0xe6, 0x71, 0xbc, 0xf8, 0xd8, 0xf2, 0x3c, 0x98, 0x96, 0x21, 0xbc, 0xbe,
+ 0xdb, 0x18, 0xbd, 0xe6, 0x7f, 0x28, 0xbd, 0xab, 0x56, 0x23, 0xbd, 0xc2, 0x40,
+ 0x8e, 0xbd, 0x8c, 0x92, 0xc3, 0x3c, 0xd4, 0x0a, 0x13, 0xbd, 0xbe, 0x25, 0x05,
+ 0x3d, 0x12, 0x58, 0x0d, 0x3d, 0xd7, 0x65, 0x79, 0xbd, 0x9c, 0x54, 0x4e, 0x3d,
+ 0x02, 0x2a, 0x40, 0x3d, 0xef, 0xcd, 0x01, 0xbd, 0x11, 0x5c, 0x92, 0x3d, 0xb0,
+ 0x03, 0x95, 0x3c, 0xa0, 0x08, 0x19, 0x3b, 0x79, 0xad, 0x8c, 0x3d, 0x19, 0x93,
+ 0x7a, 0xbd, 0x40, 0xfa, 0xc6, 0xbb, 0x68, 0xb6, 0xa8, 0x3c, 0x45, 0x29, 0x8d,
+ 0xbd, 0x90, 0x3e, 0x13, 0xbc, 0x1a, 0x2d, 0x70, 0x3d, 0xc1, 0xdd, 0x6a, 0xbd,
+ 0x50, 0x75, 0x01, 0xbd, 0xc1, 0x8d, 0x91, 0xbd, 0xdd, 0x3f, 0x84, 0xbd, 0xa3,
+ 0xc6, 0x8d, 0x3d, 0xce, 0x23, 0x5b, 0x3d, 0x7e, 0xfb, 0x7d, 0x3d, 0xd5, 0xf4,
+ 0x23, 0xbd, 0x4c, 0x65, 0x8d, 0xbc, 0xb0, 0x76, 0x89, 0xbd, 0x28, 0xc4, 0x82,
+ 0xbd, 0x40, 0x70, 0x71, 0x3b, 0xfa, 0x55, 0x8e, 0xbc, 0x40, 0x08, 0xf0, 0x3a,
+ 0x02, 0x81, 0x56, 0x3d, 0xfe, 0x51, 0xf8, 0xbc, 0x1a, 0xcd, 0x91, 0xbd, 0xfb,
+ 0x66, 0x7b, 0xbd, 0xb0, 0xbb, 0xf2, 0xbc, 0xbb, 0x24, 0x23, 0xbd, 0x5c, 0x6c,
+ 0x6d, 0xbd, 0x08, 0xa0, 0x8b, 0x3c, 0xb7, 0x93, 0x1d, 0xbd, 0x74, 0x9f, 0x21,
+ 0x3d, 0x1c, 0x43, 0x33, 0xbd, 0x66, 0x2c, 0x1c, 0xbd, 0xfe, 0xf5, 0x11, 0xbd,
+ 0x10, 0x32, 0xef, 0xbc, 0x40, 0x70, 0x6f, 0xbb, 0xa1, 0xca, 0x8f, 0x3d, 0x12,
+ 0x42, 0x13, 0x3d, 0x38, 0x2e, 0xf3, 0x3c, 0x16, 0x69, 0x77, 0x3d, 0x6d, 0xa9,
+ 0x1e, 0xbd, 0xdc, 0xf5, 0xba, 0xbc, 0xc4, 0xe8, 0x1f, 0xbd, 0xfc, 0xc7, 0x08,
+ 0x3d, 0x8c, 0x9a, 0x28, 0x3d, 0x80, 0xbb, 0x14, 0x3b, 0xce, 0x47, 0x68, 0x3d,
+ 0xd3, 0x75, 0x10, 0xbd, 0x30, 0x9e, 0xb1, 0x3b, 0x48, 0x08, 0x80, 0x3c, 0x53,
+ 0xbe, 0x7e, 0xbd, 0x54, 0xdd, 0x5c, 0xbd, 0x89, 0x15, 0x77, 0xbd, 0x20, 0x13,
+ 0x00, 0x3b, 0xab, 0x6a, 0x15, 0xbd, 0x70, 0x62, 0x0b, 0xbc, 0xb6, 0x69, 0x44,
+ 0x3d, 0x9e, 0x71, 0x44, 0x3d, 0xfb, 0x84, 0x1e, 0xbd, 0xc8, 0x25, 0x3e, 0xbc,
+ 0xa8, 0x9e, 0xa6, 0x3c, 0xa0, 0x0c, 0x0b, 0x3d, 0x48, 0xe7, 0xb1, 0xbc, 0x2f,
+ 0xfc, 0x8a, 0x3d, 0xbc, 0x2a, 0x27, 0xbc, 0x80, 0x69, 0x38, 0x3c, 0xa0, 0x89,
+ 0xb4, 0xbb, 0x10, 0xb6, 0x56, 0xbc, 0x80, 0xaa, 0x37, 0x3b, 0xbd, 0x66, 0x1d,
+ 0xbd, 0xb9, 0x3e, 0x6c, 0xbd, 0x14, 0xc1, 0x1e, 0x3d, 0x10, 0xd3, 0xa5, 0x3b,
+ 0x1c, 0x9a, 0x43, 0xbc, 0xa0, 0xb3, 0xdd, 0xbc, 0xf8, 0x82, 0xb8, 0x3c, 0xc8,
+ 0x76, 0x1b, 0x3d, 0x7e, 0x2b, 0x5c, 0x3d, 0x20, 0xd8, 0x7f, 0xbd, 0x88, 0xe0,
+ 0xa0, 0x3c, 0x1c, 0x48, 0x26, 0x3d, 0x50, 0x53, 0x1e, 0x3c, 0xf0, 0x07, 0x54,
+ 0x3c, 0xc9, 0xde, 0x05, 0xbd, 0x2c, 0x34, 0x84, 0x3c, 0xa8, 0x30, 0x1b, 0x3c,
+ 0x6c, 0xa1, 0x3c, 0xbd, 0x00, 0x58, 0xc1, 0xb8, 0xf0, 0xd4, 0xf9, 0x3b, 0xf0,
+ 0xb3, 0x2e, 0x3d, 0x14, 0xe3, 0x4f, 0x3d, 0x70, 0x0b, 0x73, 0x3c, 0x8b, 0xca,
+ 0x89, 0xbd, 0x9c, 0xd8, 0x85, 0x3c, 0x9c, 0x34, 0x4b, 0xbc, 0xf5, 0x38, 0x71,
+ 0xbd, 0x01, 0xe5, 0x84, 0x3d, 0xd4, 0xde, 0x25, 0xbc, 0x80, 0xc0, 0xb1, 0xbb,
+ 0x80, 0xca, 0xfc, 0x3b, 0x78, 0xe0, 0x2d, 0xbd, 0xda, 0x90, 0x29, 0xbd, 0x3a,
+ 0xdb, 0x37, 0xbd, 0x00, 0x81, 0xa1, 0xbb, 0x3a, 0xcb, 0x71, 0xbd, 0x1c, 0x8e,
+ 0x29, 0xbc, 0x68, 0x0a, 0x5f, 0xbc, 0x0f, 0x86, 0x91, 0xbd, 0x98, 0x61, 0x62,
+ 0x3c, 0x82, 0x06, 0x4e, 0xbd, 0xa0, 0x7a, 0x35, 0x3b, 0xfa, 0xbc, 0x31, 0x3d,
+ 0xee, 0x18, 0x3a, 0x3d, 0xe0, 0xf0, 0x9d, 0xbb, 0x87, 0xba, 0x8f, 0x3d, 0x0e,
+ 0x75, 0x24, 0x3d, 0x92, 0xf6, 0x77, 0x3d, 0x78, 0xda, 0x72, 0xbc, 0xe4, 0x5c,
+ 0x55, 0xbc, 0xe3, 0xbf, 0x87, 0x3d, 0x74, 0x55, 0x5c, 0xbd, 0x88, 0x2b, 0x0b,
+ 0xbc, 0x68, 0xd5, 0x21, 0x3d, 0x0a, 0x05, 0x94, 0xbc, 0x5f, 0xb7, 0x8a, 0x3d,
+ 0x48, 0x83, 0x5c, 0x3c, 0x08, 0x83, 0x77, 0xbc, 0xc4, 0x31, 0xd6, 0x3c, 0xb8,
+ 0x48, 0x52, 0x3c, 0x00, 0xcb, 0xda, 0x3b, 0x32, 0x6a, 0x5f, 0xbd, 0x76, 0x7f,
+ 0x8f, 0xbd, 0xc0, 0xb7, 0xb2, 0x3c, 0x91, 0x5e, 0x1d, 0xbd, 0x92, 0x5d, 0x62,
+ 0x3d, 0x9c, 0x2b, 0x65, 0xbd, 0x3e, 0xe5, 0x2a, 0x3d, 0x29, 0xb7, 0x81, 0xbd,
+ 0x74, 0xa2, 0xda, 0x3c, 0x1a, 0xcb, 0x15, 0x3d, 0x56, 0x35, 0x60, 0x3d, 0x50,
+ 0x4a, 0x4f, 0xbc, 0xb2, 0x3c, 0x73, 0x3d, 0x88, 0x39, 0x71, 0xbd, 0xa0, 0x73,
+ 0x7d, 0xbd, 0x18, 0x14, 0xac, 0x3c, 0xa8, 0x1a, 0x57, 0x3d, 0x00, 0x3a, 0x77,
+ 0xbc, 0x2a, 0xd5, 0x93, 0xbc, 0x7e, 0x27, 0x41, 0x3d, 0xa0, 0x96, 0x19, 0x3d,
+ 0x18, 0x3e, 0xe5, 0x3c, 0x56, 0xda, 0x0d, 0x3d, 0xb2, 0x5f, 0x1d, 0x3d, 0x0c,
+ 0x27, 0xd6, 0x3c, 0xc6, 0x34, 0x89, 0xbd, 0x84, 0xe7, 0x65, 0xbd, 0xfc, 0x87,
+ 0xba, 0x3c, 0xd6, 0x7b, 0x3b, 0xbd, 0xe8, 0xf4, 0x49, 0xbd, 0x70, 0x19, 0x0d,
+ 0x3c, 0x5a, 0x0c, 0x18, 0x3d, 0xe6, 0x0e, 0x26, 0x3d, 0x12, 0xa0, 0x61, 0xbd,
+ 0xec, 0xa3, 0x26, 0x3d, 0xf4, 0xef, 0xe0, 0x3c, 0xdd, 0xc0, 0x88, 0xbd, 0x08,
+ 0x87, 0x0e, 0x3d, 0x2b, 0xb7, 0x18, 0xbd, 0xe6, 0xd5, 0x1f, 0xbd, 0x38, 0xc1,
+ 0x37, 0x3c, 0x88, 0x9a, 0x74, 0xbd, 0x04, 0xce, 0x04, 0x3d, 0x00, 0x5c, 0xab,
+ 0xbc, 0xbd, 0x47, 0x4b, 0xbd, 0xf0, 0xc1, 0x33, 0xbc, 0x2c, 0x4d, 0xca, 0x3c,
+ 0x84, 0xfd, 0xed, 0xbc, 0x6c, 0xf2, 0x2c, 0x3d, 0x1b, 0x24, 0x87, 0x3d, 0x7a,
+ 0x67, 0x8f, 0xbc, 0x84, 0xab, 0x50, 0xbc, 0x84, 0xd2, 0x0b, 0x3d, 0x18, 0x03,
+ 0x03, 0x3d, 0x80, 0x54, 0x01, 0x3d, 0xbc, 0x41, 0xd8, 0x3c, 0x60, 0xe4, 0x34,
+ 0x3d, 0x3d, 0xfb, 0x26, 0xbd, 0xcc, 0x6f, 0x1f, 0x3d, 0xc0, 0xb0, 0x30, 0xbb,
+ 0x7f, 0xb2, 0x83, 0xbd, 0x8f, 0xed, 0x91, 0x3d, 0xa0, 0xe6, 0xe2, 0xbb, 0xfa,
+ 0x94, 0x67, 0x3d, 0x70, 0xd4, 0x69, 0xbd, 0x80, 0xba, 0xed, 0x3c, 0xce, 0x26,
+ 0xb8, 0xbc, 0xfe, 0xd9, 0x1c, 0x3d, 0xae, 0x09, 0x0e, 0x3d, 0x4f, 0x3d, 0x52,
+ 0xbd, 0x87, 0xde, 0x62, 0xbd, 0x02, 0x63, 0xff, 0xbc, 0x70, 0x60, 0xbd, 0x3b,
+ 0x3c, 0x3f, 0xe7, 0x3c, 0x9c, 0x9c, 0x34, 0xbd, 0x82, 0xcf, 0x82, 0xbd, 0xa2,
+ 0xdb, 0x39, 0x3d, 0x70, 0x89, 0xe8, 0x3c, 0xad, 0x61, 0x80, 0xbd, 0xd8, 0x58,
+ 0x34, 0xbd, 0xf6, 0x79, 0x5f, 0xbd, 0xd0, 0x9b, 0xc6, 0x3c, 0x02, 0x91, 0x0f,
+ 0x3d, 0x90, 0xe4, 0xc1, 0x3b, 0xff, 0xa7, 0x8e, 0x3d, 0x99, 0x07, 0x92, 0xbd,
+ 0x30, 0x36, 0xe4, 0x3b, 0xf0, 0xd6, 0x38, 0xbd, 0xea, 0x6d, 0x2d, 0xbd, 0x0e,
+ 0x11, 0xf6, 0xbc, 0x80, 0x5b, 0x53, 0x3b, 0x1c, 0x44, 0x41, 0x3d, 0xab, 0x98,
+ 0x7b, 0xbd, 0x20, 0x36, 0x71, 0x3b, 0x87, 0x93, 0x20, 0xbd, 0xb0, 0x35, 0x27,
+ 0xbd, 0xd2, 0x2b, 0x75, 0x3d, 0x90, 0x12, 0xdc, 0xbc, 0x06, 0x6c, 0x2b, 0x3d,
+ 0xe0, 0x86, 0x20, 0xbb, 0x9d, 0xdd, 0x88, 0x3d, 0xec, 0xe2, 0x19, 0x3d, 0x70,
+ 0x76, 0xb4, 0x3c, 0x0e, 0x49, 0x42, 0xbd, 0x34, 0x9c, 0xe3, 0x3c, 0xe0, 0x1d,
+ 0xf8, 0xbb, 0xfc, 0x83, 0xc2, 0xbc, 0xdc, 0xe1, 0x8d, 0xbc, 0x04, 0x9b, 0xa7,
+ 0x3c, 0x54, 0x5a, 0xfc, 0x3c, 0x80, 0x63, 0x14, 0xba, 0xcc, 0x46, 0x08, 0x3d,
+ 0x46, 0xf5, 0x2b, 0x3d, 0xe0, 0x8b, 0x48, 0x3d, 0xa0, 0x99, 0xfd, 0x3b, 0x41,
+ 0x57, 0x87, 0x3d, 0xe4, 0xcb, 0x56, 0xbd, 0x1f, 0xa4, 0x3f, 0xbd, 0xac, 0x66,
+ 0x85, 0x3c, 0xaa, 0x3a, 0x55, 0x3d, 0x32, 0x06, 0x29, 0x3d, 0x9a, 0xb8, 0x5a,
+ 0xbd, 0x00, 0xfc, 0xbb, 0xba, 0xd7, 0x80, 0x86, 0x3d, 0xb4, 0x7c, 0xf5, 0x3c,
+ 0xac, 0xf4, 0x36, 0x3d, 0x82, 0xef, 0x65, 0x3d, 0x49, 0x63, 0x5c, 0xbd, 0x66,
+ 0xe0, 0x8f, 0xbd, 0x42, 0x66, 0x28, 0x3d, 0xfc, 0xec, 0x08, 0x3d, 0x0a, 0x9c,
+ 0x1e, 0x3d, 0x65, 0x3c, 0x45, 0xbd, 0x73, 0x4f, 0x88, 0x3d, 0xec, 0x1e, 0xbf,
+ 0xbc, 0xee, 0xa7, 0x55, 0x3d, 0x10, 0x84, 0x57, 0x3c, 0xd4, 0x12, 0xdf, 0x3c,
+ 0xa8, 0x8f, 0x8f, 0xbd, 0x56, 0x80, 0x89, 0xbd, 0x08, 0xc5, 0x09, 0xbc, 0xfd,
+ 0x84, 0x22, 0xbd, 0xb2, 0x0a, 0x66, 0x3d, 0x0a, 0x86, 0x61, 0x3d, 0x79, 0xf8,
+ 0x81, 0xbd, 0x7a, 0x81, 0x49, 0xbd, 0x88, 0x62, 0x7f, 0x3c, 0x8c, 0x81, 0x71,
+ 0xbd, 0x42, 0x9e, 0x86, 0xbd, 0x30, 0x5d, 0xf6, 0x3b, 0x6c, 0xc0, 0x29, 0xbc,
+ 0x88, 0x30, 0xdf, 0xbc, 0xda, 0xed, 0xf4, 0xbc, 0x98, 0x29, 0x34, 0xbd, 0xc0,
+ 0x10, 0xbe, 0x3a, 0x9b, 0x69, 0x8c, 0x3d, 0x40, 0x02, 0x98, 0xba, 0x2b, 0x85,
+ 0x76, 0xbd, 0x0c, 0xfd, 0xd3, 0x3c, 0x62, 0x37, 0x08, 0x3d, 0x0a, 0xe3, 0xe9,
+ 0xbc, 0x80, 0x1c, 0xc9, 0x3a, 0x54, 0x4b, 0x39, 0xbc, 0x28, 0xae, 0x7a, 0x3c,
+ 0x60, 0xd7, 0xe9, 0x3b, 0x08, 0xbe, 0x52, 0xbd, 0x04, 0x99, 0x3d, 0xbd, 0xd0,
+ 0xd2, 0x13, 0xbd, 0x1a, 0x86, 0x8e, 0xbc, 0xeb, 0xaa, 0x6a, 0xbd, 0x00, 0x23,
+ 0xa3, 0xb9, 0xc8, 0x76, 0x77, 0xbc, 0x36, 0x45, 0x72, 0xbd, 0xe4, 0xd7, 0x8a,
+ 0xbc, 0xfd, 0xfa, 0x8c, 0x3d, 0x2b, 0xc3, 0x07, 0xbd, 0x6d, 0xd0, 0x87, 0x3d,
+ 0xec, 0xa4, 0xde, 0x3c, 0x92, 0x4b, 0x65, 0x3d, 0x20, 0x6c, 0x2c, 0xbd, 0x00,
+ 0xb7, 0x0c, 0x3b, 0x96, 0x7f, 0x4b, 0x3d, 0xec, 0xe9, 0xdb, 0xbc, 0xaa, 0x06,
+ 0x3b, 0x3d, 0x20, 0x8c, 0x33, 0x3d, 0xe1, 0x03, 0x18, 0xbd, 0xe0, 0xa5, 0x0a,
+ 0xbc, 0x30, 0x1d, 0x5f, 0x3c, 0xfc, 0x28, 0x6d, 0xbd, 0x43, 0x41, 0x90, 0x3d,
+ 0x58, 0x87, 0x30, 0x3c, 0xdd, 0x8c, 0x60, 0xbd, 0xec, 0x2a, 0xba, 0xbc, 0xf2,
+ 0x9d, 0xa9, 0xbc, 0x30, 0xb0, 0x06, 0x3c, 0x68, 0x3e, 0x53, 0x3c, 0x78, 0xab,
+ 0xff, 0xbc, 0xa8, 0x34, 0x0d, 0xbc, 0x4e, 0x3f, 0x01, 0x3d, 0x00, 0x96, 0x44,
+ 0x3b, 0x2c, 0xa3, 0xda, 0x3c, 0xba, 0xc4, 0x2e, 0xbd, 0x72, 0xbd, 0x2f, 0x3d,
+ 0xfc, 0x1b, 0x7d, 0xbc, 0x9e, 0xbf, 0x7e, 0x3d, 0x02, 0x94, 0x19, 0x3d, 0x94,
+ 0x36, 0x4f, 0x3d, 0xf1, 0xee, 0x68, 0xbd, 0x54, 0x9c, 0x87, 0x3c, 0xfa, 0x3e,
+ 0x7e, 0x3d, 0x02, 0xec, 0x84, 0xbc, 0x12, 0xe7, 0x89, 0xbd, 0xa4, 0x90, 0xa6,
+ 0x3c, 0x3c, 0x7a, 0x89, 0xbc, 0x86, 0x5d, 0x54, 0x3d, 0xa4, 0xad, 0x53, 0xbc,
+ 0x32, 0xc5, 0x00, 0x3d, 0x1e, 0x53, 0x0b, 0x3d, 0xef, 0xae, 0x02, 0xbd, 0x7c,
+ 0xd8, 0x03, 0x3d, 0x38, 0x0e, 0xa5, 0xbc, 0x51, 0xc4, 0x83, 0x3d, 0x66, 0xcb,
+ 0x8f, 0xbd, 0xa6, 0xfe, 0xb6, 0xbc, 0xa4, 0xb1, 0x97, 0x3c, 0x00, 0xad, 0xb2,
+ 0x3a, 0x0f, 0xb7, 0x33, 0xbd, 0x37, 0x1f, 0x6f, 0xbd, 0x57, 0x39, 0x8c, 0x3d,
+ 0x54, 0xe4, 0xb7, 0xbc, 0x1e, 0x63, 0x52, 0xbd, 0x00, 0x3b, 0x43, 0xbd, 0x50,
+ 0x48, 0xf1, 0xbb, 0x18, 0x01, 0x81, 0xbd, 0x90, 0x1c, 0xaf, 0xbc, 0x06, 0xf8,
+ 0x7d, 0xbd, 0xf0, 0xe0, 0xa5, 0xbc, 0x08, 0x06, 0xc3, 0x3c, 0x22, 0xff, 0x83,
+ 0xbc, 0x4c, 0xef, 0x88, 0xbd, 0x36, 0xf2, 0x77, 0x3d, 0x54, 0x3b, 0xd4, 0xbc,
+ 0xa7, 0xa2, 0x8e, 0x3d, 0xac, 0xb2, 0x99, 0x3c, 0x10, 0x08, 0x88, 0xbb, 0x81,
+ 0x58, 0x8d, 0xbd, 0xf8, 0x25, 0x29, 0xbd, 0x1c, 0x0f, 0x26, 0xbd, 0x8e, 0x7a,
+ 0x81, 0xbd, 0x5c, 0x14, 0x8d, 0xbd, 0x81, 0xdd, 0x8f, 0xbd, 0xc8, 0xa2, 0x5f,
+ 0xbc, 0xc0, 0x48, 0xda, 0xba, 0xfe, 0x26, 0x14, 0x3d, 0xe2, 0x9a, 0x89, 0xbd,
+ 0x66, 0x8d, 0x59, 0x3d, 0xd8, 0xf8, 0x45, 0x3d, 0x0b, 0xb1, 0x04, 0xbd, 0x7a,
+ 0x32, 0xdd, 0xbc, 0x00, 0x01, 0x24, 0xbb, 0xc5, 0x97, 0x87, 0xbd, 0x7c, 0xea,
+ 0x46, 0x3d, 0x85, 0xc1, 0x81, 0x3d, 0xe8, 0x63, 0x24, 0x3d, 0x5d, 0xb3, 0x84,
+ 0xbd, 0xca, 0xa4, 0x04, 0x3d, 0xea, 0xe8, 0xf0, 0xbc, 0xdc, 0x41, 0x05, 0xbd,
+ 0xe8, 0x40, 0x4c, 0xbd, 0xb0, 0xb7, 0x2d, 0x3d, 0xa9, 0x0c, 0x1f, 0xbd, 0xd0,
+ 0x50, 0x97, 0x3b, 0x3f, 0x9c, 0x0f, 0xbd, 0xac, 0xa8, 0x59, 0xbd, 0xdb, 0x76,
+ 0x87, 0x3d, 0x08, 0xd7, 0x52, 0x3c, 0xc8, 0xf0, 0x1c, 0x3d, 0xec, 0xc1, 0x4a,
+ 0x3d, 0x44, 0x87, 0x81, 0x3c, 0xbe, 0x6f, 0x13, 0x3d, 0x80, 0x36, 0x49, 0x3c,
+ 0xae, 0xea, 0x73, 0x3d, 0x70, 0xd3, 0x2d, 0x3d, 0xde, 0xbb, 0x9d, 0xbc, 0xaa,
+ 0xba, 0x32, 0x3d, 0x7b, 0xc1, 0x3c, 0xbd, 0x42, 0x4e, 0x5f, 0xbd, 0x9a, 0xd4,
+ 0x75, 0xbd, 0x52, 0x8d, 0x4a, 0x3d, 0xb4, 0x42, 0x8f, 0x3c, 0x20, 0x32, 0x92,
+ 0xbc, 0x39, 0x52, 0x0a, 0xbd, 0xd8, 0xf6, 0x21, 0xbd, 0x8b, 0x5e, 0x26, 0xbd,
+ 0x42, 0x45, 0x5b, 0xbd, 0x06, 0x86, 0x7f, 0xbd, 0x65, 0x5a, 0x57, 0xbd, 0x78,
+ 0x0a, 0x41, 0xbd, 0x5d, 0x12, 0x89, 0xbd, 0x40, 0x70, 0x34, 0xbc, 0xa0, 0x15,
+ 0x43, 0xbb, 0x76, 0xc5, 0x48, 0x3d, 0x40, 0x0b, 0x36, 0x3d, 0x40, 0x3a, 0x3f,
+ 0x3b, 0x58, 0xc4, 0xa3, 0x3c, 0x70, 0xdc, 0xdf, 0x3c, 0x50, 0x13, 0x1c, 0x3d,
+ 0xc0, 0x6d, 0xcc, 0xbb, 0x62, 0xc7, 0x32, 0xbd, 0x15, 0x3f, 0x8b, 0x3d, 0xb5,
+ 0x5b, 0x14, 0xbd, 0xf1, 0x00, 0x3f, 0xbd, 0x90, 0xe9, 0x53, 0x3c, 0xae, 0xa0,
+ 0x1f, 0xbd, 0x54, 0x4f, 0xc8, 0xbc, 0x7c, 0x0b, 0x3a, 0xbc, 0x96, 0x74, 0x38,
+ 0x3d, 0xa6, 0x9b, 0x3f, 0xbd, 0xf4, 0xfd, 0x88, 0xbc, 0x18, 0x1c, 0x97, 0xbc,
+ 0xc8, 0xcf, 0xea, 0x3c, 0xd9, 0x76, 0x8c, 0x3d, 0x3e, 0x07, 0x87, 0xbc, 0xa8,
+ 0xb5, 0x3f, 0x3c, 0x74, 0x96, 0x79, 0xbd, 0x30, 0xfc, 0x4e, 0x3c, 0x60, 0x75,
+ 0x25, 0x3d, 0x28, 0xd6, 0x7a, 0x3c, 0x38, 0xf6, 0x3e, 0x3c, 0x90, 0xd8, 0xf6,
+ 0xbc, 0x0a, 0x8b, 0x78, 0x3d, 0x94, 0x29, 0xc7, 0xbc, 0xa0, 0x3e, 0xe9, 0xbc,
+ 0x20, 0xfc, 0xa9, 0x3c, 0xde, 0xab, 0xd2, 0xbc, 0x97, 0x63, 0x8b, 0xbd, 0xa0,
+ 0xe7, 0x52, 0xbb, 0xa4, 0xf2, 0x36, 0xbc, 0x50, 0x49, 0xb9, 0xbb, 0x1f, 0x9e,
+ 0x88, 0x3d, 0x86, 0xea, 0x9d, 0xbc, 0x38, 0x1b, 0xf5, 0x3c, 0x46, 0xea, 0x1e,
+ 0xbd, 0x00, 0xad, 0x18, 0xba, 0x1e, 0x19, 0x6b, 0xbd, 0xa4, 0x1f, 0x90, 0x3c,
+ 0xf5, 0xb4, 0x42, 0xbd, 0x48, 0xf2, 0x1f, 0xbd, 0x26, 0x05, 0x12, 0x3d, 0x80,
+ 0x01, 0x58, 0xbd, 0xee, 0x98, 0x51, 0xbd, 0xb8, 0xcd, 0x96, 0xbc, 0x65, 0xbc,
+ 0x81, 0x3d, 0x90, 0x57, 0xcd, 0x3b, 0xa0, 0x9a, 0x30, 0x3c, 0xa6, 0xa4, 0x82,
+ 0xbd, 0x20, 0xa1, 0xc6, 0xbb, 0x95, 0x3a, 0x8c, 0xbd, 0x00, 0xa2, 0x72, 0x3c,
+ 0x00, 0xd6, 0x58, 0x3b, 0xc8, 0x1f, 0x7d, 0x3c, 0xf0, 0x98, 0xe1, 0xbb, 0x02,
+ 0x83, 0xe7, 0xbc, 0x9a, 0xc9, 0x67, 0x3d, 0xf5, 0x03, 0x90, 0xbd, 0x00, 0x9e,
+ 0x55, 0xba, 0x80, 0xa0, 0x05, 0x3b, 0x00, 0x53, 0x6d, 0x3c, 0x16, 0xc9, 0x6a,
+ 0x3d, 0x96, 0x11, 0x04, 0x3d, 0x10, 0x45, 0xff, 0xbb, 0xd2, 0x78, 0x2a, 0xbd,
+ 0xbb, 0xe1, 0x8d, 0xbd, 0x8c, 0x4a, 0xc7, 0xbc, 0x20, 0x1c, 0x23, 0x3d, 0x10,
+ 0xb3, 0xff, 0x3b, 0xd8, 0xec, 0x36, 0x3c, 0x64, 0xf1, 0xa7, 0x3d, 0x22, 0xd3,
+ 0xb0, 0xbd, 0xba, 0xd3, 0xc4, 0x3c, 0x7f, 0x35, 0x0a, 0x3d, 0xb1, 0xba, 0xc0,
+ 0x3d, 0x70, 0x6e, 0x10, 0x3c, 0x0b, 0x3f, 0x43, 0x3d, 0x75, 0x57, 0x4f, 0xbd,
+ 0xf7, 0xae, 0x5e, 0xbd, 0xd6, 0xc7, 0x9f, 0x3d, 0x15, 0x89, 0x08, 0x3d, 0x02,
+ 0x77, 0x49, 0x3c, 0x19, 0x3b, 0xc5, 0xbc, 0xa2, 0x8d, 0x43, 0xbd, 0x7b, 0x63,
+ 0x22, 0xbc, 0xb8, 0x4c, 0xbe, 0x3d, 0x98, 0x23, 0x2a, 0xbd, 0xd2, 0x49, 0x69,
+ 0xbd, 0x58, 0xae, 0x14, 0x3d, 0xdc, 0x52, 0x85, 0xbd, 0xd0, 0x91, 0xea, 0x3c,
+ 0x93, 0x04, 0x5c, 0x3d, 0xdf, 0xf9, 0x20, 0x3d, 0xd3, 0x87, 0x3f, 0xbd, 0xae,
+ 0xe4, 0x6a, 0x3c, 0xed, 0x34, 0x27, 0x3c, 0x79, 0x2d, 0x67, 0x3d, 0x63, 0xb8,
+ 0x57, 0xbc, 0x9f, 0x7f, 0x79, 0xbd, 0x44, 0x92, 0x9b, 0x3d, 0x60, 0x08, 0x40,
+ 0xbd, 0xde, 0x4c, 0x9c, 0x3c, 0xdd, 0x61, 0x21, 0x3c, 0x86, 0xd4, 0x15, 0xbd,
+ 0xf9, 0xd9, 0xe1, 0xbd, 0x40, 0xc7, 0x2f, 0x3d, 0xa7, 0x36, 0x89, 0x3d, 0x8a,
+ 0xdc, 0xa0, 0xbd, 0x5a, 0x12, 0x99, 0x3c, 0x8a, 0x63, 0xfa, 0xba, 0x77, 0x80,
+ 0xa2, 0xbd, 0x68, 0x8f, 0x19, 0xbc, 0x91, 0x17, 0xfc, 0x3c, 0xc7, 0x5f, 0xa0,
+ 0x3c, 0x21, 0x34, 0xf2, 0xbc, 0x09, 0x55, 0x1d, 0xbc, 0xcf, 0x87, 0x01, 0xbc,
+ 0xba, 0xe9, 0x8c, 0x3d, 0x07, 0xf7, 0x93, 0x3c, 0xe2, 0x86, 0x80, 0x3c, 0xd7,
+ 0xf7, 0x45, 0xbd, 0x8d, 0x5c, 0x55, 0x3d, 0x40, 0x89, 0x73, 0x3c, 0x7a, 0xe1,
+ 0x5c, 0x3c, 0x6a, 0x34, 0xe7, 0xbc, 0x25, 0x79, 0xaa, 0x3a, 0x13, 0x23, 0xa1,
+ 0x3d, 0x4b, 0x1e, 0xe1, 0x3c, 0x49, 0xbb, 0xb5, 0xbc, 0xa6, 0x19, 0xa9, 0x3c,
+ 0x4e, 0xf1, 0x2a, 0x3d, 0x69, 0x81, 0xac, 0x3c, 0x00, 0x31, 0x46, 0x3c, 0x84,
+ 0x9b, 0x17, 0xbd, 0xa3, 0x50, 0x70, 0x3d, 0xf9, 0x6d, 0x91, 0xbd, 0x41, 0x1f,
+ 0xad, 0x3b, 0x9c, 0x7c, 0xa5, 0xbc, 0xd7, 0xa0, 0x8f, 0xbb, 0xfe, 0xeb, 0x05,
+ 0x3d, 0xc5, 0x31, 0xc5, 0x3a, 0x9a, 0x3c, 0x08, 0x3d, 0xc2, 0x6d, 0x27, 0xbd,
+ 0xa5, 0xc1, 0x7a, 0x3c, 0x4c, 0x25, 0x41, 0xbd, 0x3e, 0x6e, 0xd0, 0x3c, 0x6b,
+ 0x0e, 0x6d, 0x3d, 0xb4, 0x47, 0x86, 0x3c, 0x60, 0xc8, 0x03, 0x3d, 0x78, 0xb8,
+ 0xb3, 0x3d, 0xfb, 0x4b, 0x0d, 0x3d, 0x44, 0x4c, 0xc0, 0x3b, 0xd1, 0xa8, 0x33,
+ 0xbc, 0xf8, 0x4d, 0x8d, 0xbd, 0x3b, 0xeb, 0x15, 0xbd, 0x16, 0xef, 0x19, 0xbb,
+ 0x66, 0x45, 0x2c, 0xbd, 0x50, 0x0b, 0xab, 0xbb, 0x95, 0x0b, 0x06, 0xbd, 0x2c,
+ 0x1f, 0x33, 0xbd, 0xe4, 0xa5, 0xb7, 0x3a, 0xa0, 0xa0, 0xe4, 0xbc, 0x6c, 0x3b,
+ 0x65, 0x3d, 0x1e, 0xa8, 0x8b, 0x3b, 0xe0, 0xb7, 0x82, 0x3c, 0x3f, 0x77, 0x5b,
+ 0x3d, 0xd1, 0xd3, 0x0a, 0x3c, 0xdd, 0xbc, 0xaa, 0xbd, 0xb2, 0x81, 0x91, 0xbc,
+ 0x0f, 0xcb, 0x5d, 0x3d, 0x08, 0xa9, 0xf0, 0xbc, 0x9b, 0xc4, 0x0c, 0x3c, 0xf7,
+ 0x0d, 0x64, 0xbc, 0x1c, 0xa0, 0xa5, 0xbc, 0x5b, 0x1d, 0x2d, 0xbd, 0x03, 0x78,
+ 0x59, 0x3d, 0x1b, 0x8a, 0x13, 0x3d, 0xaa, 0x9c, 0x14, 0xbd, 0x57, 0xe2, 0xf1,
+ 0x3c, 0x5f, 0xaa, 0x58, 0x3d, 0x6c, 0x19, 0xb5, 0xbc, 0x20, 0xeb, 0x3c, 0x3d,
+ 0xe0, 0xda, 0xd5, 0x3c, 0x54, 0x6f, 0x6f, 0xbd, 0x91, 0x64, 0x82, 0x3d, 0xed,
+ 0xcd, 0x10, 0x3b, 0xec, 0x91, 0x1c, 0x3d, 0xad, 0xee, 0xc0, 0x3c, 0xb9, 0x84,
+ 0xb8, 0x3d, 0x67, 0xe4, 0x19, 0xba, 0xc5, 0xca, 0x00, 0x3b, 0xbc, 0x29, 0xcb,
+ 0xbc, 0xca, 0x3c, 0x20, 0xbd, 0x6e, 0xed, 0x2e, 0xbd, 0xd8, 0x47, 0x83, 0xbd,
+ 0x1f, 0x0b, 0x52, 0xbd, 0x10, 0x29, 0x29, 0x3c, 0xfa, 0x35, 0xd2, 0xbc, 0xbe,
+ 0x31, 0x1b, 0x3d, 0x9c, 0x28, 0xdc, 0xbc, 0xb7, 0x93, 0x70, 0xbb, 0x7b, 0xa8,
+ 0x83, 0xbc, 0xcb, 0xf0, 0x9a, 0x3c, 0x53, 0x7d, 0x31, 0xbd, 0x8a, 0x47, 0x4a,
+ 0x3c, 0xf2, 0xe7, 0x79, 0xbd, 0xe7, 0x10, 0x64, 0xbc, 0x69, 0xf1, 0xa9, 0xbc,
+ 0x5c, 0xfc, 0x9b, 0x3d, 0x5a, 0xcf, 0x14, 0x3d, 0xec, 0x08, 0x63, 0x3d, 0x69,
+ 0x0f, 0x99, 0xbd, 0x6a, 0x76, 0xeb, 0x3c, 0xbd, 0x2f, 0x8f, 0x3d, 0xa0, 0x54,
+ 0x8f, 0x3d, 0x7e, 0x08, 0x84, 0x3d, 0xba, 0x94, 0x42, 0x3d, 0x7c, 0xae, 0xf9,
+ 0xbd, 0x70, 0x32, 0x7f, 0x3c, 0x2f, 0xd3, 0x88, 0xbc, 0x9a, 0x1a, 0x49, 0x3d,
+ 0xf6, 0xed, 0x54, 0xbd, 0x7e, 0x15, 0x66, 0x3d, 0x81, 0x94, 0x7f, 0x3d, 0x4a,
+ 0xfb, 0x5f, 0x3c, 0xd7, 0x10, 0x3a, 0x3c, 0xf8, 0x02, 0x89, 0xbd, 0x9f, 0x9c,
+ 0xb9, 0xbc, 0x02, 0x4c, 0x5b, 0x3d, 0x80, 0xe7, 0x33, 0x3c, 0x55, 0x86, 0x99,
+ 0x3d, 0x9d, 0xa9, 0xad, 0xbd, 0x9e, 0x1b, 0x76, 0xbb, 0xb8, 0x62, 0x49, 0x3d,
+ 0x22, 0x21, 0x65, 0x3d, 0x22, 0x6d, 0x0f, 0x3d, 0x60, 0x23, 0x87, 0xbc, 0xc8,
+ 0xfc, 0x26, 0xbd, 0xc5, 0x47, 0x8c, 0xbd, 0x22, 0x6e, 0xe2, 0xbc, 0xf0, 0x78,
+ 0x2e, 0x3d, 0xa4, 0x7f, 0xa5, 0xbc, 0xf1, 0x41, 0xae, 0x3d, 0xa4, 0x08, 0x0b,
+ 0x3d, 0xe8, 0xbb, 0x1c, 0xbc, 0xf8, 0xdd, 0x85, 0xbc, 0x72, 0x87, 0xea, 0x3c,
+ 0x4a, 0xaa, 0x9a, 0x3d, 0x86, 0xdb, 0xb6, 0x3d, 0x0f, 0xb5, 0xd1, 0xba, 0xfc,
+ 0x88, 0x62, 0xbd, 0x08, 0x54, 0xfd, 0x3d, 0x35, 0xf8, 0x2e, 0xbd, 0x3b, 0xbb,
+ 0xc9, 0x3d, 0x9c, 0xb6, 0x57, 0x3d, 0x03, 0x65, 0x58, 0x3d, 0x13, 0xd0, 0x1d,
+ 0xbd, 0xbb, 0xb1, 0xbf, 0xbc, 0x78, 0x00, 0xde, 0xbc, 0x5c, 0xcb, 0x48, 0xbd,
+ 0xd3, 0xa1, 0x85, 0x3d, 0x08, 0x35, 0xf6, 0xbc, 0x4c, 0x66, 0x89, 0x3d, 0x09,
+ 0x92, 0xa6, 0xbc, 0x64, 0x99, 0x9e, 0xbd, 0xae, 0x80, 0x85, 0xbd, 0x99, 0xe0,
+ 0xe2, 0x3c, 0x8e, 0x75, 0x66, 0xbc, 0x1e, 0x8c, 0xb9, 0xbd, 0x57, 0x43, 0xa8,
+ 0x3c, 0x31, 0x71, 0xac, 0xbc, 0xb5, 0x75, 0x01, 0x3d, 0x10, 0x39, 0x5c, 0xbd,
+ 0xa6, 0xf9, 0x7b, 0xbd, 0xf6, 0xea, 0x5d, 0x3d, 0xd3, 0x34, 0xc7, 0xbc, 0x4e,
+ 0xdc, 0x76, 0xbc, 0x7c, 0x98, 0x26, 0x3c, 0xfb, 0x7a, 0x27, 0xbd, 0x44, 0xe6,
+ 0x44, 0xbd, 0x26, 0xc5, 0xb2, 0x3d, 0xb1, 0x6e, 0xfa, 0xbd, 0x79, 0xcc, 0x29,
+ 0xbd, 0x08, 0xae, 0x46, 0xbc, 0x9d, 0x74, 0x67, 0x3d, 0xa3, 0xb6, 0x98, 0x3d,
+ 0x92, 0xae, 0x3f, 0xbc, 0xef, 0x8c, 0x90, 0x3d, 0xeb, 0x4c, 0x02, 0xbc, 0x21,
+ 0x7d, 0xe5, 0x3c, 0xd4, 0x6f, 0x47, 0xbd, 0x1a, 0xe8, 0x84, 0x3c, 0x0c, 0x96,
+ 0x85, 0xbd, 0xa9, 0x69, 0xa7, 0xbb, 0x8c, 0x1e, 0x82, 0xba, 0xff, 0x78, 0x04,
+ 0xbc, 0x25, 0xb9, 0xaa, 0xbd, 0x0b, 0x03, 0x48, 0xbc, 0xb3, 0xbb, 0x88, 0xbd,
+ 0x00, 0x26, 0xba, 0xbd, 0x82, 0x41, 0x81, 0x3d, 0xfa, 0x3d, 0xc7, 0x3c, 0x38,
+ 0x5c, 0x49, 0xbd, 0x0d, 0x4d, 0x3a, 0x3d, 0x67, 0x58, 0x0a, 0xbd, 0x7e, 0xf6,
+ 0x82, 0x3b, 0x1a, 0x7a, 0x7b, 0x3d, 0xba, 0xff, 0x84, 0x3c, 0x46, 0x87, 0x84,
+ 0x3c, 0xe8, 0x6c, 0x29, 0x3d, 0x8c, 0x6a, 0xac, 0xbc, 0x89, 0x34, 0x91, 0xbd,
+ 0xb9, 0xaf, 0xa6, 0x3c, 0xe0, 0x9e, 0xaf, 0xbc, 0xd2, 0x7a, 0x38, 0x3d, 0xac,
+ 0xbf, 0xc9, 0x3d, 0x73, 0xa1, 0x13, 0x3d, 0x7d, 0xe1, 0xf2, 0x3c, 0x73, 0xec,
+ 0xcf, 0x3b, 0xfd, 0x7b, 0x8e, 0x3d, 0x1e, 0xb2, 0xf3, 0xbc, 0xdc, 0x32, 0x03,
+ 0xbe, 0x5e, 0xfa, 0x1b, 0x3d, 0xdc, 0x1a, 0x25, 0x3d, 0x00, 0xcd, 0x48, 0xba,
+ 0x13, 0x9d, 0xbe, 0x3d, 0x2e, 0x05, 0x77, 0xbd, 0x17, 0x74, 0x9e, 0xbd, 0xae,
+ 0xc5, 0x62, 0x3c, 0x95, 0xf4, 0x59, 0x3d, 0x36, 0xd2, 0xa4, 0x3d, 0xab, 0x2b,
+ 0x84, 0xbc, 0x87, 0x89, 0x55, 0x3d, 0xd0, 0xde, 0x5d, 0xbc, 0xcd, 0xb0, 0xce,
+ 0xbc, 0x29, 0xa0, 0xc8, 0xbc, 0x8a, 0x0b, 0xf1, 0x3c, 0xb8, 0xce, 0x9c, 0x3c,
+ 0x14, 0xd1, 0x36, 0x3d, 0x50, 0x4b, 0x08, 0xbd, 0x85, 0x95, 0x4b, 0xbd, 0x31,
+ 0x9e, 0xcf, 0xbc, 0xff, 0x96, 0x83, 0x3d, 0x6c, 0x32, 0x15, 0x3c, 0x6d, 0xfd,
+ 0xb0, 0x3d, 0x05, 0xd8, 0x33, 0xbd, 0x1b, 0x74, 0x8d, 0xbd, 0xfb, 0x92, 0x21,
+ 0xbd, 0xde, 0x6c, 0x8f, 0xbc, 0xcc, 0x1e, 0x0f, 0xbd, 0xfa, 0xc4, 0xb8, 0xbb,
+ 0xc6, 0xe2, 0x1e, 0x3d, 0x9b, 0xd2, 0x99, 0xbb, 0x0f, 0x21, 0x5a, 0xbd, 0x32,
+ 0xb3, 0x8b, 0x3c, 0x08, 0x0c, 0x2e, 0x3b, 0x81, 0xda, 0x5f, 0xbd, 0x44, 0x42,
+ 0x81, 0x3c, 0x11, 0xf4, 0xb3, 0xbb, 0xf5, 0x91, 0xdd, 0xbd, 0x20, 0xdd, 0xb0,
+ 0x3b, 0x94, 0xc1, 0xe4, 0x3c, 0x7c, 0x2f, 0x5d, 0xbd, 0x8b, 0x1f, 0xf3, 0x3c,
+ 0xf7, 0xc1, 0xd1, 0xbd, 0x2e, 0x5f, 0x5d, 0xbd, 0x35, 0x2c, 0x92, 0x3b, 0x47,
+ 0x24, 0x34, 0x3d, 0x7f, 0x44, 0x71, 0x3d, 0x39, 0xd7, 0xfc, 0x3c, 0x60, 0x34,
+ 0x49, 0xbd, 0x70, 0xdc, 0x80, 0x3c, 0x3b, 0xe4, 0x5d, 0xbc, 0x7d, 0x7f, 0xe3,
+ 0x3c, 0x6d, 0x96, 0x2e, 0x3d, 0x7b, 0x5c, 0x15, 0x3d, 0xc3, 0x8f, 0x78, 0x3c,
+ 0x5b, 0x2f, 0x2d, 0xbc, 0x30, 0xfd, 0x3a, 0x3d, 0x79, 0x6a, 0xbb, 0x3d, 0x1a,
+ 0xb0, 0x4d, 0x3c, 0xe2, 0x91, 0x9a, 0x3b, 0x3c, 0x03, 0xa4, 0x3d, 0xa9, 0x2a,
+ 0x3a, 0xbd, 0xfc, 0xbb, 0x88, 0x3d, 0x16, 0x7f, 0x2a, 0x3c, 0xdd, 0xfc, 0x43,
+ 0x3d, 0x41, 0x34, 0x3f, 0x3d, 0x80, 0x68, 0x76, 0xbd, 0xbb, 0xab, 0xa9, 0x3d,
+ 0x4f, 0x4c, 0x17, 0x3d, 0xa3, 0x6e, 0x48, 0x3c, 0x24, 0xdf, 0xed, 0xbc, 0xa9,
+ 0xca, 0x8e, 0xbd, 0x28, 0x64, 0x51, 0x3d, 0x65, 0xea, 0x94, 0x3d, 0x80, 0xc3,
+ 0x08, 0x3b, 0xba, 0xc6, 0x38, 0x3d, 0xa3, 0x2f, 0x64, 0xba, 0x16, 0xc1, 0x28,
+ 0x3d, 0xfb, 0x5a, 0x4c, 0x3c, 0xd9, 0x21, 0x26, 0xbd, 0xb9, 0x19, 0xbd, 0x3d,
+ 0xba, 0x00, 0x59, 0x3c, 0xeb, 0x40, 0x14, 0xbc, 0x24, 0x37, 0xe9, 0xbc, 0x5e,
+ 0x99, 0xd0, 0xbc, 0x7c, 0xbc, 0x18, 0xbd, 0x71, 0x23, 0x56, 0x3d, 0xca, 0xa7,
+ 0x30, 0xbe, 0x37, 0x29, 0x5b, 0xbd, 0x73, 0xfa, 0x30, 0x3d, 0xb7, 0x67, 0xcd,
+ 0xbc, 0x92, 0xa3, 0x54, 0x3c, 0xf8, 0x54, 0xaa, 0x3d, 0xba, 0x13, 0x8c, 0x3d,
+ 0x35, 0xa3, 0xa6, 0x3c, 0x11, 0x44, 0x1d, 0xbc, 0x56, 0xe4, 0x18, 0xbd, 0xd6,
+ 0x33, 0xab, 0x3c, 0x2c, 0x70, 0xa8, 0xbc, 0xa0, 0xd7, 0xc8, 0xb8, 0x56, 0xd9,
+ 0x69, 0x3d, 0xab, 0xaf, 0x5e, 0xbd, 0x09, 0xbf, 0xb1, 0xbd, 0xad, 0xf1, 0x50,
+ 0x3c, 0xe0, 0x69, 0x47, 0xbd, 0x21, 0x32, 0x2b, 0xbb, 0x66, 0x24, 0x90, 0xbd,
+ 0xf8, 0xca, 0xbf, 0xbc, 0x1f, 0x85, 0x02, 0xbd, 0xc9, 0x47, 0xa6, 0x3d, 0xaa,
+ 0xeb, 0x9b, 0xbc, 0xcf, 0x49, 0x88, 0xbd, 0x40, 0xf0, 0x4e, 0xbc, 0xe3, 0x45,
+ 0x16, 0x3d, 0xd4, 0x2e, 0xa4, 0xbc, 0xaf, 0xe6, 0x81, 0x3d, 0x62, 0xef, 0x2c,
+ 0xbc, 0x95, 0xea, 0x63, 0xbd, 0x33, 0x76, 0x9e, 0x3d, 0x16, 0xdf, 0xd6, 0xbd,
+ 0xa4, 0xb0, 0xde, 0x39, 0xee, 0xfc, 0x89, 0x3d, 0xbd, 0x48, 0xbe, 0x3b, 0xd1,
+ 0xbb, 0x31, 0xbc, 0x69, 0x1b, 0x26, 0xbd, 0xc1, 0x34, 0xec, 0x3c, 0x33, 0x47,
+ 0xd5, 0x3c, 0xd0, 0xfb, 0x5c, 0x3b, 0xec, 0x71, 0x27, 0xbc, 0x48, 0x88, 0x62,
+ 0x3c, 0x60, 0x89, 0x76, 0x3b, 0x4c, 0x07, 0xe8, 0x3c, 0xd5, 0xb4, 0x16, 0x3d,
+ 0x9d, 0x21, 0x9f, 0x3c, 0x9d, 0x78, 0xb3, 0xbd, 0xeb, 0x74, 0x21, 0xbd, 0xdb,
+ 0x5e, 0x75, 0xbd, 0x02, 0xf1, 0x9b, 0x3d, 0x50, 0x67, 0x30, 0xbc, 0xc4, 0xa7,
+ 0xe6, 0x3c, 0x77, 0x75, 0x6e, 0x3c, 0xfd, 0x7e, 0x9e, 0xbb, 0x79, 0xed, 0x77,
+ 0xbc, 0x18, 0x82, 0x40, 0x3d, 0x18, 0xd1, 0x93, 0x3d, 0x4a, 0xa2, 0x32, 0xbb,
+ 0x83, 0xd5, 0x51, 0x3c, 0xa1, 0x52, 0xd9, 0x38, 0x6a, 0x5e, 0xb4, 0x3d, 0x73,
+ 0xb2, 0x1f, 0xbd, 0x02, 0xe7, 0x06, 0xbd, 0x25, 0x20, 0x5c, 0xbd, 0x6a, 0x66,
+ 0x16, 0x3d, 0xef, 0x75, 0x7c, 0x3d, 0x4b, 0xa8, 0x89, 0x3d, 0x17, 0x5e, 0x82,
+ 0xbc, 0xd7, 0x41, 0x80, 0x3d, 0x67, 0x41, 0xaf, 0xbc, 0x93, 0x11, 0x9b, 0x3d,
+ 0x4a, 0x03, 0xb3, 0xbd, 0x0d, 0x82, 0x32, 0xbd, 0x39, 0x35, 0xee, 0xbc, 0x07,
+ 0x60, 0x87, 0xbd, 0x51, 0xb7, 0x4d, 0x3b, 0xe4, 0x6e, 0xbf, 0xbb, 0x24, 0x01,
+ 0x36, 0xbd, 0x24, 0x02, 0x10, 0xbd, 0xfe, 0x24, 0x4f, 0xbd, 0xaf, 0xc2, 0x34,
+ 0xbc, 0x21, 0x39, 0xd9, 0x3c, 0x80, 0x73, 0x88, 0x3c, 0x8e, 0xaf, 0x84, 0xbd,
+ 0x1e, 0x05, 0x8b, 0xbd, 0xd2, 0xa7, 0x0e, 0x3d, 0x53, 0xe6, 0x89, 0x3b, 0xf3,
+ 0xd7, 0xa7, 0x3d, 0x58, 0xf7, 0x29, 0x3d, 0xb1, 0x45, 0x9f, 0x3c, 0x3d, 0xf4,
+ 0x73, 0x3d, 0x73, 0xd2, 0x4d, 0xbd, 0x6f, 0x4a, 0x0f, 0x3d, 0xc1, 0x60, 0x95,
+ 0xbd, 0xf4, 0x0f, 0x8e, 0x3d, 0x83, 0x58, 0xed, 0xbd, 0x58, 0x39, 0x12, 0x3c,
+ 0x20, 0x58, 0x39, 0x3d, 0xf4, 0xc9, 0x14, 0x3d, 0x5f, 0xa1, 0x0a, 0x3d, 0xd0,
+ 0x80, 0x42, 0xbd, 0x2b, 0xc9, 0x35, 0xbd, 0xa5, 0xe0, 0xf9, 0xbc, 0x11, 0xe4,
+ 0x8b, 0x3c, 0x0f, 0x18, 0x33, 0xbd, 0xb7, 0x53, 0x8f, 0xbc, 0xa8, 0xfe, 0x4f,
+ 0xbd, 0x1f, 0x8d, 0xf9, 0x3b, 0x33, 0x31, 0xa6, 0x3d, 0xb7, 0x6d, 0x03, 0x3c,
+ 0x80, 0xaa, 0xda, 0xbd, 0x82, 0x6e, 0xc5, 0x3c, 0x22, 0xaa, 0xba, 0x3c, 0xfd,
+ 0xd9, 0xcd, 0x3c, 0x16, 0x60, 0x5a, 0x3c, 0x48, 0xdb, 0x36, 0x3d, 0x10, 0xf4,
+ 0x84, 0xbc, 0x78, 0xf4, 0x8c, 0x3d, 0x24, 0xd3, 0xf2, 0xbc, 0x8e, 0xac, 0x16,
+ 0xbd, 0x41, 0x7a, 0xf1, 0x3c, 0xd3, 0x25, 0x77, 0x3d, 0x26, 0xf2, 0x63, 0x3d,
+ 0x7a, 0xb2, 0xa0, 0x3d, 0x00, 0xbb, 0xa4, 0x3c, 0x11, 0xd2, 0xf7, 0xbc, 0x92,
+ 0x58, 0xa7, 0x3d, 0xa1, 0x9e, 0xaf, 0xbd, 0x38, 0xb3, 0x0b, 0x3c, 0xf3, 0xbb,
+ 0x62, 0x3c, 0x98, 0x07, 0x9c, 0x3d, 0xa3, 0x56, 0xba, 0xba, 0x1a, 0x8d, 0x95,
+ 0x3d, 0x13, 0x14, 0x7b, 0x3d, 0xfe, 0x05, 0xb3, 0x3d, 0xd2, 0x56, 0x01, 0x3c,
+ 0x9e, 0xad, 0x44, 0x3d, 0xc7, 0xd7, 0x98, 0x3c, 0x1e, 0xfb, 0x18, 0x3d, 0x58,
+ 0x4c, 0x53, 0xbc, 0xf2, 0x16, 0xf1, 0xbb, 0xae, 0x3a, 0xad, 0xbd, 0x3d, 0xdd,
+ 0x40, 0xbd, 0x9f, 0xa1, 0x9c, 0xbd, 0xb6, 0xb7, 0x09, 0xbc, 0x74, 0xc3, 0xbc,
+ 0xbd, 0x22, 0xf9, 0x61, 0xbc, 0x71, 0x46, 0x80, 0xbc, 0x26, 0x48, 0x53, 0xbd,
+ 0x6a, 0xb7, 0x5d, 0x3d, 0xb9, 0xc9, 0x66, 0x3d, 0xaf, 0x27, 0x00, 0xbd, 0x24,
+ 0x28, 0xd3, 0x3a, 0x53, 0xfb, 0x5d, 0xbd, 0xf4, 0x8b, 0x8a, 0x3d, 0x80, 0x14,
+ 0x8e, 0xbd, 0x72, 0xcc, 0xa7, 0x3d, 0xd4, 0x5b, 0xff, 0xbc, 0xdf, 0x54, 0x43,
+ 0xbd, 0x6a, 0x25, 0xe1, 0x3b, 0xe2, 0xe9, 0x09, 0xbd, 0x55, 0xad, 0x63, 0xbd,
+ 0x14, 0xb6, 0xa9, 0x3b, 0x0c, 0xba, 0xd8, 0xbc, 0xc3, 0x6d, 0x53, 0xbd, 0x42,
+ 0xa5, 0x5f, 0xbd, 0x7b, 0x04, 0x22, 0xbd, 0x15, 0x56, 0x77, 0x3c, 0x53, 0x67,
+ 0xe6, 0xbc, 0x69, 0xe6, 0x89, 0x3c, 0x80, 0xcc, 0xbb, 0xbb, 0xea, 0x11, 0xb5,
+ 0x3d, 0x02, 0x35, 0xb6, 0x3b, 0x98, 0x78, 0x19, 0x3d, 0xae, 0x02, 0xdd, 0xbd,
+ 0x88, 0x78, 0x35, 0x3c, 0x30, 0x8b, 0x9d, 0xbd, 0xce, 0x4f, 0xad, 0xbd, 0x27,
+ 0xf3, 0xcf, 0x3c, 0xda, 0x15, 0x82, 0xbd, 0x50, 0x43, 0x86, 0x3c, 0xff, 0x0b,
+ 0xca, 0x3b, 0xec, 0x3f, 0xd1, 0xbc, 0x53, 0xc4, 0x15, 0x3d, 0x72, 0x9f, 0x12,
+ 0x3d, 0xcb, 0x3b, 0xcc, 0x3c, 0x90, 0xd2, 0x3a, 0x3d, 0x42, 0x53, 0x0d, 0xbc,
+ 0x46, 0x82, 0x93, 0x3d, 0xe9, 0x9a, 0xb1, 0xbd, 0x05, 0x99, 0x98, 0xbb, 0x52,
+ 0x17, 0x71, 0xbd, 0x6e, 0xb6, 0x8d, 0xbd, 0x0f, 0xe1, 0x66, 0xbd, 0x2b, 0x2f,
+ 0x1b, 0x3d, 0x97, 0x2f, 0xf4, 0xbc, 0xc0, 0xc0, 0x0f, 0x3d, 0xf3, 0x36, 0x6f,
+ 0x3d, 0x38, 0x99, 0x97, 0x3c, 0xca, 0x4a, 0xca, 0xbd, 0xe2, 0x66, 0x11, 0x3b,
+ 0xa8, 0xe8, 0x03, 0xbd, 0x60, 0xbf, 0x7e, 0xbb, 0x6d, 0x53, 0xb9, 0x3d, 0x50,
+ 0x02, 0x0c, 0x3c, 0xe3, 0x5f, 0xbb, 0xbd, 0xd1, 0xc0, 0xbd, 0xbc, 0x42, 0x35,
+ 0x89, 0x3d, 0x36, 0x8e, 0x9c, 0xbd, 0xac, 0x4a, 0x92, 0xbd, 0x7c, 0xb8, 0x65,
+ 0xbd, 0x77, 0xdd, 0x5e, 0xbd, 0x58, 0x55, 0x38, 0xbd, 0x2e, 0xa6, 0x67, 0x3c,
+ 0x7d, 0x81, 0x0b, 0xbd, 0x7b, 0xda, 0x92, 0x3d, 0x07, 0xec, 0x98, 0xbc, 0x6c,
+ 0x89, 0x35, 0xbd, 0x1b, 0x09, 0x0a, 0x3d, 0xca, 0x57, 0x27, 0x3c, 0xab, 0xff,
+ 0x2e, 0x3d, 0x97, 0xd7, 0x8d, 0xbd, 0xfa, 0x59, 0xb3, 0x3d, 0xb2, 0x38, 0x31,
+ 0x3d, 0xd2, 0x30, 0x2b, 0x3d, 0xa5, 0x8d, 0xa4, 0x3b, 0xc9, 0xca, 0xe4, 0x3c,
+ 0x0a, 0x75, 0x99, 0x3d, 0x3f, 0x85, 0x08, 0x3d, 0xff, 0x4e, 0x4e, 0x3d, 0x00,
+ 0xfb, 0x74, 0x3d, 0x90, 0x22, 0xb2, 0xbb, 0xed, 0xe6, 0x8c, 0xbb, 0x23, 0x48,
+ 0xe6, 0x3b, 0xfc, 0x6e, 0x62, 0xbd, 0xd5, 0x72, 0x58, 0x3d, 0xc8, 0x23, 0xce,
+ 0x3c, 0xf2, 0x1f, 0x3b, 0x3c, 0xd0, 0x69, 0xc6, 0x3b, 0x18, 0x15, 0x62, 0x3c,
+ 0xa8, 0x0a, 0x2b, 0x3d, 0x94, 0xed, 0x79, 0xbd, 0xf1, 0xff, 0x81, 0xbc, 0xb8,
+ 0x90, 0x3e, 0xbd, 0x4d, 0x8e, 0x25, 0x3d, 0x04, 0x91, 0xef, 0x3d, 0xb9, 0x57,
+ 0x17, 0x3d, 0x3a, 0xef, 0x01, 0xbd, 0xc4, 0x52, 0x59, 0xbc, 0x8a, 0x5e, 0x8e,
+ 0xbd, 0xe7, 0x23, 0xf5, 0xbc, 0x4f, 0xe7, 0x1f, 0xbd, 0x1f, 0x86, 0x82, 0xbc,
+ 0x1e, 0xf9, 0x53, 0x3d, 0xdf, 0x9c, 0x0a, 0x3c, 0xbf, 0xc9, 0xcc, 0x3c, 0xec,
+ 0xa1, 0x3e, 0xbc, 0x9c, 0x8e, 0x5e, 0x3a, 0xfd, 0xd8, 0x90, 0xbc, 0xe8, 0x4c,
+ 0xc7, 0xbc, 0xf2, 0x0f, 0x4b, 0x3a, 0x08, 0x9d, 0xbc, 0xbc, 0xab, 0x39, 0x4d,
+ 0x3d, 0xea, 0x3d, 0x6b, 0x3d, 0x5c, 0x84, 0x80, 0x3d, 0x7d, 0x95, 0xf8, 0xbc,
+ 0x70, 0xb2, 0x18, 0xbd, 0x2a, 0x02, 0x79, 0x3d, 0xe8, 0xd9, 0x3c, 0x3d, 0x67,
+ 0xaf, 0x29, 0x3d, 0x39, 0x45, 0x27, 0xbd, 0x0a, 0x7b, 0x12, 0xbd, 0xbb, 0xdc,
+ 0xe9, 0xbc, 0x73, 0x04, 0x83, 0xbd, 0x5d, 0xe4, 0x1c, 0xbd, 0xf0, 0x70, 0x29,
+ 0x3d, 0x87, 0x1e, 0x0d, 0xbd, 0x39, 0x86, 0xf0, 0x3c, 0xf5, 0x57, 0x3e, 0xbd,
+ 0xc8, 0x3c, 0x18, 0xbc, 0xf4, 0xa8, 0xa0, 0x3d, 0x5c, 0xa0, 0x6c, 0x3d, 0x02,
+ 0x7a, 0x7e, 0xbc, 0x0b, 0xb6, 0x6d, 0xbd, 0xb0, 0x9a, 0xa8, 0x3c, 0xee, 0x24,
+ 0x11, 0x3d, 0x54, 0x87, 0xf7, 0xbc, 0x57, 0x52, 0x70, 0xbd, 0x1e, 0x35, 0x46,
+ 0xbd, 0x38, 0x2d, 0x82, 0x3d, 0x9d, 0x1a, 0x3c, 0xbd, 0x53, 0x7b, 0xa6, 0x3d,
+ 0x29, 0x4b, 0xab, 0x3d, 0x0c, 0x43, 0x2d, 0x3d, 0x1a, 0x12, 0x95, 0x3d, 0x3b,
+ 0xf1, 0x3e, 0x3d, 0x80, 0xf6, 0x8d, 0xbd, 0x1b, 0xb6, 0xb4, 0xbc, 0x98, 0x23,
+ 0x79, 0xbd, 0xb7, 0xf6, 0xc5, 0x3d, 0x10, 0xd5, 0x48, 0x3d, 0x58, 0x7c, 0x9f,
+ 0xbd, 0xa0, 0x5a, 0x16, 0xbd, 0x82, 0xfb, 0x8e, 0xbd, 0x0b, 0xec, 0xed, 0xbc,
+ 0x92, 0xb7, 0xa3, 0xbd, 0xd5, 0xfd, 0x85, 0xbd, 0x54, 0xc9, 0x20, 0x3d, 0xad,
+ 0xa1, 0x90, 0xbd, 0x83, 0xd6, 0xfb, 0xbc, 0xe2, 0x46, 0x43, 0x3b, 0xfe, 0xa6,
+ 0xbd, 0xb7, 0x8f, 0xd3, 0xaf, 0x3d, 0x75, 0xb9, 0x9d, 0x3d, 0xd5, 0xfc, 0x2a,
+ 0x3c, 0xc6, 0x7e, 0xd6, 0xbc, 0x08, 0xcd, 0x4c, 0xbd, 0xcf, 0x4f, 0x73, 0x3d,
+ 0x3e, 0x7f, 0xb7, 0xbc, 0xbc, 0xa9, 0xfd, 0xbc, 0xf4, 0x8b, 0xa6, 0xbc, 0x11,
+ 0x90, 0xd0, 0xbc, 0x47, 0xf7, 0x4d, 0x3c, 0xed, 0x09, 0x64, 0xbd, 0x61, 0x49,
+ 0x8d, 0xbc, 0xc8, 0xd3, 0x3c, 0x3d, 0x72, 0x23, 0x88, 0x3d, 0xc3, 0xa7, 0x2e,
+ 0x3d, 0x67, 0x01, 0x2d, 0xbd, 0xcc, 0x34, 0xa0, 0xbd, 0x7e, 0xc7, 0xf8, 0xbc,
+ 0x0c, 0xf5, 0xaf, 0xbb, 0x6e, 0xa6, 0x4f, 0x3d, 0xe2, 0xb9, 0x88, 0xbd, 0x87,
+ 0x6f, 0xf9, 0xbc, 0x82, 0x23, 0x16, 0x3c, 0x10, 0x0c, 0x69, 0x3b, 0xab, 0x02,
+ 0xe2, 0x3c, 0x57, 0x6a, 0x08, 0xba, 0x4e, 0xc7, 0x6a, 0x3d, 0x30, 0x86, 0x6d,
+ 0x3c, 0xee, 0xb3, 0x84, 0x3d, 0xf9, 0xc4, 0x3a, 0x3d, 0x6f, 0x21, 0x8d, 0xbb,
+ 0xef, 0x7e, 0xc1, 0x3b, 0x05, 0xca, 0x12, 0xbc, 0x8a, 0x77, 0x2b, 0xbd, 0x1e,
+ 0x23, 0x32, 0x3d, 0x32, 0x8b, 0x03, 0x3d, 0xd3, 0x33, 0x0a, 0xbd, 0x3f, 0xdd,
+ 0x59, 0xbd, 0x18, 0xfa, 0x00, 0x3d, 0x46, 0x0b, 0xdd, 0x3b, 0x96, 0x2b, 0x4c,
+ 0xbd, 0xc8, 0xcc, 0xa7, 0x3d, 0xe2, 0xad, 0x2e, 0x3d, 0xbc, 0x68, 0x54, 0x3d,
+ 0xcb, 0x88, 0xae, 0x3c, 0x00, 0xd8, 0x15, 0xbc, 0x18, 0x4b, 0xb5, 0xbd, 0x89,
+ 0x31, 0x93, 0xbd, 0x84, 0xd3, 0x57, 0x3d, 0x86, 0x2c, 0x6c, 0x3d, 0x18, 0x08,
+ 0xb1, 0x3d, 0x14, 0x61, 0xbc, 0xbc, 0x25, 0xa4, 0x27, 0xbd, 0xfa, 0xdd, 0xb7,
+ 0xbd, 0x81, 0xaf, 0x1d, 0xbc, 0x06, 0x91, 0x5d, 0x3d, 0x54, 0xfb, 0xc9, 0xbc,
+ 0x0b, 0x35, 0x9a, 0x3b, 0x48, 0x7f, 0x1c, 0xbd, 0xaa, 0x85, 0x54, 0x3d, 0x3e,
+ 0x43, 0xfe, 0xbb, 0xcb, 0xf9, 0xbf, 0x3b, 0x4b, 0x03, 0xed, 0x3c, 0xe0, 0x7f,
+ 0x85, 0x3d, 0xe2, 0x52, 0x82, 0x3d, 0x98, 0x11, 0x94, 0x3d, 0x39, 0x2d, 0x26,
+ 0x3c, 0xce, 0x96, 0x5e, 0xbd, 0x6c, 0x42, 0x31, 0xbd, 0xca, 0x90, 0xd4, 0x3b,
+ 0x66, 0xa9, 0xc0, 0xbd, 0x23, 0x2e, 0x8d, 0x3d, 0x26, 0xc8, 0x4a, 0xbc, 0x2a,
+ 0xbd, 0x09, 0xbd, 0x26, 0xa5, 0xe6, 0x3c, 0x1e, 0x7c, 0xaa, 0x3d, 0x1b, 0x52,
+ 0x15, 0x3d, 0xb2, 0xa4, 0x81, 0x3d, 0x73, 0x78, 0x8a, 0x3c, 0x60, 0x6d, 0x4a,
+ 0xbd, 0x60, 0xc1, 0x3b, 0xbc, 0x14, 0xc6, 0xfb, 0x3c, 0x48, 0x70, 0x05, 0xbd,
+ 0xc1, 0xa4, 0x98, 0x3d, 0x71, 0x0a, 0xc4, 0xbd, 0x25, 0xdd, 0x31, 0xbd, 0x99,
+ 0x3a, 0x94, 0xbd, 0xa1, 0x45, 0xbf, 0x3c, 0x54, 0x14, 0xbf, 0xbc, 0xfd, 0x98,
+ 0xd2, 0xbd, 0xca, 0x27, 0x87, 0xbd, 0x1a, 0x52, 0x3a, 0x3d, 0xc3, 0xcf, 0x42,
+ 0xbc, 0x4c, 0x2f, 0xe0, 0x3a, 0x96, 0x3f, 0x5e, 0x3b, 0xba, 0xc2, 0x1d, 0xbd,
+ 0xed, 0x26, 0x42, 0xbd, 0xf6, 0xe0, 0xb4, 0x3d, 0xbe, 0x39, 0x23, 0xbc, 0x05,
+ 0x9d, 0xba, 0x3c, 0xe9, 0x38, 0x2f, 0xbb, 0x15, 0x9c, 0xbb, 0x3d, 0x22, 0xca,
+ 0x66, 0x3c, 0x10, 0x16, 0xdb, 0xbc, 0x11, 0x3d, 0xda, 0x3d, 0xac, 0x48, 0x37,
+ 0xbd, 0xac, 0x3e, 0x08, 0xbd, 0x8b, 0xb1, 0x7f, 0x3d, 0xe7, 0x31, 0xa3, 0x3c,
+ 0xd5, 0xe9, 0xb6, 0x3d, 0x53, 0xc1, 0x19, 0xbd, 0x2f, 0xc2, 0x35, 0xbd, 0xf9,
+ 0xa6, 0xa2, 0xbd, 0x46, 0x22, 0x2b, 0x3d, 0x2a, 0x2c, 0x3b, 0xbd, 0xf3, 0x8e,
+ 0x07, 0x3c, 0xff, 0xb1, 0x09, 0xbd, 0xbd, 0x01, 0x0f, 0xbb, 0x04, 0x7f, 0x4a,
+ 0xbd, 0xb9, 0xca, 0x87, 0x3d, 0x4e, 0x96, 0x12, 0xbc, 0x7b, 0x9a, 0x7d, 0x3d,
+ 0x1b, 0x48, 0x08, 0xbc, 0x1b, 0x36, 0x8a, 0x3d, 0xd1, 0x48, 0xe1, 0x3c, 0xb9,
+ 0xb0, 0x6f, 0x3d, 0x51, 0x6a, 0x83, 0xbb, 0xaa, 0xf0, 0xac, 0x3d, 0x61, 0xdb,
+ 0x43, 0xbd, 0x2e, 0xcf, 0xa2, 0x3d, 0xa6, 0x41, 0x89, 0x3d, 0x53, 0x86, 0xe1,
+ 0xbc, 0xda, 0x91, 0x9a, 0xbd, 0xba, 0xf7, 0x86, 0x3d, 0x8b, 0x8c, 0xab, 0xbd,
+ 0xa2, 0x2c, 0x6b, 0x3d, 0x31, 0x66, 0x83, 0x3c, 0xce, 0xd5, 0x0e, 0xbd, 0x35,
+ 0x29, 0x73, 0x3d, 0x9b, 0xf7, 0xb0, 0x3d, 0x51, 0x33, 0x21, 0x3d, 0x4c, 0xa1,
+ 0x4b, 0x3d, 0x58, 0xe3, 0xd5, 0xbc, 0x9f, 0xe4, 0x68, 0x3b, 0xed, 0x0b, 0x1e,
+ 0x3b, 0xc8, 0x06, 0x8c, 0x3c, 0x67, 0x47, 0x17, 0xbd, 0x63, 0xb4, 0xd1, 0xbc,
+ 0xf3, 0x34, 0x55, 0xbc, 0xde, 0x7b, 0x31, 0xbd, 0x17, 0x4e, 0x74, 0xba, 0x8b,
+ 0x65, 0x43, 0xbc, 0x01, 0xcc, 0xa0, 0x3d, 0xc7, 0x20, 0xa2, 0xbd, 0x63, 0x70,
+ 0x67, 0x3c, 0x65, 0xa0, 0x8d, 0x3d, 0xdf, 0xc9, 0x3d, 0xbc, 0x2f, 0xfa, 0x44,
+ 0x3b, 0xd2, 0xcf, 0x42, 0x3d, 0x9a, 0x40, 0x06, 0x3d, 0x67, 0x53, 0x4b, 0xbc,
+ 0x43, 0x50, 0x4a, 0x3c, 0x23, 0xb9, 0xa1, 0xbc, 0xad, 0x34, 0xe3, 0xbc, 0xac,
+ 0xc4, 0x4f, 0xbd, 0x4b, 0x40, 0xe5, 0xbb, 0xc3, 0xf1, 0x50, 0xbd, 0x98, 0x34,
+ 0x28, 0xbd, 0x28, 0xf8, 0xae, 0x3d, 0xd1, 0x27, 0x8f, 0x3c, 0xb4, 0x8c, 0x8b,
+ 0x3d, 0x73, 0xf2, 0x07, 0xbb, 0x65, 0x39, 0x61, 0xbd, 0x9a, 0x90, 0xcb, 0xbb,
+ 0x18, 0x2f, 0x8e, 0xbd, 0x65, 0xab, 0x4b, 0x3d, 0xd1, 0x40, 0x64, 0xbd, 0x10,
+ 0xdb, 0x83, 0xbd, 0x3b, 0x12, 0xa5, 0x3d, 0x31, 0x45, 0x78, 0x3d, 0xa4, 0xb1,
+ 0x26, 0x3d, 0xac, 0x10, 0x42, 0xbc, 0xbe, 0x62, 0xb3, 0xbd, 0x4e, 0x3d, 0x76,
+ 0x3c, 0x66, 0x0e, 0xde, 0xbc, 0x4f, 0x82, 0xd0, 0xbd, 0xf1, 0x86, 0x8e, 0xbd,
+ 0xf1, 0xe8, 0x37, 0x3c, 0xb7, 0xbb, 0x0e, 0x3d, 0x1c, 0xc4, 0x05, 0x3d, 0x15,
+ 0x50, 0x86, 0x3d, 0x81, 0x10, 0x92, 0x3b, 0x0a, 0xff, 0xed, 0x3c, 0x91, 0x9b,
+ 0xb3, 0xbb, 0xb5, 0xba, 0x26, 0xbc, 0x89, 0xef, 0x0f, 0x3d, 0x52, 0xde, 0x47,
+ 0x3d, 0x9d, 0x0f, 0x0c, 0x3d, 0x80, 0xee, 0xcb, 0xbd, 0xe2, 0xc7, 0x82, 0xbd,
+ 0x1a, 0xf6, 0x64, 0x3c, 0xaf, 0xa7, 0xbf, 0xbc, 0xfc, 0x41, 0x37, 0x3c, 0xf9,
+ 0x88, 0xfe, 0xbc, 0xdf, 0x47, 0x8d, 0xbc, 0x55, 0x09, 0x0b, 0xbd, 0x32, 0x50,
+ 0x00, 0xbd, 0x83, 0x62, 0xaf, 0xbc, 0xdc, 0xac, 0x5e, 0xbd, 0xb6, 0x22, 0x54,
+ 0xbd, 0x74, 0xd7, 0x00, 0x3c, 0xe3, 0x5a, 0xcb, 0xbc, 0xaa, 0x37, 0x25, 0xbd,
+ 0x64, 0x98, 0x5f, 0x3d, 0x81, 0xdf, 0x8b, 0x3c, 0x23, 0xef, 0x66, 0x3b, 0x84,
+ 0x67, 0x55, 0xbb, 0xd2, 0x11, 0x98, 0xbd, 0x2b, 0x15, 0x82, 0x3d, 0xeb, 0x1e,
+ 0xc6, 0x3c, 0x56, 0x83, 0xcb, 0xba, 0xd0, 0xc7, 0x2d, 0x3d, 0xd1, 0xcd, 0x0c,
+ 0x3d, 0xe4, 0x5c, 0x5a, 0xbc, 0x4a, 0xf3, 0x73, 0xbd, 0x43, 0xdc, 0xfe, 0x3c,
+ 0x00, 0xd6, 0x2f, 0x3d, 0x06, 0x22, 0x49, 0xbb, 0x4e, 0x45, 0x71, 0xbc, 0xb3,
+ 0x3c, 0x00, 0x3d, 0x1a, 0xae, 0x58, 0xbd, 0x15, 0x61, 0x92, 0x3d, 0x14, 0xb9,
+ 0xf8, 0xbc, 0x15, 0x2c, 0x1b, 0x3d, 0x31, 0x97, 0x3b, 0xbc, 0xe2, 0xe7, 0x18,
+ 0x3d, 0xcf, 0xf0, 0x1f, 0xbd, 0x7c, 0x1e, 0x0f, 0x3d, 0xb1, 0x27, 0x7f, 0xbd,
+ 0xb8, 0xdd, 0xb2, 0xbd, 0xcc, 0xc2, 0x44, 0x3d, 0x44, 0x5c, 0x06, 0xbd, 0x4f,
+ 0x6a, 0x4a, 0xbd, 0x43, 0x2c, 0x87, 0x3d, 0xb7, 0xe9, 0x48, 0xbd, 0x60, 0x01,
+ 0x07, 0xbd, 0x0b, 0xe4, 0x78, 0x3a, 0x92, 0x5d, 0x64, 0xbd, 0x7c, 0xcf, 0x81,
+ 0xbc, 0xe2, 0x59, 0xab, 0x3c, 0xf0, 0xbc, 0x68, 0xbc, 0xc3, 0x2d, 0x3d, 0x3d,
+ 0x27, 0xb2, 0xce, 0x3d, 0x44, 0x61, 0x0e, 0x3c, 0x94, 0x6d, 0x02, 0xbd, 0xe5,
+ 0x6f, 0xc2, 0x3c, 0x70, 0xab, 0x8a, 0x3a, 0x14, 0xab, 0x04, 0x3c, 0x9d, 0xd4,
+ 0xab, 0x3d, 0x0a, 0x7d, 0x64, 0x3c, 0x17, 0xb5, 0xce, 0x3b, 0x66, 0xbd, 0x24,
+ 0x3d, 0xed, 0xce, 0x77, 0xbd, 0xed, 0x6e, 0x7f, 0xbd, 0x70, 0xe8, 0x10, 0xbc,
+ 0x6a, 0x80, 0x37, 0x3d, 0x2d, 0x0b, 0x83, 0x3d, 0x8e, 0x4b, 0x5e, 0xbd, 0xd6,
+ 0x38, 0x34, 0xbd, 0xce, 0xaf, 0x88, 0x3d, 0xef, 0x64, 0x10, 0xbc, 0xa0, 0x8b,
+ 0xac, 0xbd, 0x70, 0xa5, 0x50, 0x3c, 0x87, 0x3d, 0x83, 0x3d, 0x70, 0x63, 0x57,
+ 0xbd, 0xf3, 0x6a, 0x44, 0x3d, 0x3a, 0x49, 0xda, 0xbd, 0x1b, 0x74, 0xde, 0xbd,
+ 0x0d, 0xb2, 0x34, 0x3d, 0x04, 0x0f, 0x87, 0x3d, 0x04, 0xb1, 0x25, 0xbd, 0x5f,
+ 0x2c, 0x01, 0xbc, 0x9a, 0x55, 0x6b, 0x3b, 0xad, 0xdf, 0x5e, 0x3d, 0x7f, 0x85,
+ 0x2a, 0x3c, 0xfa, 0x88, 0xfa, 0xbc, 0x0d, 0x79, 0x8b, 0xbd, 0x01, 0x45, 0x73,
+ 0x3d, 0x11, 0xde, 0xb6, 0x3c, 0xcc, 0xb5, 0xa4, 0x3c, 0xe8, 0xc5, 0x67, 0xbc,
+ 0x66, 0x99, 0x92, 0x3d, 0x36, 0xb0, 0x79, 0xbd, 0x14, 0x41, 0xa7, 0x3d, 0xfe,
+ 0x98, 0xcf, 0x3c, 0x32, 0xf7, 0x0a, 0x3d, 0xa6, 0x4a, 0x45, 0x3d, 0x83, 0xa0,
+ 0x9e, 0x3d, 0x86, 0x2e, 0x71, 0x3d, 0x92, 0x9c, 0x4d, 0x3d, 0xed, 0x24, 0xeb,
+ 0xbc, 0x3e, 0xfe, 0xc0, 0xbc, 0xcd, 0x6e, 0x4f, 0x3c, 0x83, 0x86, 0xa5, 0xbd,
+ 0xa4, 0xd7, 0xa5, 0xbc, 0xe0, 0x9a, 0x38, 0x3d, 0xe2, 0x79, 0xcd, 0x3c, 0x4a,
+ 0xe2, 0xa1, 0x3c, 0x94, 0x66, 0xd1, 0xbc, 0xe6, 0xed, 0x9b, 0x3c, 0x68, 0xb1,
+ 0x41, 0x3b, 0x1b, 0x65, 0x0b, 0x3d, 0xdd, 0x50, 0xae, 0xbd, 0x29, 0xf9, 0xfc,
+ 0xbc, 0x33, 0xe6, 0x37, 0xbd, 0xb6, 0x53, 0xbb, 0x3c, 0x0c, 0x5e, 0xf6, 0x3d,
+ 0x75, 0xbb, 0xf6, 0xbc, 0xf8, 0xc6, 0x9a, 0x3d, 0x8f, 0xe5, 0xc4, 0x3c, 0x88,
+ 0xee, 0x33, 0xbc, 0x73, 0xb2, 0x87, 0x3c, 0xd4, 0xd8, 0x58, 0x3c, 0x15, 0x37,
+ 0x82, 0x3d, 0xc1, 0x4f, 0x38, 0xbc, 0xba, 0x8e, 0xf9, 0xbb, 0x7c, 0x56, 0xe0,
+ 0xbd, 0xca, 0x23, 0x94, 0xbc, 0x24, 0x41, 0xae, 0x3d, 0x89, 0x4e, 0x9a, 0x3c,
+ 0xcb, 0x28, 0xe3, 0x3c, 0xf1, 0xfa, 0x05, 0x3d, 0xe3, 0xa4, 0x80, 0xbd, 0x6f,
+ 0xda, 0x16, 0x3d, 0xc7, 0xee, 0x77, 0xbd, 0xa8, 0xe3, 0xb1, 0xbc, 0x6f, 0x70,
+ 0x90, 0xbc, 0x78, 0x35, 0x48, 0x3d, 0xac, 0xdb, 0x23, 0xbd, 0x4e, 0xbd, 0xe4,
+ 0xbb, 0x79, 0x88, 0xd0, 0xbb, 0xf2, 0xa9, 0xb6, 0xbd, 0x54, 0x46, 0x5d, 0xbd,
+ 0xc6, 0xb2, 0x95, 0x3d, 0xe6, 0x67, 0x52, 0x3d, 0xa6, 0x5d, 0x7f, 0xbd, 0x0b,
+ 0xe5, 0xad, 0x3b, 0x91, 0xf6, 0x0c, 0x3c, 0x33, 0x45, 0xab, 0xbc, 0xa7, 0x84,
+ 0xb3, 0xbc, 0xf5, 0xb0, 0x6c, 0x3c, 0x08, 0xc9, 0xb4, 0x3c, 0x61, 0x9d, 0x8b,
+ 0x3c, 0x0d, 0x19, 0x87, 0x3d, 0xaa, 0xbc, 0xd3, 0xbc, 0x85, 0x92, 0x8e, 0x3b,
+ 0xfc, 0x26, 0x49, 0xbd, 0x56, 0x7e, 0x7f, 0x3d, 0xf3, 0x85, 0x61, 0xbd, 0x8c,
+ 0x5b, 0xf0, 0x3c, 0x14, 0x09, 0x65, 0xbd, 0x66, 0x78, 0x38, 0xbb, 0x2c, 0x69,
+ 0x4d, 0xbd, 0x33, 0x31, 0x46, 0x3d, 0x6d, 0xb8, 0xa6, 0xbc, 0x69, 0x4e, 0xc3,
+ 0x3d, 0xc9, 0x54, 0x93, 0xbd, 0x1a, 0x80, 0x83, 0x3d, 0x06, 0x1b, 0xa8, 0x3c,
+ 0xf0, 0x64, 0x65, 0x3c, 0xae, 0xd7, 0xb2, 0x3d, 0x03, 0xc0, 0xf0, 0x3c, 0x9d,
+ 0xbf, 0x84, 0xbd, 0xa6, 0x60, 0xfd, 0xbd, 0x58, 0x27, 0x41, 0x3d, 0x3f, 0x70,
+ 0x9f, 0x3c, 0x13, 0x59, 0x37, 0xbd, 0x6b, 0x61, 0x4e, 0xbd, 0xb5, 0xf3, 0x26,
+ 0x39, 0x10, 0x99, 0xc5, 0x3c, 0x7c, 0xda, 0x28, 0x3d, 0x23, 0x7b, 0x78, 0x3b,
+ 0xa5, 0x5f, 0x1c, 0xbd, 0x8e, 0x82, 0xd0, 0x3c, 0x42, 0x5a, 0x29, 0x3d, 0x5c,
+ 0x7a, 0x1d, 0xb8, 0xf8, 0x4e, 0x3c, 0xbc, 0x24, 0xee, 0x52, 0x3b, 0x56, 0xfa,
+ 0x0b, 0x3d, 0xe2, 0xa4, 0xc4, 0x3b, 0xd1, 0x51, 0xe1, 0xbd, 0x22, 0xbb, 0x7f,
+ 0xbd, 0xd3, 0x54, 0x6d, 0x3d, 0x75, 0x61, 0xaa, 0x3d, 0x4a, 0xd4, 0x33, 0x3d,
+ 0x2d, 0x5f, 0x91, 0x3c, 0x38, 0xc6, 0xe3, 0xb9, 0x91, 0x94, 0x38, 0x3d, 0x87,
+ 0x92, 0xd5, 0x3c, 0xb3, 0x59, 0x34, 0xbd, 0x74, 0x48, 0x64, 0xbd, 0x90, 0xb1,
+ 0xba, 0x3c, 0xd1, 0x21, 0x97, 0x3c, 0xb9, 0x24, 0xa7, 0x3c, 0xa0, 0xe7, 0xe8,
+ 0xbd, 0xf1, 0xc5, 0x45, 0x3c, 0x93, 0x0e, 0x2e, 0x3d, 0x31, 0x84, 0xd5, 0xbc,
+ 0xd7, 0x86, 0xbf, 0x3c, 0x5b, 0xae, 0xb8, 0x3c, 0xc3, 0x7e, 0xf3, 0xbc, 0xb1,
+ 0xd7, 0x0c, 0x3d, 0x2a, 0x33, 0xcc, 0x3d, 0x86, 0x09, 0x6b, 0x3d, 0xb6, 0xa4,
+ 0x97, 0x3d, 0x15, 0x03, 0x89, 0x3d, 0x5c, 0x5c, 0x85, 0x3d, 0x47, 0x39, 0x65,
+ 0x3d, 0xd2, 0x8b, 0x06, 0xbd, 0x6c, 0xed, 0x55, 0x3b, 0x30, 0xd5, 0x99, 0xbc,
+ 0x7d, 0x00, 0xb5, 0xbb, 0x54, 0xe8, 0x12, 0xbd, 0x8c, 0x6f, 0x3e, 0x3c, 0x07,
+ 0x15, 0x9a, 0x3d, 0xf2, 0x93, 0xa1, 0x3d, 0x0a, 0xf7, 0x7c, 0x3d, 0x89, 0xe9,
+ 0xc0, 0x3c, 0xc4, 0x63, 0x6d, 0x3d, 0x02, 0x6a, 0xa9, 0x3d, 0x85, 0x9b, 0x4b,
+ 0x3d, 0x20, 0x90, 0x99, 0x3c, 0xcd, 0xb5, 0x1f, 0x3d, 0x7f, 0x5e, 0x72, 0xbd,
+ 0x19, 0x42, 0x08, 0xbc, 0x4c, 0xd0, 0x60, 0xbd, 0x28, 0x45, 0x5d, 0xbd, 0x9f,
+ 0x9e, 0x95, 0xbd, 0xf8, 0x82, 0x82, 0xbd, 0x14, 0xd6, 0x3c, 0x3d, 0x55, 0x69,
+ 0x6e, 0x3d, 0x6e, 0xd1, 0x37, 0xbc, 0x6a, 0x72, 0x34, 0xbd, 0x67, 0x77, 0xa4,
+ 0xbc, 0xd0, 0xb2, 0xaa, 0x3d, 0xfa, 0xbb, 0x32, 0x3d, 0x5b, 0xfd, 0x1e, 0x3d,
+ 0x6b, 0x18, 0x8a, 0x3b, 0xd1, 0xe0, 0x3b, 0x3c, 0x0e, 0xaa, 0xb8, 0xbc, 0xd8,
+ 0x60, 0x73, 0x3d, 0x18, 0xea, 0xac, 0x3d, 0x0a, 0x98, 0x8c, 0xbd, 0xa8, 0xae,
+ 0x90, 0x3d, 0xa4, 0x92, 0x81, 0x3b, 0xfa, 0x7d, 0x67, 0x3d, 0xd1, 0x86, 0xad,
+ 0x3d, 0xa0, 0x03, 0x2e, 0xbc, 0xa7, 0x6d, 0xf7, 0x3c, 0x93, 0xfe, 0x81, 0x3d,
+ 0x55, 0x43, 0xdd, 0x3b, 0x9e, 0xc7, 0x19, 0x3d, 0xc1, 0x4e, 0x1e, 0x3d, 0x4a,
+ 0xb6, 0x3c, 0xbd, 0xae, 0x17, 0x16, 0xbd, 0xa1, 0xf5, 0x4d, 0xbd, 0x89, 0x2c,
+ 0x04, 0xbd, 0xd3, 0xeb, 0x93, 0x3d, 0x35, 0xae, 0x19, 0x3c, 0xf8, 0x48, 0xa5,
+ 0x3c, 0x94, 0x41, 0xf4, 0xbc, 0x67, 0x32, 0x41, 0xbd, 0x19, 0x2d, 0x38, 0x3d,
+ 0x57, 0x90, 0x6f, 0xbc, 0xea, 0xb3, 0x89, 0xbc, 0x73, 0x19, 0x5b, 0x3d, 0x9d,
+ 0x72, 0xae, 0x3d, 0xb9, 0x8b, 0x23, 0xbd, 0xa4, 0x13, 0x43, 0xbc, 0xd0, 0x4d,
+ 0x12, 0x3d, 0xd7, 0xa3, 0x38, 0xbd, 0xc9, 0xb4, 0xd5, 0x3d, 0x4b, 0x93, 0x24,
+ 0x3c, 0xd2, 0xfa, 0xe8, 0xbc, 0xdb, 0xa3, 0x0b, 0xbd, 0xc2, 0xdd, 0x5e, 0x3d,
+ 0x4c, 0x2c, 0xa5, 0xbd, 0xd2, 0x24, 0x77, 0xbd, 0x50, 0xd3, 0xa1, 0x3d, 0xca,
+ 0xe7, 0x00, 0x3a, 0xbf, 0x15, 0xed, 0xbc, 0x83, 0xc3, 0x60, 0x3d, 0xba, 0x44,
+ 0x82, 0x3d, 0xa4, 0x8d, 0x93, 0x3d, 0x7a, 0xdf, 0x92, 0xbd, 0x2e, 0x60, 0xcd,
+ 0x3b, 0x8a, 0xc9, 0x67, 0x3d, 0xbc, 0x59, 0x2e, 0xbd, 0xd6, 0x96, 0xb0, 0x3d,
+ 0x89, 0x2f, 0xd1, 0xbc, 0x18, 0xd2, 0x0c, 0xbc, 0xc4, 0xf8, 0x84, 0x3d, 0x50,
+ 0xc8, 0x52, 0xbd, 0xa8, 0xc1, 0x58, 0xbd, 0xa3, 0xe1, 0x26, 0x3d, 0x61, 0x05,
+ 0x00, 0x3d, 0x5d, 0xe9, 0x84, 0x3d, 0xc2, 0x44, 0x37, 0x3d, 0xfb, 0xf3, 0xb0,
+ 0xbc, 0x69, 0x4b, 0x6c, 0xbd, 0xa9, 0x6b, 0xa4, 0xbc, 0x77, 0x53, 0x84, 0x3c,
+ 0x12, 0x21, 0x0c, 0xbd, 0x0d, 0x59, 0x08, 0xbc, 0x44, 0xb6, 0x11, 0xbd, 0xaa,
+ 0xef, 0x8e, 0x3d, 0x4e, 0x39, 0x32, 0x3d, 0x40, 0x7f, 0x7a, 0xbd, 0xa8, 0x2d,
+ 0xbf, 0xbc, 0x3a, 0xff, 0x30, 0x3d, 0xff, 0x61, 0xbb, 0x3b, 0xc3, 0xdf, 0x96,
+ 0xbc, 0x22, 0x74, 0x53, 0xbd, 0x69, 0x07, 0x8a, 0xbd, 0x46, 0x58, 0xe0, 0x3c,
+ 0x91, 0x62, 0x31, 0xbd, 0x38, 0x57, 0x01, 0xbc, 0x09, 0x74, 0x93, 0xbc, 0x3e,
+ 0xb2, 0x8a, 0x3c, 0xd8, 0x12, 0x1d, 0xbd, 0xd7, 0xf6, 0xc2, 0xbc, 0x86, 0x55,
+ 0x11, 0x3c, 0x28, 0x0d, 0x70, 0x3d, 0x98, 0xa3, 0x8a, 0x3d, 0x7b, 0xf0, 0x93,
+ 0xbd, 0xc2, 0x7c, 0x0b, 0xbd, 0xfa, 0x05, 0xcc, 0x3c, 0x5f, 0x77, 0x19, 0x3d,
+ 0xe0, 0x09, 0xb3, 0x3c, 0x13, 0x77, 0x8a, 0xbc, 0x1f, 0x76, 0x36, 0x3c, 0xfb,
+ 0x4f, 0x97, 0x3d, 0x1f, 0xec, 0x31, 0x3d, 0xf9, 0x14, 0x79, 0x3d, 0x50, 0xab,
+ 0x92, 0xbd, 0xda, 0x3c, 0xf3, 0xba, 0x2f, 0x4d, 0x72, 0xbc, 0x0f, 0x3a, 0xc6,
+ 0x3c, 0x7e, 0xf5, 0x40, 0xbd, 0x0f, 0xf2, 0x87, 0xbd, 0xc9, 0x6e, 0xef, 0xbc,
+ 0x06, 0xec, 0xce, 0xbc, 0x3d, 0x26, 0x2b, 0xbd, 0x4a, 0x6a, 0x53, 0x3d, 0x1b,
+ 0x90, 0x1a, 0xbb, 0x39, 0xb6, 0x23, 0x3d, 0xa2, 0xbd, 0x88, 0xbd, 0xd7, 0x0d,
+ 0x2a, 0xbc, 0xf5, 0xf6, 0x94, 0xbd, 0xf0, 0xd7, 0x52, 0xbc, 0x85, 0x99, 0x83,
+ 0xbd, 0xdd, 0xc4, 0x8c, 0xbd, 0xaa, 0x19, 0x4a, 0x3d, 0x26, 0x21, 0xec, 0x3c,
+ 0x0f, 0xe7, 0x1b, 0xbc, 0x39, 0x8e, 0xea, 0xbc, 0x03, 0xdc, 0x2f, 0xbd, 0x03,
+ 0x8c, 0x8c, 0x3d, 0xe4, 0xcb, 0x7f, 0xbc, 0xc6, 0xb9, 0xfd, 0x3b, 0x78, 0x5b,
+ 0x44, 0xbd, 0xd0, 0x3d, 0x89, 0xbc, 0xe0, 0xdb, 0xc2, 0xbc, 0x84, 0x8d, 0x39,
+ 0xbd, 0x9a, 0x7b, 0x9a, 0x3b, 0x5d, 0xb4, 0x88, 0xbc, 0xf3, 0xf0, 0x8e, 0xbd,
+ 0x27, 0x0c, 0x41, 0x3d, 0xe7, 0x60, 0xa0, 0x3c, 0x86, 0xb6, 0xa9, 0xbc, 0x15,
+ 0x55, 0x4f, 0xbd, 0xf4, 0x53, 0xfb, 0xbc, 0xdf, 0x4d, 0x0d, 0x3d, 0x06, 0x46,
+ 0x7d, 0xbd, 0x37, 0x4d, 0xb0, 0xbc, 0x7d, 0x65, 0x1e, 0xbd, 0x30, 0x1a, 0x00,
+ 0xbb, 0x16, 0x56, 0x28, 0xbd, 0xb4, 0xef, 0xdd, 0xbc, 0xcc, 0xbc, 0x40, 0xbd,
+ 0x95, 0xce, 0x84, 0xbd, 0x97, 0x26, 0x98, 0xbd, 0x86, 0x1f, 0x80, 0xbd, 0x64,
+ 0x16, 0x97, 0x3c, 0x9b, 0xd0, 0x22, 0x3c, 0x05, 0x08, 0x52, 0xbb, 0xd2, 0x11,
+ 0x8e, 0xbd, 0x3c, 0xa3, 0x8c, 0x3d, 0x4c, 0xdb, 0xa0, 0xbd, 0x24, 0xe2, 0x0a,
+ 0xbd, 0x24, 0x87, 0x69, 0x3c, 0x7c, 0x72, 0xb2, 0x3c, 0xda, 0xcd, 0x0c, 0x3d,
+ 0xd1, 0x51, 0x4c, 0x3d, 0xb6, 0xaf, 0x30, 0xbd, 0x07, 0xa0, 0x64, 0x3d, 0x09,
+ 0x30, 0x59, 0x3d, 0x68, 0xb3, 0x06, 0xbd, 0x01, 0x85, 0xe4, 0xbc, 0x10, 0x9f,
+ 0x2a, 0xbd, 0xe0, 0x85, 0x93, 0x3d, 0x71, 0xe0, 0x13, 0xbd, 0x28, 0x8b, 0x8e,
+ 0x3c, 0x53, 0x74, 0x71, 0xbc, 0x6a, 0x6d, 0xad, 0x3d, 0x88, 0xf7, 0x32, 0x3c,
+ 0xfb, 0xde, 0x41, 0x3c, 0x90, 0x33, 0x4c, 0xba, 0x89, 0xe4, 0x1d, 0x3c, 0x47,
+ 0x26, 0xb5, 0xbc, 0x5c, 0x9c, 0x9d, 0xbd, 0xd4, 0xe8, 0xdb, 0x3b, 0x7f, 0x88,
+ 0x99, 0x3d, 0x79, 0xd9, 0xb8, 0xbc, 0x76, 0x00, 0xb9, 0x3d, 0x74, 0x04, 0xb9,
+ 0xbc, 0xde, 0x84, 0x38, 0x3d, 0x5c, 0x38, 0x91, 0x3d, 0x80, 0x37, 0x04, 0xbd,
+ 0xfa, 0x1a, 0x34, 0x3d, 0x36, 0x16, 0x11, 0x3d, 0xf3, 0x66, 0x86, 0x3d, 0x84,
+ 0x83, 0x16, 0xbd, 0xec, 0x1a, 0x43, 0xbd, 0x06, 0xf8, 0x64, 0x3d, 0x96, 0x19,
+ 0x31, 0x3b, 0x75, 0x30, 0x9e, 0x3d, 0xf5, 0xfa, 0xd1, 0xbb, 0x96, 0xf3, 0xc8,
+ 0xbc, 0x84, 0x0f, 0x6d, 0xbd, 0xd1, 0x3e, 0x77, 0x3c, 0xbb, 0xb8, 0xf1, 0xbc,
+ 0x49, 0xf5, 0x70, 0x3d, 0x33, 0x33, 0x44, 0xbd, 0xc9, 0xca, 0xf5, 0x3c, 0x5d,
+ 0xe3, 0x2c, 0xbc, 0x06, 0x48, 0xb8, 0x3d, 0xfe, 0xac, 0x12, 0x3d, 0x1d, 0xd6,
+ 0x86, 0x3d, 0x54, 0xa5, 0x39, 0x3d, 0x4d, 0x88, 0xeb, 0x3c, 0x14, 0xe2, 0x3e,
+ 0x3c, 0xb5, 0xe9, 0xd3, 0xbc, 0x97, 0xe0, 0x7e, 0x3c, 0x9b, 0xa2, 0x5a, 0xbc,
+ 0x14, 0xab, 0x89, 0x3d, 0x4a, 0xdc, 0x93, 0x3d, 0xe8, 0xee, 0xb5, 0xbc, 0x5f,
+ 0x9a, 0x9b, 0x3b, 0x26, 0x69, 0x55, 0x3c, 0x7d, 0x50, 0x89, 0xbc, 0xe0, 0x93,
+ 0x8c, 0x3b, 0x44, 0xbc, 0x23, 0xbd, 0x47, 0x76, 0x85, 0x3d, 0xfd, 0x6a, 0x25,
+ 0x39, 0x3e, 0x57, 0x9c, 0x3d, 0x70, 0xdd, 0xd0, 0x3b, 0x40, 0xdf, 0x3b, 0x3d,
+ 0x47, 0x5c, 0xbd, 0xbc, 0x90, 0x3d, 0x33, 0xbd, 0xd8, 0xc6, 0x76, 0xbd, 0xf2,
+ 0xd8, 0x51, 0x3d, 0x17, 0x60, 0x9c, 0xbd, 0x32, 0x78, 0x1b, 0xbd, 0xb4, 0xef,
+ 0x70, 0x3d, 0xfa, 0x9d, 0xb6, 0x3b, 0x88, 0x5c, 0xe0, 0x3a, 0x47, 0x1b, 0xf8,
+ 0xbc, 0x3b, 0x66, 0xcb, 0xba, 0x30, 0xe1, 0x04, 0xbd, 0x58, 0xbe, 0x87, 0xbd,
+ 0xc2, 0xa5, 0x10, 0xbc, 0x48, 0x34, 0xa3, 0x3d, 0x44, 0xa4, 0x77, 0x3d, 0x7d,
+ 0xe5, 0x94, 0xba, 0x23, 0xd9, 0xa3, 0xbc, 0xf6, 0xf6, 0xc6, 0xbc, 0xea, 0xd8,
+ 0x31, 0xbd, 0x9f, 0x50, 0x24, 0x3d, 0xc8, 0x2a, 0x37, 0x3d, 0xaf, 0xe4, 0x82,
+ 0x3d, 0x28, 0x20, 0x70, 0x3d, 0xa3, 0x27, 0x52, 0x3d, 0xbd, 0x34, 0x8a, 0x3c,
+ 0x8c, 0x2c, 0xde, 0x3c, 0x35, 0xf4, 0x70, 0xbd, 0x35, 0x89, 0x19, 0x3d, 0x54,
+ 0x59, 0x46, 0xb9, 0xa6, 0xfb, 0xc0, 0xbc, 0x56, 0x95, 0x8d, 0x3d, 0xd1, 0x4f,
+ 0x71, 0x3d, 0xe1, 0xe3, 0x9f, 0x3d, 0x05, 0xe2, 0x82, 0xbd, 0xb7, 0xcf, 0x06,
+ 0x3d, 0x02, 0x28, 0xa3, 0xbc, 0xd0, 0xcf, 0x48, 0x3d, 0x8e, 0x69, 0x3b, 0xbc,
+ 0x1e, 0x83, 0x14, 0xbb, 0x72, 0x67, 0x82, 0x3b, 0x64, 0x7d, 0xeb, 0xbc, 0x2a,
+ 0x76, 0xe5, 0xba, 0x6a, 0xd8, 0x3c, 0xbd, 0x10, 0xc0, 0x4c, 0x3d, 0x64, 0x44,
+ 0x64, 0x3d, 0xbe, 0xb4, 0x31, 0xbd, 0x0c, 0x43, 0x09, 0xbd, 0xa4, 0x6d, 0x8d,
+ 0xbd, 0xd0, 0xbf, 0x4a, 0x3d, 0x09, 0x76, 0x90, 0xbd, 0x29, 0x9c, 0x0b, 0x3d,
+ 0x7c, 0x61, 0x74, 0xbd, 0xb9, 0x1c, 0x1c, 0xbd, 0x09, 0x6d, 0xad, 0x3b, 0x3e,
+ 0xb4, 0x93, 0xbc, 0x1f, 0x5a, 0xa4, 0x3c, 0xe2, 0x7a, 0x89, 0xbd, 0x1c, 0x1d,
+ 0x49, 0x3c, 0x0c, 0xc3, 0x06, 0xbd, 0xf9, 0xe2, 0xd6, 0x3c, 0x1a, 0x44, 0x57,
+ 0xbd, 0x7a, 0xac, 0x50, 0x3d, 0x39, 0xe4, 0xc4, 0x3c, 0xfb, 0x1e, 0x04, 0x3d,
+ 0x8a, 0xf6, 0x53, 0xbd, 0xfc, 0xac, 0x62, 0xbc, 0x44, 0xcc, 0x20, 0x3d, 0xf6,
+ 0x5e, 0xa0, 0x3c, 0x88, 0x20, 0xcd, 0xba, 0x6b, 0xc7, 0x1c, 0xbd, 0x66, 0xd2,
+ 0x16, 0xbb, 0x8b, 0x02, 0x58, 0xbd, 0x17, 0x15, 0x83, 0x3d, 0xef, 0x6a, 0x84,
+ 0x3d, 0x00, 0x91, 0xd1, 0xba, 0x9a, 0xa6, 0x83, 0x3d, 0x6e, 0x12, 0x9c, 0xbd,
+ 0x4c, 0x00, 0x46, 0x3d, 0x08, 0x8e, 0xcf, 0x3b, 0x53, 0x98, 0xb9, 0xbc, 0x5c,
+ 0x33, 0x43, 0x3d, 0x05, 0x7b, 0x03, 0xbd, 0x82, 0x26, 0x35, 0xbd, 0xbf, 0x76,
+ 0x75, 0xbd, 0x08, 0x78, 0x49, 0xbd, 0xe1, 0x7e, 0x53, 0xbc, 0xf0, 0x64, 0xf2,
+ 0x3c, 0x56, 0xaf, 0x1a, 0x3d, 0x1c, 0x8f, 0x08, 0x3d, 0x11, 0xac, 0x91, 0xbd,
+ 0xe8, 0x21, 0x06, 0x3d, 0xf5, 0xbb, 0xdb, 0xbc, 0x0c, 0xc9, 0x81, 0xbd, 0x74,
+ 0x76, 0x83, 0xbd, 0x5e, 0xf3, 0x40, 0xbd, 0xd6, 0xbb, 0x98, 0x3d, 0x4b, 0x9a,
+ 0x93, 0x3c, 0x25, 0x64, 0x9d, 0xbd, 0xf4, 0xf4, 0x9e, 0xbc, 0x66, 0xbe, 0x2b,
+ 0xbb, 0xad, 0xa4, 0x82, 0x3c, 0x76, 0x08, 0x5d, 0xbd, 0x2c, 0xf4, 0x2f, 0xbd,
+ 0xb3, 0x5e, 0x84, 0x3d, 0x62, 0xad, 0x06, 0x3d, 0x6a, 0xe5, 0xea, 0xbc, 0xd8,
+ 0x06, 0x23, 0x3d, 0x85, 0x25, 0xeb, 0xbc, 0xa9, 0x01, 0xab, 0xbb, 0x28, 0xe4,
+ 0xf3, 0x3c, 0x9f, 0x9e, 0x8e, 0xbd, 0x3f, 0xe2, 0x2c, 0xbc, 0xe0, 0xfd, 0xc1,
+ 0x3c, 0x84, 0x67, 0xa7, 0xbb, 0xc5, 0x1d, 0xfc, 0xbc, 0xee, 0x05, 0x6b, 0xbd,
+ 0x9a, 0x29, 0xc9, 0xbc, 0x35, 0x9c, 0x0f, 0x3d, 0xff, 0xd3, 0x1c, 0xbd, 0x60,
+ 0x5c, 0x3d, 0xbd, 0x85, 0xf0, 0x81, 0x3d, 0xe6, 0x58, 0x0f, 0xbc, 0xda, 0x46,
+ 0x01, 0xbd, 0xe4, 0xae, 0x88, 0xbd, 0xe2, 0x4a, 0x47, 0xbd, 0x51, 0xf0, 0x7e,
+ 0xbd, 0x18, 0xc7, 0x82, 0x3d, 0x85, 0xf7, 0x26, 0x3d, 0x7f, 0xe0, 0xc0, 0xbc,
+ 0x28, 0xa7, 0x56, 0x3b, 0x86, 0xe9, 0x17, 0xbb, 0x75, 0xc7, 0x81, 0x3d, 0x0c,
+ 0x95, 0x19, 0xbc, 0x27, 0x0d, 0x62, 0xbd, 0xae, 0x2f, 0x14, 0x3b, 0xcf, 0x26,
+ 0x47, 0xbd, 0x75, 0xe8, 0x26, 0x3d, 0x99, 0x94, 0x48, 0x3d, 0xac, 0xe6, 0x3f,
+ 0x3d, 0x50, 0xa8, 0xee, 0x3c, 0x25, 0x3e, 0xef, 0xbc, 0x98, 0xfe, 0x37, 0xbc,
+ 0x05, 0x4b, 0x28, 0x3d, 0xa5, 0x42, 0xfc, 0x3c, 0x40, 0xda, 0x68, 0x3d, 0xf7,
+ 0x91, 0x35, 0x3d, 0xae, 0xa1, 0x1a, 0x3d, 0xeb, 0xc7, 0x1b, 0xbd, 0x98, 0x7d,
+ 0xb1, 0x3c, 0xf7, 0xe7, 0x0b, 0xbd, 0x72, 0x31, 0x47, 0x3d, 0x47, 0xeb, 0x85,
+ 0xbd, 0x4f, 0x71, 0x1f, 0xbc, 0xae, 0x19, 0x1b, 0xbd, 0x30, 0xc5, 0xd7, 0xbb,
+ 0x94, 0xbe, 0x05, 0x3d, 0x39, 0x66, 0x94, 0x3c, 0x68, 0xab, 0x65, 0xbc, 0x4a,
+ 0x43, 0xd3, 0xbc, 0x66, 0x6e, 0x22, 0x3d, 0x2c, 0xb6, 0x45, 0x3d, 0xec, 0xf0,
+ 0x09, 0xbd, 0x15, 0x84, 0xd6, 0x3c, 0x67, 0xb6, 0x5e, 0xbd, 0x48, 0xb9, 0x1b,
+ 0x3d, 0xef, 0x6b, 0x36, 0x3d, 0xfa, 0x9f, 0x60, 0x3c, 0xfb, 0x49, 0x8c, 0x3d,
+ 0x50, 0x0b, 0xfd, 0x3c, 0x43, 0x24, 0xf5, 0x3c, 0x48, 0xf5, 0x1c, 0x3d, 0x24,
+ 0xed, 0x55, 0xbd, 0x12, 0x2a, 0x33, 0xbd, 0x6f, 0x59, 0x3b, 0xbb, 0xeb, 0x66,
+ 0xe0, 0xbc, 0x7b, 0x67, 0x60, 0xbb, 0x19, 0x8c, 0x85, 0x3c, 0x72, 0x71, 0x22,
+ 0x3b, 0x7f, 0xa1, 0x22, 0xbd, 0x9e, 0xcd, 0x04, 0x3d, 0x00, 0xf6, 0xff, 0xb9,
+ 0xdf, 0x8b, 0x16, 0xbd, 0xc1, 0x0c, 0xfd, 0x3c, 0x9b, 0xf9, 0x5b, 0xbd, 0x71,
+ 0x73, 0x8c, 0x3d, 0x0f, 0x55, 0x63, 0x3d, 0x20, 0xbf, 0xb9, 0x3c, 0xa3, 0xc5,
+ 0x85, 0x3d, 0xfd, 0x98, 0x2e, 0xbd, 0xb4, 0x02, 0x2e, 0xbc, 0xe2, 0x12, 0x46,
+ 0xbc, 0x90, 0x41, 0x6f, 0xbd, 0x0d, 0xc7, 0x68, 0x3d, 0x4e, 0x58, 0x4f, 0x3c,
+ 0xc0, 0xeb, 0x1d, 0xbb, 0x3d, 0xcb, 0x9f, 0xbd, 0x29, 0x0c, 0x7f, 0x3d, 0x8a,
+ 0x62, 0x4d, 0xbc, 0x01, 0x3c, 0x7b, 0x3d, 0x3c, 0x41, 0xb8, 0x3c, 0xa9, 0x70,
+ 0x53, 0x3d, 0x32, 0x94, 0xab, 0x3d, 0xdc, 0x75, 0x4c, 0x3d, 0xab, 0x5d, 0xd6,
+ 0xbc, 0xae, 0x74, 0x0a, 0xbd, 0x7f, 0xf5, 0xec, 0x3c, 0xff, 0x6e, 0x4c, 0xbd,
+ 0x0c, 0x65, 0x16, 0xbc, 0x4f, 0x2a, 0x58, 0x3c, 0xe2, 0x17, 0xa0, 0x3d, 0x6a,
+ 0x10, 0x83, 0xbc, 0xfc, 0x40, 0xc0, 0x3d, 0xbc, 0xa0, 0xad, 0xbc, 0xde, 0xdc,
+ 0x98, 0x3d, 0xaf, 0x54, 0x84, 0xbb, 0x64, 0xcd, 0xdf, 0x3c, 0xab, 0x93, 0x2c,
+ 0xbc, 0x44, 0x5c, 0x29, 0x3c, 0xac, 0x7f, 0x27, 0x3d, 0xb2, 0x34, 0xee, 0x3c,
+ 0x66, 0xf2, 0xd9, 0x3c, 0x4d, 0xaf, 0x86, 0x3d, 0xee, 0x79, 0x10, 0xbd, 0xa2,
+ 0x84, 0x31, 0xbd, 0xe2, 0xf9, 0x43, 0x3d, 0x26, 0x87, 0xf1, 0x3b, 0xf0, 0x3a,
+ 0x8f, 0xbd, 0x3e, 0x23, 0x5d, 0xbd, 0x75, 0x0a, 0x7c, 0x3d, 0x15, 0xe4, 0x5a,
+ 0xbd, 0x45, 0xb3, 0xb2, 0x3c, 0xe3, 0xc4, 0x36, 0x3d, 0x7d, 0x89, 0x9f, 0x3c,
+ 0x9e, 0x54, 0xaa, 0xbb, 0x89, 0x2e, 0x88, 0xbd, 0xad, 0xe0, 0x89, 0xbc, 0x69,
+ 0xe9, 0x66, 0xbd, 0x94, 0xa9, 0xf4, 0xbc, 0xb3, 0xde, 0x21, 0xbd, 0x0b, 0x5a,
+ 0x82, 0xbd, 0x55, 0x78, 0x00, 0x3d, 0x1f, 0x1d, 0xa2, 0xbd, 0x5c, 0xe4, 0x4b,
+ 0xbd, 0x63, 0x9e, 0xa6, 0xbd, 0x44, 0xdb, 0x75, 0xbd, 0x6a, 0xe7, 0xf3, 0xbc,
+ 0xdc, 0xa5, 0x2c, 0xbd, 0xc7, 0xcd, 0x8d, 0x3c, 0xd4, 0x97, 0x85, 0x3c, 0xc5,
+ 0x19, 0x4a, 0xbc, 0x48, 0x7d, 0x09, 0xbc, 0xd6, 0x74, 0x2c, 0xbd, 0x94, 0xb6,
+ 0xf9, 0x3c, 0xfd, 0x54, 0x8d, 0x3d, 0xdf, 0x85, 0x57, 0x3d, 0x82, 0x58, 0x67,
+ 0x3d, 0x67, 0x4a, 0xe8, 0xba, 0xec, 0xb0, 0xe9, 0x3c, 0x9a, 0xf0, 0x1f, 0x3d,
+ 0x80, 0xbc, 0x7e, 0xbd, 0x15, 0xe3, 0x16, 0x3d, 0x49, 0xb7, 0x33, 0xbc, 0x03,
+ 0xbe, 0x65, 0xbd, 0x6c, 0x41, 0x8b, 0x3d, 0x93, 0x68, 0x85, 0xbc, 0x50, 0x1a,
+ 0x50, 0xbd, 0x10, 0xbe, 0x7f, 0xbc, 0x15, 0x0c, 0x58, 0xbc, 0x48, 0xe9, 0x92,
+ 0xbd, 0x48, 0x67, 0x3e, 0xbc, 0x38, 0x60, 0x66, 0xbd, 0x76, 0xac, 0x9e, 0xbd,
+ 0x4d, 0xc9, 0x61, 0x3d, 0x0b, 0xa6, 0x9f, 0xbd, 0x8f, 0x08, 0xcb, 0x3c, 0x60,
+ 0x17, 0x35, 0x3d, 0x60, 0x75, 0x7a, 0x3c, 0x24, 0x97, 0x48, 0x3a, 0x64, 0x78,
+ 0x90, 0xbc, 0xf3, 0x93, 0xb8, 0xbb, 0x46, 0x84, 0x69, 0xbd, 0xd6, 0x71, 0x43,
+ 0x3d, 0xb4, 0x2b, 0x62, 0xbc, 0x47, 0x6b, 0x08, 0x3c, 0x0e, 0x23, 0xeb, 0xbc,
+ 0xf4, 0xc8, 0xb0, 0xbc, 0x3f, 0x17, 0xbe, 0xbc, 0x11, 0xc5, 0x99, 0x3d, 0x50,
+ 0x81, 0x15, 0x3d, 0x8e, 0xd8, 0x7d, 0x3d, 0xfd, 0x07, 0x8d, 0xbb, 0x7a, 0x46,
+ 0xea, 0x3c, 0x7d, 0xc9, 0x2c, 0x3d, 0x1e, 0x27, 0x2f, 0x3d, 0x67, 0x04, 0x05,
+ 0xbc, 0x8f, 0x0a, 0x71, 0xbc, 0x44, 0xcb, 0x78, 0xbc, 0x3b, 0x8e, 0x17, 0x3d,
+ 0x8c, 0x61, 0xf6, 0x3c, 0xdf, 0x7a, 0x54, 0x3d, 0x93, 0xe6, 0xaa, 0xbc, 0xef,
+ 0x19, 0xd2, 0xbc, 0xb8, 0xec, 0x13, 0x3d, 0xed, 0x16, 0x39, 0x3d, 0x7c, 0xb2,
+ 0xdc, 0x3c, 0x03, 0xf9, 0x84, 0xb9, 0xe7, 0xbd, 0x70, 0xbc, 0xea, 0x33, 0x77,
+ 0x3d, 0xa8, 0xd3, 0x55, 0x3c, 0x3b, 0x55, 0x04, 0x3c, 0x72, 0x75, 0x67, 0xbc,
+ 0xde, 0x63, 0x4b, 0xbc, 0x73, 0xc5, 0x01, 0xbd, 0x2e, 0x1b, 0x01, 0x3c, 0xb2,
+ 0xeb, 0x57, 0x3d, 0x81, 0xaa, 0x2d, 0xbd, 0x68, 0x5f, 0x1c, 0xbd, 0x0e, 0x36,
+ 0x77, 0x3d, 0xd9, 0xb5, 0x27, 0x3c, 0x99, 0x74, 0x27, 0x3d, 0xae, 0x86, 0x74,
+ 0xbd, 0x57, 0x12, 0x0e, 0xbd, 0x37, 0x30, 0x2a, 0x3d, 0x5e, 0xf5, 0x3b, 0x3d,
+ 0x37, 0x81, 0x6f, 0x3d, 0xd3, 0xe7, 0x4b, 0xbd, 0x4a, 0x7f, 0x85, 0x3d, 0xce,
+ 0x31, 0x21, 0x3d, 0xda, 0xf8, 0x86, 0xbc, 0x5e, 0x6d, 0x1f, 0x3c, 0x80, 0x1b,
+ 0x06, 0x3b, 0xd7, 0x82, 0x5f, 0x3d, 0x74, 0xc0, 0x26, 0xbd, 0x1d, 0x0e, 0x8d,
+ 0xbc, 0x00, 0xfe, 0x06, 0x3d, 0x5f, 0x91, 0x79, 0xbd, 0x53, 0x7a, 0xee, 0xbc,
+ 0x64, 0x03, 0x41, 0x3d, 0x66, 0xa9, 0xfa, 0xba, 0x67, 0x37, 0x40, 0xbd, 0xd8,
+ 0x7f, 0x23, 0xbd, 0x1a, 0x9f, 0x03, 0xbc, 0x93, 0x26, 0x03, 0xbd, 0xeb, 0xf7,
+ 0x58, 0xbc, 0x04, 0xe4, 0xdc, 0xb9, 0xb6, 0xbb, 0x9b, 0x3b, 0x9e, 0x4b, 0x14,
+ 0x3d, 0x5a, 0x9a, 0xd4, 0xba, 0x59, 0xcd, 0x21, 0xbd, 0x00, 0xc3, 0x85, 0x3c,
+ 0xec, 0xbf, 0xf2, 0xbc, 0x0e, 0x59, 0x3a, 0xbd, 0xa7, 0x8f, 0x81, 0x3d, 0x11,
+ 0x2d, 0x63, 0xbd, 0x55, 0x42, 0xe8, 0xbc, 0x6b, 0x6e, 0x8c, 0x3c, 0xa3, 0x84,
+ 0x1d, 0xbd, 0x8c, 0xda, 0x4f, 0x3c, 0xb2, 0x36, 0xd1, 0x3c, 0x4f, 0x27, 0x71,
+ 0x3d, 0xf8, 0x32, 0x8c, 0x3c, 0x5c, 0xe8, 0x69, 0xbc, 0x42, 0xcb, 0x24, 0x3d,
+ 0x8f, 0xd8, 0x6b, 0xbd, 0x87, 0xd2, 0x9c, 0xbd, 0xc5, 0x3f, 0xb5, 0x3c, 0x08,
+ 0xfc, 0xf9, 0x3c, 0x5b, 0x21, 0x7e, 0x3d, 0xef, 0x06, 0x65, 0xbc, 0xda, 0x92,
+ 0x02, 0x3c, 0xb1, 0xf0, 0x99, 0xbc, 0x2e, 0x72, 0xe7, 0xbc, 0x32, 0x44, 0x6a,
+ 0xbd, 0xdd, 0xbb, 0x20, 0x3b, 0xa1, 0xbf, 0xa3, 0x3c, 0xd2, 0x4f, 0x9b, 0x3c,
+ 0xf8, 0x55, 0xbe, 0x3c, 0x35, 0xe3, 0x0a, 0x3d, 0xf0, 0x8a, 0x89, 0xbc, 0xd7,
+ 0xd7, 0x6f, 0x3d, 0x96, 0xd9, 0x70, 0xbd, 0x00, 0x50, 0x20, 0x39, 0x1f, 0xa7,
+ 0x17, 0x3d, 0x4f, 0x4f, 0xc3, 0xbb, 0xf6, 0x99, 0x40, 0xbd, 0x87, 0xd4, 0x2a,
+ 0xbd, 0x09, 0x54, 0x06, 0x3d, 0x87, 0x46, 0xf4, 0xbb, 0x9c, 0x12, 0x12, 0x3c,
+ 0x2f, 0xc9, 0xd1, 0x3c, 0x4c, 0x47, 0x4e, 0x3d, 0xf9, 0x77, 0x64, 0xbd, 0xd1,
+ 0xa5, 0x17, 0xbd, 0xf3, 0x5b, 0xdb, 0x3c, 0x98, 0x30, 0x55, 0x3d, 0x3f, 0x3d,
+ 0x37, 0xbd, 0x54, 0x12, 0xed, 0xbc, 0x30, 0x26, 0x1d, 0x3d, 0x72, 0x80, 0x8a,
+ 0x3d, 0xf1, 0xd7, 0x4c, 0xbd, 0xa9, 0xc7, 0x83, 0x3d, 0x86, 0xba, 0x93, 0xbd,
+ 0x6b, 0x0a, 0x90, 0xbd, 0x96, 0x8c, 0x64, 0xbd, 0x40, 0x70, 0xf1, 0x3a, 0xc0,
+ 0x39, 0x79, 0x3d, 0x27, 0xda, 0x24, 0xbc, 0x36, 0x2e, 0x3c, 0x3d, 0xb0, 0xbe,
+ 0x90, 0xbd, 0x20, 0x68, 0x14, 0xbc, 0x00, 0xa4, 0x3e, 0xbc, 0x85, 0xb9, 0x44,
+ 0xbd, 0xa2, 0x06, 0x52, 0xbd, 0x6e, 0xae, 0x4a, 0xbd, 0xbe, 0x73, 0x6c, 0xbd,
+ 0x49, 0xee, 0x3e, 0xbd, 0x36, 0x8a, 0xe0, 0x3c, 0x7f, 0x94, 0x8a, 0xbd, 0x19,
+ 0x1d, 0x11, 0xbd, 0x15, 0x3e, 0x55, 0xbd, 0x4b, 0xcd, 0x7b, 0x3d, 0x63, 0xd7,
+ 0x9f, 0xba, 0x83, 0xcb, 0x37, 0xbd, 0xa4, 0x4f, 0x21, 0xbd, 0xa5, 0xaf, 0xec,
+ 0xbc, 0xcd, 0x46, 0xae, 0xbd, 0xe8, 0x66, 0x9d, 0x3c, 0x7c, 0x84, 0xa6, 0xbc,
+ 0x85, 0xcc, 0x7f, 0x3d, 0xa5, 0x28, 0xa6, 0xbd, 0x2f, 0x3a, 0x55, 0xbc, 0xb4,
+ 0x8b, 0xc8, 0xbc, 0xd3, 0x90, 0x5e, 0x3d, 0x49, 0x79, 0x81, 0xbd, 0x50, 0xc3,
+ 0x79, 0xbc, 0x90, 0x04, 0x9b, 0xbd, 0x1e, 0xdb, 0x73, 0x3d, 0x97, 0x15, 0x7e,
+ 0x3c, 0x5f, 0xf6, 0x83, 0x3d, 0x1d, 0x20, 0x32, 0x3c, 0xda, 0x32, 0x7a, 0xbd,
+ 0x8f, 0xa0, 0x69, 0x3c, 0x20, 0xe0, 0x87, 0xbd, 0x08, 0xb7, 0x2f, 0x3d, 0x5e,
+ 0x6c, 0x26, 0xbd, 0xba, 0xa8, 0xbe, 0xbc, 0xb3, 0x9b, 0xb7, 0xbc, 0xc1, 0x3e,
+ 0x8e, 0x3d, 0x45, 0x90, 0x3f, 0xbd, 0x82, 0xee, 0x0c, 0x3d, 0x62, 0xe1, 0x38,
+ 0xbc, 0x30, 0x95, 0x8b, 0x3c, 0xc6, 0x6b, 0x58, 0x3d, 0x7c, 0xca, 0x06, 0xbd,
+ 0x03, 0xa3, 0x7b, 0x3d, 0x77, 0xef, 0x83, 0x3c, 0x24, 0xc7, 0x69, 0x3d, 0xf6,
+ 0xed, 0x35, 0xbd, 0xaa, 0x2d, 0x33, 0x3d, 0x71, 0x69, 0x72, 0x3c, 0xed, 0x0d,
+ 0x80, 0x3c, 0x02, 0x0d, 0x47, 0x3d, 0x30, 0x51, 0x86, 0xbc, 0x0a, 0xad, 0x8d,
+ 0xbc, 0x80, 0xab, 0x1c, 0x3d, 0x68, 0x17, 0x3d, 0x3d, 0x47, 0x3c, 0x36, 0xbd,
+ 0x32, 0x58, 0xfb, 0x3c, 0x27, 0x47, 0x82, 0x3d, 0xb8, 0x9c, 0x92, 0xbc, 0xab,
+ 0xa8, 0xaf, 0xbb, 0x97, 0xb4, 0x7b, 0x3d, 0xdb, 0x16, 0xad, 0xbc, 0xa8, 0x50,
+ 0x8b, 0xbd, 0x50, 0x91, 0x4d, 0x3c, 0xe1, 0x69, 0x73, 0x3c, 0x62, 0x4f, 0x30,
+ 0xbd, 0x00, 0x70, 0x6a, 0x3c, 0x57, 0xbb, 0x8f, 0x3d, 0xe6, 0x60, 0x44, 0xbd,
+ 0x33, 0x5a, 0xc2, 0xbc, 0xe6, 0xae, 0x82, 0xbd, 0x1e, 0xad, 0x6e, 0xbd, 0xc9,
+ 0x43, 0x30, 0x3d, 0x30, 0x4a, 0x65, 0x3c, 0x79, 0x1d, 0xc7, 0x3c, 0x97, 0xab,
+ 0x1e, 0x3b, 0x95, 0x60, 0xd7, 0xbc, 0xcc, 0xed, 0xa1, 0xbc, 0xa3, 0x6d, 0x6b,
+ 0xbd, 0xd8, 0xc4, 0x30, 0x3c, 0xcf, 0x3e, 0x8b, 0xbc, 0x82, 0xd9, 0x0d, 0xbc,
+ 0x6b, 0x1f, 0xdb, 0xbc, 0xb7, 0x65, 0x76, 0xbd, 0x19, 0x3a, 0xfb, 0x3c, 0xe8,
+ 0x08, 0x08, 0xbd, 0x0b, 0xdb, 0x00, 0xbd, 0x4c, 0x51, 0x19, 0xbd, 0x2e, 0x6c,
+ 0x37, 0x3d, 0xc0, 0xdf, 0x1e, 0x3b, 0x64, 0x10, 0x49, 0x3d, 0x77, 0x9b, 0xca,
+ 0xbc, 0xca, 0x17, 0xfb, 0xbc, 0xe6, 0xa4, 0x92, 0x3d, 0xfd, 0x90, 0x77, 0x3d,
+ 0x82, 0x5e, 0x6b, 0x3d, 0xe5, 0x15, 0x3c, 0x3d, 0xc3, 0x45, 0xf9, 0xbb, 0x0c,
+ 0x61, 0x88, 0xbd, 0x26, 0xa1, 0x68, 0xbd, 0x67, 0x2c, 0x1e, 0xbd, 0x2b, 0xfe,
+ 0x3e, 0xbd, 0xb9, 0x45, 0x0b, 0xbd, 0x8e, 0x79, 0x09, 0xbd, 0x16, 0xdf, 0x45,
+ 0xbd, 0x52, 0xbb, 0x24, 0xbc, 0x84, 0x55, 0x78, 0xbd, 0xb7, 0x6d, 0x55, 0x3d,
+ 0xb8, 0xe4, 0x8a, 0x3d, 0xcc, 0x8e, 0x2d, 0xbd, 0xf8, 0x0a, 0x13, 0x3c, 0xda,
+ 0x22, 0x23, 0x3d, 0xee, 0x07, 0x1e, 0x3d, 0xee, 0x5c, 0x38, 0xbd, 0x1b, 0xfa,
+ 0xc1, 0xbc, 0x62, 0x88, 0x82, 0xbc, 0x9e, 0x6c, 0x39, 0xbd, 0xe8, 0xc8, 0x90,
+ 0xbd, 0xb2, 0xaf, 0x0e, 0xbd, 0x87, 0xc1, 0x61, 0xbc, 0x91, 0xcf, 0x21, 0x3b,
+ 0xaa, 0x52, 0x88, 0xbd, 0x2b, 0xcb, 0x8e, 0xbd, 0x42, 0x58, 0xb0, 0x3c, 0x72,
+ 0x3e, 0x9a, 0x3c, 0x1e, 0x92, 0x09, 0x3d, 0xc6, 0x67, 0x9a, 0xbd, 0xa0, 0xb0,
+ 0x29, 0x3b, 0x51, 0x6e, 0x0c, 0xbd, 0x88, 0x0d, 0x4d, 0xbd, 0x1c, 0xc3, 0xee,
+ 0x3c, 0x43, 0xfc, 0x61, 0x3d, 0x74, 0x13, 0x84, 0x3c, 0x10, 0xbc, 0xd4, 0x3c,
+ 0x8a, 0x20, 0x9d, 0x39, 0x0a, 0x33, 0xdd, 0x3b, 0xee, 0x75, 0x96, 0xbd, 0x77,
+ 0x4f, 0xa2, 0x3c, 0x1a, 0x55, 0xe4, 0xbc, 0x17, 0x4b, 0x5c, 0xbc, 0xe8, 0x22,
+ 0x5a, 0xbd, 0xcf, 0xa8, 0x46, 0x3c, 0x2e, 0x1d, 0x2c, 0xbd, 0x7c, 0x53, 0x62,
+ 0xbc, 0x4e, 0xdc, 0x25, 0x3d, 0x3c, 0x94, 0x4e, 0xbd, 0xba, 0x9a, 0x3b, 0xbd,
+ 0x32, 0x01, 0x02, 0x3d, 0x57, 0xd2, 0x80, 0x3d, 0x88, 0x7d, 0xb4, 0xbc, 0x81,
+ 0xbf, 0x7f, 0xbd, 0xf7, 0xbb, 0x89, 0x3d, 0xa0, 0xba, 0x30, 0x3d, 0x13, 0xd5,
+ 0x91, 0x3d, 0xc7, 0x59, 0x37, 0x3d, 0x3c, 0xc1, 0x95, 0xbd, 0x41, 0x62, 0x94,
+ 0xbc, 0x09, 0x66, 0x25, 0xbc, 0x4a, 0x10, 0x84, 0xbd, 0xf0, 0x61, 0x09, 0x3d,
+ 0x7c, 0xba, 0x6d, 0x3d, 0x43, 0x44, 0x60, 0x3d, 0xbc, 0x42, 0x2d, 0x3d, 0x09,
+ 0x6d, 0x2d, 0x3d, 0x3b, 0x61, 0xb1, 0x3c, 0xd7, 0xb2, 0x36, 0xbc, 0x10, 0xe9,
+ 0x06, 0xbd, 0xd4, 0x30, 0x64, 0x3d, 0x4e, 0xb2, 0x8d, 0xbc, 0x54, 0x0d, 0x24,
+ 0xbd, 0xb6, 0x13, 0xe8, 0x3c, 0xe1, 0xd2, 0xd3, 0x3c, 0xd2, 0xc8, 0x99, 0xbc,
+ 0x5c, 0x05, 0x75, 0x3d, 0x58, 0x19, 0x91, 0x3d, 0x66, 0x5b, 0x03, 0xbd, 0xf4,
+ 0x88, 0xbd, 0xbc, 0xff, 0x51, 0x93, 0xbc, 0xaa, 0xc8, 0x3e, 0x3d, 0x57, 0x16,
+ 0xbc, 0xba, 0xf4, 0xe1, 0xa0, 0xbd, 0x3a, 0x82, 0x94, 0xbd, 0x77, 0xfa, 0x86,
+ 0xbd, 0xa6, 0xfd, 0x84, 0xbb, 0x91, 0x28, 0xeb, 0xbb, 0x86, 0xfd, 0xca, 0xbc,
+ 0x7f, 0xd4, 0x10, 0xbc, 0xea, 0x09, 0x08, 0xbd, 0xbe, 0x9e, 0x23, 0xbc, 0x5a,
+ 0x6a, 0x4f, 0xbd, 0x00, 0xf1, 0x54, 0x3d, 0xf4, 0x72, 0xb8, 0xbc, 0x0a, 0xde,
+ 0x0f, 0x3d, 0x27, 0x61, 0x1b, 0x3d, 0xed, 0xb6, 0x49, 0xbd, 0x11, 0x6d, 0xfb,
+ 0x3c, 0x51, 0x41, 0x75, 0x3d, 0x0b, 0x3b, 0x68, 0x3d, 0x1e, 0xb2, 0x6c, 0xbd,
+ 0xd0, 0x5a, 0xfe, 0x3c, 0x3d, 0xa0, 0x30, 0xbd, 0xc8, 0xf9, 0x89, 0x3c, 0x10,
+ 0x06, 0x72, 0x3d, 0xed, 0x61, 0xe1, 0x3a, 0x35, 0x65, 0x7e, 0x3d, 0x16, 0x6c,
+ 0x4d, 0x3d, 0x8a, 0xf6, 0x5a, 0x3d, 0x3e, 0x18, 0x64, 0x3d, 0x36, 0x9a, 0xbe,
+ 0x3c, 0x14, 0xa7, 0xba, 0xbc, 0x93, 0x98, 0xe3, 0x3c, 0x14, 0x13, 0x30, 0x3d,
+ 0xa8, 0x9a, 0x71, 0xbc, 0xd0, 0x9e, 0xfd, 0xbc, 0x10, 0x8b, 0xa7, 0xbd, 0xb9,
+ 0x47, 0x2f, 0x3d, 0x44, 0xff, 0x9c, 0xbd, 0x5b, 0x84, 0x3e, 0xbd, 0xc6, 0xa4,
+ 0xaa, 0x3c, 0x5b, 0xa9, 0x0e, 0xbd, 0x6b, 0xa6, 0x33, 0x3d, 0x65, 0x26, 0x46,
+ 0x3d, 0x8e, 0x5d, 0xdc, 0xbc, 0x62, 0xcf, 0x43, 0xbd, 0xfd, 0x0e, 0x86, 0x3d,
+ 0x52, 0xd5, 0xf3, 0x3c, 0x10, 0x00, 0x50, 0xbc, 0x55, 0xec, 0x6c, 0xbd, 0x9b,
+ 0x21, 0x46, 0x3d, 0xb3, 0xe4, 0x80, 0xbc, 0xa1, 0xf7, 0x84, 0xbd, 0x64, 0x01,
+ 0x4e, 0xbd, 0x01, 0xfb, 0x3e, 0xbc, 0x28, 0xfc, 0xac, 0xbc, 0x84, 0xf6, 0x17,
+ 0x3c, 0x69, 0x7c, 0xd9, 0xbc, 0x30, 0xb8, 0xfe, 0xbc, 0x0e, 0x3a, 0x87, 0xbd,
+ 0x88, 0xad, 0x93, 0xbd, 0xe1, 0x85, 0x8d, 0xbd, 0x42, 0x8c, 0x12, 0x3d, 0x41,
+ 0x59, 0x84, 0xbd, 0x1c, 0x0e, 0x70, 0xbb, 0xb0, 0x9e, 0xd3, 0xbc, 0x3c, 0x03,
+ 0xdb, 0xbb, 0xf4, 0x19, 0x01, 0x3d, 0x6f, 0x20, 0xc6, 0x3c, 0x77, 0xc0, 0xb4,
+ 0x3c, 0x4a, 0xa0, 0xa7, 0x3c, 0x1c, 0xaa, 0x2a, 0xbd, 0x49, 0x9b, 0x60, 0xbd,
+ 0x30, 0xff, 0xf9, 0xbc, 0x2f, 0x70, 0xc9, 0xbb, 0x72, 0x4b, 0x8f, 0xbd, 0x47,
+ 0xc6, 0x34, 0x3d, 0x18, 0x49, 0x21, 0x3c, 0x04, 0x19, 0x30, 0x3d, 0x74, 0xbe,
+ 0x7b, 0xbb, 0xbc, 0x92, 0x43, 0xbc, 0x6f, 0xb6, 0xdf, 0xbc, 0x20, 0xdb, 0x90,
+ 0x3c, 0x45, 0x29, 0x95, 0xbc, 0x4c, 0x9c, 0xa6, 0x3c, 0x2b, 0xbf, 0xe4, 0xbc,
+ 0xa9, 0x41, 0xff, 0xbc, 0x62, 0x15, 0xd4, 0x3c, 0x29, 0x60, 0x8e, 0xbd, 0x8d,
+ 0xce, 0x56, 0xbc, 0x84, 0x09, 0x41, 0x3d, 0x16, 0xb8, 0x35, 0x3d, 0x03, 0x5c,
+ 0x09, 0xbd, 0x82, 0xfe, 0x64, 0x3d, 0x16, 0x2e, 0x6d, 0xbd, 0xbf, 0x4b, 0x05,
+ 0xbd, 0x15, 0x9a, 0x28, 0xbd, 0x1d, 0x3d, 0x4f, 0xbd, 0x7c, 0x8a, 0x99, 0x3b,
+ 0xf9, 0x8c, 0x35, 0xbd, 0xef, 0xc2, 0x2a, 0xbd, 0xe6, 0xea, 0x85, 0xbc, 0xfd,
+ 0xf1, 0xde, 0x3b, 0xce, 0xb3, 0x5f, 0x3d, 0x2f, 0x4a, 0x30, 0xbc, 0xc5, 0xa1,
+ 0x09, 0xbd, 0x63, 0x5f, 0x5e, 0xbd, 0x44, 0xc9, 0xc2, 0xbc, 0xb6, 0x2a, 0xf8,
+ 0xbc, 0x58, 0x39, 0x34, 0x3d, 0x49, 0xbe, 0x5c, 0xbd, 0x45, 0xad, 0x1d, 0x3c,
+ 0x3f, 0x9f, 0x19, 0xbd, 0xfb, 0xef, 0x2e, 0x3c, 0xd5, 0xe8, 0x88, 0x3c, 0x13,
+ 0x36, 0x5c, 0xbd, 0x04, 0xeb, 0x78, 0x3c, 0x6e, 0x39, 0x64, 0x3d, 0xdc, 0x1e,
+ 0x70, 0x3d, 0x79, 0x43, 0x4d, 0x3d, 0xfd, 0x0f, 0x30, 0xbd, 0xd2, 0x88, 0x18,
+ 0x3d, 0x87, 0x62, 0xcc, 0x3c, 0x00, 0x39, 0x30, 0x3d, 0xba, 0xa0, 0xfa, 0xbc,
+ 0x00, 0x3d, 0x41, 0x3d, 0xed, 0xfa, 0x73, 0xbd, 0x0c, 0x09, 0x54, 0xbd, 0x77,
+ 0x2f, 0x5f, 0xbd, 0x01, 0x38, 0x7f, 0xbd, 0x98, 0x08, 0xee, 0xbc, 0x53, 0x34,
+ 0x48, 0xbc, 0x8a, 0x25, 0x72, 0xbc, 0xf3, 0x71, 0x70, 0xbd, 0x44, 0xdf, 0x1b,
+ 0x3d, 0xd8, 0x6e, 0x6f, 0xbd, 0xdf, 0x4d, 0x23, 0x3c, 0x9c, 0xfb, 0x21, 0x3d,
+ 0x72, 0xe1, 0xa4, 0xbc, 0x74, 0xc3, 0x2e, 0xbd, 0x63, 0x0c, 0x8a, 0xbc, 0x24,
+ 0x09, 0x6e, 0xbd, 0xbb, 0x68, 0x68, 0xbd, 0x7d, 0xd7, 0x6c, 0x3d, 0xd8, 0x63,
+ 0x63, 0x3c, 0x1a, 0x16, 0xdb, 0xbb, 0x86, 0x5e, 0x40, 0xbd, 0x50, 0x6d, 0x31,
+ 0xbb, 0xdd, 0xb6, 0x96, 0xbd, 0x19, 0x27, 0x56, 0xbd, 0xf3, 0xd5, 0x11, 0x3d,
+ 0x91, 0x8e, 0x68, 0x3d, 0xea, 0xed, 0x86, 0xbd, 0xd6, 0x51, 0x87, 0xbc, 0xfb,
+ 0x6c, 0x76, 0xbd, 0x50, 0x6f, 0x38, 0x3d, 0x9b, 0xa5, 0x71, 0xbd, 0x9b, 0x1f,
+ 0x16, 0xbd, 0x25, 0xee, 0x93, 0x3d, 0xa9, 0x05, 0xca, 0xbc, 0x9f, 0xee, 0x36,
+ 0xbd, 0x5c, 0x03, 0x28, 0x3d, 0x52, 0x3b, 0xb1, 0x3c, 0xe3, 0x45, 0x13, 0x3d,
+ 0x38, 0xec, 0x82, 0xbd, 0xba, 0xc6, 0x5f, 0x3d, 0x18, 0xf7, 0x59, 0x3d, 0xc4,
+ 0x2f, 0x89, 0x3c, 0x3c, 0x23, 0xd1, 0xbc, 0x39, 0xa7, 0x28, 0x3d, 0x07, 0x78,
+ 0x17, 0xbc, 0x72, 0xe3, 0xaf, 0xbc, 0x15, 0x2e, 0x2d, 0x3d, 0x2c, 0x3d, 0xa3,
+ 0x3c, 0x33, 0x96, 0x18, 0xbd, 0xee, 0x47, 0x30, 0xbd, 0x56, 0xc0, 0x0e, 0xbd,
+ 0xae, 0x3b, 0x74, 0x3c, 0x79, 0x3e, 0x94, 0x3d, 0xee, 0x19, 0x3d, 0xbd, 0x8d,
+ 0x14, 0x7a, 0xbd, 0x49, 0xfa, 0x2e, 0x3d, 0x9a, 0x0e, 0x8e, 0xbd, 0x41, 0x87,
+ 0x45, 0x3c, 0x3b, 0x28, 0x66, 0xbd, 0x3d, 0xbd, 0x20, 0x3d, 0x60, 0x4e, 0x80,
+ 0xbd, 0x7a, 0x3c, 0x50, 0xbd, 0xaa, 0x0f, 0x9e, 0xbd, 0xa2, 0x81, 0x57, 0xbd,
+ 0x69, 0xf7, 0x27, 0x3d, 0x62, 0x88, 0x17, 0xbc, 0x47, 0x5d, 0xac, 0x3c, 0xe7,
+ 0x41, 0x31, 0xbd, 0xde, 0xec, 0x85, 0xbd, 0x74, 0xa1, 0x48, 0xbd, 0x80, 0x0d,
+ 0x2a, 0xbd, 0x5e, 0x67, 0x7e, 0x3c, 0x35, 0xa5, 0xc6, 0x3c, 0xc4, 0xeb, 0x89,
+ 0xbc, 0xcb, 0xa7, 0x97, 0x3c, 0x0f, 0xca, 0x68, 0x3c, 0xeb, 0x57, 0xea, 0xbc,
+ 0x88, 0xf8, 0xb3, 0x3c, 0x44, 0x92, 0xee, 0x3c, 0x89, 0xa1, 0x92, 0x3d, 0x61,
+ 0xa5, 0x23, 0x3a, 0x1e, 0x6c, 0x28, 0xbd, 0x18, 0x89, 0xa4, 0x3c, 0xd1, 0x26,
+ 0x47, 0x3b, 0x4a, 0x06, 0x80, 0x3c, 0x3a, 0x5f, 0x58, 0xbd, 0x6e, 0x1d, 0x77,
+ 0xbd, 0xe1, 0x43, 0x89, 0x3a, 0x41, 0xd0, 0x71, 0xbc, 0x90, 0x43, 0x40, 0xbd,
+ 0xa5, 0xc3, 0x3a, 0x3c, 0xc2, 0x45, 0xb1, 0xbb, 0xf1, 0x81, 0x32, 0x3d, 0x80,
+ 0x8e, 0x20, 0x3d, 0x0a, 0xbd, 0x14, 0x3d, 0xbb, 0x93, 0x3e, 0xbd, 0x50, 0x1f,
+ 0x5b, 0x3d, 0xb7, 0xd1, 0x99, 0xbd, 0xbe, 0x77, 0x4b, 0x3d, 0x5f, 0xd4, 0x58,
+ 0x3d, 0xdc, 0xab, 0xa4, 0x3c, 0x41, 0x6c, 0x78, 0xbd, 0xbd, 0x11, 0x71, 0x3c,
+ 0xc9, 0x97, 0x50, 0xbd, 0x93, 0xca, 0xe9, 0x3b, 0xec, 0x1b, 0xb4, 0xbc, 0xcf,
+ 0xb1, 0x48, 0x3c, 0x26, 0xd1, 0x99, 0x3c, 0x9b, 0xca, 0x26, 0xbd, 0xe0, 0xaf,
+ 0x2f, 0xbc, 0xef, 0x23, 0x84, 0xbd, 0x10, 0x75, 0xe1, 0x3b, 0xe6, 0x8c, 0x3c,
+ 0x3d, 0xad, 0x1a, 0x48, 0x3d, 0xfe, 0x04, 0x3f, 0x3d, 0xf2, 0x2f, 0xe0, 0xbc,
+ 0x98, 0x58, 0xe3, 0xbb, 0xe2, 0x78, 0x84, 0x3d, 0xde, 0x9e, 0x97, 0x3b, 0xe3,
+ 0x90, 0x35, 0xbd, 0xb9, 0xf5, 0x57, 0x3c, 0x29, 0x97, 0x18, 0x3c, 0xa7, 0xe6,
+ 0x02, 0x3d, 0x6e, 0xd3, 0x0b, 0x3d, 0x09, 0x9f, 0x51, 0xbd, 0xca, 0x5b, 0xac,
+ 0x3a, 0x38, 0xd9, 0x55, 0xbd, 0xc0, 0x50, 0x0b, 0x3d, 0x63, 0xe8, 0x69, 0xbd,
+ 0x96, 0xeb, 0x86, 0xbd, 0x43, 0x18, 0x26, 0x3d, 0x76, 0xab, 0xd8, 0x3a, 0xe3,
+ 0x0e, 0xb9, 0xbc, 0xed, 0xb2, 0x33, 0x3c, 0x67, 0x1d, 0x7c, 0xbd, 0x13, 0x39,
+ 0xa8, 0x3b, 0x4b, 0xa3, 0x39, 0xbd, 0x17, 0xb9, 0x44, 0xbd, 0x88, 0x76, 0x43,
+ 0xbd, 0xdd, 0x31, 0x61, 0xbd, 0x2d, 0x7d, 0xae, 0xbc, 0xe9, 0xb8, 0x05, 0x3d,
+ 0xdd, 0x80, 0x2a, 0xbd, 0x55, 0x66, 0x08, 0xbd, 0xea, 0x09, 0x8a, 0xbd, 0x13,
+ 0xd8, 0x0d, 0xbd, 0x7e, 0x9d, 0x5a, 0x3d, 0x08, 0x68, 0x8d, 0x3c, 0x02, 0x87,
+ 0xdc, 0x3c, 0xfb, 0x55, 0xda, 0xb9, 0xc4, 0x69, 0x71, 0xbd, 0xd1, 0x02, 0xf6,
+ 0xbc, 0x92, 0x01, 0x0c, 0x3d, 0xbb, 0x2c, 0x40, 0xbd, 0x82, 0x69, 0x97, 0x3d,
+ 0x2b, 0xda, 0x57, 0xbd, 0x7b, 0x9b, 0xe0, 0x3b, 0xff, 0xfd, 0x4b, 0xbd, 0x5c,
+ 0xa6, 0x2e, 0x3d, 0x40, 0xec, 0x85, 0xbd, 0x3b, 0x5d, 0x17, 0xbd, 0x52, 0x04,
+ 0x2c, 0xbd, 0x61, 0x00, 0x20, 0x3c, 0x65, 0x33, 0x28, 0xbc, 0x77, 0x76, 0x07,
+ 0x3d, 0x7a, 0xff, 0x32, 0x3b, 0xb9, 0x96, 0x59, 0xbd, 0xe0, 0xe1, 0x43, 0xbd,
+ 0x17, 0xa7, 0x6b, 0xbd, 0xf8, 0xa6, 0x4d, 0xbd, 0x4f, 0xc3, 0x9d, 0xbb, 0xfa,
+ 0x3a, 0x39, 0xbd, 0xe3, 0x59, 0x9a, 0xbd, 0xbd, 0xb9, 0x43, 0xbc, 0x21, 0xc4,
+ 0x0c, 0x3c, 0x3e, 0x70, 0x47, 0xbd, 0x42, 0xcf, 0x93, 0x3b, 0x9b, 0xe0, 0x34,
+ 0x3d, 0x00, 0x5d, 0xeb, 0x39, 0x5f, 0x65, 0x80, 0xbd, 0x37, 0x8a, 0x65, 0x3d,
+ 0x0e, 0x1b, 0x67, 0xbc, 0xa0, 0x0a, 0x68, 0x3c, 0xc5, 0x6d, 0xf7, 0x3c, 0xe1,
+ 0x9d, 0x85, 0x3d, 0xa8, 0xe7, 0x69, 0xbd, 0x30, 0x9c, 0x36, 0xbd, 0xcf, 0x55,
+ 0xdf, 0x3c, 0x85, 0xe9, 0x4c, 0x3d, 0x3e, 0x03, 0x8a, 0xbd, 0x19, 0xe1, 0x86,
+ 0xbb, 0xa0, 0x51, 0xec, 0x3c, 0x11, 0xc9, 0x84, 0x3d, 0x48, 0xa9, 0x1d, 0x3d,
+ 0x1c, 0xd6, 0xee, 0x3b, 0x82, 0x07, 0x96, 0xbc, 0x33, 0x6b, 0xd0, 0x3c, 0x62,
+ 0x62, 0xb6, 0x3c, 0x4a, 0x35, 0x62, 0x3d, 0x10, 0x85, 0x66, 0xbd, 0xc9, 0xf5,
+ 0x53, 0xbc, 0x70, 0x4a, 0xfa, 0x3b, 0xa5, 0x21, 0x33, 0xbd, 0xe7, 0x07, 0x40,
+ 0x3b, 0x6d, 0xe3, 0x16, 0x3d, 0x11, 0xa2, 0xa7, 0x3a, 0x01, 0x73, 0x95, 0xbc,
+ 0x5c, 0xd1, 0x2e, 0xbd, 0x5c, 0x41, 0x00, 0xbd, 0x02, 0x40, 0x8a, 0x3d, 0x66,
+ 0xcf, 0x2b, 0x3d, 0x3d, 0x54, 0x8b, 0xbc, 0x1b, 0x25, 0x44, 0x3d, 0x56, 0xda,
+ 0x15, 0xbd, 0xfc, 0x0c, 0xc1, 0xbc, 0x4d, 0xcd, 0x5e, 0xbd, 0x40, 0x55, 0x2c,
+ 0x3d, 0xb9, 0xe6, 0xc5, 0xbc, 0x6b, 0x0d, 0xd2, 0xba, 0xd0, 0x10, 0x28, 0x3c,
+ 0x6b, 0xd8, 0x63, 0xbd, 0xf7, 0xed, 0xca, 0x3c, 0xa3, 0x63, 0x5a, 0x3b, 0x45,
+ 0x41, 0x8e, 0x3d, 0x48, 0x23, 0xd7, 0x3c, 0x71, 0xbb, 0xa8, 0x3c, 0xe2, 0x55,
+ 0x98, 0x3c, 0x27, 0xae, 0x5e, 0xbc, 0x06, 0x79, 0xb4, 0xbb, 0x8c, 0xdb, 0x13,
+ 0xbd, 0x7b, 0x59, 0x18, 0x3d, 0xbb, 0x91, 0xfc, 0xbc, 0x4b, 0x7d, 0x80, 0xbd,
+ 0x58, 0x76, 0x8a, 0x3c, 0x5f, 0x71, 0xa8, 0x3c, 0xb3, 0x8f, 0x89, 0xbd, 0xb4,
+ 0x4c, 0x64, 0xbd, 0xf9, 0x1a, 0x81, 0x3d, 0x8f, 0xa5, 0x90, 0xbd, 0x24, 0x93,
+ 0xbf, 0x3c, 0x1c, 0x73, 0x68, 0x3d, 0xa5, 0x53, 0x4a, 0xbd, 0xec, 0x40, 0x34,
+ 0xbd, 0xb2, 0x5f, 0x90, 0x3d, 0x0d, 0xe3, 0x11, 0x3d, 0x5b, 0x77, 0x91, 0x3d,
+ 0xe4, 0x5b, 0x8b, 0x3d, 0x99, 0x6e, 0x6a, 0xbd, 0x05, 0xcb, 0x99, 0xbd, 0xb5,
+ 0x26, 0x1f, 0xbd, 0xfd, 0xc3, 0x2f, 0xbd, 0xd2, 0x82, 0x96, 0x3d, 0x06, 0xf6,
+ 0x78, 0xbd, 0x8e, 0x08, 0x30, 0x3d, 0x16, 0x22, 0x6d, 0xbd, 0xda, 0x25, 0x4b,
+ 0x3d, 0xf7, 0x44, 0x43, 0xbc, 0xba, 0x20, 0xbc, 0xbc, 0x41, 0xd7, 0x04, 0xbc,
+ 0xe1, 0x62, 0x0d, 0xbd, 0x93, 0x78, 0x2f, 0xbd, 0x2a, 0xad, 0xd5, 0xbc, 0x13,
+ 0xd3, 0x6f, 0xbd, 0x88, 0xc4, 0x12, 0xbd, 0x49, 0x73, 0x84, 0xbd, 0xd6, 0x50,
+ 0x2c, 0x3d, 0xa9, 0xb7, 0x7d, 0xbd, 0x9a, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00,
+ 0x00, 0x08, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80, 0x04, 0x00, 0x00,
+ 0xae, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc0,
+ 0x02, 0x74, 0xbb, 0xc6, 0x58, 0x47, 0x39, 0x07, 0x36, 0x4d, 0x3c, 0xf5, 0x20,
+ 0xc5, 0x3c, 0xce, 0x88, 0x6c, 0x3a, 0xd2, 0x40, 0x7d, 0xbc, 0x2f, 0x7e, 0xf5,
+ 0x3a, 0x3d, 0xe1, 0x3e, 0xbc, 0xda, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+ 0x40, 0x00, 0x00, 0x00, 0x1d, 0xe1, 0xa3, 0xbc, 0xe7, 0x98, 0x88, 0x3c, 0xe4,
+ 0xc0, 0x49, 0x3b, 0xa6, 0x49, 0x38, 0x3c, 0x0e, 0x65, 0xbc, 0xbc, 0xd8, 0x59,
+ 0x73, 0xbc, 0x15, 0x66, 0x0a, 0xbd, 0x7c, 0x75, 0x24, 0xba, 0x37, 0xc4, 0x65,
+ 0x3c, 0x94, 0x0d, 0x84, 0x3c, 0x26, 0xcc, 0x87, 0x3c, 0x59, 0xea, 0x03, 0xbd,
+ 0x33, 0x39, 0x48, 0xbc, 0xac, 0x3e, 0x6d, 0x3c, 0xc7, 0x46, 0xb1, 0xbb, 0xcf,
+ 0xee, 0x07, 0x3d, 0x26, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x00,
+ 0x00, 0x00, 0x7c, 0xe9, 0x43, 0x3c, 0xd3, 0x16, 0xd7, 0xbc, 0x15, 0x37, 0x4a,
+ 0xba, 0xa4, 0xad, 0x1c, 0x3c, 0x20, 0x66, 0x3b, 0xbb, 0x22, 0x84, 0x97, 0x3a,
+ 0xa5, 0x65, 0x86, 0x3c, 0x68, 0x0b, 0xf7, 0xbb, 0x52, 0xaf, 0x8c, 0x3b, 0xe1,
+ 0x81, 0x00, 0x3d, 0x3c, 0xf9, 0xd9, 0x3c, 0x96, 0xa8, 0x80, 0x3c, 0x94, 0xdf,
+ 0x21, 0x3c, 0xc7, 0x26, 0xd7, 0x3a, 0x96, 0xb2, 0x8c, 0x3c, 0x17, 0x29, 0x20,
+ 0x3c, 0xfa, 0xe0, 0x59, 0x3c, 0xf7, 0x08, 0x14, 0x3c, 0xad, 0x71, 0x61, 0x3c,
+ 0x2e, 0x73, 0x1a, 0xbc, 0x0f, 0xd0, 0x55, 0xbb, 0xa8, 0xde, 0x68, 0x3c, 0xd9,
+ 0x86, 0x44, 0x3c, 0x54, 0x22, 0x05, 0xbc, 0x3c, 0x7a, 0x92, 0x3c, 0x70, 0x16,
+ 0x01, 0x3c, 0x69, 0x1e, 0xaf, 0xbb, 0xe8, 0x4b, 0xc5, 0xbc, 0x8b, 0xfd, 0x23,
+ 0x3c, 0xb8, 0x1e, 0xfd, 0xbc, 0x49, 0x11, 0x50, 0xbb, 0x2a, 0x7b, 0x9c, 0x3c,
+ 0xb2, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x6e,
+ 0x5f, 0x06, 0xba, 0xca, 0x9c, 0x99, 0xbb, 0x00, 0x00, 0x00, 0x00, 0xa4, 0x8a,
+ 0xfe, 0xba, 0x12, 0xed, 0xa7, 0x3c, 0xc0, 0x7d, 0x37, 0xbb, 0xa3, 0x8a, 0x30,
+ 0xbb, 0xd0, 0x95, 0x99, 0xbc, 0x00, 0x00, 0x00, 0x00, 0x81, 0x9c, 0x1c, 0x3d,
+ 0x5c, 0x2a, 0x8e, 0xbb, 0x8c, 0xc0, 0x1a, 0xbb, 0x5b, 0xa1, 0xe5, 0x3b, 0x00,
+ 0x00, 0x00, 0x00, 0x6a, 0x50, 0xef, 0x3c, 0xdc, 0xbc, 0x9a, 0x3a, 0x00, 0x00,
+ 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0x08, 0x00, 0x00, 0x00, 0x6e, 0x6b, 0xdf, 0xbb, 0x54, 0xe6, 0xe6, 0x3c,
+ 0xd0, 0xf4, 0xff, 0xff, 0xd4, 0xf4, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x4d,
+ 0x4c, 0x49, 0x52, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64,
+ 0x2e, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e,
+ 0x00, 0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00,
+ 0x0e, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0,
+ 0x02, 0x00, 0x00, 0xa4, 0x02, 0x00, 0x00, 0xa8, 0x02, 0x00, 0x00, 0x04, 0x00,
+ 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00,
+ 0x00, 0x38, 0x02, 0x00, 0x00, 0xd4, 0x01, 0x00, 0x00, 0x80, 0x01, 0x00, 0x00,
+ 0x3c, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x8c,
+ 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5a, 0xfe,
+ 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x10, 0x00, 0x00,
+ 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x68, 0xf5, 0xff, 0xff,
+ 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x13,
+ 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8e, 0xfe,
+ 0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x1c, 0x00, 0x00,
+ 0x00, 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00,
+ 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x12, 0x00,
+ 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a,
+ 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
+ 0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x11, 0x00,
+ 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfe, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0xee, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02,
+ 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+ 0x00, 0x10, 0x00, 0x00, 0x00, 0xde, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xd0,
+ 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+ 0x03, 0x00, 0x00, 0x00, 0x7e, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x05, 0x24, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00,
+ 0x00, 0x00, 0x6e, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+ 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e,
+ 0x00, 0x00, 0x00, 0x5e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x50, 0xff, 0xff,
+ 0xff, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x03,
+ 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x04, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c,
+ 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x05, 0x34, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x17, 0x00, 0x10, 0x00,
+ 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+ 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00,
+ 0x00, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x0b, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00,
+ 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x28, 0x00, 0x00, 0x00,
+ 0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x13, 0x00, 0x0c, 0x00, 0x08,
+ 0x00, 0x07, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+ 0x00, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x09, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14,
+ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00,
+ 0x00, 0x00, 0x10, 0x08, 0x00, 0x00, 0xc4, 0x07, 0x00, 0x00, 0x7c, 0x07, 0x00,
+ 0x00, 0x44, 0x07, 0x00, 0x00, 0x0c, 0x07, 0x00, 0x00, 0xd4, 0x06, 0x00, 0x00,
+ 0x88, 0x06, 0x00, 0x00, 0x2c, 0x06, 0x00, 0x00, 0xe0, 0x05, 0x00, 0x00, 0x8c,
+ 0x05, 0x00, 0x00, 0x38, 0x05, 0x00, 0x00, 0xe4, 0x04, 0x00, 0x00, 0x28, 0x04,
+ 0x00, 0x00, 0xb4, 0x03, 0x00, 0x00, 0xf8, 0x02, 0x00, 0x00, 0x84, 0x02, 0x00,
+ 0x00, 0xc8, 0x01, 0x00, 0x00, 0x54, 0x01, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+ 0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x58, 0xf8, 0xff, 0xff, 0x14,
+ 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x15, 0x00,
+ 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff,
+ 0xff, 0x02, 0x00, 0x00, 0x00, 0x3c, 0xf8, 0xff, 0xff, 0x19, 0x00, 0x00, 0x00,
+ 0x53, 0x74, 0x61, 0x74, 0x65, 0x66, 0x75, 0x6c, 0x50, 0x61, 0x72, 0x74, 0x69,
+ 0x74, 0x69, 0x6f, 0x6e, 0x65, 0x64, 0x43, 0x61, 0x6c, 0x6c, 0x3a, 0x30, 0x00,
+ 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00,
+ 0x00, 0xac, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+ 0x1c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x02,
+ 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x90, 0xf8,
+ 0xff, 0xff, 0x5b, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+ 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+ 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x3b, 0x73, 0x65,
+ 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64,
+ 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x52, 0x65, 0x6c, 0x75,
+ 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+ 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x42,
+ 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00,
+ 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x40, 0xf9, 0xff, 0xff, 0x14, 0x00, 0x00,
+ 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00,
+ 0x3c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x80,
+ 0x04, 0x00, 0x00, 0x24, 0xf9, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x73, 0x65,
+ 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x66,
+ 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f, 0x52, 0x65, 0x73,
+ 0x68, 0x61, 0x70, 0x65, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x80, 0x04, 0x00, 0x00, 0x9c, 0xf9, 0xff, 0xff, 0x14, 0x00,
+ 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00,
+ 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+ 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x88,
+ 0xf9, 0xff, 0xff, 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e,
+ 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70,
+ 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x38, 0x2f,
+ 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00,
+ 0x00, 0x00, 0x0c, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00,
+ 0x00, 0x24, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0c,
+ 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xf8, 0xf9, 0xff, 0xff, 0x6e, 0x00,
+ 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f,
+ 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33,
+ 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+ 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64,
+ 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b,
+ 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+ 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43,
+ 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f,
+ 0x32, 0x34, 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+ 0x20, 0x00, 0x00, 0x00, 0xc4, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24,
+ 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00,
+ 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00,
+ 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xb0, 0xfa, 0xff, 0xff,
+ 0x27, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61,
+ 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c,
+ 0x69, 0x6e, 0x67, 0x32, 0x64, 0x5f, 0x31, 0x39, 0x37, 0x2f, 0x4d, 0x61, 0x78,
+ 0x50, 0x6f, 0x6f, 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+ 0x0e, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x34,
+ 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00,
+ 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0xff, 0xff, 0xff, 0xff, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
+ 0x10, 0x00, 0x00, 0x00, 0x20, 0xfb, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73,
+ 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f,
+ 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x52, 0x65,
+ 0x6c, 0x75, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c,
+ 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+ 0x32, 0x2f, 0x42, 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71,
+ 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f,
+ 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32, 0x2f, 0x43, 0x6f, 0x6e, 0x76,
+ 0x32, 0x44, 0x3b, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32,
+ 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00,
+ 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0xec, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+ 0x24, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04,
+ 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00, 0x1f, 0x00,
+ 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xd8, 0xfb, 0xff, 0xff, 0x27, 0x00, 0x00,
+ 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+ 0x33, 0x2f, 0x6d, 0x61, 0x78, 0x5f, 0x70, 0x6f, 0x6f, 0x6c, 0x69, 0x6e, 0x67,
+ 0x32, 0x64, 0x5f, 0x31, 0x39, 0x36, 0x2f, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f,
+ 0x6c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00,
+ 0x00, 0x1f, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x5c, 0xfc, 0xff, 0xff,
+ 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x0d,
+ 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff,
+ 0xff, 0xff, 0x3e, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+ 0x00, 0x48, 0xfc, 0xff, 0xff, 0x6e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75,
+ 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e,
+ 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x3b,
+ 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+ 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x42,
+ 0x69, 0x61, 0x73, 0x41, 0x64, 0x64, 0x3b, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e,
+ 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32,
+ 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x3b,
+ 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x31, 0x2f, 0x62, 0x69,
+ 0x61, 0x73, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3e,
+ 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x56, 0xfd,
+ 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00,
+ 0x00, 0x2c, 0x00, 0x00, 0x00, 0xe8, 0xfc, 0xff, 0xff, 0x1f, 0x00, 0x00, 0x00,
+ 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+ 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x33, 0x2f, 0x43,
+ 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+ 0xa6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0b,
+ 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x38, 0xfd, 0xff, 0xff, 0x1f, 0x00,
+ 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f,
+ 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34, 0x32,
+ 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10,
+ 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00,
+ 0x00, 0x00, 0xf6, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x0a, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x88, 0xfd, 0xff, 0xff,
+ 0x1f, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61,
+ 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32,
+ 0x34, 0x31, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0x46, 0xfe, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10,
+ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0xd8, 0xfd,
+ 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74,
+ 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33, 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+ 0x31, 0x36, 0x34, 0x2f, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02,
+ 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x8e, 0xfe,
+ 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00,
+ 0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00, 0x00,
+ 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36, 0x33,
+ 0x2f, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33, 0x2f, 0x4d, 0x61,
+ 0x74, 0x4d, 0x75, 0x6c, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x80, 0x04, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x14, 0x00,
+ 0x13, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x14,
+ 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x02, 0x2c, 0x00, 0x00, 0x00, 0x7c, 0xfe, 0xff, 0xff, 0x1e, 0x00, 0x00,
+ 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x36,
+ 0x33, 0x2f, 0x66, 0x6c, 0x61, 0x74, 0x74, 0x65, 0x6e, 0x5f, 0x37, 0x32, 0x2f,
+ 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00,
+ 0x00, 0x00, 0x2e, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xc0, 0xfe, 0xff, 0xff,
+ 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+ 0x31, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00,
+ 0x00, 0x00, 0x62, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x05, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff,
+ 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+ 0x32, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00,
+ 0x00, 0x00, 0x96, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0xff, 0xff, 0xff,
+ 0x0f, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f, 0x32, 0x34,
+ 0x33, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00,
+ 0x00, 0x00, 0xca, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x5c, 0xff, 0xff, 0xff,
+ 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f, 0x31, 0x36, 0x33,
+ 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c,
+ 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+ 0x10, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa0,
+ 0xff, 0xff, 0xff, 0x0e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x6e, 0x73, 0x65, 0x5f,
+ 0x31, 0x36, 0x34, 0x2f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x01, 0x00, 0x00,
+ 0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00,
+ 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x14,
+ 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0xff, 0xff, 0xff, 0xff, 0x40, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x22,
+ 0x00, 0x00, 0x00, 0x73, 0x65, 0x72, 0x76, 0x69, 0x6e, 0x67, 0x5f, 0x64, 0x65,
+ 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x32, 0x64, 0x5f,
+ 0x32, 0x34, 0x31, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3a, 0x30, 0x00, 0x00,
+ 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x40,
+ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x40, 0x00,
+ 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+ 0x00, 0xdc, 0xff, 0xff, 0xff, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+ 0xe8, 0xff, 0xff, 0xff, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0xf4,
+ 0xff, 0xff, 0xff, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x0c, 0x00,
+ 0x0c, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03
+};
diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c
new file mode 100644
index 0000000000..2fab99dd8b
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/dwt.h"
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass,
+ tran_low_t *highpass) {
+ int n;
+ tran_low_t r, *a, *b;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ while (--n) {
+ *a++ = (r = *x++) * 2;
+ *b++ = *x - ((r + x[1] + 1) >> 1);
+ x++;
+ }
+ *a = (r = *x++) * 2;
+ *b = *x - r;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ r = *highpass;
+ while (n--) {
+ *a++ += (r + (*b) + 1) >> 1;
+ r = *b++;
+ }
+}
+
+static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass,
+ tran_low_t *highpass) {
+ int n;
+ tran_low_t r, *a, *b;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ while (--n) {
+ *a++ = (r = *x++);
+ *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2;
+ x++;
+ }
+ *a = (r = *x++);
+ *b = (*x - r + 1) >> 1;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ r = *highpass;
+ while (n--) {
+ *a++ += (r + (*b) + 1) >> 1;
+ r = *b++;
+ }
+}
+
+static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
+ const uint8_t *x, int pitch_x,
+ tran_low_t *c, int pitch_c,
+ int dwt_scale_bits, int hbd) {
+ int lv, i, j, nh, nw, hh = height, hw = width;
+ tran_low_t buffer[2 * DWT_MAX_LENGTH];
+
+ if (hbd) {
+ const uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
+ }
+ }
+ } else {
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
+ }
+ }
+ }
+
+ for (lv = 0; lv < levels; lv++) {
+ nh = hh;
+ hh = (hh + 1) >> 1;
+ nw = hw;
+ hw = (hw + 1) >> 1;
+ if ((nh < 2) || (nw < 2)) return;
+ for (i = 0; i < nh; i++) {
+ memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
+ analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+ }
+ for (j = 0; j < nw; j++) {
+ for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j];
+ analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+ for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i];
+ }
+ }
+}
+
+void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
+ int stride, int hbd) {
+ dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd);
+}
+
+static int haar_ac_sad(const tran_low_t *output, int bw, int bh, int stride) {
+ int acsad = 0;
+
+ for (int r = 0; r < bh; ++r)
+ for (int c = 0; c < bw; ++c) {
+ if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]);
+ }
+ return acsad;
+}
+
+static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride,
+ int hbd) {
+ tran_low_t output[64];
+
+ av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
+ return haar_ac_sad(output, 8, 8, 8);
+}
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+ int hbd, int num_8x8_rows,
+ int num_8x8_cols) {
+ int64_t wavelet_energy = 0;
+ for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
+ for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
+ wavelet_energy += haar_ac_sad_8x8_uint8_input(
+ input + c8 * 8 + r8 * 8 * stride, stride, hbd);
+ }
+ }
+ return wavelet_energy;
+}
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
new file mode 100644
index 0000000000..443b6bc12c
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_DWT_H_
+#define AOM_AV1_ENCODER_DWT_H_
+
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+
+#define DWT_MAX_LENGTH 64
+
+void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
+ int stride, int hbd);
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+ int hbd, int num_8x8_rows,
+ int num_8x8_cols);
+
+#endif // AOM_AV1_ENCODER_DWT_H_
diff --git a/third_party/aom/av1/encoder/enc_enums.h b/third_party/aom/av1/encoder/enc_enums.h
new file mode 100644
index 0000000000..20cefa16a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/enc_enums.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_
+#define AOM_AV1_ENCODER_ENC_ENUMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code.
+enum {
+ THR_NEARESTMV,
+ THR_NEARESTL2,
+ THR_NEARESTL3,
+ THR_NEARESTB,
+ THR_NEARESTA2,
+ THR_NEARESTA,
+ THR_NEARESTG,
+
+ THR_NEWMV,
+ THR_NEWL2,
+ THR_NEWL3,
+ THR_NEWB,
+ THR_NEWA2,
+ THR_NEWA,
+ THR_NEWG,
+
+ THR_NEARMV,
+ THR_NEARL2,
+ THR_NEARL3,
+ THR_NEARB,
+ THR_NEARA2,
+ THR_NEARA,
+ THR_NEARG,
+
+ THR_GLOBALMV,
+ THR_GLOBALL2,
+ THR_GLOBALL3,
+ THR_GLOBALB,
+ THR_GLOBALA2,
+ THR_GLOBALA,
+ THR_GLOBALG,
+
+ THR_COMP_NEAREST_NEARESTLA,
+ THR_COMP_NEAREST_NEARESTL2A,
+ THR_COMP_NEAREST_NEARESTL3A,
+ THR_COMP_NEAREST_NEARESTGA,
+ THR_COMP_NEAREST_NEARESTLB,
+ THR_COMP_NEAREST_NEARESTL2B,
+ THR_COMP_NEAREST_NEARESTL3B,
+ THR_COMP_NEAREST_NEARESTGB,
+ THR_COMP_NEAREST_NEARESTLA2,
+ THR_COMP_NEAREST_NEARESTL2A2,
+ THR_COMP_NEAREST_NEARESTL3A2,
+ THR_COMP_NEAREST_NEARESTGA2,
+ THR_COMP_NEAREST_NEARESTLL2,
+ THR_COMP_NEAREST_NEARESTLL3,
+ THR_COMP_NEAREST_NEARESTLG,
+ THR_COMP_NEAREST_NEARESTBA,
+
+ THR_COMP_NEAR_NEARLB,
+ THR_COMP_NEW_NEWLB,
+ THR_COMP_NEW_NEARESTLB,
+ THR_COMP_NEAREST_NEWLB,
+ THR_COMP_NEW_NEARLB,
+ THR_COMP_NEAR_NEWLB,
+ THR_COMP_GLOBAL_GLOBALLB,
+
+ THR_COMP_NEAR_NEARLA,
+ THR_COMP_NEW_NEWLA,
+ THR_COMP_NEW_NEARESTLA,
+ THR_COMP_NEAREST_NEWLA,
+ THR_COMP_NEW_NEARLA,
+ THR_COMP_NEAR_NEWLA,
+ THR_COMP_GLOBAL_GLOBALLA,
+
+ THR_COMP_NEAR_NEARL2A,
+ THR_COMP_NEW_NEWL2A,
+ THR_COMP_NEW_NEARESTL2A,
+ THR_COMP_NEAREST_NEWL2A,
+ THR_COMP_NEW_NEARL2A,
+ THR_COMP_NEAR_NEWL2A,
+ THR_COMP_GLOBAL_GLOBALL2A,
+
+ THR_COMP_NEAR_NEARL3A,
+ THR_COMP_NEW_NEWL3A,
+ THR_COMP_NEW_NEARESTL3A,
+ THR_COMP_NEAREST_NEWL3A,
+ THR_COMP_NEW_NEARL3A,
+ THR_COMP_NEAR_NEWL3A,
+ THR_COMP_GLOBAL_GLOBALL3A,
+
+ THR_COMP_NEAR_NEARGA,
+ THR_COMP_NEW_NEWGA,
+ THR_COMP_NEW_NEARESTGA,
+ THR_COMP_NEAREST_NEWGA,
+ THR_COMP_NEW_NEARGA,
+ THR_COMP_NEAR_NEWGA,
+ THR_COMP_GLOBAL_GLOBALGA,
+
+ THR_COMP_NEAR_NEARL2B,
+ THR_COMP_NEW_NEWL2B,
+ THR_COMP_NEW_NEARESTL2B,
+ THR_COMP_NEAREST_NEWL2B,
+ THR_COMP_NEW_NEARL2B,
+ THR_COMP_NEAR_NEWL2B,
+ THR_COMP_GLOBAL_GLOBALL2B,
+
+ THR_COMP_NEAR_NEARL3B,
+ THR_COMP_NEW_NEWL3B,
+ THR_COMP_NEW_NEARESTL3B,
+ THR_COMP_NEAREST_NEWL3B,
+ THR_COMP_NEW_NEARL3B,
+ THR_COMP_NEAR_NEWL3B,
+ THR_COMP_GLOBAL_GLOBALL3B,
+
+ THR_COMP_NEAR_NEARGB,
+ THR_COMP_NEW_NEWGB,
+ THR_COMP_NEW_NEARESTGB,
+ THR_COMP_NEAREST_NEWGB,
+ THR_COMP_NEW_NEARGB,
+ THR_COMP_NEAR_NEWGB,
+ THR_COMP_GLOBAL_GLOBALGB,
+
+ THR_COMP_NEAR_NEARLA2,
+ THR_COMP_NEW_NEWLA2,
+ THR_COMP_NEW_NEARESTLA2,
+ THR_COMP_NEAREST_NEWLA2,
+ THR_COMP_NEW_NEARLA2,
+ THR_COMP_NEAR_NEWLA2,
+ THR_COMP_GLOBAL_GLOBALLA2,
+
+ THR_COMP_NEAR_NEARL2A2,
+ THR_COMP_NEW_NEWL2A2,
+ THR_COMP_NEW_NEARESTL2A2,
+ THR_COMP_NEAREST_NEWL2A2,
+ THR_COMP_NEW_NEARL2A2,
+ THR_COMP_NEAR_NEWL2A2,
+ THR_COMP_GLOBAL_GLOBALL2A2,
+
+ THR_COMP_NEAR_NEARL3A2,
+ THR_COMP_NEW_NEWL3A2,
+ THR_COMP_NEW_NEARESTL3A2,
+ THR_COMP_NEAREST_NEWL3A2,
+ THR_COMP_NEW_NEARL3A2,
+ THR_COMP_NEAR_NEWL3A2,
+ THR_COMP_GLOBAL_GLOBALL3A2,
+
+ THR_COMP_NEAR_NEARGA2,
+ THR_COMP_NEW_NEWGA2,
+ THR_COMP_NEW_NEARESTGA2,
+ THR_COMP_NEAREST_NEWGA2,
+ THR_COMP_NEW_NEARGA2,
+ THR_COMP_NEAR_NEWGA2,
+ THR_COMP_GLOBAL_GLOBALGA2,
+
+ THR_COMP_NEAR_NEARLL2,
+ THR_COMP_NEW_NEWLL2,
+ THR_COMP_NEW_NEARESTLL2,
+ THR_COMP_NEAREST_NEWLL2,
+ THR_COMP_NEW_NEARLL2,
+ THR_COMP_NEAR_NEWLL2,
+ THR_COMP_GLOBAL_GLOBALLL2,
+
+ THR_COMP_NEAR_NEARLL3,
+ THR_COMP_NEW_NEWLL3,
+ THR_COMP_NEW_NEARESTLL3,
+ THR_COMP_NEAREST_NEWLL3,
+ THR_COMP_NEW_NEARLL3,
+ THR_COMP_NEAR_NEWLL3,
+ THR_COMP_GLOBAL_GLOBALLL3,
+
+ THR_COMP_NEAR_NEARLG,
+ THR_COMP_NEW_NEWLG,
+ THR_COMP_NEW_NEARESTLG,
+ THR_COMP_NEAREST_NEWLG,
+ THR_COMP_NEW_NEARLG,
+ THR_COMP_NEAR_NEWLG,
+ THR_COMP_GLOBAL_GLOBALLG,
+
+ THR_COMP_NEAR_NEARBA,
+ THR_COMP_NEW_NEWBA,
+ THR_COMP_NEW_NEARESTBA,
+ THR_COMP_NEAREST_NEWBA,
+ THR_COMP_NEW_NEARBA,
+ THR_COMP_NEAR_NEWBA,
+ THR_COMP_GLOBAL_GLOBALBA,
+
+ THR_DC,
+ THR_PAETH,
+ THR_SMOOTH,
+ THR_SMOOTH_V,
+ THR_SMOOTH_H,
+ THR_H_PRED,
+ THR_V_PRED,
+ THR_D135_PRED,
+ THR_D203_PRED,
+ THR_D157_PRED,
+ THR_D67_PRED,
+ THR_D113_PRED,
+ THR_D45_PRED,
+
+ MAX_MODES,
+ SINGLE_REF_MODE_START = THR_NEARESTMV,
+ SINGLE_REF_MODE_END = THR_COMP_NEAREST_NEARESTLA,
+ NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START,
+ THR_MODE_START = THR_NEARESTMV,
+ THR_MODE_END = MAX_MODES,
+ THR_INTER_MODE_START = THR_MODE_START,
+ THR_INTER_MODE_END = THR_DC,
+ THR_INVALID = 255
+} UENUM1BYTE(THR_MODES);
+
+enum {
+ THR_LAST,
+ THR_LAST2,
+ THR_LAST3,
+ THR_BWDR,
+ THR_ALTR2,
+ THR_GOLD,
+ THR_ALTR,
+
+ THR_COMP_LA,
+ THR_COMP_L2A,
+ THR_COMP_L3A,
+ THR_COMP_GA,
+
+ THR_COMP_LB,
+ THR_COMP_L2B,
+ THR_COMP_L3B,
+ THR_COMP_GB,
+
+ THR_COMP_LA2,
+ THR_COMP_L2A2,
+ THR_COMP_L3A2,
+ THR_COMP_GA2,
+
+ THR_INTRA,
+
+ MAX_REFS
+} UENUM1BYTE(THR_MODES_SUB8X8);
+
+enum {
+ FULL_TXFM_RD,
+ LOW_TXFM_RD,
+} UENUM1BYTE(TXFM_RD_MODEL);
+
+enum {
+ USE_FULL_RD = 0,
+ USE_FAST_RD,
+ USE_LARGESTALL,
+} UENUM1BYTE(TX_SIZE_SEARCH_METHOD);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENC_ENUMS_H_
diff --git a/third_party/aom/av1/encoder/encode_strategy.c b/third_party/aom/av1/encoder/encode_strategy.c
new file mode 100644
index 0000000000..35ca83c3f4
--- /dev/null
+++ b/third_party/aom/av1/encoder/encode_strategy.c
@@ -0,0 +1,1767 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "av1/common/blockd.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/temporal_filter.h"
+#if CONFIG_THREE_PASS
+#include "av1/encoder/thirdpass.h"
+#endif // CONFIG_THREE_PASS
+#include "av1/encoder/tpl_model.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
+
+static INLINE void set_refresh_frame_flags(
+ RefreshFrameInfo *const refresh_frame, bool refresh_gf, bool refresh_bwdref,
+ bool refresh_arf) {
+ refresh_frame->golden_frame = refresh_gf;
+ refresh_frame->bwd_ref_frame = refresh_bwdref;
+ refresh_frame->alt_ref_frame = refresh_arf;
+}
+
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+ RefreshFrameInfo *const refresh_frame,
+ const FRAME_UPDATE_TYPE type,
+ const REFBUF_STATE refbuf_state,
+ int force_refresh_all) {
+ // NOTE(weitinglin): Should we define another function to take care of
+ // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+ const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+ &cpi->ext_flags.refresh_frame;
+ cpi->rc.is_src_frame_alt_ref = 0;
+
+ switch (type) {
+ case KF_UPDATE:
+ set_refresh_frame_flags(refresh_frame, true, true, true);
+ break;
+
+ case LF_UPDATE:
+ set_refresh_frame_flags(refresh_frame, false, false, false);
+ break;
+
+ case GF_UPDATE:
+ set_refresh_frame_flags(refresh_frame, true, false, false);
+ break;
+
+ case OVERLAY_UPDATE:
+ if (refbuf_state == REFBUF_RESET)
+ set_refresh_frame_flags(refresh_frame, true, true, true);
+ else
+ set_refresh_frame_flags(refresh_frame, true, false, false);
+
+ cpi->rc.is_src_frame_alt_ref = 1;
+ break;
+
+ case ARF_UPDATE:
+ // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
+ if (refbuf_state == REFBUF_RESET)
+ set_refresh_frame_flags(refresh_frame, true, true, true);
+ else
+ set_refresh_frame_flags(refresh_frame, false, false, true);
+
+ break;
+
+ case INTNL_OVERLAY_UPDATE:
+ set_refresh_frame_flags(refresh_frame, false, false, false);
+ cpi->rc.is_src_frame_alt_ref = 1;
+ break;
+
+ case INTNL_ARF_UPDATE:
+ set_refresh_frame_flags(refresh_frame, false, true, false);
+ break;
+
+ default: assert(0); break;
+ }
+
+ if (ext_refresh_frame_flags->update_pending &&
+ (!is_stat_generation_stage(cpi))) {
+ set_refresh_frame_flags(refresh_frame,
+ ext_refresh_frame_flags->golden_frame,
+ ext_refresh_frame_flags->bwd_ref_frame,
+ ext_refresh_frame_flags->alt_ref_frame);
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (ext_refresh_frame_flags->golden_frame)
+ gf_group->update_type[cpi->gf_frame_index] = GF_UPDATE;
+ if (ext_refresh_frame_flags->alt_ref_frame)
+ gf_group->update_type[cpi->gf_frame_index] = ARF_UPDATE;
+ if (ext_refresh_frame_flags->bwd_ref_frame)
+ gf_group->update_type[cpi->gf_frame_index] = INTNL_ARF_UPDATE;
+ }
+
+ if (force_refresh_all)
+ set_refresh_frame_flags(refresh_frame, true, true, true);
+}
+
+static void set_additional_frame_flags(const AV1_COMMON *const cm,
+ unsigned int *const frame_flags) {
+ if (frame_is_intra_only(cm)) {
+ *frame_flags |= FRAMEFLAGS_INTRAONLY;
+ }
+ if (frame_is_sframe(cm)) {
+ *frame_flags |= FRAMEFLAGS_SWITCH;
+ }
+ if (cm->features.error_resilient_mode) {
+ *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
+ }
+}
+
+static void set_ext_overrides(AV1_COMMON *const cm,
+ EncodeFrameParams *const frame_params,
+ ExternalFlags *const ext_flags) {
+ // Overrides the defaults with the externally supplied values with
+ // av1_update_reference() and av1_update_entropy() calls
+ // Note: The overrides are valid only for the next frame passed
+ // to av1_encode_lowlevel()
+
+ if (ext_flags->use_s_frame) {
+ frame_params->frame_type = S_FRAME;
+ }
+
+ if (ext_flags->refresh_frame_context_pending) {
+ cm->features.refresh_frame_context = ext_flags->refresh_frame_context;
+ ext_flags->refresh_frame_context_pending = 0;
+ }
+ cm->features.allow_ref_frame_mvs = ext_flags->use_ref_frame_mvs;
+
+ frame_params->error_resilient_mode = ext_flags->use_error_resilient;
+ // A keyframe is already error resilient and keyframes with
+ // error_resilient_mode interferes with the use of show_existing_frame
+ // when forward reference keyframes are enabled.
+ frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME;
+ // For bitstream conformance, s-frames must be error-resilient
+ frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME;
+}
+
+static int choose_primary_ref_frame(
+ AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const int intra_only = frame_params->frame_type == KEY_FRAME ||
+ frame_params->frame_type == INTRA_ONLY_FRAME;
+ if (intra_only || frame_params->error_resilient_mode ||
+ cpi->ext_flags.use_primary_ref_none) {
+ return PRIMARY_REF_NONE;
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->use_ducky_encode) {
+ int wanted_fb = cpi->ppi->gf_group.primary_ref_idx[cpi->gf_frame_index];
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb)
+ return ref_frame - LAST_FRAME;
+ }
+
+ return PRIMARY_REF_NONE;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ // In large scale case, always use Last frame's frame contexts.
+ // Note(yunqing): In other cases, primary_ref_frame is chosen based on
+ // cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], which also controls
+ // frame bit allocation.
+ if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME);
+
+ if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config)
+ return av1_svc_primary_ref_frame(cpi);
+
+ // Find the most recent reference frame with the same reference type as the
+ // current frame
+ const int current_ref_type = get_current_frame_ref_type(cpi);
+ int wanted_fb = cpi->ppi->fb_of_context_type[current_ref_type];
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ if (gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+ int frame_level = gf_group->frame_parallel_level[cpi->gf_frame_index];
+ // Book keep wanted_fb of frame_parallel_level 1 frame in an FP2 set.
+ if (frame_level == 1) {
+ cpi->wanted_fb = wanted_fb;
+ }
+ // Use the wanted_fb of level 1 frame in an FP2 for a level 2 frame in the
+ // set.
+ if (frame_level == 2 &&
+ gf_group->update_type[cpi->gf_frame_index - 1] == INTNL_ARF_UPDATE) {
+ assert(gf_group->frame_parallel_level[cpi->gf_frame_index - 1] == 1);
+ wanted_fb = cpi->wanted_fb;
+ }
+ }
+ }
+#endif // CONFIG_FPMT_TEST
+ int primary_ref_frame = PRIMARY_REF_NONE;
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) {
+ primary_ref_frame = ref_frame - LAST_FRAME;
+ }
+ }
+
+ return primary_ref_frame;
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) {
+ TimeStamps *time_stamps = &cpi->time_stamps;
+ int64_t this_duration;
+ int step = 0;
+
+ // Clear down mmx registers
+
+ if (cpi->ppi->use_svc && cpi->ppi->rtc_ref.set_ref_frame_config &&
+ cpi->svc.number_spatial_layers > 1) {
+ // ts_start is the timestamp for the current frame and ts_end is the
+ // expected next timestamp given the duration passed into codec_encode().
+ // See the setting in encoder_encode() in av1_cx_iface.c:
+ // ts_start = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol),
+ // ts_end = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol +
+ // duration). So the difference ts_end - ts_start is the duration passed
+ // in by the user. For spatial layers SVC set the framerate based directly
+ // on the duration, and bypass the adjustments below.
+ this_duration = ts_end - ts_start;
+ if (this_duration > 0) {
+ cpi->new_framerate = 10000000.0 / this_duration;
+ av1_new_framerate(cpi, cpi->new_framerate);
+ time_stamps->prev_ts_start = ts_start;
+ time_stamps->prev_ts_end = ts_end;
+ return;
+ }
+ }
+
+ if (ts_start == time_stamps->first_ts_start) {
+ this_duration = ts_end - ts_start;
+ step = 1;
+ } else {
+ int64_t last_duration =
+ time_stamps->prev_ts_end - time_stamps->prev_ts_start;
+
+ this_duration = ts_end - time_stamps->prev_ts_end;
+
+ // do a step update if the duration changes by 10%
+ if (last_duration)
+ step = (int)((this_duration - last_duration) * 10 / last_duration);
+ }
+
+ if (this_duration) {
+ if (step) {
+ cpi->new_framerate = 10000000.0 / this_duration;
+ av1_new_framerate(cpi, cpi->new_framerate);
+ } else {
+ // Average this frame's rate into the last second's average
+ // frame rate. If we haven't seen 1 second yet, then average
+ // over the whole interval seen.
+ const double interval =
+ AOMMIN((double)(ts_end - time_stamps->first_ts_start), 10000000.0);
+ double avg_duration = 10000000.0 / cpi->framerate;
+ avg_duration *= (interval - avg_duration + this_duration);
+ avg_duration /= interval;
+ cpi->new_framerate = (10000000.0 / avg_duration);
+ // For parallel frames update cpi->framerate with new_framerate
+ // during av1_post_encode_updates()
+ double framerate =
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ ? cpi->framerate
+ : cpi->new_framerate;
+ av1_new_framerate(cpi, framerate);
+ }
+ }
+
+ time_stamps->prev_ts_start = ts_start;
+ time_stamps->prev_ts_end = ts_end;
+}
+
+// Determine whether there is a forced keyframe pending in the lookahead buffer
+int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+ const int up_to_index,
+ const COMPRESSOR_STAGE compressor_stage) {
+ for (int i = 0; i <= up_to_index; i++) {
+ const struct lookahead_entry *e =
+ av1_lookahead_peek(lookahead, i, compressor_stage);
+ if (e == NULL) {
+ // We have reached the end of the lookahead buffer and not early-returned
+ // so there isn't a forced key-frame pending.
+ return -1;
+ } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+ return i;
+ } else {
+ continue;
+ }
+ }
+ return -1; // Never reached
+}
+
+// Check if we should encode an ARF or internal ARF. If not, try a LAST
+// Do some setup associated with the chosen source
+// temporal_filtered, flush, and frame_update_type are outputs.
+// Return the frame source, or NULL if we couldn't find one
+static struct lookahead_entry *choose_frame_source(
+ AV1_COMP *const cpi, int *const flush, int *pop_lookahead,
+ struct lookahead_entry **last_source, int *const show_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ struct lookahead_entry *source = NULL;
+
+ // Source index in lookahead buffer.
+ int src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+
+ // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q
+ if (src_index &&
+ (is_forced_keyframe_pending(cpi->ppi->lookahead, src_index,
+ cpi->compressor_stage) != -1) &&
+ cpi->oxcf.rc_cfg.mode != AOM_Q && !is_stat_generation_stage(cpi)) {
+ src_index = 0;
+ *flush = 1;
+ }
+
+ // If the current frame is arf, then we should not pop from the lookahead
+ // buffer. If the current frame is not arf, then pop it. This assumes the
+ // first frame in the GF group is not arf. May need to change if it is not
+ // true.
+ *pop_lookahead = (src_index == 0);
+ // If this is a key frame and keyframe filtering is enabled with overlay,
+ // then do not pop.
+ if (*pop_lookahead && cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1 &&
+ gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE &&
+ !is_stat_generation_stage(cpi) && cpi->ppi->lookahead) {
+ if (cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz &&
+ (*flush ||
+ cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].sz ==
+ cpi->ppi->lookahead->read_ctxs[cpi->compressor_stage].pop_sz)) {
+ *pop_lookahead = 0;
+ }
+ }
+
+ // LAP stage does not have ARFs or forward key-frames,
+ // hence, always pop_lookahead here.
+ if (is_stat_generation_stage(cpi)) {
+ *pop_lookahead = 1;
+ src_index = 0;
+ }
+
+ *show_frame = *pop_lookahead;
+
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE) {
+#else
+ {
+#endif // CONFIG_FPMT_TEST
+ // Future frame in parallel encode set
+ if (gf_group->src_offset[cpi->gf_frame_index] != 0 &&
+ !is_stat_generation_stage(cpi))
+ src_index = gf_group->src_offset[cpi->gf_frame_index];
+ }
+ if (*show_frame) {
+ // show frame, pop from buffer
+ // Get last frame source.
+ if (cm->current_frame.frame_number > 0) {
+ *last_source = av1_lookahead_peek(cpi->ppi->lookahead, src_index - 1,
+ cpi->compressor_stage);
+ }
+ // Read in the source frame.
+ source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
+ cpi->compressor_stage);
+ } else {
+ // no show frames are arf frames
+ source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
+ cpi->compressor_stage);
+ if (source != NULL) {
+ cm->showable_frame = 1;
+ }
+ }
+ return source;
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient or
+// S-Frame. An exception can be made in the case of a keyframe, since it does
+// not depend on any previous frames.
+static int allow_show_existing(const AV1_COMP *const cpi,
+ unsigned int frame_flags) {
+ if (cpi->common.current_frame.frame_number == 0) return 0;
+
+ const struct lookahead_entry *lookahead_src =
+ av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
+ if (lookahead_src == NULL) return 1;
+
+ const int is_error_resilient =
+ cpi->oxcf.tool_cfg.error_resilient_mode ||
+ (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+ const int is_s_frame = cpi->oxcf.kf_cfg.enable_sframe ||
+ (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+ const int is_key_frame =
+ (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY);
+ return !(is_error_resilient || is_s_frame) || is_key_frame;
+}
+
+// Update frame_flags to tell the encoder's caller what sort of frame was
+// encoded.
+static void update_frame_flags(const AV1_COMMON *const cm,
+ const RefreshFrameInfo *const refresh_frame,
+ unsigned int *frame_flags) {
+ if (encode_show_existing_frame(cm)) {
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN;
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF;
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF;
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY;
+ return;
+ }
+
+ if (refresh_frame->golden_frame) {
+ *frame_flags |= FRAMEFLAGS_GOLDEN;
+ } else {
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_GOLDEN;
+ }
+
+ if (refresh_frame->alt_ref_frame) {
+ *frame_flags |= FRAMEFLAGS_ALTREF;
+ } else {
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_ALTREF;
+ }
+
+ if (refresh_frame->bwd_ref_frame) {
+ *frame_flags |= FRAMEFLAGS_BWDREF;
+ } else {
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_BWDREF;
+ }
+
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ *frame_flags |= FRAMEFLAGS_KEY;
+ } else {
+ *frame_flags &= ~(uint32_t)FRAMEFLAGS_KEY;
+ }
+}
+
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+ const YV12_BUFFER_CONFIG *const ref_buf,
+ char *file_name) {
+ int h;
+ FILE *f_ref = NULL;
+
+ if (ref_buf == NULL) {
+ printf("Frame data buffer is NULL.\n");
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ if ((f_ref = fopen(file_name, "wb")) == NULL) {
+ printf("Unable to open file %s to write.\n", file_name);
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ // --- Y ---
+ for (h = 0; h < cm->height; ++h) {
+ fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+ }
+ // --- U ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+ f_ref);
+ }
+ // --- V ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+ f_ref);
+ }
+
+ fclose(f_ref);
+
+ return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ char file_name[256] = "";
+ snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+ cm->current_frame.frame_number, ref_frame);
+ dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name);
+ }
+}
+#endif // DUMP_REF_FRAME_IMAGES == 1
+
+int av1_get_refresh_ref_frame_map(int refresh_frame_flags) {
+ int ref_map_index;
+
+ for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index)
+ if ((refresh_frame_flags >> ref_map_index) & 1) break;
+
+ if (ref_map_index == REF_FRAMES) ref_map_index = INVALID_IDX;
+ return ref_map_index;
+}
+
+static int get_free_ref_map_index(RefFrameMapPair ref_map_pairs[REF_FRAMES]) {
+ for (int idx = 0; idx < REF_FRAMES; ++idx)
+ if (ref_map_pairs[idx].disp_order == -1) return idx;
+ return INVALID_IDX;
+}
+
+static int get_refresh_idx(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+ int update_arf, GF_GROUP *gf_group, int gf_index,
+ int enable_refresh_skip, int cur_frame_disp) {
+ int arf_count = 0;
+ int oldest_arf_order = INT32_MAX;
+ int oldest_arf_idx = -1;
+
+ int oldest_frame_order = INT32_MAX;
+ int oldest_idx = -1;
+
+ for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+ RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+ if (ref_pair.disp_order == -1) continue;
+ const int frame_order = ref_pair.disp_order;
+ const int reference_frame_level = ref_pair.pyr_level;
+ // Keep future frames and three closest previous frames in output order.
+ if (frame_order > cur_frame_disp - 3) continue;
+
+ if (enable_refresh_skip) {
+ int skip_frame = 0;
+ // Prevent refreshing a frame in gf_group->skip_frame_refresh.
+ for (int i = 0; i < REF_FRAMES; i++) {
+ int frame_to_skip = gf_group->skip_frame_refresh[gf_index][i];
+ if (frame_to_skip == INVALID_IDX) break;
+ if (frame_order == frame_to_skip) {
+ skip_frame = 1;
+ break;
+ }
+ }
+ if (skip_frame) continue;
+ }
+
+ // Keep track of the oldest level 1 frame if the current frame is also level
+ // 1.
+ if (reference_frame_level == 1) {
+ // If there are more than 2 level 1 frames in the reference list,
+ // discard the oldest.
+ if (frame_order < oldest_arf_order) {
+ oldest_arf_order = frame_order;
+ oldest_arf_idx = map_idx;
+ }
+ arf_count++;
+ continue;
+ }
+
+ // Update the overall oldest reference frame.
+ if (frame_order < oldest_frame_order) {
+ oldest_frame_order = frame_order;
+ oldest_idx = map_idx;
+ }
+ }
+ if (update_arf && arf_count > 2) return oldest_arf_idx;
+ if (oldest_idx >= 0) return oldest_idx;
+ if (oldest_arf_idx >= 0) return oldest_arf_idx;
+ if (oldest_idx == -1) {
+ assert(arf_count > 2 && enable_refresh_skip);
+ return oldest_arf_idx;
+ }
+ assert(0 && "No valid refresh index found");
+ return -1;
+}
+
+// Computes the reference refresh index for INTNL_ARF_UPDATE frame.
+int av1_calc_refresh_idx_for_intnl_arf(
+ AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+ int gf_index) {
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+ // Search for the open slot to store the current frame.
+ int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs);
+
+ // Use a free slot if available.
+ if (free_fb_index != INVALID_IDX) {
+ return free_fb_index;
+ } else {
+ int enable_refresh_skip = !is_one_pass_rt_params(cpi);
+ int refresh_idx =
+ get_refresh_idx(ref_frame_map_pairs, 0, gf_group, gf_index,
+ enable_refresh_skip, gf_group->display_idx[gf_index]);
+ return refresh_idx;
+ }
+}
+
+int av1_get_refresh_frame_flags(
+ const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+ FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order,
+ RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+ &cpi->ext_flags.refresh_frame;
+
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (gf_group->refbuf_state[gf_index] == REFBUF_RESET)
+ return SELECT_ALL_BUF_SLOTS;
+
+ // TODO(jingning): Deprecate the following operations.
+ // Switch frames and shown key-frames overwrite all reference slots
+ if (frame_params->frame_type == S_FRAME) return SELECT_ALL_BUF_SLOTS;
+
+ // show_existing_frames don't actually send refresh_frame_flags so set the
+ // flags to 0 to keep things consistent.
+ if (frame_params->show_existing_frame) return 0;
+
+ const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ if (is_frame_droppable(rtc_ref, ext_refresh_frame_flags)) return 0;
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->use_ducky_encode &&
+ cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) {
+ int new_fb_map_idx = cpi->ppi->gf_group.update_ref_idx[gf_index];
+ if (new_fb_map_idx == INVALID_IDX) return 0;
+ return 1 << new_fb_map_idx;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ int refresh_mask = 0;
+ if (ext_refresh_frame_flags->update_pending) {
+ if (rtc_ref->set_ref_frame_config ||
+ use_rtc_reference_structure_one_layer(cpi)) {
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ int ref_frame_map_idx = rtc_ref->ref_idx[i];
+ refresh_mask |= rtc_ref->refresh[ref_frame_map_idx]
+ << ref_frame_map_idx;
+ }
+ return refresh_mask;
+ }
+ // Unfortunately the encoder interface reflects the old refresh_*_frame
+ // flags so we have to replicate the old refresh_frame_flags logic here in
+ // order to preserve the behaviour of the flag overrides.
+ int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->last_frame << ref_frame_map_idx;
+
+ ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->bwd_ref_frame
+ << ref_frame_map_idx;
+
+ ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->alt2_ref_frame
+ << ref_frame_map_idx;
+
+ if (frame_update_type == OVERLAY_UPDATE) {
+ ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->golden_frame
+ << ref_frame_map_idx;
+ } else {
+ ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->golden_frame
+ << ref_frame_map_idx;
+
+ ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ if (ref_frame_map_idx != INVALID_IDX)
+ refresh_mask |= ext_refresh_frame_flags->alt_ref_frame
+ << ref_frame_map_idx;
+ }
+ return refresh_mask;
+ }
+
+ // Search for the open slot to store the current frame.
+ int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs);
+
+ // No refresh necessary for these frame types.
+ if (frame_update_type == OVERLAY_UPDATE ||
+ frame_update_type == INTNL_OVERLAY_UPDATE)
+ return refresh_mask;
+
+ // If there is an open slot, refresh that one instead of replacing a
+ // reference.
+ if (free_fb_index != INVALID_IDX) {
+ refresh_mask = 1 << free_fb_index;
+ return refresh_mask;
+ }
+ const int enable_refresh_skip = !is_one_pass_rt_params(cpi);
+ const int update_arf = frame_update_type == ARF_UPDATE;
+ const int refresh_idx =
+ get_refresh_idx(ref_frame_map_pairs, update_arf, &cpi->ppi->gf_group,
+ gf_index, enable_refresh_skip, cur_disp_order);
+ return 1 << refresh_idx;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size);
+
+ av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y, num_planes);
+
+ set_mi_offsets(&cm->mi_params, xd, 0, 0);
+}
+
+// Apply temporal filtering to source frames and encode the filtered frame.
+// If the current frame does not require filtering, this function is identical
+// to av1_encode() except that tpl is not performed.
+static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
+ EncodeFrameInput *const frame_input,
+ const EncodeFrameParams *const frame_params,
+ EncodeFrameResults *const frame_results) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (cpi->oxcf.pass == 2) start_timing(cpi, denoise_and_encode_time);
+#endif
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ AV1_COMMON *const cm = &cpi->common;
+
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ const int is_second_arf =
+ av1_gop_is_second_arf(gf_group, cpi->gf_frame_index);
+
+ // Decide whether to apply temporal filtering to the source frame.
+ int apply_filtering =
+ av1_is_temporal_filter_on(oxcf) && !is_stat_generation_stage(cpi);
+ if (update_type != KF_UPDATE && update_type != ARF_UPDATE && !is_second_arf) {
+ apply_filtering = 0;
+ }
+ if (apply_filtering) {
+ if (frame_params->frame_type == KEY_FRAME) {
+ // TODO(angiebird): Move the noise level check to av1_tf_info_filtering.
+ // Decide whether it is allowed to perform key frame filtering
+ int allow_kf_filtering = oxcf->kf_cfg.enable_keyframe_filtering &&
+ !frame_params->show_existing_frame &&
+ !is_lossless_requested(&oxcf->rc_cfg);
+ if (allow_kf_filtering) {
+ double y_noise_level = 0.0;
+ av1_estimate_noise_level(
+ frame_input->source, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y,
+ cm->seq_params->bit_depth, NOISE_ESTIMATION_EDGE_THRESHOLD);
+ apply_filtering = y_noise_level > 0;
+ } else {
+ apply_filtering = 0;
+ }
+ // If we are doing kf filtering, set up a few things.
+ if (apply_filtering) {
+ av1_setup_past_independence(cm);
+ }
+ } else if (is_second_arf) {
+ apply_filtering = cpi->sf.hl_sf.second_alt_ref_filtering;
+ }
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time);
+#endif
+ // Save the pointer to the original source image.
+ YV12_BUFFER_CONFIG *source_buffer = frame_input->source;
+ // apply filtering to frame
+ if (apply_filtering) {
+ int show_existing_alt_ref = 0;
+ FRAME_DIFF frame_diff;
+ int top_index = 0;
+ int bottom_index = 0;
+ const int q_index = av1_rc_pick_q_and_bounds(
+ cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
+ cpi->gf_frame_index, &bottom_index, &top_index);
+
+ // TODO(bohanli): figure out why we need frame_type in cm here.
+ cm->current_frame.frame_type = frame_params->frame_type;
+ if (update_type == KF_UPDATE || update_type == ARF_UPDATE) {
+ YV12_BUFFER_CONFIG *tf_buf = av1_tf_info_get_filtered_buf(
+ &cpi->ppi->tf_info, cpi->gf_frame_index, &frame_diff);
+ if (tf_buf != NULL) {
+ frame_input->source = tf_buf;
+ show_existing_alt_ref = av1_check_show_filtered_frame(
+ tf_buf, &frame_diff, q_index, cm->seq_params->bit_depth);
+ if (show_existing_alt_ref) {
+ cpi->common.showable_frame |= 1;
+ } else {
+ cpi->common.showable_frame = 0;
+ }
+ }
+ if (gf_group->frame_type[cpi->gf_frame_index] != KEY_FRAME) {
+ cpi->ppi->show_existing_alt_ref = show_existing_alt_ref;
+ }
+ }
+
+ if (is_second_arf) {
+ // Allocate the memory for tf_buf_second_arf buffer, only when it is
+ // required.
+ int ret = aom_realloc_frame_buffer(
+ &cpi->ppi->tf_info.tf_buf_second_arf, oxcf->frm_dim_cfg.width,
+ oxcf->frm_dim_cfg.height, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+ NULL, cpi->image_pyramid_levels, 0);
+ if (ret)
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate tf_buf_second_arf");
+
+ YV12_BUFFER_CONFIG *tf_buf_second_arf =
+ &cpi->ppi->tf_info.tf_buf_second_arf;
+ // We didn't apply temporal filtering for second arf ahead in
+ // av1_tf_info_filtering().
+ const int arf_src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+ // Right now, we are still using tf_buf_second_arf due to
+ // implementation complexity.
+ // TODO(angiebird): Reuse tf_info->tf_buf here.
+ av1_temporal_filter(cpi, arf_src_index, cpi->gf_frame_index, &frame_diff,
+ tf_buf_second_arf);
+ show_existing_alt_ref = av1_check_show_filtered_frame(
+ tf_buf_second_arf, &frame_diff, q_index, cm->seq_params->bit_depth);
+ if (show_existing_alt_ref) {
+ aom_extend_frame_borders(tf_buf_second_arf, av1_num_planes(cm));
+ frame_input->source = tf_buf_second_arf;
+ }
+ // Currently INTNL_ARF_UPDATE only do show_existing.
+ cpi->common.showable_frame |= 1;
+ }
+
+ // Copy source metadata to the temporal filtered frame
+ if (source_buffer->metadata &&
+ aom_copy_metadata_to_frame_buffer(frame_input->source,
+ source_buffer->metadata)) {
+ aom_internal_error(
+ cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to copy source metadata to the temporal filtered frame");
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (cpi->oxcf.pass == 2) end_timing(cpi, apply_filtering_time);
+#endif
+
+ int set_mv_params = frame_params->frame_type == KEY_FRAME ||
+ update_type == ARF_UPDATE || update_type == GF_UPDATE;
+ cm->show_frame = frame_params->show_frame;
+ cm->current_frame.frame_type = frame_params->frame_type;
+ // TODO(bohanli): Why is this? what part of it is necessary?
+ av1_set_frame_size(cpi, cm->width, cm->height);
+ if (set_mv_params) av1_set_mv_search_params(cpi);
+
+#if CONFIG_RD_COMMAND
+ if (frame_params->frame_type == KEY_FRAME) {
+ char filepath[] = "rd_command.txt";
+ av1_read_rd_command(filepath, &cpi->rd_command);
+ }
+#endif // CONFIG_RD_COMMAND
+ if (cpi->gf_frame_index == 0 && !is_stat_generation_stage(cpi)) {
+ // perform tpl after filtering
+ int allow_tpl =
+ oxcf->gf_cfg.lag_in_frames > 1 && oxcf->algo_cfg.enable_tpl_model;
+ if (gf_group->size > MAX_LENGTH_TPL_FRAME_STATS) {
+ allow_tpl = 0;
+ }
+ if (frame_params->frame_type != KEY_FRAME) {
+ // In rare case, it's possible to have non ARF/GF update_type here.
+ // We should set allow_tpl to zero in the situation
+ allow_tpl =
+ allow_tpl && (update_type == ARF_UPDATE || update_type == GF_UPDATE ||
+ (cpi->use_ducky_encode &&
+ cpi->ducky_encode_info.frame_info.gop_mode ==
+ DUCKY_ENCODE_GOP_MODE_RCL));
+ }
+
+ if (allow_tpl) {
+ if (!cpi->skip_tpl_setup_stats) {
+ av1_tpl_preload_rc_estimate(cpi, frame_params);
+ av1_tpl_setup_stats(cpi, 0, frame_params);
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+ assert(cpi->gf_frame_index == 0);
+ av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data,
+ gf_group, cm->seq_params->bit_depth);
+#endif
+ }
+ } else {
+ av1_init_tpl_stats(&cpi->ppi->tpl_data);
+ }
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+ if (cpi->oxcf.pass == AOM_RC_SECOND_PASS &&
+ cpi->second_pass_log_stream != NULL) {
+ TPL_INFO *tpl_info;
+ AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info)));
+ av1_pack_tpl_info(tpl_info, gf_group, &cpi->ppi->tpl_data);
+ av1_write_tpl_info(tpl_info, cpi->second_pass_log_stream,
+ cpi->common.error);
+ aom_free(tpl_info);
+ }
+#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+ }
+
+ if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ // Set frame_input source to true source for psnr calculation.
+ if (apply_filtering && is_psnr_calc_enabled(cpi)) {
+ cpi->source = av1_realloc_and_scale_if_required(
+ cm, source_buffer, &cpi->scaled_source, cm->features.interp_filter, 0,
+ false, true, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+ cpi->unscaled_source = source_buffer;
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (cpi->oxcf.pass == 2) end_timing(cpi, denoise_and_encode_time);
+#endif
+ return AOM_CODEC_OK;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+/*!\cond */
+// Struct to keep track of relevant reference frame data.
+typedef struct {
+ int map_idx;
+ int disp_order;
+ int pyr_level;
+ int used;
+} RefBufMapData;
+/*!\endcond */
+
+// Comparison function to sort reference frames in ascending display order.
+static int compare_map_idx_pair_asc(const void *a, const void *b) {
+ if (((RefBufMapData *)a)->disp_order == ((RefBufMapData *)b)->disp_order) {
+ return 0;
+ } else if (((const RefBufMapData *)a)->disp_order >
+ ((const RefBufMapData *)b)->disp_order) {
+ return 1;
+ } else {
+ return -1;
+ }
+}
+
+// Checks to see if a particular reference frame is already in the reference
+// frame map.
+static int is_in_ref_map(RefBufMapData *map, int disp_order, int n_frames) {
+ for (int i = 0; i < n_frames; i++) {
+ if (disp_order == map[i].disp_order) return 1;
+ }
+ return 0;
+}
+
+// Add a reference buffer index to a named reference slot.
+static void add_ref_to_slot(RefBufMapData *ref, int *const remapped_ref_idx,
+ int frame) {
+ remapped_ref_idx[frame - LAST_FRAME] = ref->map_idx;
+ ref->used = 1;
+}
+
+// Threshold dictating when we are allowed to start considering
+// leaving lowest level frames unmapped.
+#define LOW_LEVEL_FRAMES_TR 5
+
+// Find which reference buffer should be left out of the named mapping.
+// This is because there are 8 reference buffers and only 7 named slots.
+static void set_unmapped_ref(RefBufMapData *buffer_map, int n_bufs,
+ int n_min_level_refs, int min_level,
+ int cur_frame_disp) {
+ int max_dist = 0;
+ int unmapped_idx = -1;
+ if (n_bufs <= ALTREF_FRAME) return;
+ for (int i = 0; i < n_bufs; i++) {
+ if (buffer_map[i].used) continue;
+ if (buffer_map[i].pyr_level != min_level ||
+ n_min_level_refs >= LOW_LEVEL_FRAMES_TR) {
+ int dist = abs(cur_frame_disp - buffer_map[i].disp_order);
+ if (dist > max_dist) {
+ max_dist = dist;
+ unmapped_idx = i;
+ }
+ }
+ }
+ assert(unmapped_idx >= 0 && "Unmapped reference not found");
+ buffer_map[unmapped_idx].used = 1;
+}
+
+void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+ int cur_frame_disp, const AV1_COMP *cpi, int gf_index,
+ int is_parallel_encode,
+ int remapped_ref_idx[REF_FRAMES]) {
+ int buf_map_idx = 0;
+
+ // Initialize reference frame mappings.
+ for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX;
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->use_ducky_encode &&
+ cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) {
+ for (int rf = LAST_FRAME; rf < REF_FRAMES; ++rf) {
+ if (cpi->ppi->gf_group.ref_frame_list[gf_index][rf] != INVALID_IDX) {
+ remapped_ref_idx[rf - LAST_FRAME] =
+ cpi->ppi->gf_group.ref_frame_list[gf_index][rf];
+ }
+ }
+
+ int valid_rf_idx = 0;
+ static const int ref_frame_type_order[REF_FRAMES - LAST_FRAME] = {
+ GOLDEN_FRAME, ALTREF_FRAME, LAST_FRAME, BWDREF_FRAME,
+ ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME
+ };
+ for (int i = 0; i < REF_FRAMES - LAST_FRAME; i++) {
+ int rf = ref_frame_type_order[i];
+ if (remapped_ref_idx[rf - LAST_FRAME] != INVALID_IDX) {
+ valid_rf_idx = remapped_ref_idx[rf - LAST_FRAME];
+ break;
+ }
+ }
+
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ if (remapped_ref_idx[i] == INVALID_IDX) {
+ remapped_ref_idx[i] = valid_rf_idx;
+ }
+ }
+
+ return;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ RefBufMapData buffer_map[REF_FRAMES];
+ int n_bufs = 0;
+ memset(buffer_map, 0, REF_FRAMES * sizeof(buffer_map[0]));
+ int min_level = MAX_ARF_LAYERS;
+ int max_level = 0;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ int skip_ref_unmapping = 0;
+ int is_one_pass_rt = is_one_pass_rt_params(cpi);
+
+ // Go through current reference buffers and store display order, pyr level,
+ // and map index.
+ for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+ // Get reference frame buffer.
+ RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+ if (ref_pair.disp_order == -1) continue;
+ const int frame_order = ref_pair.disp_order;
+ // Avoid duplicates.
+ if (is_in_ref_map(buffer_map, frame_order, n_bufs)) continue;
+ const int reference_frame_level = ref_pair.pyr_level;
+
+ // Keep track of the lowest and highest levels that currently exist.
+ if (reference_frame_level < min_level) min_level = reference_frame_level;
+ if (reference_frame_level > max_level) max_level = reference_frame_level;
+
+ buffer_map[n_bufs].map_idx = map_idx;
+ buffer_map[n_bufs].disp_order = frame_order;
+ buffer_map[n_bufs].pyr_level = reference_frame_level;
+ buffer_map[n_bufs].used = 0;
+ n_bufs++;
+ }
+
+ // Sort frames in ascending display order.
+ qsort(buffer_map, n_bufs, sizeof(buffer_map[0]), compare_map_idx_pair_asc);
+
+ int n_min_level_refs = 0;
+ int closest_past_ref = -1;
+ int golden_idx = -1;
+ int altref_idx = -1;
+
+ // Find the GOLDEN_FRAME and BWDREF_FRAME.
+ // Also collect various stats about the reference frames for the remaining
+ // mappings.
+ for (int i = n_bufs - 1; i >= 0; i--) {
+ if (buffer_map[i].pyr_level == min_level) {
+ // Keep track of the number of lowest level frames.
+ n_min_level_refs++;
+ if (buffer_map[i].disp_order < cur_frame_disp && golden_idx == -1 &&
+ remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] == INVALID_IDX) {
+ // Save index for GOLDEN.
+ golden_idx = i;
+ } else if (buffer_map[i].disp_order > cur_frame_disp &&
+ altref_idx == -1 &&
+ remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] == INVALID_IDX) {
+ // Save index for ALTREF.
+ altref_idx = i;
+ }
+ } else if (buffer_map[i].disp_order == cur_frame_disp) {
+ // Map the BWDREF_FRAME if this is the show_existing_frame.
+ add_ref_to_slot(&buffer_map[i], remapped_ref_idx, BWDREF_FRAME);
+ }
+
+ // During parallel encodes of lower layer frames, exclude the first frame
+ // (frame_parallel_level 1) from being used for the reference assignment of
+ // the second frame (frame_parallel_level 2).
+ if (!is_one_pass_rt && gf_group->frame_parallel_level[gf_index] == 2 &&
+ gf_group->frame_parallel_level[gf_index - 1] == 1 &&
+ gf_group->update_type[gf_index - 1] == INTNL_ARF_UPDATE) {
+ assert(gf_group->update_type[gf_index] == INTNL_ARF_UPDATE);
+#if CONFIG_FPMT_TEST
+ is_parallel_encode = (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_ENCODE)
+ ? is_parallel_encode
+ : 0;
+#endif // CONFIG_FPMT_TEST
+ // If parallel cpis are active, use ref_idx_to_skip, else, use display
+ // index.
+ assert(IMPLIES(is_parallel_encode, cpi->ref_idx_to_skip != INVALID_IDX));
+ assert(IMPLIES(!is_parallel_encode,
+ gf_group->skip_frame_as_ref[gf_index] != INVALID_IDX));
+ buffer_map[i].used = is_parallel_encode
+ ? (buffer_map[i].map_idx == cpi->ref_idx_to_skip)
+ : (buffer_map[i].disp_order ==
+ gf_group->skip_frame_as_ref[gf_index]);
+ // In case a ref frame is excluded from being used during assignment,
+ // skip the call to set_unmapped_ref(). Applicable in steady state.
+ if (buffer_map[i].used) skip_ref_unmapping = 1;
+ }
+
+ // Keep track of where the frames change from being past frames to future
+ // frames.
+ if (buffer_map[i].disp_order < cur_frame_disp && closest_past_ref < 0)
+ closest_past_ref = i;
+ }
+
+ // Do not map GOLDEN and ALTREF based on their pyramid level if all reference
+ // frames have the same level.
+ if (n_min_level_refs <= n_bufs) {
+ // Map the GOLDEN_FRAME.
+ if (golden_idx > -1)
+ add_ref_to_slot(&buffer_map[golden_idx], remapped_ref_idx, GOLDEN_FRAME);
+ // Map the ALTREF_FRAME.
+ if (altref_idx > -1)
+ add_ref_to_slot(&buffer_map[altref_idx], remapped_ref_idx, ALTREF_FRAME);
+ }
+
+ // Find the buffer to be excluded from the mapping.
+ if (!skip_ref_unmapping)
+ set_unmapped_ref(buffer_map, n_bufs, n_min_level_refs, min_level,
+ cur_frame_disp);
+
+ // Place past frames in LAST_FRAME, LAST2_FRAME, and LAST3_FRAME.
+ for (int frame = LAST_FRAME; frame < GOLDEN_FRAME; frame++) {
+ // Continue if the current ref slot is already full.
+ if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+ // Find the next unmapped reference buffer
+ // in decreasing ouptut order relative to current picture.
+ int next_buf_max = 0;
+ int next_disp_order = INT_MIN;
+ for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+ if (!buffer_map[buf_map_idx].used &&
+ buffer_map[buf_map_idx].disp_order < cur_frame_disp &&
+ buffer_map[buf_map_idx].disp_order > next_disp_order) {
+ next_disp_order = buffer_map[buf_map_idx].disp_order;
+ next_buf_max = buf_map_idx;
+ }
+ }
+ buf_map_idx = next_buf_max;
+ if (buf_map_idx < 0) break;
+ if (buffer_map[buf_map_idx].used) break;
+ add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+ }
+
+ // Place future frames (if there are any) in BWDREF_FRAME and ALTREF2_FRAME.
+ for (int frame = BWDREF_FRAME; frame < REF_FRAMES; frame++) {
+ // Continue if the current ref slot is already full.
+ if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+ // Find the next unmapped reference buffer
+ // in increasing ouptut order relative to current picture.
+ int next_buf_max = 0;
+ int next_disp_order = INT_MAX;
+ for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+ if (!buffer_map[buf_map_idx].used &&
+ buffer_map[buf_map_idx].disp_order > cur_frame_disp &&
+ buffer_map[buf_map_idx].disp_order < next_disp_order) {
+ next_disp_order = buffer_map[buf_map_idx].disp_order;
+ next_buf_max = buf_map_idx;
+ }
+ }
+ buf_map_idx = next_buf_max;
+ if (buf_map_idx < 0) break;
+ if (buffer_map[buf_map_idx].used) break;
+ add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+ }
+
+ // Place remaining past frames.
+ buf_map_idx = closest_past_ref;
+ for (int frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+ // Continue if the current ref slot is already full.
+ if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+ // Find the next unmapped reference buffer.
+ for (; buf_map_idx >= 0; buf_map_idx--) {
+ if (!buffer_map[buf_map_idx].used) break;
+ }
+ if (buf_map_idx < 0) break;
+ if (buffer_map[buf_map_idx].used) break;
+ add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+ }
+
+ // Place remaining future frames.
+ buf_map_idx = n_bufs - 1;
+ for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; frame--) {
+ // Continue if the current ref slot is already full.
+ if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+ // Find the next unmapped reference buffer.
+ for (; buf_map_idx > closest_past_ref; buf_map_idx--) {
+ if (!buffer_map[buf_map_idx].used) break;
+ }
+ if (buf_map_idx < 0) break;
+ if (buffer_map[buf_map_idx].used) break;
+ add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+ }
+
+ // Fill any slots that are empty (should only happen for the first 7 frames).
+ for (int i = 0; i < REF_FRAMES; ++i)
+ if (remapped_ref_idx[i] == INVALID_IDX) remapped_ref_idx[i] = 0;
+}
+
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+ uint8_t *const dest, unsigned int *frame_flags,
+ int64_t *const time_stamp, int64_t *const time_end,
+ const aom_rational64_t *const timestamp_ratio,
+ int *const pop_lookahead, int flush) {
+ AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ AV1_COMMON *const cm = &cpi->common;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ ExternalFlags *const ext_flags = &cpi->ext_flags;
+ GFConfig *const gf_cfg = &oxcf->gf_cfg;
+
+ EncodeFrameInput frame_input;
+ EncodeFrameParams frame_params;
+ EncodeFrameResults frame_results;
+ memset(&frame_input, 0, sizeof(frame_input));
+ memset(&frame_params, 0, sizeof(frame_params));
+ memset(&frame_results, 0, sizeof(frame_results));
+
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+ VBR_RATECTRL_INFO *vbr_rc_info = &cpi->vbr_rc_info;
+ if (oxcf->pass == AOM_RC_THIRD_PASS && vbr_rc_info->ready == 0) {
+ THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF];
+ av1_open_second_pass_log(cpi, 1);
+ FILE *second_pass_log_stream = cpi->second_pass_log_stream;
+ fseek(second_pass_log_stream, 0, SEEK_END);
+ size_t file_size = ftell(second_pass_log_stream);
+ rewind(second_pass_log_stream);
+ size_t read_size = 0;
+ while (read_size < file_size) {
+ THIRD_PASS_GOP_INFO gop_info;
+ struct aom_internal_error_info *error = cpi->common.error;
+ // Read in GOP information from the second pass file.
+ av1_read_second_pass_gop_info(second_pass_log_stream, &gop_info, error);
+ TPL_INFO *tpl_info;
+ AOM_CHECK_MEM_ERROR(cm->error, tpl_info, aom_malloc(sizeof(*tpl_info)));
+ av1_read_tpl_info(tpl_info, second_pass_log_stream, error);
+ // Read in per-frame info from second-pass encoding
+ av1_read_second_pass_per_frame_info(second_pass_log_stream, frame_info,
+ gop_info.num_frames, error);
+ av1_vbr_rc_append_tpl_info(vbr_rc_info, tpl_info);
+ read_size = ftell(second_pass_log_stream);
+ aom_free(tpl_info);
+ }
+ av1_close_second_pass_log(cpi);
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+ vbr_rc_info->base_q_index = cpi->oxcf.rc_cfg.cq_level;
+ av1_vbr_rc_compute_q_indices(
+ vbr_rc_info->base_q_index, vbr_rc_info->total_frame_count,
+ vbr_rc_info->qstep_ratio_list, cm->seq_params->bit_depth,
+ vbr_rc_info->q_index_list);
+ } else {
+ vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q(
+ vbr_rc_info->total_bit_budget, cm->seq_params->bit_depth,
+ vbr_rc_info->scale_factors, vbr_rc_info->total_frame_count,
+ vbr_rc_info->update_type_list, vbr_rc_info->qstep_ratio_list,
+ vbr_rc_info->txfm_stats_list, vbr_rc_info->q_index_list, NULL);
+ }
+ vbr_rc_info->ready = 1;
+#if CONFIG_RATECTRL_LOG
+ rc_log_record_chunk_info(&cpi->rc_log, vbr_rc_info->base_q_index,
+ vbr_rc_info->total_frame_count);
+#endif // CONFIG_RATECTRL_LOG
+ }
+#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+
+ // Check if we need to stuff more src frames
+ if (flush == 0) {
+ int srcbuf_size =
+ av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+ int pop_size =
+ av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage);
+
+ // Continue buffering look ahead buffer.
+ if (srcbuf_size < pop_size) return -1;
+ }
+
+ if (!av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage)) {
+#if !CONFIG_REALTIME_ONLY
+ if (flush && oxcf->pass == AOM_RC_FIRST_PASS &&
+ !cpi->ppi->twopass.first_pass_done) {
+ av1_end_first_pass(cpi); /* get last stats packet */
+ cpi->ppi->twopass.first_pass_done = 1;
+ }
+#endif
+ return -1;
+ }
+
+ // TODO(sarahparker) finish bit allocation for one pass pyramid
+ if (has_no_stats_stage(cpi)) {
+ gf_cfg->gf_max_pyr_height =
+ AOMMIN(gf_cfg->gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS);
+ gf_cfg->gf_min_pyr_height =
+ AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height);
+ }
+
+ // Allocation of mi buffers.
+ alloc_mb_mode_info_buffers(cpi);
+
+ cpi->skip_tpl_setup_stats = 0;
+#if !CONFIG_REALTIME_ONLY
+ if (oxcf->pass != AOM_RC_FIRST_PASS) {
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ if (tpl_data->tpl_stats_pool[0] == NULL) {
+ av1_setup_tpl_buffers(cpi->ppi, &cm->mi_params, oxcf->frm_dim_cfg.width,
+ oxcf->frm_dim_cfg.height, 0,
+ oxcf->gf_cfg.lag_in_frames);
+ }
+ }
+ cpi->twopass_frame.this_frame = NULL;
+ const int use_one_pass_rt_params = is_one_pass_rt_params(cpi);
+ if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_get_second_pass_params_time);
+#endif
+
+ // Initialise frame_level_rate_correction_factors with value previous
+ // to the parallel frames.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ for (int i = 0; i < RATE_FACTOR_LEVELS; i++) {
+ cpi->rc.frame_level_rate_correction_factors[i] =
+#if CONFIG_FPMT_TEST
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)
+ ? cpi->ppi->p_rc.temp_rate_correction_factors[i]
+ :
+#endif // CONFIG_FPMT_TEST
+ cpi->ppi->p_rc.rate_correction_factors[i];
+ }
+ }
+
+ // copy mv_stats from ppi to frame_level cpi.
+ cpi->mv_stats = cpi->ppi->mv_stats;
+ av1_get_second_pass_params(cpi, &frame_params, *frame_flags);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_get_second_pass_params_time);
+#endif
+ }
+#endif
+
+ if (!is_stat_generation_stage(cpi)) {
+ // TODO(jingning): fwd key frame always uses show existing frame?
+ if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE &&
+ gf_group->refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+ frame_params.show_existing_frame = 1;
+ } else {
+ frame_params.show_existing_frame =
+ (cpi->ppi->show_existing_alt_ref &&
+ gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) ||
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE;
+ }
+ frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags);
+
+ // Special handling to reset 'show_existing_frame' in case of dropped
+ // frames.
+ if (oxcf->rc_cfg.drop_frames_water_mark &&
+ (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE)) {
+ // During the encode of an OVERLAY_UPDATE/INTNL_OVERLAY_UPDATE frame, loop
+ // over the gf group to check if the corresponding
+ // ARF_UPDATE/INTNL_ARF_UPDATE frame was dropped.
+ int cur_disp_idx = gf_group->display_idx[cpi->gf_frame_index];
+ for (int idx = 0; idx < cpi->gf_frame_index; idx++) {
+ if (cur_disp_idx == gf_group->display_idx[idx]) {
+ assert(IMPLIES(
+ gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE,
+ gf_group->update_type[idx] == ARF_UPDATE));
+ assert(IMPLIES(gf_group->update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE,
+ gf_group->update_type[idx] == INTNL_ARF_UPDATE));
+ // Reset show_existing_frame and set cpi->is_dropped_frame to true if
+ // the frame was dropped during its first encode.
+ if (gf_group->is_frame_dropped[idx]) {
+ frame_params.show_existing_frame = 0;
+ assert(!cpi->is_dropped_frame);
+ cpi->is_dropped_frame = true;
+ }
+ break;
+ }
+ }
+ }
+
+ // Reset show_existing_alt_ref decision to 0 after it is used.
+ if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+ cpi->ppi->show_existing_alt_ref = 0;
+ }
+ } else {
+ frame_params.show_existing_frame = 0;
+ }
+
+ struct lookahead_entry *source = NULL;
+ struct lookahead_entry *last_source = NULL;
+ if (frame_params.show_existing_frame) {
+ source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
+ *pop_lookahead = 1;
+ frame_params.show_frame = 1;
+ } else {
+ source = choose_frame_source(cpi, &flush, pop_lookahead, &last_source,
+ &frame_params.show_frame);
+ }
+
+ if (source == NULL) { // If no source was found, we can't encode a frame.
+#if !CONFIG_REALTIME_ONLY
+ if (flush && oxcf->pass == AOM_RC_FIRST_PASS &&
+ !cpi->ppi->twopass.first_pass_done) {
+ av1_end_first_pass(cpi); /* get last stats packet */
+ cpi->ppi->twopass.first_pass_done = 1;
+ }
+#endif
+ return -1;
+ }
+
+ // reset src_offset to allow actual encode call for this frame to get its
+ // source.
+ gf_group->src_offset[cpi->gf_frame_index] = 0;
+
+ // Source may be changed if temporal filtered later.
+ frame_input.source = &source->img;
+ if ((cpi->ppi->use_svc || cpi->rc.prev_frame_is_dropped) &&
+ last_source != NULL)
+ av1_svc_set_last_source(cpi, &frame_input, &last_source->img);
+ else
+ frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
+ frame_input.ts_duration = source->ts_end - source->ts_start;
+ // Save unfiltered source. It is used in av1_get_second_pass_params().
+ cpi->unfiltered_source = frame_input.source;
+
+ *time_stamp = source->ts_start;
+ *time_end = source->ts_end;
+ if (source->ts_start < cpi->time_stamps.first_ts_start) {
+ cpi->time_stamps.first_ts_start = source->ts_start;
+ cpi->time_stamps.prev_ts_end = source->ts_start;
+ }
+
+ av1_apply_encoding_flags(cpi, source->flags);
+ *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ cpi->framerate = cpi->temp_framerate;
+ }
+ }
+#endif // CONFIG_FPMT_TEST
+
+ // Shown frames and arf-overlay frames need frame-rate considering
+ if (frame_params.show_frame)
+ adjust_frame_rate(cpi, source->ts_start, source->ts_end);
+
+ if (!frame_params.show_existing_frame) {
+ if (cpi->film_grain_table) {
+ cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup(
+ cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
+ &cm->film_grain_params);
+ } else {
+ cm->cur_frame->film_grain_params_present =
+ cm->seq_params->film_grain_params_present;
+ }
+ // only one operating point supported now
+ const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp);
+ if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+
+ cm->frame_presentation_time = (uint32_t)pts64;
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_get_one_pass_rt_params_time);
+#endif
+#if CONFIG_REALTIME_ONLY
+ av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input,
+ *frame_flags);
+ if (use_rtc_reference_structure_one_layer(cpi))
+ av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0);
+#else
+ if (use_one_pass_rt_params) {
+ av1_get_one_pass_rt_params(cpi, &frame_params.frame_type, &frame_input,
+ *frame_flags);
+ if (use_rtc_reference_structure_one_layer(cpi))
+ av1_set_rtc_reference_structure_one_layer(cpi, cpi->gf_frame_index == 0);
+ }
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_get_one_pass_rt_params_time);
+#endif
+
+ FRAME_UPDATE_TYPE frame_update_type =
+ get_frame_update_type(gf_group, cpi->gf_frame_index);
+
+ if (frame_params.show_existing_frame &&
+ frame_params.frame_type != KEY_FRAME) {
+ // Force show-existing frames to be INTER, except forward keyframes
+ frame_params.frame_type = INTER_FRAME;
+ }
+
+ // Per-frame encode speed. In theory this can vary, but things may have
+ // been written assuming speed-level will not change within a sequence, so
+ // this parameter should be used with caution.
+ frame_params.speed = oxcf->speed;
+
+#if !CONFIG_REALTIME_ONLY
+ // Set forced key frames when necessary. For two-pass encoding / lap mode,
+ // this is already handled by av1_get_second_pass_params. However when no
+ // stats are available, we still need to check if the new frame is a keyframe.
+ // For one pass rt, this is already checked in av1_get_one_pass_rt_params.
+ if (!use_one_pass_rt_params &&
+ (is_stat_generation_stage(cpi) || has_no_stats_stage(cpi))) {
+ // Current frame is coded as a key-frame for any of the following cases:
+ // 1) First frame of a video
+ // 2) For all-intra frame encoding
+ // 3) When a key-frame is forced
+ const int kf_requested =
+ (cm->current_frame.frame_number == 0 ||
+ oxcf->kf_cfg.key_freq_max == 0 || (*frame_flags & FRAMEFLAGS_KEY));
+ if (kf_requested && frame_update_type != OVERLAY_UPDATE &&
+ frame_update_type != INTNL_OVERLAY_UPDATE) {
+ frame_params.frame_type = KEY_FRAME;
+ } else if (is_stat_generation_stage(cpi)) {
+ // For stats generation, set the frame type to inter here.
+ frame_params.frame_type = INTER_FRAME;
+ }
+ }
+#endif
+
+ // Work out some encoding parameters specific to the pass:
+ if (has_no_stats_stage(cpi) && oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
+ av1_cyclic_refresh_update_parameters(cpi);
+ } else if (is_stat_generation_stage(cpi)) {
+ cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&oxcf->rc_cfg);
+ } else if (is_stat_consumption_stage(cpi)) {
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+ cm->txcoeff_cost_timer = 0;
+ cm->txcoeff_cost_count = 0;
+#endif
+ }
+
+ if (!is_stat_generation_stage(cpi))
+ set_ext_overrides(cm, &frame_params, ext_flags);
+
+ // Shown keyframes and S frames refresh all reference buffers
+ const int force_refresh_all =
+ ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) ||
+ frame_params.frame_type == S_FRAME) &&
+ !frame_params.show_existing_frame;
+
+ av1_configure_buffer_updates(
+ cpi, &frame_params.refresh_frame, frame_update_type,
+ gf_group->refbuf_state[cpi->gf_frame_index], force_refresh_all);
+
+ if (!is_stat_generation_stage(cpi)) {
+ const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME];
+
+ RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+ init_ref_map_pair(cpi, ref_frame_map_pairs);
+ const int order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
+ const int cur_frame_disp =
+ cpi->common.current_frame.frame_number + order_offset;
+
+ int get_ref_frames = 0;
+#if CONFIG_FPMT_TEST
+ get_ref_frames =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0;
+#endif // CONFIG_FPMT_TEST
+ if (get_ref_frames ||
+ gf_group->frame_parallel_level[cpi->gf_frame_index] == 0) {
+ if (!ext_flags->refresh_frame.update_pending) {
+ av1_get_ref_frames(ref_frame_map_pairs, cur_frame_disp, cpi,
+ cpi->gf_frame_index, 1, cm->remapped_ref_idx);
+ } else if (cpi->ppi->rtc_ref.set_ref_frame_config ||
+ use_rtc_reference_structure_one_layer(cpi)) {
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
+ cm->remapped_ref_idx[i] = cpi->ppi->rtc_ref.ref_idx[i];
+ }
+ }
+
+ // Get the reference frames
+ bool has_ref_frames = false;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ const RefCntBuffer *ref_frame =
+ get_ref_frame_buf(cm, ref_frame_priority_order[i]);
+ ref_frame_buf[i] = ref_frame != NULL ? &ref_frame->buf : NULL;
+ if (ref_frame != NULL) has_ref_frames = true;
+ }
+ if (!has_ref_frames && (frame_params.frame_type == INTER_FRAME ||
+ frame_params.frame_type == S_FRAME)) {
+ return AOM_CODEC_ERROR;
+ }
+
+ // Work out which reference frame slots may be used.
+ frame_params.ref_frame_flags =
+ get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), ref_frame_buf,
+ ext_flags->ref_frame_flags);
+
+ // Set primary_ref_frame of non-reference frames as PRIMARY_REF_NONE.
+ if (cpi->ppi->gf_group.is_frame_non_ref[cpi->gf_frame_index]) {
+ frame_params.primary_ref_frame = PRIMARY_REF_NONE;
+ } else {
+ frame_params.primary_ref_frame =
+ choose_primary_ref_frame(cpi, &frame_params);
+ }
+
+ frame_params.order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
+
+ // Call av1_get_refresh_frame_flags() if refresh index not available.
+ if (!cpi->refresh_idx_available) {
+ frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
+ cpi, &frame_params, frame_update_type, cpi->gf_frame_index,
+ cur_frame_disp, ref_frame_map_pairs);
+ } else {
+ assert(cpi->ref_refresh_index != INVALID_IDX);
+ frame_params.refresh_frame_flags = (1 << cpi->ref_refresh_index);
+ }
+
+ // Make the frames marked as is_frame_non_ref to non-reference frames.
+ if (gf_group->is_frame_non_ref[cpi->gf_frame_index])
+ frame_params.refresh_frame_flags = 0;
+
+ frame_params.existing_fb_idx_to_show = INVALID_IDX;
+ // Find the frame buffer to show based on display order.
+ if (frame_params.show_existing_frame) {
+ for (int frame = 0; frame < REF_FRAMES; frame++) {
+ const RefCntBuffer *const buf = cm->ref_frame_map[frame];
+ if (buf == NULL) continue;
+ const int frame_order = (int)buf->display_order_hint;
+ if (frame_order == cur_frame_disp)
+ frame_params.existing_fb_idx_to_show = frame;
+ }
+ }
+ }
+
+ // The way frame_params->remapped_ref_idx is setup is a placeholder.
+ // Currently, reference buffer assignment is done by update_ref_frame_map()
+ // which is called by high-level strategy AFTER encoding a frame. It
+ // modifies cm->remapped_ref_idx. If you want to use an alternative method
+ // to determine reference buffer assignment, just put your assignments into
+ // frame_params->remapped_ref_idx here and they will be used when encoding
+ // this frame. If frame_params->remapped_ref_idx is setup independently of
+ // cm->remapped_ref_idx then update_ref_frame_map() will have no effect.
+ memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx,
+ REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+ cpi->td.mb.rdmult_delta_qindex = cpi->td.mb.delta_qindex = 0;
+
+ if (!frame_params.show_existing_frame) {
+ cm->quant_params.using_qmatrix = oxcf->q_cfg.using_qm;
+ }
+
+ const int is_intra_frame = frame_params.frame_type == KEY_FRAME ||
+ frame_params.frame_type == INTRA_ONLY_FRAME;
+ FeatureFlags *const features = &cm->features;
+ if (!is_stat_generation_stage(cpi) &&
+ (oxcf->pass == AOM_RC_ONE_PASS || oxcf->pass >= AOM_RC_SECOND_PASS) &&
+ is_intra_frame) {
+ av1_set_screen_content_options(cpi, features);
+ }
+
+#if CONFIG_REALTIME_ONLY
+ if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+#else
+ if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME &&
+ gf_cfg->lag_in_frames == 0) {
+ if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ } else if (denoise_and_encode(cpi, dest, &frame_input, &frame_params,
+ &frame_results) != AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+#endif // CONFIG_REALTIME_ONLY
+
+ // This is used in rtc temporal filter case. Use true source in the PSNR
+ // calculation.
+ if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf &&
+ cpi->common.current_frame.frame_type != KEY_FRAME) {
+ assert(cpi->orig_source.buffer_alloc_sz > 0);
+ cpi->source = &cpi->orig_source;
+ }
+
+ if (!is_stat_generation_stage(cpi)) {
+ // First pass doesn't modify reference buffer assignment or produce frame
+ // flags
+ update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags);
+ set_additional_frame_flags(cm, frame_flags);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+#if TXCOEFF_COST_TIMER
+ if (!is_stat_generation_stage(cpi)) {
+ cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+ fprintf(stderr,
+ "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+ "in us\n",
+ cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+ cm->cum_txcoeff_cost_timer);
+ }
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_TUNE_VMAF
+ if (!is_stat_generation_stage(cpi) &&
+ (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+ oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
+ av1_update_vmaf_curve(cpi);
+ }
+#endif
+
+ // Unpack frame_results:
+ *size = frame_results.size;
+
+ // Leave a signal for a higher level caller about if this frame is droppable
+ if (*size > 0) {
+ cpi->droppable =
+ is_frame_droppable(&cpi->ppi->rtc_ref, &ext_flags->refresh_frame);
+ }
+
+ // For SVC, or when frame-dropper is enabled:
+ // keep track of the (unscaled) source corresponding to the refresh of LAST
+ // reference (base temporal layer - TL0). Copy only for the
+ // top spatial enhancement layer so all spatial layers of the next
+ // superframe have last_source to be aligned with previous TL0 superframe.
+ // Avoid cases where resolution changes for unscaled source (top spatial
+ // layer). Only needs to be done for frame that are encoded (size > 0).
+ if (*size > 0 &&
+ (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+ cpi->svc.temporal_layer_id == 0 &&
+ cpi->unscaled_source->y_width == cpi->svc.source_last_TL0.y_width &&
+ cpi->unscaled_source->y_height == cpi->svc.source_last_TL0.y_height) {
+ aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+ aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+ aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0);
+ }
+
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/encode_strategy.h b/third_party/aom/av1/encoder/encode_strategy.h
new file mode 100644
index 0000000000..c1d14d134c
--- /dev/null
+++ b/third_party/aom/av1/encoder/encode_strategy.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares frame encoding functions.
+ */
+#ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+#define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "aom/aom_encoder.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+
+/*!\brief Implement high-level encode strategy
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ * This function will implement high-level encode strategy, choosing frame type,
+ * frame placement, etc. It populates an EncodeFrameParams struct with the
+ * results of these decisions and then encodes the frame. The caller should use
+ * the output parameters *time_stamp and *time_end only when this function
+ * returns AOM_CODEC_OK.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] size Bitstream size
+ * \param[in] dest Bitstream output
+ * \param[in] frame_flags Flags to decide how to encoding the frame
+ * \param[out] time_stamp Time stamp of the frame
+ * \param[out] time_end Time end
+ * \param[in] timestamp_ratio Time base
+ * \param[in] pop_lookahead Decide to pop the source frame from queue
+ * \param[in] flush Decide to encode one frame or the rest of frames
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * \retval #AOM_CODEC_ERROR
+ */
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+ uint8_t *const dest, unsigned int *frame_flags,
+ int64_t *const time_stamp, int64_t *const time_end,
+ const aom_rational64_t *const timestamp_ratio,
+ int *const pop_lookahead, int flush);
+
+/*!\cond */
+// Set individual buffer update flags based on frame reference type.
+// force_refresh_all is used when we have a KEY_FRAME or S_FRAME. It forces all
+// refresh_*_frame flags to be set, because we refresh all buffers in this case.
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+ RefreshFrameInfo *const refresh_frame,
+ const FRAME_UPDATE_TYPE type,
+ const REFBUF_STATE refbuf_state,
+ int force_refresh_all);
+
+int av1_get_refresh_frame_flags(
+ const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+ FRAME_UPDATE_TYPE frame_update_type, int gf_index, int cur_disp_order,
+ RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]);
+
+int av1_get_refresh_ref_frame_map(int refresh_frame_flags);
+
+/*!\brief Obtain indices of reference frames in ref_frame_map
+ *
+ * \callgraph
+ * \callergraph
+ *
+ * \param[out] remapped_ref_idx An array for storing indices of reference
+ * frames. The index is used to retrieve a
+ * reference frame buffer from ref_frame_map
+ * in AV1Common.
+ */
+void av1_get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+ int cur_frame_disp, const AV1_COMP *cpi, int gf_index,
+ int is_parallel_encode,
+ int remapped_ref_idx[REF_FRAMES]);
+
+int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+ const int up_to_index,
+ const COMPRESSOR_STAGE compressor_stage);
+
+static AOM_INLINE int is_frame_droppable(
+ const RTC_REF *const rtc_ref,
+ const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) {
+ // Droppable frame is only used by external refresh flags. VoD setting won't
+ // trigger its use case.
+ if (rtc_ref->set_ref_frame_config)
+ return rtc_ref->non_reference_frame;
+ else if (ext_refresh_frame_flags->update_pending)
+ return !(ext_refresh_frame_flags->alt_ref_frame ||
+ ext_refresh_frame_flags->alt2_ref_frame ||
+ ext_refresh_frame_flags->bwd_ref_frame ||
+ ext_refresh_frame_flags->golden_frame ||
+ ext_refresh_frame_flags->last_frame);
+ else
+ return 0;
+}
+
+static AOM_INLINE int get_current_frame_ref_type(const AV1_COMP *const cpi) {
+ // We choose the reference "type" of this frame from the flags which indicate
+ // which reference frames will be refreshed by it. More than one of these
+ // flags may be set, so the order here implies an order of precedence. This is
+ // just used to choose the primary_ref_frame (as the most recent reference
+ // buffer of the same reference-type as the current frame).
+
+ switch (cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]) {
+ case 0: return 0;
+ case 1: return 1;
+ case MAX_ARF_LAYERS:
+ case MAX_ARF_LAYERS + 1: return 4;
+ default: return 7;
+ }
+}
+
+int av1_calc_refresh_idx_for_intnl_arf(
+ AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+ int gf_index);
+/*!\endcond */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
new file mode 100644
index 0000000000..e2213a8355
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -0,0 +1,2408 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/global_motion_facade.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/partition_model_weights.h"
+#endif
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/var_based_part.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+/*!\cond */
+// This is used as a reference when computing the source variance for the
+// purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+// which will be faster.
+static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16
+};
+#endif // CONFIG_AV1_HIGHBITDEPTH
+/*!\endcond */
+
+// For the given bit depth, returns a constant array used to assist the
+// calculation of source block variance, which will then be used to decide
+// adaptive quantizers.
+static const uint8_t *get_var_offs(int use_hbd, int bd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ assert(bd == 8 || bd == 10 || bd == 12);
+ const int off_index = (bd - 8) >> 1;
+ static const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8,
+ AV1_HIGH_VAR_OFFS_10,
+ AV1_HIGH_VAR_OFFS_12 };
+ return CONVERT_TO_BYTEPTR(high_var_offs[off_index]);
+ }
+#else
+ (void)use_hbd;
+ (void)bd;
+ assert(!use_hbd);
+#endif
+ assert(bd == 8);
+ return AV1_VAR_OFFS;
+}
+
+void av1_init_rtc_counters(MACROBLOCK *const x) {
+ av1_init_cyclic_refresh_counters(x);
+ x->cnt_zeromv = 0;
+}
+
+void av1_accumulate_rtc_counters(AV1_COMP *cpi, const MACROBLOCK *const x) {
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+ av1_accumulate_cyclic_refresh_counters(cpi->cyclic_refresh, x);
+ cpi->rc.cnt_zeromv += x->cnt_zeromv;
+}
+
+unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi,
+ const MACROBLOCKD *xd,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bsize, int plane,
+ int use_hbd) {
+ const int subsampling_x = xd->plane[plane].subsampling_x;
+ const int subsampling_y = xd->plane[plane].subsampling_y;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, subsampling_x, subsampling_y);
+ unsigned int sse;
+ const unsigned int var = cpi->ppi->fn_ptr[plane_bsize].vf(
+ ref->buf, ref->stride, get_var_offs(use_hbd, xd->bd), 0, &sse);
+ return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[plane_bsize]);
+}
+
+unsigned int av1_get_perpixel_variance_facade(const AV1_COMP *cpi,
+ const MACROBLOCKD *xd,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bsize, int plane) {
+ const int use_hbd = is_cur_buf_hbd(xd);
+ return av1_get_perpixel_variance(cpi, xd, ref, bsize, plane, use_hbd);
+}
+
+void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+ int mi_row, int mi_col, const int num_planes,
+ BLOCK_SIZE bsize) {
+ // Set current frame pointer.
+ x->e_mbd.cur_buf = src;
+
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) {
+ const int is_uv = i > 0;
+ setup_pred_plane(
+ &x->plane[i].src, bsize, src->buffers[i], src->crop_widths[is_uv],
+ src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, NULL,
+ x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+/*!\brief Assigns different quantization parameters to each super
+ * block based on its TPL weight.
+ *
+ * \ingroup tpl_modelling
+ *
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in,out] td Thread data structure
+ * \param[in,out] x Macro block level data for this block.
+ * \param[in] tile_info Tile infromation / identification
+ * \param[in] mi_row Block row (in "MI_SIZE" units) index
+ * \param[in] mi_col Block column (in "MI_SIZE" units) index
+ * \param[out] num_planes Number of image planes (e.g. Y,U,V)
+ *
+ * \remark No return value but updates macroblock and thread data
+ * related to the q / q delta to be used.
+ */
+static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
+ MACROBLOCK *const x,
+ const TileInfo *const tile_info,
+ int mi_row, int mi_col, int num_planes) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ assert(delta_q_info->delta_q_present_flag);
+
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ // Delta-q modulation based on variance
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+
+ const int delta_q_res = delta_q_info->delta_q_res;
+ int current_qindex = cm->quant_params.base_qindex;
+ if (cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.qp_mode ==
+ DUCKY_ENCODE_FRAME_MODE_QINDEX) {
+ const int sb_row = mi_row >> cm->seq_params->mib_size_log2;
+ const int sb_col = mi_col >> cm->seq_params->mib_size_log2;
+ const int sb_cols =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+ const int sb_index = sb_row * sb_cols + sb_col;
+ current_qindex =
+ cpi->ducky_encode_info.frame_info.superblock_encode_qindex[sb_index];
+ } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) {
+ if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
+ const int block_wavelet_energy_level =
+ av1_block_wavelet_energy_level(cpi, x, sb_size);
+ x->sb_energy_level = block_wavelet_energy_level;
+ current_qindex = av1_compute_q_from_energy_level_deltaq_mode(
+ cpi, block_wavelet_energy_level);
+ } else {
+ const int block_var_level = av1_log_block_var(cpi, x, sb_size);
+ x->sb_energy_level = block_var_level;
+ current_qindex =
+ av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level);
+ }
+ } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_OBJECTIVE &&
+ cpi->oxcf.algo_cfg.enable_tpl_model) {
+ // Setup deltaq based on tpl stats
+ current_qindex =
+ av1_get_q_for_deltaq_objective(cpi, td, NULL, sb_size, mi_row, mi_col);
+ } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI) {
+ current_qindex = av1_get_sbq_perceptual_ai(cpi, sb_size, mi_row, mi_col);
+ } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
+ current_qindex = av1_get_sbq_user_rating_based(cpi, mi_row, mi_col);
+ } else if (cpi->oxcf.q_cfg.enable_hdr_deltaq) {
+ current_qindex = av1_get_q_for_hdr(cpi, x, sb_size, mi_row, mi_col);
+ }
+
+ x->rdmult_cur_qindex = current_qindex;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int adjusted_qindex = av1_adjust_q_from_delta_q_res(
+ delta_q_res, xd->current_base_qindex, current_qindex);
+ if (cpi->use_ducky_encode) {
+ assert(adjusted_qindex == current_qindex);
+ }
+ current_qindex = adjusted_qindex;
+
+ x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+ x->rdmult_delta_qindex = x->delta_qindex;
+
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ xd->mi[0]->current_qindex = current_qindex;
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0);
+
+ // keep track of any non-zero delta-q used
+ td->deltaq_used |= (x->delta_qindex != 0);
+
+ if (cpi->oxcf.tool_cfg.enable_deltalf_mode) {
+ const int delta_lf_res = delta_q_info->delta_lf_res;
+ const int lfmask = ~(delta_lf_res - 1);
+ const int delta_lf_from_base =
+ ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask);
+ const int8_t delta_lf =
+ (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ const int mib_size = cm->seq_params->mib_size;
+
+ // pre-set the delta lf for loop filter. Note that this value is set
+ // before mi is assigned for each block in current superblock
+ for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
+ for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
+ const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
+ mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf;
+ }
+ }
+ }
+ }
+}
+
+static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
+ int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MACROBLOCK *x = &td->mb;
+ const int frame_idx = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+ av1_zero(x->tpl_keep_ref_frame);
+
+ if (!av1_tpl_stats_ready(tpl_data, frame_idx)) return;
+ if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
+ if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
+
+ const int is_overlay =
+ cpi->ppi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
+ if (is_overlay) {
+ memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame));
+ return;
+ }
+
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ const int tpl_stride = tpl_frame->stride;
+ int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 };
+ const int step = 1 << block_mis_log2;
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+ const int mi_row_end =
+ AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_col_end_sr =
+ AOMMIN(coded_to_superres_mi(mi_col + mi_size_wide[sb_size],
+ cm->superres_scale_denominator),
+ mi_cols_sr);
+ const int row_step = step;
+ const int col_step_sr =
+ coded_to_superres_mi(step, cm->superres_scale_denominator);
+ for (int row = mi_row; row < mi_row_end; row += row_step) {
+ for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+ const TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+ int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 };
+ // Find the winner ref frame idx for the current block
+ int64_t best_inter_cost = this_stats->pred_error[0];
+ int best_rf_idx = 0;
+ for (int idx = 1; idx < INTER_REFS_PER_FRAME; ++idx) {
+ if ((this_stats->pred_error[idx] < best_inter_cost) &&
+ (this_stats->pred_error[idx] != 0)) {
+ best_inter_cost = this_stats->pred_error[idx];
+ best_rf_idx = idx;
+ }
+ }
+ // tpl_pred_error is the pred_error reduction of best_ref w.r.t.
+ // LAST_FRAME.
+ tpl_pred_error[best_rf_idx] = this_stats->pred_error[best_rf_idx] -
+ this_stats->pred_error[LAST_FRAME - 1];
+
+ for (int rf_idx = 1; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx)
+ inter_cost[rf_idx] += tpl_pred_error[rf_idx];
+ }
+ }
+
+ int rank_index[INTER_REFS_PER_FRAME - 1];
+ for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
+ rank_index[idx] = idx + 1;
+ for (int i = idx; i > 0; --i) {
+ if (inter_cost[rank_index[i - 1]] > inter_cost[rank_index[i]]) {
+ const int tmp = rank_index[i - 1];
+ rank_index[i - 1] = rank_index[i];
+ rank_index[i] = tmp;
+ }
+ }
+ }
+
+ x->tpl_keep_ref_frame[INTRA_FRAME] = 1;
+ x->tpl_keep_ref_frame[LAST_FRAME] = 1;
+
+ int cutoff_ref = 0;
+ for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
+ x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 1;
+ if (idx > 2) {
+ if (!cutoff_ref) {
+ // If the predictive coding gains are smaller than the previous more
+ // relevant frame over certain amount, discard this frame and all the
+ // frames afterwards.
+ if (llabs(inter_cost[rank_index[idx]]) <
+ llabs(inter_cost[rank_index[idx - 1]]) / 8 ||
+ inter_cost[rank_index[idx]] == 0)
+ cutoff_ref = 1;
+ }
+
+ if (cutoff_ref) x->tpl_keep_ref_frame[rank_index[idx] + LAST_FRAME] = 0;
+ }
+ }
+}
+
+static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
+ int mi_row, int mi_col) {
+ const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+ const int orig_rdmult = cpi->rd.RDMULT;
+
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int gf_group_index = cpi->gf_frame_index;
+ if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+ cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
+ cpi->ppi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
+ const int dr =
+ av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult);
+ x->rdmult = dr;
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_RT_ML_PARTITIONING
+// Get a prediction(stored in x->est_pred) for the whole superblock.
+static void get_estimated_pred(AV1_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCK *x, int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int is_key_frame = frame_is_intra_only(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ // TODO(kyslov) Extend to 128x128
+ assert(cm->seq_params->sb_size == BLOCK_64X64);
+
+ av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+
+ if (!is_key_frame) {
+ MB_MODE_INFO *mi = xd->mi[0];
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+
+ assert(yv12 != NULL);
+
+ av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ get_ref_scale_factors(cm, LAST_FRAME), 1);
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE;
+ mi->bsize = BLOCK_64X64;
+ mi->mv[0].as_int = 0;
+ mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+
+ xd->plane[0].dst.buf = x->est_pred;
+ xd->plane[0].dst.stride = 64;
+ av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+ } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+ switch (xd->bd) {
+ case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break;
+ case 10:
+ memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0]));
+ break;
+ case 12:
+ memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0]));
+ break;
+ }
+#else
+ memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0]));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+}
+#endif // CONFIG_RT_ML_PARTITIONING
+
+#define AVG_CDF_WEIGHT_LEFT 3
+#define AVG_CDF_WEIGHT_TOP_RIGHT 1
+
+/*!\brief Encode a superblock (minimal RD search involved)
+ *
+ * \ingroup partition_search
+ * Encodes the superblock by a pre-determined partition pattern, only minor
+ * rd-based searches are allowed to adjust the initial pattern. It is only used
+ * by realtime encoding.
+ */
+static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ const int mi_row, const int mi_col,
+ const int seg_skip) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+ get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ PC_TREE *const pc_root = td->pc_root;
+
+#if CONFIG_RT_ML_PARTITIONING
+ if (sf->part_sf.partition_search_type == ML_BASED_PARTITION) {
+ RD_STATS dummy_rdc;
+ get_estimated_pred(cpi, tile_info, x, mi_row, mi_col);
+ av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+ BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, pc_root);
+ return;
+ }
+#endif
+ // Set the partition
+ if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
+ (sf->rt_sf.use_fast_fixed_part &&
+ x->content_state_sb.source_sad_nonrd < kMedSad)) {
+ // set a fixed-size partition
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size;
+ if (sf->rt_sf.use_fast_fixed_part &&
+ x->content_state_sb.source_sad_nonrd < kLowSad) {
+ bsize_select = BLOCK_64X64;
+ }
+ const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select;
+ av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+ // set a variance-based partition
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+ }
+ assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
+ sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
+ set_cb_offsets(td->mb.cb_offset, 0, 0);
+
+ // Initialize the flag to skip cdef to 1.
+ if (sf->rt_sf.skip_cdef_sb) {
+ const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
+ // If 128x128 block is used, we need to set the flag for all 4 64x64 sub
+ // "blocks".
+ for (int r = 0; r < block64_in_sb; ++r) {
+ for (int c = 0; c < block64_in_sb; ++c) {
+ const int idx_in_sb =
+ r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
+ if (mi[idx_in_sb]) mi[idx_in_sb]->cdef_strength = 1;
+ }
+ }
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, nonrd_use_partition_time);
+#endif
+ av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+ pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, nonrd_use_partition_time);
+#endif
+}
+
+// This function initializes the stats for encode_rd_sb.
+static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+ const TileDataEnc *tile_data,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ RD_STATS *rd_cost, int mi_row, int mi_col,
+ int gather_tpl_data) {
+ const AV1_COMMON *cm = &cpi->common;
+ const TileInfo *tile_info = &tile_data->tile_info;
+ MACROBLOCK *x = &td->mb;
+
+ const SPEED_FEATURES *sf = &cpi->sf;
+ const int use_simple_motion_search =
+ (sf->part_sf.simple_motion_search_split ||
+ sf->part_sf.simple_motion_search_prune_rect ||
+ sf->part_sf.simple_motion_search_early_term_none ||
+ sf->part_sf.ml_early_term_after_part_split_level) &&
+ !frame_is_intra_only(cm);
+ if (use_simple_motion_search) {
+ av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_root,
+ mi_row, mi_col);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (!(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+ cpi->oxcf.gf_cfg.lag_in_frames == 0)) {
+ init_ref_frame_space(cpi, td, mi_row, mi_col);
+ x->sb_energy_level = 0;
+ x->part_search_info.cnn_output_valid = 0;
+ if (gather_tpl_data) {
+ if (cm->delta_q_info.delta_q_present_flag) {
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
+ av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
+ }
+
+ // TODO(jingning): revisit this function.
+ if (cpi->oxcf.algo_cfg.enable_tpl_model && (0)) {
+ adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col);
+ }
+ }
+ }
+#else
+ (void)tile_info;
+ (void)mi_row;
+ (void)mi_col;
+ (void)gather_tpl_data;
+#endif
+
+ x->reuse_inter_pred = false;
+ x->txfm_search_params.mode_eval_type = DEFAULT_EVAL;
+ reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+ av1_zero(x->picked_ref_frames_mask);
+ av1_invalid_rd_stats(rd_cost);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void sb_qp_sweep_init_quantizers(AV1_COMP *cpi, ThreadData *td,
+ const TileDataEnc *tile_data,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ RD_STATS *rd_cost, int mi_row,
+ int mi_col, int delta_qp_ofs) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const TileInfo *tile_info = &tile_data->tile_info;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ assert(delta_q_info->delta_q_present_flag);
+ const int delta_q_res = delta_q_info->delta_q_res;
+
+ const SPEED_FEATURES *sf = &cpi->sf;
+ const int use_simple_motion_search =
+ (sf->part_sf.simple_motion_search_split ||
+ sf->part_sf.simple_motion_search_prune_rect ||
+ sf->part_sf.simple_motion_search_early_term_none ||
+ sf->part_sf.ml_early_term_after_part_split_level) &&
+ !frame_is_intra_only(cm);
+ if (use_simple_motion_search) {
+ av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_tree,
+ mi_row, mi_col);
+ }
+
+ int current_qindex = x->rdmult_cur_qindex + delta_qp_ofs;
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ current_qindex = av1_adjust_q_from_delta_q_res(
+ delta_q_res, xd->current_base_qindex, current_qindex);
+
+ x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ xd->mi[0]->current_qindex = current_qindex;
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id, 0);
+
+ // keep track of any non-zero delta-q used
+ td->deltaq_used |= (x->delta_qindex != 0);
+
+ if (cpi->oxcf.tool_cfg.enable_deltalf_mode) {
+ const int delta_lf_res = delta_q_info->delta_lf_res;
+ const int lfmask = ~(delta_lf_res - 1);
+ const int delta_lf_from_base =
+ ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask);
+ const int8_t delta_lf =
+ (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ const int mib_size = cm->seq_params->mib_size;
+
+ // pre-set the delta lf for loop filter. Note that this value is set
+ // before mi is assigned for each block in current superblock
+ for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
+ for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
+ const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
+ mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf;
+ }
+ }
+ }
+ }
+
+ x->reuse_inter_pred = false;
+ x->txfm_search_params.mode_eval_type = DEFAULT_EVAL;
+ reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+ av1_zero(x->picked_ref_frames_mask);
+ av1_invalid_rd_stats(rd_cost);
+}
+
+static int sb_qp_sweep(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ SB_FIRST_PASS_STATS *sb_org_stats) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ RD_STATS rdc_winner, cur_rdc;
+ av1_invalid_rd_stats(&rdc_winner);
+
+ int best_qindex = td->mb.rdmult_delta_qindex;
+ const int start = cm->current_frame.frame_type == KEY_FRAME ? -20 : -12;
+ const int end = cm->current_frame.frame_type == KEY_FRAME ? 20 : 12;
+ const int step = cm->delta_q_info.delta_q_res;
+
+ for (int sweep_qp_delta = start; sweep_qp_delta <= end;
+ sweep_qp_delta += step) {
+ sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_tree, &cur_rdc, mi_row,
+ mi_col, sweep_qp_delta);
+
+ const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+ const int backup_current_qindex =
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+ av1_reset_mbmi(&cm->mi_params, bsize, mi_row, mi_col);
+ av1_restore_sb_state(sb_org_stats, cpi, td, tile_data, mi_row, mi_col);
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex = backup_current_qindex;
+
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+ &cur_rdc, cur_rdc, td->pc_root, sms_tree, NULL,
+ SB_DRY_PASS, NULL);
+
+ if ((rdc_winner.rdcost > cur_rdc.rdcost) ||
+ (abs(sweep_qp_delta) < abs(best_qindex - x->rdmult_delta_qindex) &&
+ rdc_winner.rdcost == cur_rdc.rdcost)) {
+ rdc_winner = cur_rdc;
+ best_qindex = x->rdmult_delta_qindex + sweep_qp_delta;
+ }
+ }
+
+ return best_qindex;
+}
+#endif //! CONFIG_REALTIME_ONLY
+
+/*!\brief Encode a superblock (RD-search-based)
+ *
+ * \ingroup partition_search
+ * Conducts partition search for a superblock, based on rate-distortion costs,
+ * from scratch or adjusting from a pre-calculated partition pattern.
+ */
+static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ const int mi_row, const int mi_col,
+ const int seg_skip) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+ get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int num_planes = av1_num_planes(cm);
+ int dummy_rate;
+ int64_t dummy_dist;
+ RD_STATS dummy_rdc;
+ SIMPLE_MOTION_DATA_TREE *const sms_root = td->sms_root;
+
+#if CONFIG_REALTIME_ONLY
+ (void)seg_skip;
+#endif // CONFIG_REALTIME_ONLY
+
+ init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row, mi_col,
+ 1);
+
+ // Encode the superblock
+ if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+ // partition search starting from a variance-based partition
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_use_partition_time);
+#endif
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+ &dummy_rate, &dummy_dist, 1, td->pc_root);
+ av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0,
+ sf->part_sf.partition_search_type);
+ td->pc_root = NULL;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_use_partition_time);
+#endif
+ }
+#if !CONFIG_REALTIME_ONLY
+ else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
+ // partition search by adjusting a fixed-size partition
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ const BLOCK_SIZE bsize =
+ seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
+ av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+ &dummy_rate, &dummy_dist, 1, td->pc_root);
+ av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0,
+ sf->part_sf.partition_search_type);
+ td->pc_root = NULL;
+ } else {
+ // The most exhaustive recursive partition search
+ SuperBlockEnc *sb_enc = &x->sb_enc;
+ // No stats for overlay frames. Exclude key frame.
+ av1_get_tpl_stats_sb(cpi, sb_size, mi_row, mi_col, sb_enc);
+
+ // Reset the tree for simple motion search data
+ av1_reset_simple_motion_tree_partition(sms_root, sb_size);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_partition_time);
+#endif
+
+ // Estimate the maximum square partition block size, which will be used
+ // as the starting block size for partitioning the sb
+ set_max_min_partition_size(sb_enc, cpi, x, sf, sb_size, mi_row, mi_col);
+
+ // The superblock can be searched only once, or twice consecutively for
+ // better quality. Note that the meaning of passes here is different from
+ // the general concept of 1-pass/2-pass encoders.
+ const int num_passes =
+ cpi->oxcf.unit_test_cfg.sb_multipass_unit_test ? 2 : 1;
+
+ if (cpi->oxcf.sb_qp_sweep &&
+ !(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+ cpi->oxcf.gf_cfg.lag_in_frames == 0) &&
+ cm->delta_q_info.delta_q_present_flag) {
+ AOM_CHECK_MEM_ERROR(
+ x->e_mbd.error_info, td->mb.sb_stats_cache,
+ (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_stats_cache)));
+ av1_backup_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row,
+ mi_col);
+ assert(x->rdmult_delta_qindex == x->delta_qindex);
+
+ const int best_qp_diff =
+ sb_qp_sweep(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, sms_root,
+ td->mb.sb_stats_cache) -
+ x->rdmult_delta_qindex;
+
+ sb_qp_sweep_init_quantizers(cpi, td, tile_data, sms_root, &dummy_rdc,
+ mi_row, mi_col, best_qp_diff);
+
+ const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+ const int backup_current_qindex =
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+ av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+ av1_restore_sb_state(td->mb.sb_stats_cache, cpi, td, tile_data, mi_row,
+ mi_col);
+
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+ backup_current_qindex;
+ aom_free(td->mb.sb_stats_cache);
+ td->mb.sb_stats_cache = NULL;
+ }
+ if (num_passes == 1) {
+#if CONFIG_PARTITION_SEARCH_ORDER
+ if (cpi->ext_part_controller.ready && !frame_is_intra_only(cm)) {
+ av1_reset_part_sf(&cpi->sf.part_sf);
+ av1_reset_sf_for_ext_part(cpi);
+ RD_STATS this_rdc;
+ av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row,
+ mi_col, sb_size, &this_rdc);
+ } else {
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root,
+ NULL, SB_SINGLE_PASS, NULL);
+ }
+#else
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
+ SB_SINGLE_PASS, NULL);
+#endif // CONFIG_PARTITION_SEARCH_ORDER
+ } else {
+ // First pass
+ AOM_CHECK_MEM_ERROR(
+ x->e_mbd.error_info, td->mb.sb_fp_stats,
+ (SB_FIRST_PASS_STATS *)aom_malloc(sizeof(*td->mb.sb_fp_stats)));
+ av1_backup_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row,
+ mi_col);
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
+ SB_DRY_PASS, NULL);
+
+ // Second pass
+ init_encode_rd_sb(cpi, td, tile_data, sms_root, &dummy_rdc, mi_row,
+ mi_col, 0);
+ av1_reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+ av1_reset_simple_motion_tree_partition(sms_root, sb_size);
+
+ av1_restore_sb_state(td->mb.sb_fp_stats, cpi, td, tile_data, mi_row,
+ mi_col);
+
+ td->pc_root = av1_alloc_pc_tree_node(sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+ &dummy_rdc, dummy_rdc, td->pc_root, sms_root, NULL,
+ SB_WET_PASS, NULL);
+ aom_free(td->mb.sb_fp_stats);
+ td->mb.sb_fp_stats = NULL;
+ }
+
+ // Reset to 0 so that it wouldn't be used elsewhere mistakenly.
+ sb_enc->tpl_data_count = 0;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_partition_time);
+#endif
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ // Update the inter rd model
+ // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
+ cm->tiles.cols == 1 && cm->tiles.rows == 1) {
+ av1_inter_mode_data_fit(tile_data, x->rdmult);
+ }
+}
+
+// Check if the cost update of symbols mode, coeff and dv are tile or off.
+static AOM_INLINE int is_mode_coeff_dv_upd_freq_tile_or_off(
+ const AV1_COMP *const cpi) {
+ const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
+
+ return (inter_sf->coeff_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
+ inter_sf->mode_cost_upd_level <= INTERNAL_COST_UPD_TILE &&
+ cpi->sf.intra_sf.dv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
+}
+
+// When row-mt is enabled and cost update frequencies are set to off/tile,
+// processing of current SB can start even before processing of top-right SB
+// is finished. This function checks if it is sufficient to wait for top SB
+// to finish processing before current SB starts processing.
+static AOM_INLINE int delay_wait_for_top_right_sb(const AV1_COMP *const cpi) {
+ const MODE mode = cpi->oxcf.mode;
+ if (mode == GOOD) return 0;
+
+ if (mode == ALLINTRA)
+ return is_mode_coeff_dv_upd_freq_tile_or_off(cpi);
+ else if (mode == REALTIME)
+ return (is_mode_coeff_dv_upd_freq_tile_or_off(cpi) &&
+ cpi->sf.inter_sf.mv_cost_upd_level <= INTERNAL_COST_UPD_TILE);
+ else
+ return 0;
+}
+
+/*!\brief Calculate source SAD at superblock level using 64x64 block source SAD
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+static AOM_INLINE uint64_t get_sb_source_sad(const AV1_COMP *cpi, int mi_row,
+ int mi_col) {
+ if (cpi->src_sad_blk_64x64 == NULL) return UINT64_MAX;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int blk_64x64_in_mis = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int num_blk_64x64_cols =
+ (cm->mi_params.mi_cols + blk_64x64_in_mis - 1) / blk_64x64_in_mis;
+ const int num_blk_64x64_rows =
+ (cm->mi_params.mi_rows + blk_64x64_in_mis - 1) / blk_64x64_in_mis;
+ const int blk_64x64_col_index = mi_col / blk_64x64_in_mis;
+ const int blk_64x64_row_index = mi_row / blk_64x64_in_mis;
+ uint64_t curr_sb_sad = UINT64_MAX;
+ const uint64_t *const src_sad_blk_64x64_data =
+ &cpi->src_sad_blk_64x64[blk_64x64_col_index +
+ blk_64x64_row_index * num_blk_64x64_cols];
+ if (cm->seq_params->sb_size == BLOCK_128X128 &&
+ blk_64x64_col_index + 1 < num_blk_64x64_cols &&
+ blk_64x64_row_index + 1 < num_blk_64x64_rows) {
+ // Calculate SB source SAD by accumulating source SAD of 64x64 blocks in the
+ // superblock
+ curr_sb_sad = src_sad_blk_64x64_data[0] + src_sad_blk_64x64_data[1] +
+ src_sad_blk_64x64_data[num_blk_64x64_cols] +
+ src_sad_blk_64x64_data[num_blk_64x64_cols + 1];
+ } else if (cm->seq_params->sb_size == BLOCK_64X64) {
+ curr_sb_sad = src_sad_blk_64x64_data[0];
+ }
+ return curr_sb_sad;
+}
+
+/*!\brief Determine whether grading content can be skipped based on sad stat
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+static AOM_INLINE bool is_calc_src_content_needed(AV1_COMP *cpi,
+ MACROBLOCK *const x,
+ int mi_row, int mi_col) {
+ if (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+ return true;
+ const uint64_t curr_sb_sad = get_sb_source_sad(cpi, mi_row, mi_col);
+ if (curr_sb_sad == UINT64_MAX) return true;
+ if (curr_sb_sad == 0) {
+ x->content_state_sb.source_sad_nonrd = kZeroSad;
+ return false;
+ }
+ AV1_COMMON *const cm = &cpi->common;
+ bool do_calc_src_content = true;
+
+ if (cpi->oxcf.speed < 9) return do_calc_src_content;
+
+ // TODO(yunqing): Tune/validate the thresholds for 128x128 SB size.
+ if (AOMMIN(cm->width, cm->height) < 360) {
+ // Derive Average 64x64 block source SAD from SB source SAD
+ const uint64_t avg_64x64_blk_sad =
+ (cm->seq_params->sb_size == BLOCK_128X128) ? ((curr_sb_sad + 2) >> 2)
+ : curr_sb_sad;
+
+ // The threshold is determined based on kLowSad and kHighSad threshold and
+ // test results.
+ const uint64_t thresh_low = 15000;
+ const uint64_t thresh_high = 40000;
+
+ if (avg_64x64_blk_sad > thresh_low && avg_64x64_blk_sad < thresh_high) {
+ do_calc_src_content = false;
+ // Note: set x->content_state_sb.source_sad_rd as well if this is extended
+ // to RTC rd path.
+ x->content_state_sb.source_sad_nonrd = kMedSad;
+ }
+ }
+
+ return do_calc_src_content;
+}
+
+/*!\brief Determine whether grading content is needed based on sf and frame stat
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ */
+// TODO(any): consolidate sfs to make interface cleaner
+static AOM_INLINE void grade_source_content_sb(AV1_COMP *cpi,
+ MACROBLOCK *const x,
+ TileDataEnc *tile_data,
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (cm->current_frame.frame_type == KEY_FRAME ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
+ assert(x->content_state_sb.source_sad_nonrd == kMedSad);
+ assert(x->content_state_sb.source_sad_rd == kMedSad);
+ return;
+ }
+ bool calc_src_content = false;
+
+ if (cpi->sf.rt_sf.source_metrics_sb_nonrd) {
+ if (!cpi->sf.rt_sf.check_scene_detection || cpi->rc.frame_source_sad > 0) {
+ calc_src_content = is_calc_src_content_needed(cpi, x, mi_row, mi_col);
+ } else {
+ x->content_state_sb.source_sad_nonrd = kZeroSad;
+ }
+ } else if ((cpi->sf.rt_sf.var_part_based_on_qidx >= 1) &&
+ (cm->width * cm->height <= 352 * 288)) {
+ if (cpi->rc.frame_source_sad > 0)
+ calc_src_content = true;
+ else
+ x->content_state_sb.source_sad_rd = kZeroSad;
+ }
+ if (calc_src_content)
+ av1_source_content_sb(cpi, x, tile_data, mi_row, mi_col);
+}
+
+/*!\brief Encode a superblock row by breaking it into superblocks
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Do partition and mode search for an sb row: one row of superblocks filling up
+ * the width of the current tile.
+ */
+static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, int mi_row,
+ TokenExtra **tp) {
+ AV1_COMMON *const cm = &cpi->common;
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+ bool row_mt_enabled = mt_info->row_mt_enabled;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int mib_size = cm->seq_params->mib_size;
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+ const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_sb_row_time);
+#endif
+
+ // Initialize the left context for the new SB row
+ av1_zero_left_context(xd);
+
+ // Reset delta for quantizer and loof filters at the beginning of every tile
+ if (mi_row == tile_info->mi_row_start || row_mt_enabled) {
+ if (cm->delta_q_info.delta_q_present_flag)
+ xd->current_base_qindex = cm->quant_params.base_qindex;
+ if (cm->delta_q_info.delta_lf_present_flag) {
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ }
+ }
+
+ reset_thresh_freq_fact(x);
+
+ // Code each SB in the row
+ for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0;
+ mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) {
+ // In realtime/allintra mode and when frequency of cost updates is off/tile,
+ // wait for the top superblock to finish encoding. Otherwise, wait for the
+ // top-right superblock to finish encoding.
+ enc_row_mt->sync_read_ptr(
+ row_mt_sync, sb_row, sb_col_in_tile - delay_wait_for_top_right_sb(cpi));
+
+#if CONFIG_MULTITHREAD
+ if (row_mt_enabled) {
+ pthread_mutex_lock(enc_row_mt->mutex_);
+ const bool row_mt_exit = enc_row_mt->row_mt_exit;
+ pthread_mutex_unlock(enc_row_mt->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (row_mt_exit) return;
+ }
+#endif
+
+ const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled;
+ if (update_cdf && (tile_info->mi_row_start != mi_row)) {
+ if ((tile_info->mi_col_start == mi_col)) {
+ // restore frame context at the 1st column sb
+ memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx));
+ } else {
+ // update context
+ int wt_left = AVG_CDF_WEIGHT_LEFT;
+ int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT;
+ if (tile_info->mi_col_end > (mi_col + mib_size))
+ av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile,
+ wt_left, wt_tr);
+ else
+ av1_avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1,
+ wt_left, wt_tr);
+ }
+ }
+
+ // Update the rate cost tables for some symbols
+ av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
+
+ // Reset color coding related parameters
+ av1_zero(x->color_sensitivity_sb);
+ av1_zero(x->color_sensitivity_sb_g);
+ av1_zero(x->color_sensitivity_sb_alt);
+ av1_zero(x->color_sensitivity);
+ x->content_state_sb.source_sad_nonrd = kMedSad;
+ x->content_state_sb.source_sad_rd = kMedSad;
+ x->content_state_sb.lighting_change = 0;
+ x->content_state_sb.low_sumdiff = 0;
+ x->force_zeromv_skip_for_sb = 0;
+ x->sb_me_block = 0;
+ x->sb_me_partition = 0;
+ x->sb_me_mv.as_int = 0;
+
+ if (cpi->oxcf.mode == ALLINTRA) {
+ x->intra_sb_rdmult_modifier = 128;
+ }
+
+ xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv;
+ x->source_variance = UINT_MAX;
+ td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+
+ // Get segment id and skip flag
+ const struct segmentation *const seg = &cm->seg;
+ int seg_skip = 0;
+ if (seg->enabled) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+ const uint8_t segment_id =
+ map ? get_segment_id(&cm->mi_params, map, sb_size, mi_row, mi_col)
+ : 0;
+ seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+ }
+
+ produce_gradients_for_sb(cpi, x, sb_size, mi_row, mi_col);
+
+ init_src_var_info_of_4x4_sub_blocks(cpi, x->src_var_info_of_4x4_sub_blocks,
+ sb_size);
+
+ // Grade the temporal variation of the sb, the grade will be used to decide
+ // fast mode search strategy for coding blocks
+ grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col);
+
+ // encode the superblock
+ if (use_nonrd_mode) {
+ encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
+ } else {
+ encode_rd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
+ }
+
+ // Update the top-right context in row_mt coding
+ if (update_cdf && (tile_info->mi_row_end > (mi_row + mib_size))) {
+ if (sb_cols_in_tile == 1)
+ memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx));
+ else if (sb_col_in_tile >= 1)
+ memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx,
+ sizeof(*xd->tile_ctx));
+ }
+ enc_row_mt->sync_write_ptr(row_mt_sync, sb_row, sb_col_in_tile,
+ sb_cols_in_tile);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_sb_row_time);
+#endif
+}
+
+static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // Copy data over into macro block data structures.
+ av1_setup_src_planes(x, cpi->source, 0, 0, num_planes,
+ cm->seq_params->sb_size);
+
+ av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y, num_planes);
+}
+
+void av1_alloc_tile_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+
+ av1_row_mt_mem_dealloc(cpi);
+
+ aom_free(cpi->tile_data);
+ cpi->allocated_tiles = 0;
+ enc_row_mt->allocated_tile_cols = 0;
+ enc_row_mt->allocated_tile_rows = 0;
+
+ CHECK_MEM_ERROR(
+ cm, cpi->tile_data,
+ aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+
+ cpi->allocated_tiles = tile_cols * tile_rows;
+ enc_row_mt->allocated_tile_cols = tile_cols;
+ enc_row_mt->allocated_tile_rows = tile_rows;
+ for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ const int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+ av1_zero(this_tile->row_mt_sync);
+ this_tile->row_ctx = NULL;
+ }
+ }
+}
+
+void av1_init_tile_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int tile_col, tile_row;
+ TokenInfo *const token_info = &cpi->token_info;
+ TokenExtra *pre_tok = token_info->tile_tok[0][0];
+ TokenList *tplist = token_info->tplist[0][0];
+ unsigned int tile_tok = 0;
+ int tplist_count = 0;
+
+ if (!is_stat_generation_stage(cpi) &&
+ cm->features.allow_screen_content_tools) {
+ // Number of tokens for which token info needs to be allocated.
+ unsigned int tokens_required =
+ get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,
+ MAX_SB_SIZE_LOG2, num_planes);
+ // Allocate/reallocate memory for token related info if the number of tokens
+ // required is more than the number of tokens already allocated. This could
+ // occur in case of the following:
+ // 1) If the memory is not yet allocated
+ // 2) If the frame dimensions have changed
+ const bool realloc_tokens = tokens_required > token_info->tokens_allocated;
+ if (realloc_tokens) {
+ free_token_info(token_info);
+ alloc_token_info(cm, token_info, tokens_required);
+ pre_tok = token_info->tile_tok[0][0];
+ tplist = token_info->tplist[0][0];
+ }
+ }
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ TileInfo *const tile_info = &tile_data->tile_info;
+ av1_tile_init(tile_info, cm, tile_row, tile_col);
+ tile_data->firstpass_top_mv = kZeroMv;
+ tile_data->abs_sum_level = 0;
+
+ if (is_token_info_allocated(token_info)) {
+ token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+ pre_tok = token_info->tile_tok[tile_row][tile_col];
+ tile_tok = allocated_tokens(
+ tile_info, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
+ num_planes);
+ token_info->tplist[tile_row][tile_col] = tplist + tplist_count;
+ tplist = token_info->tplist[tile_row][tile_col];
+ tplist_count = av1_get_sb_rows_in_tile(cm, tile_info);
+ }
+ tile_data->allow_update_cdf = !cm->tiles.large_scale;
+ tile_data->allow_update_cdf = tile_data->allow_update_cdf &&
+ !cm->features.disable_cdf_update &&
+ !delay_wait_for_top_right_sb(cpi);
+ tile_data->tctx = *cm->fc;
+ }
+ }
+}
+
+// Populate the start palette token info prior to encoding an SB row.
+static AOM_INLINE void get_token_start(AV1_COMP *cpi, const TileInfo *tile_info,
+ int tile_row, int tile_col, int mi_row,
+ TokenExtra **tp) {
+ const TokenInfo *token_info = &cpi->token_info;
+ if (!is_token_info_allocated(token_info)) return;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col];
+ const int sb_row_in_tile =
+ (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
+
+ get_start_tok(cpi, tile_row, tile_col, mi_row, tp,
+ cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes);
+ assert(tplist != NULL);
+ tplist[sb_row_in_tile].start = *tp;
+}
+
+// Populate the token count after encoding an SB row.
+static AOM_INLINE void populate_token_count(AV1_COMP *cpi,
+ const TileInfo *tile_info,
+ int tile_row, int tile_col,
+ int mi_row, TokenExtra *tok) {
+ const TokenInfo *token_info = &cpi->token_info;
+ if (!is_token_info_allocated(token_info)) return;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ TokenList *const tplist = token_info->tplist[tile_row][tile_col];
+ const int sb_row_in_tile =
+ (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
+ const int tile_mb_cols =
+ (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+ const int num_mb_rows_in_sb =
+ ((1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+ tplist[sb_row_in_tile].count =
+ (unsigned int)(tok - tplist[sb_row_in_tile].start);
+
+ assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <=
+ get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+ cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
+ num_planes));
+
+ (void)num_planes;
+ (void)tile_mb_cols;
+ (void)num_mb_rows_in_sb;
+}
+
+/*!\brief Encode a superblock row
+ *
+ * \ingroup partition_search
+ */
+void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col, int mi_row) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ TokenExtra *tok = NULL;
+
+ get_token_start(cpi, tile_info, tile_row, tile_col, mi_row, &tok);
+
+ encode_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+ populate_token_count(cpi, tile_info, tile_row, tile_col, mi_row, tok);
+}
+
+/*!\brief Encode a tile
+ *
+ * \ingroup partition_search
+ */
+void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+
+ if (!cpi->sf.rt_sf.use_nonrd_pick_mode) av1_inter_mode_data_init(this_tile);
+
+ av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
+ tile_info->mi_col_end, tile_row);
+ av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+ &td->mb.e_mbd);
+
+ if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra)
+ cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
+
+ if (td->mb.txfm_search_info.mb_rd_record != NULL) {
+ av1_crc32c_calculator_init(
+ &td->mb.txfm_search_info.mb_rd_record->crc_calculator);
+ }
+
+ for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+ mi_row += cm->seq_params->mib_size) {
+ av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
+ }
+ this_tile->abs_sum_level = td->abs_sum_level;
+}
+
+/*!\brief Break one frame into tiles and encode the tiles
+ *
+ * \ingroup partition_search
+ *
+ * \param[in] cpi Top-level encoder structure
+ */
+static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int tile_col, tile_row;
+
+ MACROBLOCK *const mb = &cpi->td.mb;
+ assert(IMPLIES(cpi->tile_data == NULL,
+ cpi->allocated_tiles < tile_cols * tile_rows));
+ if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
+
+ av1_init_tile_data(cpi);
+ av1_alloc_mb_data(cpi, mb);
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+ cpi->td.intrabc_used = 0;
+ cpi->td.deltaq_used = 0;
+ cpi->td.abs_sum_level = 0;
+ cpi->td.rd_counts.seg_tmp_pred_cost[0] = 0;
+ cpi->td.rd_counts.seg_tmp_pred_cost[1] = 0;
+ cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+ cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
+ av1_init_rtc_counters(&cpi->td.mb);
+ cpi->td.mb.palette_pixels = 0;
+ av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+ if (!frame_is_intra_only(&cpi->common))
+ av1_accumulate_rtc_counters(cpi, &cpi->td.mb);
+ cpi->palette_pixel_num += cpi->td.mb.palette_pixels;
+ cpi->intrabc_used |= cpi->td.intrabc_used;
+ cpi->deltaq_used |= cpi->td.deltaq_used;
+ }
+ }
+
+ av1_dealloc_mb_data(mb, av1_num_planes(cm));
+}
+
+// Set the relative distance of a reference frame w.r.t. current frame
+static AOM_INLINE void set_rel_frame_dist(
+ const AV1_COMMON *const cm, RefFrameDistanceInfo *const ref_frame_dist_info,
+ const int ref_frame_flags) {
+ MV_REFERENCE_FRAME ref_frame;
+ int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX;
+ ref_frame_dist_info->nearest_past_ref = NONE_FRAME;
+ ref_frame_dist_info->nearest_future_ref = NONE_FRAME;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = 0;
+ if (ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ int dist = av1_encoder_get_relative_dist(
+ cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME],
+ cm->current_frame.display_order_hint);
+ ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] = dist;
+ // Get the nearest ref_frame in the past
+ if (abs(dist) < min_past_dist && dist < 0) {
+ ref_frame_dist_info->nearest_past_ref = ref_frame;
+ min_past_dist = abs(dist);
+ }
+ // Get the nearest ref_frame in the future
+ if (dist < min_future_dist && dist > 0) {
+ ref_frame_dist_info->nearest_future_ref = ref_frame;
+ min_future_dist = dist;
+ }
+ }
+ }
+}
+
+static INLINE int refs_are_one_sided(const AV1_COMMON *cm) {
+ assert(!frame_is_intra_only(cm));
+
+ int one_sided_refs = 1;
+ const int cur_display_order_hint = cm->current_frame.display_order_hint;
+ for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+ if (buf == NULL) continue;
+ if (av1_encoder_get_relative_dist(buf->display_order_hint,
+ cur_display_order_hint) > 0) {
+ one_sided_refs = 0; // bwd reference
+ break;
+ }
+ }
+ return one_sided_refs;
+}
+
+static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
+ int ref_order_hint[2]) {
+ const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
+ ref_order_hint[0] = ref_order_hint[1] = 0;
+ if (!skip_mode_info->skip_mode_allowed) return;
+
+ const RefCntBuffer *const buf_0 =
+ get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0);
+ const RefCntBuffer *const buf_1 =
+ get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1);
+ assert(buf_0 != NULL && buf_1 != NULL);
+
+ ref_order_hint[0] = buf_0->order_hint;
+ ref_order_hint[1] = buf_1->order_hint;
+}
+
+static int check_skip_mode_enabled(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ av1_setup_skip_mode_allowed(cm);
+ if (!cm->current_frame.skip_mode_info.skip_mode_allowed) return 0;
+
+ // Turn off skip mode if the temporal distances of the reference pair to the
+ // current frame are different by more than 1 frame.
+ const int cur_offset = (int)cm->current_frame.order_hint;
+ int ref_offset[2];
+ get_skip_mode_ref_offsets(cm, ref_offset);
+ const int cur_to_ref0 = get_relative_dist(&cm->seq_params->order_hint_info,
+ cur_offset, ref_offset[0]);
+ const int cur_to_ref1 = abs(get_relative_dist(
+ &cm->seq_params->order_hint_info, cur_offset, ref_offset[1]));
+ if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
+
+ // High Latency: Turn off skip mode if all refs are fwd.
+ if (cpi->all_one_sided_refs && cpi->oxcf.gf_cfg.lag_in_frames > 0) return 0;
+
+ const int ref_frame[2] = {
+ cm->current_frame.skip_mode_info.ref_frame_idx_0 + LAST_FRAME,
+ cm->current_frame.skip_mode_info.ref_frame_idx_1 + LAST_FRAME
+ };
+ if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[0]]) ||
+ !(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]]))
+ return 0;
+
+ return 1;
+}
+
+static AOM_INLINE void set_default_interp_skip_flags(
+ const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) {
+ const int num_planes = av1_num_planes(cm);
+ interp_search_flags->default_interp_skip_flags =
+ (num_planes == 1) ? INTERP_SKIP_LUMA_EVAL_CHROMA
+ : INTERP_SKIP_LUMA_SKIP_CHROMA;
+}
+
+static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
+ if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
+ cpi->sf.inter_sf.disable_onesided_comp) &&
+ cpi->all_one_sided_refs) {
+ // Disable all compound references
+ cpi->prune_ref_frame_mask = (1 << MODE_CTX_REF_FRAMES) - (1 << REF_FRAMES);
+ } else if (!cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ cpi->sf.inter_sf.selective_ref_frame >= 2) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int cur_frame_display_order_hint =
+ cm->current_frame.display_order_hint;
+ unsigned int *ref_display_order_hint =
+ cm->cur_frame->ref_display_order_hint;
+ const int arf2_dist = av1_encoder_get_relative_dist(
+ ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME],
+ cur_frame_display_order_hint);
+ const int bwd_dist = av1_encoder_get_relative_dist(
+ ref_display_order_hint[BWDREF_FRAME - LAST_FRAME],
+ cur_frame_display_order_hint);
+
+ for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) {
+ MV_REFERENCE_FRAME rf[2];
+ av1_set_ref_frame(rf, ref_idx);
+ if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) ||
+ !(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]])) {
+ continue;
+ }
+
+ if (!cpi->all_one_sided_refs) {
+ int ref_dist[2];
+ for (int i = 0; i < 2; ++i) {
+ ref_dist[i] = av1_encoder_get_relative_dist(
+ ref_display_order_hint[rf[i] - LAST_FRAME],
+ cur_frame_display_order_hint);
+ }
+
+ // One-sided compound is used only when all reference frames are
+ // one-sided.
+ if ((ref_dist[0] > 0) == (ref_dist[1] > 0)) {
+ cpi->prune_ref_frame_mask |= 1 << ref_idx;
+ }
+ }
+
+ if (cpi->sf.inter_sf.selective_ref_frame >= 4 &&
+ (rf[0] == ALTREF2_FRAME || rf[1] == ALTREF2_FRAME) &&
+ (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) {
+ // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references.
+ if (arf2_dist > 0 && bwd_dist > 0 && bwd_dist <= arf2_dist) {
+ // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer
+ // reference to the current frame than ALTREF2_FRAME
+ cpi->prune_ref_frame_mask |= 1 << ref_idx;
+ }
+ }
+ }
+ }
+}
+
+static int allow_deltaq_mode(AV1_COMP *cpi) {
+#if !CONFIG_REALTIME_ONLY
+ AV1_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ int sbs_wide = mi_size_wide[sb_size];
+ int sbs_high = mi_size_high[sb_size];
+
+ int64_t delta_rdcost = 0;
+ for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sbs_high) {
+ for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sbs_wide) {
+ int64_t this_delta_rdcost = 0;
+ av1_get_q_for_deltaq_objective(cpi, &cpi->td, &this_delta_rdcost, sb_size,
+ mi_row, mi_col);
+ delta_rdcost += this_delta_rdcost;
+ }
+ }
+ return delta_rdcost < 0;
+#else
+ (void)cpi;
+ return 1;
+#endif // !CONFIG_REALTIME_ONLY
+}
+
+#define FORCE_ZMV_SKIP_128X128_BLK_DIFF 10000
+#define FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF 4
+
+// Populates block level thresholds for force zeromv-skip decision
+static void populate_thresh_to_force_zeromv_skip(AV1_COMP *cpi) {
+ if (cpi->sf.rt_sf.part_early_exit_zeromv == 0) return;
+
+ // Threshold for forcing zeromv-skip decision is as below:
+ // For 128x128 blocks, threshold is 10000 and per pixel threshold is 0.6103.
+ // For 64x64 blocks, threshold is 5000 and per pixel threshold is 1.221
+ // allowing slightly higher error for smaller blocks.
+ // Per Pixel Threshold of 64x64 block Area of 64x64 block 1 1
+ // ------------------------------------=sqrt(---------------------)=sqrt(-)=-
+ // Per Pixel Threshold of 128x128 block Area of 128x128 block 4 2
+ // Thus, per pixel thresholds for blocks of size 32x32, 16x16,... can be
+ // chosen as 2.442, 4.884,.... As the per pixel error tends to be higher for
+ // small blocks, the same is clipped to 4.
+ const unsigned int thresh_exit_128x128_part = FORCE_ZMV_SKIP_128X128_BLK_DIFF;
+ const int num_128x128_pix =
+ block_size_wide[BLOCK_128X128] * block_size_high[BLOCK_128X128];
+
+ for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) {
+ const int num_block_pix = block_size_wide[bsize] * block_size_high[bsize];
+
+ // Calculate the threshold for zeromv-skip decision based on area of the
+ // partition
+ unsigned int thresh_exit_part_blk =
+ (unsigned int)(thresh_exit_128x128_part *
+ sqrt((double)num_block_pix / num_128x128_pix) +
+ 0.5);
+ thresh_exit_part_blk = AOMMIN(
+ thresh_exit_part_blk,
+ (unsigned int)(FORCE_ZMV_SKIP_MAX_PER_PIXEL_DIFF * num_block_pix));
+ cpi->zeromv_skip_thresh_exit_part[bsize] = thresh_exit_part_blk;
+ }
+}
+
+static void free_block_hash_buffers(uint32_t *block_hash_values[2][2],
+ int8_t *is_block_same[2][3]) {
+ for (int k = 0; k < 2; ++k) {
+ for (int j = 0; j < 2; ++j) {
+ aom_free(block_hash_values[k][j]);
+ }
+
+ for (int j = 0; j < 3; ++j) {
+ aom_free(is_block_same[k][j]);
+ }
+ }
+}
+
+/*!\brief Encoder setup(only for the current frame), encoding, and recontruction
+ * for a single frame
+ *
+ * \ingroup high_level_algo
+ */
+static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
+ ThreadData *const td = &cpi->td;
+ MACROBLOCK *const x = &td->mb;
+ AV1_COMMON *const cm = &cpi->common;
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+ FeatureFlags *const features = &cm->features;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+#if CONFIG_FPMT_TEST
+ FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs;
+ FrameProbInfo *const temp_frame_probs_simulation =
+ &cpi->ppi->temp_frame_probs_simulation;
+#endif
+ FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+ IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const DELTAQ_MODE deltaq_mode = oxcf->q_cfg.deltaq_mode;
+ int i;
+
+ if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ mi_params->setup_mi(mi_params);
+ }
+
+ set_mi_offsets(mi_params, xd, 0, 0);
+
+ av1_zero(*td->counts);
+ av1_zero(rdc->tx_type_used);
+ av1_zero(rdc->obmc_used);
+ av1_zero(rdc->warped_used);
+ av1_zero(rdc->seg_tmp_pred_cost);
+
+ // Reset the flag.
+ cpi->intrabc_used = 0;
+ // Need to disable intrabc when superres is selected
+ if (av1_superres_scaled(cm)) {
+ features->allow_intrabc = 0;
+ }
+
+ features->allow_intrabc &= (oxcf->kf_cfg.enable_intrabc);
+
+ if (features->allow_warped_motion &&
+ cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ int warped_probability =
+#if CONFIG_FPMT_TEST
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE
+ ? temp_frame_probs->warped_probs[update_type]
+ :
+#endif // CONFIG_FPMT_TEST
+ frame_probs->warped_probs[update_type];
+ if (warped_probability < cpi->sf.inter_sf.prune_warped_prob_thresh)
+ features->allow_warped_motion = 0;
+ }
+
+ int hash_table_created = 0;
+ if (!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) &&
+ !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ // TODO(any): move this outside of the recoding loop to avoid recalculating
+ // the hash table.
+ // add to hash table
+ const int pic_width = cpi->source->y_crop_width;
+ const int pic_height = cpi->source->y_crop_height;
+ uint32_t *block_hash_values[2][2] = { { NULL } };
+ int8_t *is_block_same[2][3] = { { NULL } };
+ int k, j;
+ bool error = false;
+
+ for (k = 0; k < 2 && !error; ++k) {
+ for (j = 0; j < 2; ++j) {
+ block_hash_values[k][j] = (uint32_t *)aom_malloc(
+ sizeof(*block_hash_values[0][0]) * pic_width * pic_height);
+ if (!block_hash_values[k][j]) {
+ error = true;
+ break;
+ }
+ }
+
+ for (j = 0; j < 3 && !error; ++j) {
+ is_block_same[k][j] = (int8_t *)aom_malloc(
+ sizeof(*is_block_same[0][0]) * pic_width * pic_height);
+ if (!is_block_same[k][j]) error = true;
+ }
+ }
+
+ av1_hash_table_init(intrabc_hash_info);
+ if (error ||
+ !av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table)) {
+ free_block_hash_buffers(block_hash_values, is_block_same);
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating intrabc_hash_table and buffers");
+ }
+ hash_table_created = 1;
+ av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source,
+ block_hash_values[0], is_block_same[0]);
+ // Hash data generated for screen contents is used for intraBC ME
+ const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize];
+ const int max_sb_size =
+ (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2));
+ int src_idx = 0;
+ for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) {
+ const int dst_idx = !src_idx;
+ av1_generate_block_hash_value(
+ intrabc_hash_info, cpi->source, size, block_hash_values[src_idx],
+ block_hash_values[dst_idx], is_block_same[src_idx],
+ is_block_same[dst_idx]);
+ if (size >= min_alloc_size) {
+ if (!av1_add_to_hash_map_by_row_with_precal_data(
+ &intrabc_hash_info->intrabc_hash_table,
+ block_hash_values[dst_idx], is_block_same[dst_idx][2],
+ pic_width, pic_height, size)) {
+ error = true;
+ break;
+ }
+ }
+ }
+
+ free_block_hash_buffers(block_hash_values, is_block_same);
+
+ if (error) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error adding data to intrabc_hash_table");
+ }
+ }
+
+ const CommonQuantParams *quant_params = &cm->quant_params;
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ const int qindex =
+ cm->seg.enabled ? av1_get_qindex(&cm->seg, i, quant_params->base_qindex)
+ : quant_params->base_qindex;
+ xd->lossless[i] =
+ qindex == 0 && quant_params->y_dc_delta_q == 0 &&
+ quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 &&
+ quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0;
+ if (xd->lossless[i]) cpi->enc_seg.has_lossless_segment = 1;
+ xd->qindex[i] = qindex;
+ if (xd->lossless[i]) {
+ cpi->optimize_seg_arr[i] = NO_TRELLIS_OPT;
+ } else {
+ cpi->optimize_seg_arr[i] = cpi->sf.rd_sf.optimize_coefficients;
+ }
+ }
+ features->coded_lossless = is_coded_lossless(cm, xd);
+ features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
+
+ // Fix delta q resolution for the moment
+
+ cm->delta_q_info.delta_q_res = 0;
+ if (cpi->use_ducky_encode) {
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_DUCKY_ENCODE;
+ } else if (cpi->oxcf.q_cfg.aq_mode != CYCLIC_REFRESH_AQ) {
+ if (deltaq_mode == DELTA_Q_OBJECTIVE)
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
+ else if (deltaq_mode == DELTA_Q_PERCEPTUAL)
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+ else if (deltaq_mode == DELTA_Q_PERCEPTUAL_AI)
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+ else if (deltaq_mode == DELTA_Q_USER_RATING_BASED)
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+ else if (deltaq_mode == DELTA_Q_HDR)
+ cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+ // Set delta_q_present_flag before it is used for the first time
+ cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
+ cm->delta_q_info.delta_q_present_flag = deltaq_mode != NO_DELTA_Q;
+
+ // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q
+ // is used for ineligible frames. That effectively will turn off row_mt
+ // usage. Note objective delta_q and tpl eligible frames are only altref
+ // frames currently.
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (cm->delta_q_info.delta_q_present_flag) {
+ if (deltaq_mode == DELTA_Q_OBJECTIVE &&
+ gf_group->update_type[cpi->gf_frame_index] == LF_UPDATE)
+ cm->delta_q_info.delta_q_present_flag = 0;
+
+ if (deltaq_mode == DELTA_Q_OBJECTIVE &&
+ cm->delta_q_info.delta_q_present_flag) {
+ cm->delta_q_info.delta_q_present_flag &= allow_deltaq_mode(cpi);
+ }
+ }
+
+ // Reset delta_q_used flag
+ cpi->deltaq_used = 0;
+
+ cm->delta_q_info.delta_lf_present_flag =
+ cm->delta_q_info.delta_q_present_flag &&
+ oxcf->tool_cfg.enable_deltalf_mode;
+ cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+
+ // update delta_q_present_flag and delta_lf_present_flag based on
+ // base_qindex
+ cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0;
+ cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0;
+ } else if (cpi->cyclic_refresh->apply_cyclic_refresh ||
+ cpi->svc.number_temporal_layers == 1) {
+ cpi->cyclic_refresh->actual_num_seg1_blocks = 0;
+ cpi->cyclic_refresh->actual_num_seg2_blocks = 0;
+ }
+ cpi->rc.cnt_zeromv = 0;
+
+ av1_frame_init_quantizer(cpi);
+ init_encode_frame_mb_context(cpi);
+ set_default_interp_skip_flags(cm, &cpi->interp_search_flags);
+
+ if (cm->prev_frame && cm->prev_frame->seg.enabled)
+ cm->last_frame_seg_map = cm->prev_frame->seg_map;
+ else
+ cm->last_frame_seg_map = NULL;
+ if (features->allow_intrabc || features->coded_lossless) {
+ av1_set_default_ref_deltas(cm->lf.ref_deltas);
+ av1_set_default_mode_deltas(cm->lf.mode_deltas);
+ } else if (cm->prev_frame) {
+ memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+ memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+ }
+ memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES);
+ memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+
+ cpi->all_one_sided_refs =
+ frame_is_intra_only(cm) ? 0 : refs_are_one_sided(cm);
+
+ cpi->prune_ref_frame_mask = 0;
+ // Figure out which ref frames can be skipped at frame level.
+ setup_prune_ref_frame_mask(cpi);
+
+ x->txfm_search_info.txb_split_count = 0;
+#if CONFIG_SPEED_STATS
+ x->txfm_search_info.tx_search_count = 0;
+#endif // CONFIG_SPEED_STATS
+
+#if !CONFIG_REALTIME_ONLY
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_compute_global_motion_time);
+#endif
+ av1_compute_global_motion_facade(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_compute_global_motion_time);
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_setup_motion_field_time);
+#endif
+ av1_calculate_ref_frame_side(cm);
+ if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_setup_motion_field_time);
+#endif
+
+ cm->current_frame.skip_mode_info.skip_mode_flag =
+ check_skip_mode_enabled(cpi);
+
+ // Initialization of skip mode cost depends on the value of
+ // 'skip_mode_flag'. This initialization happens in the function
+ // av1_fill_mode_rates(), which is in turn called in
+ // av1_initialize_rd_consts(). Thus, av1_initialize_rd_consts()
+ // has to be called after 'skip_mode_flag' is initialized.
+ av1_initialize_rd_consts(cpi);
+ av1_set_sad_per_bit(cpi, &x->sadperbit, quant_params->base_qindex);
+ populate_thresh_to_force_zeromv_skip(cpi);
+
+ enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+ enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
+ mt_info->row_mt_enabled = 0;
+ mt_info->pack_bs_mt_enabled = AOMMIN(mt_info->num_mod_workers[MOD_PACK_BS],
+ cm->tiles.cols * cm->tiles.rows) > 1;
+
+ if (oxcf->row_mt && (mt_info->num_workers > 1)) {
+ mt_info->row_mt_enabled = 1;
+ enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+ enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
+ av1_encode_tiles_row_mt(cpi);
+ } else {
+ if (AOMMIN(mt_info->num_workers, cm->tiles.cols * cm->tiles.rows) > 1) {
+ av1_encode_tiles_mt(cpi);
+ } else {
+ // Preallocate the pc_tree for realtime coding to reduce the cost of
+ // memory allocation.
+ const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+ if (use_nonrd_mode) {
+ td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ } else {
+ td->pc_root = NULL;
+ }
+
+ encode_tiles(cpi);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+ }
+ }
+
+ // If intrabc is allowed but never selected, reset the allow_intrabc flag.
+ if (features->allow_intrabc && !cpi->intrabc_used) {
+ features->allow_intrabc = 0;
+ }
+ if (features->allow_intrabc) {
+ cm->delta_q_info.delta_lf_present_flag = 0;
+ }
+
+ if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) {
+ cm->delta_q_info.delta_q_present_flag = 0;
+ }
+
+ // Set the transform size appropriately before bitstream creation
+ const MODE_EVAL_TYPE eval_type =
+ cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch
+ ? WINNER_MODE_EVAL
+ : DEFAULT_EVAL;
+ const TX_SIZE_SEARCH_METHOD tx_search_type =
+ cpi->winner_mode_params.tx_size_search_methods[eval_type];
+ assert(oxcf->txfm_cfg.enable_tx64 || tx_search_type != USE_LARGESTALL);
+ features->tx_mode = select_tx_mode(cm, tx_search_type);
+
+ // Retain the frame level probability update conditions for parallel frames.
+ // These conditions will be consumed during postencode stage to update the
+ // probability.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ cpi->do_update_frame_probs_txtype[cpi->num_frame_recode] =
+ cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats;
+ cpi->do_update_frame_probs_obmc[cpi->num_frame_recode] =
+ (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX);
+ cpi->do_update_frame_probs_warp[cpi->num_frame_recode] =
+ (features->allow_warped_motion &&
+ cpi->sf.inter_sf.prune_warped_prob_thresh > 0);
+ cpi->do_update_frame_probs_interpfilter[cpi->num_frame_recode] =
+ (cm->current_frame.frame_type != KEY_FRAME &&
+ cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
+ features->interp_filter == SWITCHABLE);
+ }
+
+ if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats ||
+ ((cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh !=
+ INT_MAX) &&
+ (cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != 0))) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ for (i = 0; i < TX_SIZES_ALL; i++) {
+ int sum = 0;
+ int j;
+ int left = MAX_TX_TYPE_PROB;
+
+ for (j = 0; j < TX_TYPES; j++)
+ sum += cpi->td.rd_counts.tx_type_used[i][j];
+
+ for (j = TX_TYPES - 1; j >= 0; j--) {
+ int update_txtype_frameprobs = 1;
+ const int new_prob =
+ sum ? MAX_TX_TYPE_PROB * cpi->td.rd_counts.tx_type_used[i][j] / sum
+ : (j ? 0 : MAX_TX_TYPE_PROB);
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] ==
+ 0) {
+ int prob =
+ (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ temp_frame_probs_simulation->tx_type_probs[update_type][i][j] =
+ prob;
+ // Copy temp_frame_probs_simulation to temp_frame_probs
+ for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+ update_type_idx++) {
+ temp_frame_probs->tx_type_probs[update_type_idx][i][j] =
+ temp_frame_probs_simulation
+ ->tx_type_probs[update_type_idx][i][j];
+ }
+ }
+ update_txtype_frameprobs = 0;
+ }
+#endif // CONFIG_FPMT_TEST
+ // Track the frame probabilities of parallel encode frames to update
+ // during postencode stage.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ update_txtype_frameprobs = 0;
+ cpi->frame_new_probs[cpi->num_frame_recode]
+ .tx_type_probs[update_type][i][j] = new_prob;
+ }
+ if (update_txtype_frameprobs) {
+ int prob =
+ (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ frame_probs->tx_type_probs[update_type][i][j] = prob;
+ }
+ }
+ }
+ }
+
+ if (cm->seg.enabled) {
+ cm->seg.temporal_update = 1;
+ if (rdc->seg_tmp_pred_cost[0] < rdc->seg_tmp_pred_cost[1])
+ cm->seg.temporal_update = 0;
+ }
+
+ if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+ for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+ int sum = 0;
+ int update_obmc_frameprobs = 1;
+ for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j];
+
+ const int new_prob =
+ sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0;
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ temp_frame_probs_simulation->obmc_probs[update_type][i] =
+ (temp_frame_probs_simulation->obmc_probs[update_type][i] +
+ new_prob) >>
+ 1;
+ // Copy temp_frame_probs_simulation to temp_frame_probs
+ for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+ update_type_idx++) {
+ temp_frame_probs->obmc_probs[update_type_idx][i] =
+ temp_frame_probs_simulation->obmc_probs[update_type_idx][i];
+ }
+ }
+ update_obmc_frameprobs = 0;
+ }
+#endif // CONFIG_FPMT_TEST
+ // Track the frame probabilities of parallel encode frames to update
+ // during postencode stage.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ update_obmc_frameprobs = 0;
+ cpi->frame_new_probs[cpi->num_frame_recode].obmc_probs[update_type][i] =
+ new_prob;
+ }
+ if (update_obmc_frameprobs) {
+ frame_probs->obmc_probs[update_type][i] =
+ (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+ }
+ }
+ }
+
+ if (features->allow_warped_motion &&
+ cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ int update_warp_frameprobs = 1;
+ int sum = 0;
+ for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i];
+ const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0;
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ temp_frame_probs_simulation->warped_probs[update_type] =
+ (temp_frame_probs_simulation->warped_probs[update_type] +
+ new_prob) >>
+ 1;
+ // Copy temp_frame_probs_simulation to temp_frame_probs
+ for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+ update_type_idx++) {
+ temp_frame_probs->warped_probs[update_type_idx] =
+ temp_frame_probs_simulation->warped_probs[update_type_idx];
+ }
+ }
+ update_warp_frameprobs = 0;
+ }
+#endif // CONFIG_FPMT_TEST
+ // Track the frame probabilities of parallel encode frames to update
+ // during postencode stage.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ update_warp_frameprobs = 0;
+ cpi->frame_new_probs[cpi->num_frame_recode].warped_probs[update_type] =
+ new_prob;
+ }
+ if (update_warp_frameprobs) {
+ frame_probs->warped_probs[update_type] =
+ (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+ }
+ }
+
+ if (cm->current_frame.frame_type != KEY_FRAME &&
+ cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
+ features->interp_filter == SWITCHABLE) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ int sum = 0;
+ int j;
+ int left = 1536;
+
+ for (j = 0; j < SWITCHABLE_FILTERS; j++) {
+ sum += cpi->td.counts->switchable_interp[i][j];
+ }
+
+ for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+ int update_interpfilter_frameprobs = 1;
+ const int new_prob =
+ sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum
+ : (j ? 0 : 1536);
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] ==
+ 0) {
+ int prob = (temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type][i][j] = prob;
+ // Copy temp_frame_probs_simulation to temp_frame_probs
+ for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+ update_type_idx++) {
+ temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] =
+ temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type_idx][i][j];
+ }
+ }
+ update_interpfilter_frameprobs = 0;
+ }
+#endif // CONFIG_FPMT_TEST
+ // Track the frame probabilities of parallel encode frames to update
+ // during postencode stage.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ update_interpfilter_frameprobs = 0;
+ cpi->frame_new_probs[cpi->num_frame_recode]
+ .switchable_interp_probs[update_type][i][j] = new_prob;
+ }
+ if (update_interpfilter_frameprobs) {
+ int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+ }
+ }
+ }
+ }
+ if (hash_table_created) {
+ av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table);
+ }
+}
+
+/*!\brief Setup reference frame buffers and encode a frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top-level encoder structure
+ */
+void av1_encode_frame(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ FeatureFlags *const features = &cm->features;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ // Indicates whether or not to use a default reduced set for ext-tx
+ // rather than the potential full set of 16 transforms
+ features->reduced_tx_set_used = oxcf->txfm_cfg.reduced_tx_type_set;
+
+ // Make sure segment_id is no larger than last_active_segid.
+ if (cm->seg.enabled && cm->seg.update_map) {
+ const int mi_rows = cm->mi_params.mi_rows;
+ const int mi_cols = cm->mi_params.mi_cols;
+ const int last_active_segid = cm->seg.last_active_segid;
+ uint8_t *map = cpi->enc_seg.map;
+ for (int mi_row = 0; mi_row < mi_rows; ++mi_row) {
+ for (int mi_col = 0; mi_col < mi_cols; ++mi_col) {
+ map[mi_col] = AOMMIN(map[mi_col], last_active_segid);
+ }
+ map += mi_cols;
+ }
+ }
+
+ av1_setup_frame_buf_refs(cm);
+ enforce_max_ref_frames(cpi, &cpi->ref_frame_flags,
+ cm->cur_frame->ref_display_order_hint,
+ cm->current_frame.display_order_hint);
+ set_rel_frame_dist(&cpi->common, &cpi->ref_frame_dist_info,
+ cpi->ref_frame_flags);
+ av1_setup_frame_sign_bias(cm);
+
+ // If global motion is enabled, then every buffer which is used as either
+ // a source or a ref frame should have an image pyramid allocated.
+ // Check here so that issues can be caught early in debug mode
+#if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY
+ if (cpi->image_pyramid_levels > 0) {
+ assert(cpi->source->y_pyramid);
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf != NULL) {
+ assert(buf->buf.y_pyramid);
+ }
+ }
+ }
+#endif // !defined(NDEBUG) && !CONFIG_REALTIME_ONLY
+
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_reset_frame(av1_num_planes(cm));
+#endif
+
+ rdc->newmv_or_intra_blocks = 0;
+ cpi->palette_pixel_num = 0;
+
+ if (cpi->sf.hl_sf.frame_parameter_update ||
+ cpi->sf.rt_sf.use_comp_ref_nonrd) {
+ if (frame_is_intra_only(cm))
+ current_frame->reference_mode = SINGLE_REFERENCE;
+ else
+ current_frame->reference_mode = REFERENCE_MODE_SELECT;
+
+ features->interp_filter = SWITCHABLE;
+ if (cm->tiles.large_scale) features->interp_filter = EIGHTTAP_REGULAR;
+
+ features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+ features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc);
+
+ rdc->compound_ref_used_flag = 0;
+ rdc->skip_mode_used_flag = 0;
+
+ encode_frame_internal(cpi);
+
+ if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+ // Use a flag that includes 4x4 blocks
+ if (rdc->compound_ref_used_flag == 0) {
+ current_frame->reference_mode = SINGLE_REFERENCE;
+#if CONFIG_ENTROPY_STATS
+ av1_zero(cpi->td.counts->comp_inter);
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ // Re-check on the skip mode status as reference mode may have been
+ // changed.
+ SkipModeInfo *const skip_mode_info = &current_frame->skip_mode_info;
+ if (frame_is_intra_only(cm) ||
+ current_frame->reference_mode == SINGLE_REFERENCE) {
+ skip_mode_info->skip_mode_allowed = 0;
+ skip_mode_info->skip_mode_flag = 0;
+ }
+ if (skip_mode_info->skip_mode_flag && rdc->skip_mode_used_flag == 0)
+ skip_mode_info->skip_mode_flag = 0;
+
+ if (!cm->tiles.large_scale) {
+ if (features->tx_mode == TX_MODE_SELECT &&
+ cpi->td.mb.txfm_search_info.txb_split_count == 0)
+ features->tx_mode = TX_MODE_LARGEST;
+ }
+ } else {
+ // This is needed if real-time speed setting is changed on the fly
+ // from one using compound prediction to one using single reference.
+ if (current_frame->reference_mode == REFERENCE_MODE_SELECT)
+ current_frame->reference_mode = SINGLE_REFERENCE;
+ encode_frame_internal(cpi);
+ }
+}
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
new file mode 100644
index 0000000000..ce32fb47e6
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#include "av1/encoder/global_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DELTA_Q_PERCEPTUAL_MODULATION \
+ 1 // 0: variance based
+ // 1: wavelet AC energy based
+
+struct macroblock;
+struct yv12_buffer_config;
+struct AV1_COMP;
+struct ThreadData;
+
+void av1_init_rtc_counters(struct macroblock *const x);
+
+void av1_accumulate_rtc_counters(struct AV1_COMP *cpi,
+ const struct macroblock *const x);
+
+void av1_setup_src_planes(struct macroblock *x,
+ const struct yv12_buffer_config *src, int mi_row,
+ int mi_col, const int num_planes, BLOCK_SIZE bsize);
+
+void av1_encode_frame(struct AV1_COMP *cpi);
+
+void av1_alloc_tile_data(struct AV1_COMP *cpi);
+void av1_init_tile_data(struct AV1_COMP *cpi);
+void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
+ int tile_col);
+void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
+ int tile_row, int tile_col, int mi_row);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEFRAME_H_
diff --git a/third_party/aom/av1/encoder/encodeframe_utils.c b/third_party/aom/av1/encoder/encodeframe_utils.c
new file mode 100644
index 0000000000..949837184a
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe_utils.c
@@ -0,0 +1,1775 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common_data.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/rdopt.h"
+
+void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int *const rdmult) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const BLOCK_SIZE bsize_base = BLOCK_16X16;
+ const int num_mi_w = mi_size_wide[bsize_base];
+ const int num_mi_h = mi_size_high[bsize_base];
+ const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+ const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+ int row, col;
+ double num_of_mi = 0.0;
+ double geom_mean_of_scale = 1.0;
+
+ // To avoid overflow of 'geom_mean_of_scale', bsize_base must be at least
+ // BLOCK_8X8.
+ //
+ // For bsize=BLOCK_128X128 and bsize_base=BLOCK_8X8, the loop below would
+ // iterate 256 times. Considering the maximum value of
+ // cpi->ssim_rdmult_scaling_factors (see av1_set_mb_ssim_rdmult_scaling()),
+ // geom_mean_of_scale can go up to 4.8323^256, which is within DBL_MAX
+ // (maximum value a double data type can hold). If bsize_base is modified to
+ // BLOCK_4X4 (minimum possible block size), geom_mean_of_scale can go up
+ // to 4.8323^1024 and exceed DBL_MAX, resulting in data overflow.
+ assert(bsize_base >= BLOCK_8X8);
+ assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM);
+
+ for (row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (col = mi_col / num_mi_h;
+ col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ assert(cpi->ssim_rdmult_scaling_factors[index] != 0.0);
+ geom_mean_of_scale *= cpi->ssim_rdmult_scaling_factors[index];
+ num_of_mi += 1.0;
+ }
+ }
+ geom_mean_of_scale = pow(geom_mean_of_scale, (1.0 / num_of_mi));
+
+ *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+ *rdmult = AOMMAX(*rdmult, 0);
+ av1_set_error_per_bit(errorperbit, *rdmult);
+}
+
+#if CONFIG_SALIENCY_MAP
+void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi,
+ int *errorperbit, const BLOCK_SIZE bsize,
+ const int mi_row, const int mi_col,
+ int *const rdmult) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_mi_w = mi_size_wide[bsize];
+ const int num_mi_h = mi_size_high[bsize];
+ const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+
+ *rdmult =
+ (int)(*rdmult * cpi->sm_scaling_factor[(mi_row / num_mi_h) * num_cols +
+ (mi_col / num_mi_w)]);
+
+ *rdmult = AOMMAX(*rdmult, 0);
+ av1_set_error_per_bit(errorperbit, *rdmult);
+}
+#endif
+
+// TODO(angiebird): Move these function to tpl_model.c
+#if !CONFIG_REALTIME_ONLY
+// Return the end column for the current superblock, in unit of TPL blocks.
+static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col,
+ int num_mi_w) {
+ // Find the start column of this superblock.
+ const int sb_mi_col_start = (mi_col >> cm->seq_params->mib_size_log2)
+ << cm->seq_params->mib_size_log2;
+ // Same but in superres upscaled dimension.
+ const int sb_mi_col_start_sr =
+ coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator);
+ // Width of this superblock in mi units.
+ const int sb_mi_width = mi_size_wide[cm->seq_params->sb_size];
+ // Same but in superres upscaled dimension.
+ const int sb_mi_width_sr =
+ coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator);
+ // Superblock end in mi units.
+ const int sb_mi_end = sb_mi_col_start_sr + sb_mi_width_sr;
+ // Superblock end in TPL units.
+ return (sb_mi_end + num_mi_w - 1) / num_mi_w;
+}
+
+int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+ int deltaq_rdmult = set_rdmult(cpi, x, -1);
+ if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
+ if (cm->superres_scale_denominator != SCALE_NUMERATOR) return deltaq_rdmult;
+ if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
+ if (x->rb == 0) return deltaq_rdmult;
+
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ int tpl_stride = tpl_frame->stride;
+ double intra_cost_base = 0;
+ double mc_dep_cost_base = 0;
+ double cbcmp_base = 0;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += step) {
+ for (int col = mi_col; col < mi_col + mi_wide; col += step) {
+ if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+ continue;
+
+ TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+
+ double cbcmp = (double)this_stats->srcrf_dist;
+ int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+ intra_cost_base += log(dist_scaled) * cbcmp;
+ mc_dep_cost_base += log(3 * dist_scaled + mc_dep_delta) * cbcmp;
+ cbcmp_base += cbcmp;
+ }
+ }
+
+ if (cbcmp_base == 0) return deltaq_rdmult;
+
+ double rk = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base);
+ deltaq_rdmult = (int)(deltaq_rdmult * (rk / x->rb));
+
+ return AOMMAX(deltaq_rdmult, 1);
+}
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int orig_rdmult) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+ const int deltaq_rdmult = set_rdmult(cpi, x, -1);
+ if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
+ if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index))
+ return deltaq_rdmult;
+ if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
+
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+ const int block_mi_width_sr =
+ coded_to_superres_mi(mi_size_wide[bsize], cm->superres_scale_denominator);
+
+ const BLOCK_SIZE bsize_base = BLOCK_16X16;
+ const int num_mi_w = mi_size_wide[bsize_base];
+ const int num_mi_h = mi_size_high[bsize_base];
+ const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const int num_bcols = (block_mi_width_sr + num_mi_w - 1) / num_mi_w;
+ const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+ // This is required because the end col of superblock may be off by 1 in case
+ // of superres.
+ const int sb_bcol_end = get_superblock_tpl_column_end(cm, mi_col, num_mi_w);
+ int row, col;
+ double base_block_count = 0.0;
+ double geom_mean_of_scale = 0.0;
+ for (row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (col = mi_col_sr / num_mi_h;
+ col < num_cols && col < mi_col_sr / num_mi_h + num_bcols &&
+ col < sb_bcol_end;
+ ++col) {
+ const int index = row * num_cols + col;
+ geom_mean_of_scale += log(cpi->ppi->tpl_sb_rdmult_scaling_factors[index]);
+ base_block_count += 1.0;
+ }
+ }
+ geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count);
+ int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
+ rdmult = AOMMAX(rdmult, 0);
+ av1_set_error_per_bit(&x->errorperbit, rdmult);
+#if !CONFIG_RD_COMMAND
+ if (bsize == cm->seq_params->sb_size) {
+ const int rdmult_sb = set_rdmult(cpi, x, -1);
+ assert(rdmult_sb == rdmult);
+ (void)rdmult_sb;
+ }
+#endif // !CONFIG_RD_COMMAND
+ return rdmult;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
+ const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi) {
+ int dir;
+ for (dir = 0; dir < 2; ++dir) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+
+ // Only allow the 3 valid SWITCHABLE_FILTERS.
+ assert(filter < SWITCHABLE_FILTERS);
+ ++counts->switchable_interp[ctx][filter];
+ }
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+ MB_MODE_INFO_EXT *mbmi_ext,
+ const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) {
+ memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+ sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+ memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+ sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+ mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+ mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+ memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+ sizeof(mbmi_ext->global_mvs));
+}
+
+void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
+ const PICK_MODE_CONTEXT *const ctx, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+ int i, x_idx, y;
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const MB_MODE_INFO *const mi = &ctx->mic;
+ MB_MODE_INFO *const mi_addr = xd->mi[0];
+ const struct segmentation *const seg = &cm->seg;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int bw = mi_size_wide[mi->bsize];
+ const int bh = mi_size_high[mi->bsize];
+ const int mis = mi_params->mi_stride;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+ assert(mi->bsize == bsize);
+
+ *mi_addr = *mi;
+ copy_mbmi_ext_frame_to_mbmi_ext(&x->mbmi_ext, &ctx->mbmi_ext_best,
+ av1_ref_frame_type(ctx->mic.ref_frame));
+
+ memcpy(txfm_info->blk_skip, ctx->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+
+ txfm_info->skip_txfm = ctx->rd_stats.skip_txfm;
+
+ xd->tx_type_map = ctx->tx_type_map;
+ xd->tx_type_map_stride = mi_size_wide[bsize];
+ // If not dry_run, copy the transform type data into the frame level buffer.
+ // Encoder will fetch tx types when writing bitstream.
+ if (!dry_run) {
+ const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+ uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx;
+ const int mi_stride = mi_params->mi_stride;
+ for (int blk_row = 0; blk_row < bh; ++blk_row) {
+ av1_copy_array(tx_type_map + blk_row * mi_stride,
+ xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw);
+ }
+ xd->tx_type_map = tx_type_map;
+ xd->tx_type_map_stride = mi_stride;
+ }
+
+ // If segmentation in use
+ if (seg->enabled) {
+ // For in frame complexity AQ copy the segment id from the segment map.
+ if (cpi->oxcf.q_cfg.aq_mode == COMPLEXITY_AQ) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+ mi_addr->segment_id =
+ map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0;
+ }
+ // Else for cyclic refresh mode update the segment map, set the segment id
+ // and then update the quantizer.
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ !cpi->rc.rtc_external_ratectrl) {
+ av1_cyclic_refresh_update_segment(cpi, x, mi_row, mi_col, bsize,
+ ctx->rd_stats.rate, ctx->rd_stats.dist,
+ txfm_info->skip_txfm, dry_run);
+ }
+ if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
+ mi_addr->uv_mode = UV_DC_PRED;
+
+ if (!dry_run && !mi_addr->skip_txfm) {
+ int cdf_num;
+ const uint8_t spatial_pred = av1_get_spatial_seg_pred(
+ cm, xd, &cdf_num, cpi->cyclic_refresh->skip_over4x4);
+ const uint8_t coded_id = av1_neg_interleave(
+ mi_addr->segment_id, spatial_pred, seg->last_active_segid + 1);
+ int64_t spatial_cost = x->mode_costs.spatial_pred_cost[cdf_num][coded_id];
+ td->rd_counts.seg_tmp_pred_cost[0] += spatial_cost;
+
+ const int pred_segment_id =
+ cm->last_frame_seg_map
+ ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row,
+ mi_col)
+ : 0;
+ const int use_tmp_pred = pred_segment_id == mi_addr->segment_id;
+ const uint8_t tmp_pred_ctx = av1_get_pred_context_seg_id(xd);
+ td->rd_counts.seg_tmp_pred_cost[1] +=
+ x->mode_costs.tmp_pred_cost[tmp_pred_ctx][use_tmp_pred];
+ if (!use_tmp_pred) {
+ td->rd_counts.seg_tmp_pred_cost[1] += spatial_cost;
+ }
+ }
+ }
+
+ // Count zero motion vector.
+ if (!dry_run && !frame_is_intra_only(cm)) {
+ const MV mv = mi->mv[0].as_mv;
+ if (is_inter_block(mi) && mi->ref_frame[0] == LAST_FRAME &&
+ abs(mv.row) < 8 && abs(mv.col) < 8) {
+ const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+ // Accumulate low_content_frame.
+ for (int mi_y = 0; mi_y < ymis; mi_y += 2) x->cnt_zeromv += bw << 1;
+ }
+ }
+
+ for (i = 0; i < num_planes; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ p[i].dqcoeff = ctx->dqcoeff[i];
+ p[i].eobs = ctx->eobs[i];
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ }
+ for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+ // Restore the coding context of the MB to that that was in place
+ // when the mode was picked for it
+
+ const int cols =
+ AOMMIN((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width, mi_width);
+ const int rows = AOMMIN(
+ (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height, mi_height);
+ for (y = 0; y < rows; y++) {
+ for (x_idx = 0; x_idx < cols; x_idx++) xd->mi[x_idx + y * mis] = mi_addr;
+ }
+
+ if (cpi->oxcf.q_cfg.aq_mode)
+ av1_init_plane_quantizers(cpi, x, mi_addr->segment_id, 0);
+
+ if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+ {
+ unsigned int *const mode_chosen_counts =
+ (unsigned int *)cpi->mode_chosen_counts; // Cast const away.
+ if (frame_is_intra_only(cm)) {
+ static const int kf_mode_index[] = {
+ THR_DC /*DC_PRED*/,
+ THR_V_PRED /*V_PRED*/,
+ THR_H_PRED /*H_PRED*/,
+ THR_D45_PRED /*D45_PRED*/,
+ THR_D135_PRED /*D135_PRED*/,
+ THR_D113_PRED /*D113_PRED*/,
+ THR_D157_PRED /*D157_PRED*/,
+ THR_D203_PRED /*D203_PRED*/,
+ THR_D67_PRED /*D67_PRED*/,
+ THR_SMOOTH, /*SMOOTH_PRED*/
+ THR_SMOOTH_V, /*SMOOTH_V_PRED*/
+ THR_SMOOTH_H, /*SMOOTH_H_PRED*/
+ THR_PAETH /*PAETH_PRED*/,
+ };
+ ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
+ } else {
+ // Note how often each mode chosen as best
+ ++mode_chosen_counts[ctx->best_mode_index];
+ }
+ }
+#endif
+ if (!frame_is_intra_only(cm)) {
+ if (is_inter_block(mi) && cm->features.interp_filter == SWITCHABLE) {
+ // When the frame interp filter is SWITCHABLE, several cases that always
+ // use the default type (EIGHTTAP_REGULAR) are described in
+ // av1_is_interp_needed(). Here, we should keep the counts for all
+ // applicable blocks, so the frame filter resetting decision in
+ // fix_interp_filter() is made correctly.
+ update_filter_type_count(td->counts, xd, mi_addr);
+ }
+ }
+
+ const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
+ const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
+ if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
+ av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+}
+
+void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+ PREDICTION_MODE mode, int16_t mode_context) {
+ (void)counts;
+
+ int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+ if (mode == NEWMV) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->newmv_mode[mode_ctx][0];
+#endif
+ update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
+ return;
+ }
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->newmv_mode[mode_ctx][1];
+#endif
+ update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
+
+ mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->zeromv_mode[mode_ctx][0];
+#endif
+ update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
+ return;
+ }
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->zeromv_mode[mode_ctx][1];
+#endif
+ update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
+
+ mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+ ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#endif
+ update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
+}
+
+static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+ FRAME_COUNTS *counts) {
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+ (void)counts;
+
+ if (mbmi->mode == DC_PRED) {
+ const int n = pmi->palette_size[0];
+ const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
+#endif
+ update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+ n > 0, 2);
+ if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+ update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+ n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+ }
+ }
+
+ if (mbmi->uv_mode == UV_DC_PRED) {
+ const int n = pmi->palette_size[1];
+ const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
+#endif
+ update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
+
+ if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+ update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+ n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+ }
+ }
+}
+
+void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+ MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi, const int intraonly) {
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ const PREDICTION_MODE y_mode = mbmi->mode;
+ (void)counts;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+
+ if (intraonly) {
+#if CONFIG_ENTROPY_STATS
+ const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[above];
+ const int left_ctx = intra_mode_context[left];
+ ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
+#endif // CONFIG_ENTROPY_STATS
+ update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+ } else {
+#if CONFIG_ENTROPY_STATS
+ ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+#endif // CONFIG_ENTROPY_STATS
+ update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+ }
+
+ if (av1_filter_intra_allowed(cm, mbmi)) {
+ const int use_filter_intra_mode =
+ mbmi->filter_intra_mode_info.use_filter_intra;
+#if CONFIG_ENTROPY_STATS
+ ++counts->filter_intra[mbmi->bsize][use_filter_intra_mode];
+ if (use_filter_intra_mode) {
+ ++counts
+ ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
+ }
+#endif // CONFIG_ENTROPY_STATS
+ update_cdf(fc->filter_intra_cdfs[mbmi->bsize], use_filter_intra_mode, 2);
+ if (use_filter_intra_mode) {
+ update_cdf(fc->filter_intra_mode_cdf,
+ mbmi->filter_intra_mode_info.filter_intra_mode,
+ FILTER_INTRA_MODES);
+ }
+ }
+ if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->angle_delta[mbmi->mode - V_PRED]
+ [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
+#endif
+ update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+ mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+ 2 * MAX_ANGLE_DELTA + 1);
+ }
+
+ if (!xd->is_chroma_ref) return;
+
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+#if CONFIG_ENTROPY_STATS
+ ++counts->uv_mode[cfl_allowed][y_mode][uv_mode];
+#endif // CONFIG_ENTROPY_STATS
+ update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+ UV_INTRA_MODES - !cfl_allowed);
+ if (uv_mode == UV_CFL_PRED) {
+ const int8_t joint_sign = mbmi->cfl_alpha_signs;
+ const uint8_t idx = mbmi->cfl_alpha_idx;
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->cfl_sign[joint_sign];
+#endif
+ update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+ if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
+#endif
+ update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+ }
+ if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
+#endif
+ update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+ }
+ }
+ const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ if (av1_is_directional_mode(intra_mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->angle_delta[intra_mode - V_PRED]
+ [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
+#endif
+ update_cdf(fc->angle_delta_cdf[intra_mode - V_PRED],
+ mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+ 2 * MAX_ANGLE_DELTA + 1);
+ }
+ if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+ update_palette_cdf(xd, mbmi, counts);
+ }
+}
+
+void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ const int num_planes) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ int p;
+ const int num_4x4_blocks_wide = mi_size_wide[bsize];
+ const int num_4x4_blocks_high = mi_size_high[bsize];
+ int mi_width = mi_size_wide[bsize];
+ int mi_height = mi_size_high[bsize];
+ for (p = 0; p < num_planes; p++) {
+ int tx_col = mi_col;
+ int tx_row = mi_row & MAX_MIB_MASK;
+ memcpy(
+ xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+ ctx->a + num_4x4_blocks_wide * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+ xd->plane[p].subsampling_x);
+ memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+ ctx->l + num_4x4_blocks_high * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+ xd->plane[p].subsampling_y);
+ }
+ memcpy(xd->above_partition_context + mi_col, ctx->sa,
+ sizeof(*xd->above_partition_context) * mi_width);
+ memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+ sizeof(xd->left_partition_context[0]) * mi_height);
+ xd->above_txfm_context = ctx->p_ta;
+ xd->left_txfm_context = ctx->p_tl;
+ memcpy(xd->above_txfm_context, ctx->ta,
+ sizeof(*xd->above_txfm_context) * mi_width);
+ memcpy(xd->left_txfm_context, ctx->tl,
+ sizeof(*xd->left_txfm_context) * mi_height);
+}
+
+void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ const int num_planes) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ int p;
+ int mi_width = mi_size_wide[bsize];
+ int mi_height = mi_size_high[bsize];
+
+ // buffer the above/left context information of the block in search.
+ for (p = 0; p < num_planes; ++p) {
+ int tx_col = mi_col;
+ int tx_row = mi_row & MAX_MIB_MASK;
+ memcpy(
+ ctx->a + mi_width * p,
+ xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+ (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x);
+ memcpy(ctx->l + mi_height * p,
+ xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+ (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y);
+ }
+ memcpy(ctx->sa, xd->above_partition_context + mi_col,
+ sizeof(*xd->above_partition_context) * mi_width);
+ memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK),
+ sizeof(xd->left_partition_context[0]) * mi_height);
+ memcpy(ctx->ta, xd->above_txfm_context,
+ sizeof(*xd->above_txfm_context) * mi_width);
+ memcpy(ctx->tl, xd->left_txfm_context,
+ sizeof(*xd->left_txfm_context) * mi_height);
+ ctx->p_ta = xd->above_txfm_context;
+ ctx->p_tl = xd->left_txfm_context;
+}
+
+static void set_partial_sb_partition(const AV1_COMMON *const cm,
+ MB_MODE_INFO *mi, int bh_in, int bw_in,
+ int mi_rows_remaining,
+ int mi_cols_remaining, BLOCK_SIZE bsize,
+ MB_MODE_INFO **mib) {
+ int bh = bh_in;
+ int r, c;
+ for (r = 0; r < cm->seq_params->mib_size; r += bh) {
+ int bw = bw_in;
+ for (c = 0; c < cm->seq_params->mib_size; c += bw) {
+ const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
+ const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
+ mib[grid_index] = mi + mi_index;
+ mib[grid_index]->bsize = find_partition_size(
+ bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
+ }
+ }
+}
+
+// This function attempts to set all mode info entries in a given superblock
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ MB_MODE_INFO **mib, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int mi_rows_remaining = tile->mi_row_end - mi_row;
+ const int mi_cols_remaining = tile->mi_col_end - mi_col;
+ MB_MODE_INFO *const mi_upper_left =
+ mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col);
+ int bh = mi_size_high[bsize];
+ int bw = mi_size_wide[bsize];
+
+ assert(bsize >= mi_params->mi_alloc_bsize &&
+ "Attempted to use bsize < mi_params->mi_alloc_bsize");
+ assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
+
+ // Apply the requested partition size to the SB if it is all "in image"
+ if ((mi_cols_remaining >= cm->seq_params->mib_size) &&
+ (mi_rows_remaining >= cm->seq_params->mib_size)) {
+ for (int block_row = 0; block_row < cm->seq_params->mib_size;
+ block_row += bh) {
+ for (int block_col = 0; block_col < cm->seq_params->mib_size;
+ block_col += bw) {
+ const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
+ const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
+ mib[grid_index] = mi_upper_left + mi_index;
+ mib[grid_index]->bsize = bsize;
+ }
+ }
+ } else {
+ // Else this is a partial SB.
+ set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+ mi_cols_remaining, bsize, mib);
+ }
+}
+
+int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ assert(bsize >= BLOCK_8X8);
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+ for (int i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ if ((mi_row + y_idx >= cm->mi_params.mi_rows) ||
+ (mi_col + x_idx >= cm->mi_params.mi_cols))
+ return 0;
+ if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) !=
+ PARTITION_NONE &&
+ subsize != BLOCK_8X8)
+ return 0;
+ }
+ return 1;
+}
+
+#if !CONFIG_REALTIME_ONLY
+int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int orig_rdmult) {
+ AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+ int64_t intra_cost = 0;
+ int64_t mc_dep_cost = 0;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ int tpl_stride = tpl_frame->stride;
+
+ if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) {
+ return orig_rdmult;
+ }
+ if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+ return orig_rdmult;
+ }
+
+#ifndef NDEBUG
+ int mi_count = 0;
+#endif
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_col_end_sr =
+ coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+ const int step = 1 << block_mis_log2;
+ const int row_step = step;
+ const int col_step_sr =
+ coded_to_superres_mi(step, cm->superres_scale_denominator);
+ for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+ for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+ if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+ int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+ mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+#ifndef NDEBUG
+ mi_count++;
+#endif
+ }
+ }
+ assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+
+ double beta = 1.0;
+ if (mc_dep_cost > 0 && intra_cost > 0) {
+ const double r0 = cpi->rd.r0;
+ const double rk = (double)intra_cost / mc_dep_cost;
+ beta = (r0 / rk);
+ }
+
+ int rdmult = av1_get_adaptive_rdmult(cpi, beta);
+
+ rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
+ rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
+
+ rdmult = AOMMAX(1, rdmult);
+
+ return rdmult;
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+ int top_edge = 0;
+ int bottom_edge = cpi->common.mi_params.mi_rows;
+ int is_active_h_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (is_stat_consumption_stage_twopass(cpi)) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+ &cpi->ppi->twopass, cm->current_frame.display_order_hint);
+ if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ top_edge += (int)(this_frame_stats->inactive_zone_rows * 4);
+
+ bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4);
+ bottom_edge = AOMMAX(top_edge, bottom_edge);
+ }
+
+ if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+ ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+ is_active_h_edge = 1;
+ }
+ return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+ int left_edge = 0;
+ int right_edge = cpi->common.mi_params.mi_cols;
+ int is_active_v_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (is_stat_consumption_stage_twopass(cpi)) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+ &cpi->ppi->twopass, cm->current_frame.display_order_hint);
+ if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ left_edge += (int)(this_frame_stats->inactive_zone_cols * 4);
+
+ right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4);
+ right_edge = AOMMAX(left_edge, right_edge);
+ }
+
+ if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+ ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+ is_active_v_edge = 1;
+ }
+ return is_active_v_edge;
+}
+
+void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, SuperBlockEnc *sb_enc) {
+ sb_enc->tpl_data_count = 0;
+
+ if (!cpi->oxcf.algo_cfg.enable_tpl_model) return;
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) return;
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
+ return;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int gf_group_index = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ if (!av1_tpl_stats_ready(tpl_data, gf_group_index)) return;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ int tpl_stride = tpl_frame->stride;
+
+ int mi_count = 0;
+ int count = 0;
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_col_end_sr =
+ coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+ // mi_cols_sr is mi_cols at superres case.
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+ // TPL store unit size is not the same as the motion estimation unit size.
+ // Here always use motion estimation size to avoid getting repetitive inter/
+ // intra cost.
+ const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
+ assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
+ const int row_step = mi_size_high[tpl_bsize];
+ const int col_step_sr = coded_to_superres_mi(mi_size_wide[tpl_bsize],
+ cm->superres_scale_denominator);
+
+ // Stride is only based on SB size, and we fill in values for every 16x16
+ // block in a SB.
+ sb_enc->tpl_stride = (mi_col_end_sr - mi_col_sr) / col_step_sr;
+
+ for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+ for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+ assert(count < MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+ // Handle partial SB, so that no invalid values are used later.
+ if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) {
+ sb_enc->tpl_inter_cost[count] = INT64_MAX;
+ sb_enc->tpl_intra_cost[count] = INT64_MAX;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ sb_enc->tpl_mv[count][i].as_int = INVALID_MV;
+ }
+ count++;
+ continue;
+ }
+
+ TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+ sb_enc->tpl_inter_cost[count] = this_stats->inter_cost
+ << TPL_DEP_COST_SCALE_LOG2;
+ sb_enc->tpl_intra_cost[count] = this_stats->intra_cost
+ << TPL_DEP_COST_SCALE_LOG2;
+ memcpy(sb_enc->tpl_mv[count], this_stats->mv, sizeof(this_stats->mv));
+ mi_count++;
+ count++;
+ }
+ }
+
+ assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+ sb_enc->tpl_data_count = mi_count;
+}
+
+// analysis_type 0: Use mc_dep_cost and intra_cost
+// analysis_type 1: Use count of best inter predictor chosen
+// analysis_type 2: Use cost reduction from intra to inter for best inter
+// predictor chosen
+int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td,
+ int64_t *delta_dist, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+ double intra_cost = 0;
+ double mc_dep_reg = 0;
+ double mc_dep_cost = 0;
+ double cbcmp_base = 1;
+ double srcrf_dist = 0;
+ double srcrf_sse = 0;
+ double srcrf_rate = 0;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+ const int base_qindex = cm->quant_params.base_qindex;
+
+ if (tpl_idx >= MAX_TPL_FRAME_IDX) return base_qindex;
+
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ int tpl_stride = tpl_frame->stride;
+ if (!tpl_frame->is_valid) return base_qindex;
+
+#ifndef NDEBUG
+ int mi_count = 0;
+#endif
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_col_end_sr =
+ coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+ const int step = 1 << block_mis_log2;
+ const int row_step = step;
+ const int col_step_sr =
+ coded_to_superres_mi(step, cm->superres_scale_denominator);
+ for (int row = mi_row; row < mi_row + mi_high; row += row_step) {
+ for (int col = mi_col_sr; col < mi_col_end_sr; col += col_step_sr) {
+ if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+ double cbcmp = (double)this_stats->srcrf_dist;
+ int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+ intra_cost += log(dist_scaled) * cbcmp;
+ mc_dep_cost += log(dist_scaled + mc_dep_delta) * cbcmp;
+ mc_dep_reg += log(3 * dist_scaled + mc_dep_delta) * cbcmp;
+ srcrf_dist += (double)(this_stats->srcrf_dist << RDDIV_BITS);
+ srcrf_sse += (double)(this_stats->srcrf_sse << RDDIV_BITS);
+ srcrf_rate += (double)(this_stats->srcrf_rate << TPL_DEP_COST_SCALE_LOG2);
+#ifndef NDEBUG
+ mi_count++;
+#endif
+ cbcmp_base += cbcmp;
+ }
+ }
+ assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
+
+ int offset = 0;
+ double beta = 1.0;
+ double rk;
+ if (mc_dep_cost > 0 && intra_cost > 0) {
+ const double r0 = cpi->rd.r0;
+ rk = exp((intra_cost - mc_dep_cost) / cbcmp_base);
+ td->mb.rb = exp((intra_cost - mc_dep_reg) / cbcmp_base);
+ beta = (r0 / rk);
+ assert(beta > 0.0);
+ } else {
+ return base_qindex;
+ }
+ offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
+
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+ offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+ int qindex = cm->quant_params.base_qindex + offset;
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ);
+
+ int frm_qstep = av1_dc_quant_QTX(base_qindex, 0, cm->seq_params->bit_depth);
+ int sbs_qstep =
+ av1_dc_quant_QTX(base_qindex, offset, cm->seq_params->bit_depth);
+
+ if (delta_dist) {
+ double sbs_dist = srcrf_dist * pow((double)sbs_qstep / frm_qstep, 2.0);
+ double sbs_rate = srcrf_rate * ((double)frm_qstep / sbs_qstep);
+ sbs_dist = AOMMIN(sbs_dist, srcrf_sse);
+ *delta_dist = (int64_t)((sbs_dist - srcrf_dist) / rk);
+ *delta_dist += RDCOST(tpl_frame->base_rdmult, 4 * 256, 0);
+ *delta_dist += RDCOST(tpl_frame->base_rdmult, sbs_rate - srcrf_rate, 0);
+ }
+ return qindex;
+}
+
+#if !DISABLE_HDR_LUMA_DELTAQ
+// offset table defined in Table3 of T-REC-H.Sup15 document.
+static const int hdr_thres[HDR_QP_LEVELS + 1] = { 0, 301, 367, 434, 501, 567,
+ 634, 701, 767, 834, 1024 };
+
+static const int hdr10_qp_offset[HDR_QP_LEVELS] = { 3, 2, 1, 0, -1,
+ -2, -3, -4, -5, -6 };
+#endif
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ assert(cm->seq_params->bit_depth == AOM_BITS_10);
+
+#if DISABLE_HDR_LUMA_DELTAQ
+ (void)x;
+ (void)bsize;
+ (void)mi_row;
+ (void)mi_col;
+ return cm->quant_params.base_qindex;
+#else
+ // calculate pixel average
+ const int block_luma_avg = av1_log_block_avg(cpi, x, bsize, mi_row, mi_col);
+ // adjust offset based on average of the pixel block
+ int offset = 0;
+ for (int i = 0; i < HDR_QP_LEVELS; i++) {
+ if (block_luma_avg >= hdr_thres[i] && block_luma_avg < hdr_thres[i + 1]) {
+ offset = (int)(hdr10_qp_offset[i] * QP_SCALE_FACTOR);
+ break;
+ }
+ }
+
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+ offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+ int qindex = cm->quant_params.base_qindex + offset;
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ);
+
+ return qindex;
+#endif
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
+ BLOCK_SIZE bsize) {
+ if (sms_tree == NULL) return;
+ sms_tree->partitioning = PARTITION_NONE;
+
+ if (bsize >= BLOCK_8X8) {
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int idx = 0; idx < 4; ++idx)
+ av1_reset_simple_motion_tree_partition(sms_tree->split[idx], subsize);
+ }
+}
+
+// Record the ref frames that have been selected by square partition blocks.
+void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+ BLOCK_SIZE bsize, int mib_size,
+ int mi_row, int mi_col) {
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+ const int sb_size_mask = mib_size - 1;
+ const int mi_row_in_sb = mi_row & sb_size_mask;
+ const int mi_col_in_sb = mi_col & sb_size_mask;
+ const int mi_size = mi_size_wide[bsize];
+ for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
+ for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
+ x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
+ }
+ }
+}
+
+static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr,
+ int num_cdfs, int cdf_stride, int nsymbs,
+ int wt_left, int wt_tr) {
+ for (int i = 0; i < num_cdfs; i++) {
+ for (int j = 0; j <= nsymbs; j++) {
+ cdf_ptr_left[i * cdf_stride + j] =
+ (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left +
+ (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr +
+ ((wt_left + wt_tr) / 2)) /
+ (wt_left + wt_tr));
+ assert(cdf_ptr_left[i * cdf_stride + j] >= 0 &&
+ cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP);
+ }
+ }
+}
+
+#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \
+ AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs))
+
+#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride) \
+ do { \
+ aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left; \
+ aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr; \
+ int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob); \
+ int num_cdfs = array_size / cdf_stride; \
+ avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \
+ wt_left, wt_tr); \
+ } while (0)
+
+static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left,
+ int wt_tr) {
+ AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
+ for (int i = 0; i < 2; i++) {
+ AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
+ MV_CLASSES);
+ AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf,
+ nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE);
+ AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE);
+ AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2);
+ AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf,
+ nmv_tr->comps[i].class0_hp_cdf, 2);
+ AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2);
+ AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf,
+ CLASS0_SIZE);
+ AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2);
+ }
+}
+
+// In case of row-based multi-threading of encoder, since we always
+// keep a top - right sync, we can average the top - right SB's CDFs and
+// the left SB's CDFs and use the same for current SB's encoding to
+// improve the performance. This function facilitates the averaging
+// of CDF and used only when row-mt is enabled in encoder.
+void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+ int wt_left, int wt_tr) {
+ AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
+ AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
+ AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11);
+ AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3);
+ AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4);
+ AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE);
+ AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2);
+ AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2);
+ AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2);
+ AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2);
+ AVERAGE_CDF(ctx_left->inter_compound_mode_cdf,
+ ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+ AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf,
+ MASKED_COMPOUND_TYPES);
+ AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16);
+ AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2);
+ AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2);
+ AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf,
+ INTERINTRA_MODES);
+ AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES);
+ AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2);
+ AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf,
+ PALETTE_SIZES);
+ AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf,
+ PALETTE_SIZES);
+ for (int j = 0; j < PALETTE_SIZES; j++) {
+ int nsymbs = j + PALETTE_MIN_SIZE;
+ AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j],
+ ctx_tr->palette_y_color_index_cdf[j], nsymbs,
+ CDF_SIZE(PALETTE_COLORS));
+ AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j],
+ ctx_tr->palette_uv_color_index_cdf[j], nsymbs,
+ CDF_SIZE(PALETTE_COLORS));
+ }
+ AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2);
+ AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2);
+ AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2);
+ AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2);
+ AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2);
+ AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2);
+ AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2);
+ AVERAGE_CDF(ctx_left->skip_txfm_cdfs, ctx_tr->skip_txfm_cdfs, 2);
+ AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2);
+ avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr);
+ avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr);
+ AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2);
+ AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2);
+ AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf,
+ ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+ AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2);
+ AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf,
+ FILTER_INTRA_MODES);
+ AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf,
+ RESTORE_SWITCHABLE_TYPES);
+ AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2);
+ AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2);
+ AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES);
+ AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0],
+ UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES));
+ AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES);
+ for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+ if (i < 4) {
+ AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4,
+ CDF_SIZE(10));
+ } else if (i < 16) {
+ AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10);
+ } else {
+ AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8,
+ CDF_SIZE(10));
+ }
+ }
+ AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
+ SWITCHABLE_FILTERS);
+ AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
+ AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf,
+ 2 * MAX_ANGLE_DELTA + 1);
+ AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH,
+ CDF_SIZE(MAX_TX_DEPTH + 1));
+ AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1],
+ MAX_TX_DEPTH + 1);
+ AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2],
+ MAX_TX_DEPTH + 1);
+ AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3],
+ MAX_TX_DEPTH + 1);
+ AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1);
+ AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1);
+ for (int i = 0; i < FRAME_LF_COUNT; i++) {
+ AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i],
+ DELTA_LF_PROBS + 1);
+ }
+ AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2,
+ CDF_SIZE(TX_TYPES));
+ AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS);
+ AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf,
+ CFL_ALPHABET_SIZE);
+}
+
+// Check neighbor blocks' motion information.
+static int check_neighbor_blocks(MB_MODE_INFO **mi, int mi_stride,
+ const TileInfo *const tile_info, int mi_row,
+ int mi_col) {
+ int is_above_low_motion = 1;
+ int is_left_low_motion = 1;
+ const int thr = 24;
+
+ // Check above block.
+ if (mi_row > tile_info->mi_row_start) {
+ const MB_MODE_INFO *above_mbmi = mi[-mi_stride];
+ const int_mv above_mv = above_mbmi->mv[0];
+ if (above_mbmi->mode >= INTRA_MODE_END &&
+ (abs(above_mv.as_mv.row) > thr || abs(above_mv.as_mv.col) > thr))
+ is_above_low_motion = 0;
+ }
+
+ // Check left block.
+ if (mi_col > tile_info->mi_col_start) {
+ const MB_MODE_INFO *left_mbmi = mi[-1];
+ const int_mv left_mv = left_mbmi->mv[0];
+ if (left_mbmi->mode >= INTRA_MODE_END &&
+ (abs(left_mv.as_mv.row) > thr || abs(left_mv.as_mv.col) > thr))
+ is_left_low_motion = 0;
+ }
+
+ return (is_above_low_motion && is_left_low_motion);
+}
+
+// Check this block's motion in a fast way.
+static int fast_detect_non_zero_motion(AV1_COMP *cpi, const uint8_t *src_y,
+ int src_ystride,
+ const uint8_t *last_src_y,
+ int last_src_ystride, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const BLOCK_SIZE bsize = cm->seq_params->sb_size;
+ unsigned int blk_sad = INT_MAX;
+ if (cpi->src_sad_blk_64x64 != NULL) {
+ const int sb_size_by_mb = (bsize == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int sb_cols =
+ (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+ const int sbi_col = mi_col / sb_size_by_mb;
+ const int sbi_row = mi_row / sb_size_by_mb;
+ blk_sad = (unsigned int)cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+ } else {
+ blk_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+ last_src_ystride);
+ }
+
+ // Search 4 1-away points.
+ const uint8_t *const search_pos[4] = {
+ last_src_y - last_src_ystride,
+ last_src_y - 1,
+ last_src_y + 1,
+ last_src_y + last_src_ystride,
+ };
+ unsigned int sad_arr[4];
+ cpi->ppi->fn_ptr[bsize].sdx4df(src_y, src_ystride, search_pos,
+ last_src_ystride, sad_arr);
+
+ blk_sad = (blk_sad * 5) >> 3;
+ return (blk_sad < sad_arr[0] && blk_sad < sad_arr[1] &&
+ blk_sad < sad_arr[2] && blk_sad < sad_arr[3]);
+}
+
+// Grade the temporal variation of the source by comparing the current sb and
+// its collocated block in the last frame.
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+ int mi_row, int mi_col) {
+ if (cpi->last_source->y_width != cpi->source->y_width ||
+ cpi->last_source->y_height != cpi->source->y_height)
+ return;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
+#endif
+
+ unsigned int tmp_sse;
+ unsigned int tmp_variance;
+ const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
+ uint8_t *src_y = cpi->source->y_buffer;
+ const int src_ystride = cpi->source->y_stride;
+ const int src_offset = src_ystride * (mi_row << 2) + (mi_col << 2);
+ uint8_t *last_src_y = cpi->last_source->y_buffer;
+ const int last_src_ystride = cpi->last_source->y_stride;
+ const int last_src_offset = last_src_ystride * (mi_row << 2) + (mi_col << 2);
+ uint64_t avg_source_sse_threshold_verylow = 10000; // ~1.5*1.5*(64*64)
+ uint64_t avg_source_sse_threshold_low[2] = { 100000, // ~5*5*(64*64)
+ 36000 }; // ~3*3*(64*64)
+
+ uint64_t avg_source_sse_threshold_high = 1000000; // ~15*15*(64*64)
+ if (cpi->sf.rt_sf.increase_source_sad_thresh) {
+ avg_source_sse_threshold_high = avg_source_sse_threshold_high << 1;
+ avg_source_sse_threshold_low[0] = avg_source_sse_threshold_low[0] << 1;
+ avg_source_sse_threshold_verylow = avg_source_sse_threshold_verylow << 1;
+ }
+ uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5
+ src_y += src_offset;
+ last_src_y += last_src_offset;
+ tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+ last_src_ystride, &tmp_sse);
+ // rd thresholds
+ if (tmp_sse < avg_source_sse_threshold_low[1])
+ x->content_state_sb.source_sad_rd = kLowSad;
+
+ // nonrd thresholds
+ if (tmp_sse == 0) {
+ x->content_state_sb.source_sad_nonrd = kZeroSad;
+ return;
+ }
+ if (tmp_sse < avg_source_sse_threshold_verylow)
+ x->content_state_sb.source_sad_nonrd = kVeryLowSad;
+ else if (tmp_sse < avg_source_sse_threshold_low[0])
+ x->content_state_sb.source_sad_nonrd = kLowSad;
+ else if (tmp_sse > avg_source_sse_threshold_high)
+ x->content_state_sb.source_sad_nonrd = kHighSad;
+
+ // Detect large lighting change.
+ // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
+ if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh)
+ x->content_state_sb.lighting_change = 1;
+ if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1))
+ x->content_state_sb.low_sumdiff = 1;
+
+ if (!cpi->sf.rt_sf.use_rtc_tf || cpi->rc.high_source_sad ||
+ cpi->rc.frame_source_sad > 20000 || cpi->svc.number_spatial_layers > 1)
+ return;
+
+ // In-place temporal filter. If psnr calculation is enabled, we store the
+ // source for that.
+ AV1_COMMON *const cm = &cpi->common;
+ // Calculate n*mean^2
+ const unsigned int nmean2 = tmp_sse - tmp_variance;
+ const int ac_q_step = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
+ cm->seq_params->bit_depth);
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const int avg_q_step = av1_ac_quant_QTX(p_rc->avg_frame_qindex[INTER_FRAME],
+ 0, cm->seq_params->bit_depth);
+
+ const unsigned int threshold =
+ (cpi->sf.rt_sf.use_rtc_tf == 1)
+ ? (clamp(avg_q_step, 250, 1000)) * ac_q_step
+ : 250 * ac_q_step;
+
+ // TODO(yunqing): use a weighted sum instead of averaging in filtering.
+ if (tmp_variance <= threshold && nmean2 <= 15) {
+ // Check neighbor blocks. If neighbor blocks aren't low-motion blocks,
+ // skip temporal filtering for this block.
+ MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+ get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ const int is_neighbor_blocks_low_motion = check_neighbor_blocks(
+ mi, cm->mi_params.mi_stride, tile_info, mi_row, mi_col);
+ if (!is_neighbor_blocks_low_motion) return;
+
+ // Only consider 64x64 SB for now. Need to extend to 128x128 for large SB
+ // size.
+ // Test several nearby points. If non-zero mv exists, don't do temporal
+ // filtering.
+ const int is_this_blk_low_motion = fast_detect_non_zero_motion(
+ cpi, src_y, src_ystride, last_src_y, last_src_ystride, mi_row, mi_col);
+
+ if (!is_this_blk_low_motion) return;
+
+ const int shift_x[2] = { 0, cpi->source->subsampling_x };
+ const int shift_y[2] = { 0, cpi->source->subsampling_y };
+ const uint8_t h = block_size_high[bsize];
+ const uint8_t w = block_size_wide[bsize];
+
+ for (int plane = 0; plane < av1_num_planes(cm); ++plane) {
+ uint8_t *src = cpi->source->buffers[plane];
+ const int src_stride = cpi->source->strides[plane != 0];
+ uint8_t *last_src = cpi->last_source->buffers[plane];
+ const int last_src_stride = cpi->last_source->strides[plane != 0];
+ src += src_stride * (mi_row << (2 - shift_y[plane != 0])) +
+ (mi_col << (2 - shift_x[plane != 0]));
+ last_src += last_src_stride * (mi_row << (2 - shift_y[plane != 0])) +
+ (mi_col << (2 - shift_x[plane != 0]));
+
+ for (int i = 0; i < (h >> shift_y[plane != 0]); ++i) {
+ for (int j = 0; j < (w >> shift_x[plane != 0]); ++j) {
+ src[j] = (last_src[j] + src[j]) >> 1;
+ }
+ src += src_stride;
+ last_src += last_src_stride;
+ }
+ }
+ }
+}
+
+// Memset the mbmis at the current superblock to 0
+void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
+ int mi_row, int mi_col) {
+ // size of sb in unit of mi (BLOCK_4X4)
+ const int sb_size_mi = mi_size_wide[sb_size];
+ const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+ // size of sb in unit of allocated mi size
+ const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d;
+ assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 &&
+ "mi is not allocated as a multiple of sb!");
+ assert(mi_params->mi_stride % sb_size_mi == 0 &&
+ "mi_grid_base is not allocated as a multiple of sb!");
+
+ const int mi_rows = mi_size_high[sb_size];
+ for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) {
+ assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) <
+ mi_params->mi_stride);
+ const int mi_grid_idx =
+ get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col);
+ const int alloc_mi_idx =
+ get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col);
+ memset(&mi_params->mi_grid_base[mi_grid_idx], 0,
+ sb_size_mi * sizeof(*mi_params->mi_grid_base));
+ memset(&mi_params->tx_type_map[mi_grid_idx], 0,
+ sb_size_mi * sizeof(*mi_params->tx_type_map));
+ if (cur_mi_row % mi_alloc_size_1d == 0) {
+ memset(&mi_params->mi_alloc[alloc_mi_idx], 0,
+ sb_size_alloc_mi * sizeof(*mi_params->mi_alloc));
+ }
+ }
+}
+
+void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
+ ThreadData *td, const TileDataEnc *tile_data,
+ int mi_row, int mi_col) {
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const TileInfo *tile_info = &tile_data->tile_info;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ av1_save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
+
+ sb_fp_stats->rd_count = td->rd_counts;
+ sb_fp_stats->split_count = x->txfm_search_info.txb_split_count;
+
+ sb_fp_stats->fc = *td->counts;
+
+ // Don't copy in row_mt case, otherwise run into data race. No behavior change
+ // in row_mt case.
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
+ sizeof(sb_fp_stats->inter_mode_rd_models));
+ }
+
+ memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact,
+ sizeof(sb_fp_stats->thresh_freq_fact));
+
+ const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+ sb_fp_stats->current_qindex =
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+ memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts,
+ sizeof(sb_fp_stats->mode_chosen_counts));
+#endif // CONFIG_INTERNAL_STATS
+}
+
+void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
+ ThreadData *td, TileDataEnc *tile_data, int mi_row,
+ int mi_col) {
+ MACROBLOCK *x = &td->mb;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+ av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size,
+ num_planes);
+
+ td->rd_counts = sb_fp_stats->rd_count;
+ x->txfm_search_info.txb_split_count = sb_fp_stats->split_count;
+
+ *td->counts = sb_fp_stats->fc;
+
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
+ sizeof(sb_fp_stats->inter_mode_rd_models));
+ }
+
+ memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact,
+ sizeof(sb_fp_stats->thresh_freq_fact));
+
+ const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+ cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+ sb_fp_stats->current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+ memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts,
+ sizeof(sb_fp_stats->mode_chosen_counts));
+#endif // CONFIG_INTERNAL_STATS
+}
+
+/*! Checks whether to skip updating the entropy cost based on tile info.
+ *
+ * This function contains the common code used to skip the cost update of coeff,
+ * mode, mv and dv symbols.
+ */
+static int skip_cost_update(const SequenceHeader *seq_params,
+ const TileInfo *const tile_info, const int mi_row,
+ const int mi_col,
+ INTERNAL_COST_UPDATE_TYPE upd_level) {
+ if (upd_level == INTERNAL_COST_UPD_SB) return 0;
+ if (upd_level == INTERNAL_COST_UPD_OFF) return 1;
+
+ // upd_level is at most as frequent as each sb_row in a tile.
+ if (mi_col != tile_info->mi_col_start) return 1;
+
+ if (upd_level == INTERNAL_COST_UPD_SBROW_SET) {
+ const int mib_size_log2 = seq_params->mib_size_log2;
+ const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+ const int sb_size = seq_params->mib_size * MI_SIZE;
+ const int tile_height =
+ (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+ // When upd_level = INTERNAL_COST_UPD_SBROW_SET, the cost update happens
+ // once for 2, 4 sb rows for sb size 128, sb size 64 respectively. However,
+ // as the update will not be equally spaced in smaller resolutions making
+ // it equally spaced by calculating (mv_num_rows_cost_update) the number of
+ // rows after which the cost update should happen.
+ const int sb_size_update_freq_map[2] = { 2, 4 };
+ const int update_freq_sb_rows =
+ sb_size_update_freq_map[sb_size != MAX_SB_SIZE];
+ const int update_freq_num_rows = sb_size * update_freq_sb_rows;
+ // Round-up the division result to next integer.
+ const int num_updates_per_tile =
+ (tile_height + update_freq_num_rows - 1) / update_freq_num_rows;
+ const int num_rows_update_per_tile = num_updates_per_tile * sb_size;
+ // Round-up the division result to next integer.
+ const int num_sb_rows_per_update =
+ (tile_height + num_rows_update_per_tile - 1) / num_rows_update_per_tile;
+ if ((sb_row % num_sb_rows_per_update) != 0) return 1;
+ }
+ return 0;
+}
+
+// Checks for skip status of mv cost update.
+static int skip_mv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+ const int mi_row, const int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ // For intra frames, mv cdfs are not updated during the encode. Hence, the mv
+ // cost calculation is skipped in this case.
+ if (frame_is_intra_only(cm)) return 1;
+
+ return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+ cpi->sf.inter_sf.mv_cost_upd_level);
+}
+
+// Checks for skip status of dv cost update.
+static int skip_dv_cost_update(AV1_COMP *cpi, const TileInfo *const tile_info,
+ const int mi_row, const int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ // Intrabc is only applicable to intra frames. So skip if intrabc is not
+ // allowed.
+ if (!av1_allow_intrabc(cm) || is_stat_generation_stage(cpi)) {
+ return 1;
+ }
+
+ return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+ cpi->sf.intra_sf.dv_cost_upd_level);
+}
+
+// Update the rate costs of some symbols according to the frequency directed
+// by speed features
+void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+ const TileInfo *const tile_info, const int mi_row,
+ const int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ if (cm->features.disable_cdf_update) {
+ return;
+ }
+
+ switch (cpi->sf.inter_sf.coeff_cost_upd_level) {
+ case INTERNAL_COST_UPD_OFF:
+ case INTERNAL_COST_UPD_TILE: // Tile level
+ break;
+ case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile
+ case INTERNAL_COST_UPD_SBROW: // SB row level in tile
+ case INTERNAL_COST_UPD_SB: // SB level
+ if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+ cpi->sf.inter_sf.coeff_cost_upd_level))
+ break;
+ av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes);
+ break;
+ default: assert(0);
+ }
+
+ switch (cpi->sf.inter_sf.mode_cost_upd_level) {
+ case INTERNAL_COST_UPD_OFF:
+ case INTERNAL_COST_UPD_TILE: // Tile level
+ break;
+ case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile
+ case INTERNAL_COST_UPD_SBROW: // SB row level in tile
+ case INTERNAL_COST_UPD_SB: // SB level
+ if (skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
+ cpi->sf.inter_sf.mode_cost_upd_level))
+ break;
+ av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx);
+ break;
+ default: assert(0);
+ }
+
+ switch (cpi->sf.inter_sf.mv_cost_upd_level) {
+ case INTERNAL_COST_UPD_OFF:
+ case INTERNAL_COST_UPD_TILE: // Tile level
+ break;
+ case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile
+ case INTERNAL_COST_UPD_SBROW: // SB row level in tile
+ case INTERNAL_COST_UPD_SB: // SB level
+ // Checks for skip status of mv cost update.
+ if (skip_mv_cost_update(cpi, tile_info, mi_row, mi_col)) break;
+ av1_fill_mv_costs(&xd->tile_ctx->nmvc,
+ cm->features.cur_frame_force_integer_mv,
+ cm->features.allow_high_precision_mv, x->mv_costs);
+ break;
+ default: assert(0);
+ }
+
+ switch (cpi->sf.intra_sf.dv_cost_upd_level) {
+ case INTERNAL_COST_UPD_OFF:
+ case INTERNAL_COST_UPD_TILE: // Tile level
+ break;
+ case INTERNAL_COST_UPD_SBROW_SET: // SB row set level in tile
+ case INTERNAL_COST_UPD_SBROW: // SB row level in tile
+ case INTERNAL_COST_UPD_SB: // SB level
+ // Checks for skip status of dv cost update.
+ if (skip_dv_cost_update(cpi, tile_info, mi_row, mi_col)) break;
+ av1_fill_dv_costs(&xd->tile_ctx->ndvc, x->dv_costs);
+ break;
+ default: assert(0);
+ }
+}
+
+void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ aom_free(mb->plane[plane].src_diff);
+ mb->plane[plane].src_diff = NULL;
+ }
+}
+
+void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb) {
+ const int num_planes = av1_num_planes(cm);
+#ifndef NDEBUG
+ for (int plane = 0; plane < num_planes; ++plane) {
+ assert(!mb->plane[plane].src_diff);
+ }
+#endif
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int subsampling_xy =
+ plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+ : 0;
+ const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
+ CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff,
+ (int16_t *)aom_memalign(
+ 32, sizeof(*mb->plane[plane].src_diff) * sb_size));
+ }
+}
diff --git a/third_party/aom/av1/encoder/encodeframe_utils.h b/third_party/aom/av1/encoder/encodeframe_utils.h
new file mode 100644
index 0000000000..14c71b8802
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe_utils.h
@@ -0,0 +1,595 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
+
+#include "aom_ports/aom_timer.h"
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define WRITE_FEATURE_TO_FILE 0
+
+#define FEATURE_SIZE_SMS_SPLIT_FAST 6
+#define FEATURE_SIZE_SMS_SPLIT 17
+#define FEATURE_SIZE_SMS_PRUNE_PART 25
+#define FEATURE_SIZE_SMS_TERM_NONE 28
+#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
+#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
+#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
+
+#define FEATURE_SMS_NONE_FLAG 1
+#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
+#define FEATURE_SMS_RECT_FLAG (1 << 2)
+
+#define FEATURE_SMS_PRUNE_PART_FLAG \
+ (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
+#define FEATURE_SMS_SPLIT_MODEL_FLAG \
+ (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
+
+// Number of sub-partitions in rectangular partition types.
+#define SUB_PARTITIONS_RECT 2
+
+// Number of sub-partitions in split partition type.
+#define SUB_PARTITIONS_SPLIT 4
+
+// Number of sub-partitions in AB partition types.
+#define SUB_PARTITIONS_AB 3
+
+// Number of sub-partitions in 4-way partition types.
+#define SUB_PARTITIONS_PART4 4
+
+// 4part partition types.
+enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES);
+
+// AB partition types.
+enum {
+ HORZ_A = 0,
+ HORZ_B,
+ VERT_A,
+ VERT_B,
+ NUM_AB_PARTS
+} UENUM1BYTE(AB_PART_TYPE);
+
+// Rectangular partition types.
+enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE);
+
+// Structure to keep win flags for HORZ and VERT partition evaluations.
+typedef struct {
+ int rect_part_win[NUM_RECT_PARTS];
+} RD_RECT_PART_WIN_INFO;
+
+enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
+
+enum {
+ SB_SINGLE_PASS, // Single pass encoding: all ctxs get updated normally
+ SB_DRY_PASS, // First pass of multi-pass: does not update the ctxs
+ SB_WET_PASS // Second pass of multi-pass: finalize and update the ctx
+} UENUM1BYTE(SB_MULTI_PASS_MODE);
+
+typedef struct {
+ ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+ ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
+ PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+ PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+ TXFM_CONTEXT *p_ta;
+ TXFM_CONTEXT *p_tl;
+ TXFM_CONTEXT ta[MAX_MIB_SIZE];
+ TXFM_CONTEXT tl[MAX_MIB_SIZE];
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+// This struct is used to store the statistics used by sb-level multi-pass
+// encoding. Currently, this is only used to make a copy of the state before we
+// perform the first pass
+typedef struct SB_FIRST_PASS_STATS {
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_COUNTS rd_count;
+
+ int split_count;
+ FRAME_COUNTS fc;
+ InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+ int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+ int current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+ unsigned int mode_chosen_counts[MAX_MODES];
+#endif // CONFIG_INTERNAL_STATS
+} SB_FIRST_PASS_STATS;
+
+// This structure contains block size related
+// variables for use in rd_pick_partition().
+typedef struct {
+ // Half of block width to determine block edge.
+ int mi_step;
+
+ // Block row and column indices.
+ int mi_row;
+ int mi_col;
+
+ // Block edge row and column indices.
+ int mi_row_edge;
+ int mi_col_edge;
+
+ // Block width of current partition block.
+ int width;
+
+ // Block width of minimum partition size allowed.
+ int min_partition_size_1d;
+
+ // Flag to indicate if partition is 8x8 or higher size.
+ int bsize_at_least_8x8;
+
+ // Indicates edge blocks in frame.
+ int has_rows;
+ int has_cols;
+
+ // Block size of current partition.
+ BLOCK_SIZE bsize;
+
+ // Size of current sub-partition.
+ BLOCK_SIZE subsize;
+
+ // Size of split partition.
+ BLOCK_SIZE split_bsize2;
+} PartitionBlkParams;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+typedef struct PartitionTimingStats {
+ // Tracks the number of partition decision used in the current call to \ref
+ // av1_rd_pick_partition
+ int partition_decisions[EXT_PARTITION_TYPES];
+ // Tracks the number of partition_block searched in the current call to \ref
+ // av1_rd_pick_partition
+ int partition_attempts[EXT_PARTITION_TYPES];
+ // Tracks the time spent on each partition search in the current call to \ref
+ // av1_rd_pick_partition
+ int64_t partition_times[EXT_PARTITION_TYPES];
+ // Tracks the rdcost spent on each partition search in the current call to
+ // \ref av1_rd_pick_partition
+ int64_t partition_rdcost[EXT_PARTITION_TYPES];
+ // Timer used to time the partitions.
+ struct aom_usec_timer timer;
+ // Whether the timer is on
+ int timer_is_on;
+} PartitionTimingStats;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+// Structure holding state variables for partition search.
+typedef struct {
+ // Intra partitioning related info.
+ PartitionSearchInfo *intra_part_info;
+
+ // Parameters related to partition block size.
+ PartitionBlkParams part_blk_params;
+
+ // Win flags for HORZ and VERT partition evaluations.
+ RD_RECT_PART_WIN_INFO split_part_rect_win[SUB_PARTITIONS_SPLIT];
+
+ // RD cost for the current block of given partition type.
+ RD_STATS this_rdc;
+
+ // RD cost summed across all blocks of partition type.
+ RD_STATS sum_rdc;
+
+ // Array holding partition type cost.
+ int tmp_partition_cost[PARTITION_TYPES];
+
+ // Pointer to partition cost buffer
+ int *partition_cost;
+
+ // RD costs for different partition types.
+ int64_t none_rd;
+ int64_t split_rd[SUB_PARTITIONS_SPLIT];
+ // RD costs for rectangular partitions.
+ // rect_part_rd[0][i] is the RD cost of ith partition index of PARTITION_HORZ.
+ // rect_part_rd[1][i] is the RD cost of ith partition index of PARTITION_VERT.
+ int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT];
+
+ // Flags indicating if the corresponding partition was winner or not.
+ // Used to bypass similar blocks during AB partition evaluation.
+ int is_split_ctx_is_ready[2];
+ int is_rect_ctx_is_ready[NUM_RECT_PARTS];
+
+ // If true, skips the rest of partition evaluation at the current bsize level.
+ int terminate_partition_search;
+
+ // If false, skips rdopt on PARTITION_NONE.
+ int partition_none_allowed;
+
+ // If partition_rect_allowed[HORZ] is false, skips searching PARTITION_HORZ,
+ // PARTITION_HORZ_A, PARTITIO_HORZ_B, PARTITION_HORZ_4. Same holds for VERT.
+ int partition_rect_allowed[NUM_RECT_PARTS];
+
+ // If false, skips searching rectangular partition unless some logic related
+ // to edge detection holds.
+ int do_rectangular_split;
+
+ // If false, skips searching PARTITION_SPLIT.
+ int do_square_split;
+
+ // If true, prunes the corresponding PARTITION_HORZ/PARTITION_VERT. Note that
+ // this does not directly affect the extended partitions, so this can be used
+ // to prune out PARTITION_HORZ/PARTITION_VERT while still allowing rdopt of
+ // PARTITION_HORZ_AB4, etc.
+ int prune_rect_part[NUM_RECT_PARTS];
+
+ // Chroma subsampling in x and y directions.
+ int ss_x;
+ int ss_y;
+
+ // Partition plane context index.
+ int pl_ctx_idx;
+
+ // This flag will be set if best partition is found from the search.
+ bool found_best_partition;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ PartitionTimingStats part_timing_stats;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+} PartitionSearchState;
+
+static AOM_INLINE void av1_disable_square_split_partition(
+ PartitionSearchState *part_state) {
+ part_state->do_square_split = 0;
+}
+
+// Disables all possible rectangular splits. This includes PARTITION_AB4 as they
+// depend on the corresponding partition_rect_allowed.
+static AOM_INLINE void av1_disable_rect_partitions(
+ PartitionSearchState *part_state) {
+ part_state->do_rectangular_split = 0;
+ part_state->partition_rect_allowed[HORZ] = 0;
+ part_state->partition_rect_allowed[VERT] = 0;
+}
+
+// Disables all possible splits so that only PARTITION_NONE *might* be allowed.
+static AOM_INLINE void av1_disable_all_splits(
+ PartitionSearchState *part_state) {
+ av1_disable_square_split_partition(part_state);
+ av1_disable_rect_partitions(part_state);
+}
+
+static AOM_INLINE void av1_set_square_split_only(
+ PartitionSearchState *part_state) {
+ part_state->partition_none_allowed = 0;
+ part_state->do_square_split = 1;
+ av1_disable_rect_partitions(part_state);
+}
+
+static AOM_INLINE bool av1_blk_has_rows_and_cols(
+ const PartitionBlkParams *blk_params) {
+ return blk_params->has_rows && blk_params->has_cols;
+}
+
+static AOM_INLINE bool av1_is_whole_blk_in_frame(
+ const PartitionBlkParams *blk_params,
+ const CommonModeInfoParams *mi_params) {
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+ return mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+ mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
+}
+
+static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi,
+ int dual_filter) {
+ for (int dir = 0; dir < 2; ++dir) {
+ if (dir && !dual_filter) break;
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+ update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
+ SWITCHABLE_FILTERS);
+ }
+}
+
+static AOM_INLINE int set_rdmult(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, int segment_id) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const CommonQuantParams *quant_params = &cm->quant_params;
+ const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+ const FRAME_UPDATE_TYPE update_type =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+
+ int qindex;
+ if (segment_id >= 0) {
+ qindex = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+ } else {
+ qindex = quant_params->base_qindex + x->rdmult_delta_qindex +
+ quant_params->y_dc_delta_q;
+ }
+
+ return av1_compute_rd_mult(
+ qindex, bit_depth, update_type, layer_depth, boost_index, frame_type,
+ cpi->oxcf.q_cfg.use_fixed_qp_offsets, is_stat_consumption_stage(cpi));
+}
+
+static AOM_INLINE int do_split_check(BLOCK_SIZE bsize) {
+ return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p,
+ int frm) {
+ assert(frm >= 0);
+ if (frm < 0 ||
+ p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) {
+ return NULL;
+ }
+
+ return &p->stats_buf_ctx->stats_in_start[frm];
+}
+
+int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int orig_rdmult);
+
+int av1_active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step);
+
+int av1_active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step);
+
+void av1_get_tpl_stats_sb(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, SuperBlockEnc *sb_enc);
+
+int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, ThreadData *td,
+ int64_t *delta_dist, BLOCK_SIZE bsize,
+ int mi_row, int mi_col);
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col);
+
+int av1_get_cb_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col);
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int orig_rdmult);
+#endif // !CONFIG_REALTIME_ONLY
+
+void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int *const rdmult);
+
+#if CONFIG_SALIENCY_MAP
+void av1_set_saliency_map_vmaf_rdmult(const AV1_COMP *const cpi,
+ int *errorperbit, const BLOCK_SIZE bsize,
+ const int mi_row, const int mi_col,
+ int *const rdmult);
+#endif
+
+void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
+ const PICK_MODE_CONTEXT *const ctx, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run);
+
+void av1_update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+ PREDICTION_MODE mode, int16_t mode_context);
+
+void av1_sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+ MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi, const int intraonly);
+
+void av1_restore_context(MACROBLOCK *x, const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ const int num_planes);
+
+void av1_save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ const int num_planes);
+
+void av1_set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ MB_MODE_INFO **mib, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+
+int av1_is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+
+void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
+ BLOCK_SIZE bsize);
+
+void av1_update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+ BLOCK_SIZE bsize, int mib_size,
+ int mi_row, int mi_col);
+
+void av1_avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+ int wt_left, int wt_tr);
+
+void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+ int mi_row, int mi_col);
+
+void av1_reset_mbmi(CommonModeInfoParams *const mi_params, BLOCK_SIZE sb_size,
+ int mi_row, int mi_col);
+
+void av1_backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats, const AV1_COMP *cpi,
+ ThreadData *td, const TileDataEnc *tile_data,
+ int mi_row, int mi_col);
+
+void av1_restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats, AV1_COMP *cpi,
+ ThreadData *td, TileDataEnc *tile_data, int mi_row,
+ int mi_col);
+
+void av1_set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+ const TileInfo *const tile_info, const int mi_row,
+ const int mi_col);
+
+void av1_dealloc_src_diff_buf(struct macroblock *mb, int num_planes);
+
+static AOM_INLINE void av1_dealloc_mb_data(struct macroblock *mb,
+ int num_planes) {
+ aom_free(mb->txfm_search_info.mb_rd_record);
+ mb->txfm_search_info.mb_rd_record = NULL;
+
+ aom_free(mb->inter_modes_info);
+ mb->inter_modes_info = NULL;
+
+ av1_dealloc_src_diff_buf(mb, num_planes);
+
+ aom_free(mb->e_mbd.seg_mask);
+ mb->e_mbd.seg_mask = NULL;
+
+ aom_free(mb->winner_mode_stats);
+ mb->winner_mode_stats = NULL;
+
+ aom_free(mb->dqcoeff_buf);
+ mb->dqcoeff_buf = NULL;
+}
+
+static AOM_INLINE void allocate_winner_mode_stats(const AV1_COMP *cpi,
+ struct macroblock *mb) {
+ const SPEED_FEATURES *sf = &cpi->sf;
+ // The winner_mode_stats buffer is not required in these cases.
+ if (is_stat_generation_stage(cpi) ||
+ (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode) ||
+ (sf->winner_mode_sf.multi_winner_mode_type == MULTI_WINNER_MODE_OFF))
+ return;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const int winner_mode_count =
+ winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type];
+ CHECK_MEM_ERROR(cm, mb->winner_mode_stats,
+ (WinnerModeStats *)aom_malloc(
+ winner_mode_count * sizeof(mb->winner_mode_stats[0])));
+}
+
+void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb);
+
+static AOM_INLINE void av1_alloc_mb_data(const AV1_COMP *cpi,
+ struct macroblock *mb) {
+ const AV1_COMMON *cm = &cpi->common;
+ const SPEED_FEATURES *sf = &cpi->sf;
+ if (!sf->rt_sf.use_nonrd_pick_mode) {
+ // Memory for mb_rd_record is allocated only when use_mb_rd_hash sf is
+ // enabled.
+ if (sf->rd_sf.use_mb_rd_hash)
+ CHECK_MEM_ERROR(cm, mb->txfm_search_info.mb_rd_record,
+ (MB_RD_RECORD *)aom_malloc(sizeof(MB_RD_RECORD)));
+ if (!frame_is_intra_only(cm))
+ CHECK_MEM_ERROR(
+ cm, mb->inter_modes_info,
+ (InterModesInfo *)aom_malloc(sizeof(*mb->inter_modes_info)));
+ }
+
+ av1_alloc_src_diff_buf(cm, mb);
+
+ CHECK_MEM_ERROR(cm, mb->e_mbd.seg_mask,
+ (uint8_t *)aom_memalign(
+ 16, 2 * MAX_SB_SQUARE * sizeof(mb->e_mbd.seg_mask[0])));
+
+ allocate_winner_mode_stats(cpi, mb);
+
+ const int max_sb_square_y = 1
+ << num_pels_log2_lookup[cm->seq_params->sb_size];
+ CHECK_MEM_ERROR(
+ cm, mb->dqcoeff_buf,
+ (tran_low_t *)aom_memalign(32, max_sb_square_y * sizeof(tran_low_t)));
+}
+
+// This function will compute the number of reference frames to be disabled
+// based on selective_ref_frame speed feature.
+static AOM_INLINE unsigned int get_num_refs_to_disable(
+ const AV1_COMP *cpi, const int *ref_frame_flags,
+ const unsigned int *ref_display_order_hint,
+ unsigned int cur_frame_display_index) {
+ unsigned int num_refs_to_disable = 0;
+ if (cpi->sf.inter_sf.selective_ref_frame >= 3) {
+ num_refs_to_disable++;
+ if (cpi->sf.inter_sf.selective_ref_frame >= 6) {
+ // Disable LAST2_FRAME and ALTREF2_FRAME
+ num_refs_to_disable += 2;
+ } else if (cpi->sf.inter_sf.selective_ref_frame == 5 &&
+ *ref_frame_flags & av1_ref_frame_flag_list[LAST2_FRAME]) {
+ const int last2_frame_dist = av1_encoder_get_relative_dist(
+ ref_display_order_hint[LAST2_FRAME - LAST_FRAME],
+ cur_frame_display_index);
+ // Disable LAST2_FRAME if it is a temporally distant frame
+ if (abs(last2_frame_dist) > 2) {
+ num_refs_to_disable++;
+ }
+#if !CONFIG_REALTIME_ONLY
+ else if (is_stat_consumption_stage_twopass(cpi)) {
+ const FIRSTPASS_STATS *const this_frame_stats =
+ read_one_frame_stats(&cpi->ppi->twopass, cur_frame_display_index);
+ const double coded_error_per_mb = this_frame_stats->coded_error;
+ // Disable LAST2_FRAME if the coded error of the current frame based on
+ // first pass stats is very low.
+ if (coded_error_per_mb < 100.0) num_refs_to_disable++;
+ }
+#endif // CONFIG_REALTIME_ONLY
+ }
+ }
+ return num_refs_to_disable;
+}
+
+static INLINE int get_max_allowed_ref_frames(
+ const AV1_COMP *cpi, const int *ref_frame_flags,
+ const unsigned int *ref_display_order_hint,
+ unsigned int cur_frame_display_index) {
+ const unsigned int max_reference_frames =
+ cpi->oxcf.ref_frm_cfg.max_reference_frames;
+ const unsigned int num_refs_to_disable = get_num_refs_to_disable(
+ cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index);
+ const unsigned int max_allowed_refs_for_given_speed =
+ INTER_REFS_PER_FRAME - num_refs_to_disable;
+ return AOMMIN(max_allowed_refs_for_given_speed, max_reference_frames);
+}
+
+// Enforce the number of references for each arbitrary frame based on user
+// options and speed.
+static AOM_INLINE void enforce_max_ref_frames(
+ AV1_COMP *cpi, int *ref_frame_flags,
+ const unsigned int *ref_display_order_hint,
+ unsigned int cur_frame_display_index) {
+ MV_REFERENCE_FRAME ref_frame;
+ int total_valid_refs = 0;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ total_valid_refs++;
+ }
+ }
+
+ const int max_allowed_refs = get_max_allowed_ref_frames(
+ cpi, ref_frame_flags, ref_display_order_hint, cur_frame_display_index);
+
+ for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
+ const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
+
+ if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) {
+ continue;
+ }
+
+ switch (ref_frame_to_disable) {
+ case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+ case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+ case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+ case BWDREF_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break;
+ default: assert(0);
+ }
+ --total_valid_refs;
+ }
+ assert(total_valid_refs <= max_allowed_refs);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
new file mode 100644
index 0000000000..c78761dd98
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -0,0 +1,866 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/txb_rdopt.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride) {
+ assert(rows >= 4 && cols >= 4);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (bd_info.use_highbitdepth_buf) {
+ aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+ pred8, pred_stride);
+ return;
+ }
+#endif
+ (void)bd_info;
+ aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+ pred_stride);
+}
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+ int blk_col, int blk_row, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ const int tx1d_width = tx_size_wide[tx_size];
+ const int tx1d_height = tx_size_high[tx_size];
+ uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+ int16_t *src_diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
+ av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride,
+ src, src_stride, dst, dst_stride);
+}
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+
+ av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf,
+ p->src.stride, pd->dst.buf, pd->dst.stride);
+}
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ const int eob = p->eobs[block];
+ const int segment_id = xd->mi[0]->segment_id;
+
+ if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
+ xd->lossless[segment_id]) {
+ *rate_cost = av1_cost_skip_txb(&x->coeff_costs, txb_ctx, plane, tx_size);
+ return eob;
+ }
+
+ return av1_optimize_txb(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+ rate_cost, cpi->oxcf.algo_cfg.sharpness);
+}
+
+// Hyper-parameters for dropout optimization, based on following logics.
+// TODO(yjshen): These settings are tuned by experiments. They may still be
+// optimized for better performance.
+// (1) Coefficients which are large enough will ALWAYS be kept.
+const tran_low_t DROPOUT_COEFF_MAX = 2; // Max dropout-able coefficient.
+// (2) Continuous coefficients will ALWAYS be kept. Here rigorous continuity is
+// NOT required. For example, `5 0 0 0 7` is treated as two continuous
+// coefficients if three zeros do not fulfill the dropout condition.
+const int DROPOUT_CONTINUITY_MAX = 2; // Max dropout-able continuous coeff.
+// (3) Dropout operation is NOT applicable to blocks with large or small
+// quantization index.
+const int DROPOUT_Q_MAX = 128;
+const int DROPOUT_Q_MIN = 16;
+// (4) Recall that dropout optimization will forcibly set some quantized
+// coefficients to zero. The key logic on determining whether a coefficient
+// should be dropped is to check the number of continuous zeros before AND
+// after this coefficient. The exact number of zeros for judgement depends
+// on block size and quantization index. More concretely, block size
+// determines the base number of zeros, while quantization index determines
+// the multiplier. Intuitively, larger block requires more zeros and larger
+// quantization index also requires more zeros (more information is lost
+// when using larger quantization index).
+const int DROPOUT_BEFORE_BASE_MAX = 32; // Max base number for leading zeros.
+const int DROPOUT_BEFORE_BASE_MIN = 16; // Min base number for leading zeros.
+const int DROPOUT_AFTER_BASE_MAX = 32; // Max base number for trailing zeros.
+const int DROPOUT_AFTER_BASE_MIN = 16; // Min base number for trailing zeros.
+const int DROPOUT_MULTIPLIER_MAX = 8; // Max multiplier on number of zeros.
+const int DROPOUT_MULTIPLIER_MIN = 2; // Min multiplier on number of zeros.
+const int DROPOUT_MULTIPLIER_Q_BASE = 32; // Base Q to compute multiplier.
+
+void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+ TX_TYPE tx_type, int qindex) {
+ const int tx_width = tx_size_wide[tx_size];
+ const int tx_height = tx_size_high[tx_size];
+
+ // Early return if `qindex` is out of range.
+ if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) {
+ return;
+ }
+
+ // Compute number of zeros used for dropout judgement.
+ const int base_size = AOMMAX(tx_width, tx_height);
+ const int multiplier = CLIP(qindex / DROPOUT_MULTIPLIER_Q_BASE,
+ DROPOUT_MULTIPLIER_MIN, DROPOUT_MULTIPLIER_MAX);
+ const int dropout_num_before =
+ multiplier *
+ CLIP(base_size, DROPOUT_BEFORE_BASE_MIN, DROPOUT_BEFORE_BASE_MAX);
+ const int dropout_num_after =
+ multiplier *
+ CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX);
+
+ av1_dropout_qcoeff_num(mb, plane, block, tx_size, tx_type, dropout_num_before,
+ dropout_num_after);
+}
+
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+ TX_SIZE tx_size, TX_TYPE tx_type,
+ int dropout_num_before, int dropout_num_after) {
+ const struct macroblock_plane *const p = &mb->plane[plane];
+ tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+ tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+ const int max_eob = av1_get_max_eob(tx_size);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+
+ // Early return if there are not enough non-zero coefficients.
+ if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before ||
+ max_eob <= dropout_num_before + dropout_num_after) {
+ return;
+ }
+
+ int count_zeros_before = 0;
+ int count_zeros_after = 0;
+ int count_nonzeros = 0;
+ // Index of the first non-zero coefficient after sufficient number of
+ // continuous zeros. If equals to `-1`, it means number of leading zeros
+ // hasn't reach `dropout_num_before`.
+ int idx = -1;
+ int eob = 0; // New end of block.
+
+ for (int i = 0; i < p->eobs[block]; ++i) {
+ const int scan_idx = scan_order->scan[i];
+ if (abs(qcoeff[scan_idx]) > DROPOUT_COEFF_MAX) {
+ // Keep large coefficients.
+ count_zeros_before = 0;
+ count_zeros_after = 0;
+ idx = -1;
+ eob = i + 1;
+ } else if (qcoeff[scan_idx] == 0) { // Count zeros.
+ if (idx == -1) {
+ ++count_zeros_before;
+ } else {
+ ++count_zeros_after;
+ }
+ } else { // Count non-zeros.
+ if (count_zeros_before >= dropout_num_before) {
+ idx = (idx == -1) ? i : idx;
+ ++count_nonzeros;
+ } else {
+ count_zeros_before = 0;
+ eob = i + 1;
+ }
+ }
+
+ // Handle continuity.
+ if (count_nonzeros > DROPOUT_CONTINUITY_MAX) {
+ count_zeros_before = 0;
+ count_zeros_after = 0;
+ count_nonzeros = 0;
+ idx = -1;
+ eob = i + 1;
+ }
+
+ // Handle the trailing zeros after original end of block.
+ if (idx != -1 && i == p->eobs[block] - 1) {
+ count_zeros_after += (max_eob - p->eobs[block]);
+ }
+
+ // Set redundant coefficients to zeros if needed.
+ if (count_zeros_after >= dropout_num_after) {
+ for (int j = idx; j <= i; ++j) {
+ qcoeff[scan_order->scan[j]] = 0;
+ dqcoeff[scan_order->scan[j]] = 0;
+ }
+ count_zeros_before += (i - idx + 1);
+ count_zeros_after = 0;
+ count_nonzeros = 0;
+ } else if (i == p->eobs[block] - 1) {
+ eob = i + 1;
+ }
+ }
+
+ if (eob != p->eobs[block]) {
+ p->eobs[block] = eob;
+ p->txb_entropy_ctx[block] =
+ av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+ }
+}
+
+// Settings for optimization type. NOTE: To set optimization type for all intra
+// frames, both `KEY_BLOCK_OPT_TYPE` and `INTRA_BLOCK_OPT_TYPE` should be set.
+// TODO(yjshen): These settings are hard-coded and look okay for now. They
+// should be made configurable later.
+// Blocks of key frames ONLY.
+const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+// Blocks of intra frames (key frames EXCLUSIVE).
+const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+// Blocks of inter frames. (NOTE: Dropout optimization is DISABLED by default
+// if trellis optimization is on for inter frames.)
+const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+
+enum {
+ QUANT_FUNC_LOWBD = 0,
+ QUANT_FUNC_HIGHBD = 1,
+ QUANT_FUNC_TYPES = 2
+} UENUM1BYTE(QUANT_FUNC);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AV1_QUANT_FACADE
+ quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
+ { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
+ { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
+ { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
+ { NULL, NULL }
+ };
+#else
+static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] = {
+ av1_quantize_fp_facade, av1_quantize_b_facade, av1_quantize_dc_facade, NULL
+};
+#endif
+
+// Computes the transform for DC only blocks
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+ TxfmParam *txfm_param, int64_t per_px_mean) {
+ assert(per_px_mean != INT64_MAX);
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff = p->coeff + block_offset;
+ const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+ memset(coeff, 0, sizeof(*coeff) * n_coeffs);
+ coeff[0] =
+ (tran_low_t)((per_px_mean * dc_coeff_scale[txfm_param->tx_size]) >> 12);
+}
+
+void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
+ const QUANT_PARAM *qparam) {
+ av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, txfm_param);
+ av1_quant(x, plane, block, txfm_param, qparam);
+}
+
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TxfmParam *txfm_param) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff = p->coeff + block_offset;
+ const int diff_stride = block_size_wide[plane_bsize];
+
+ const int src_offset = (blk_row * diff_stride + blk_col);
+ const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2];
+
+ av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+ const QUANT_PARAM *qparam) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param->tx_size, txfm_param->tx_type);
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff = p->coeff + block_offset;
+ tran_low_t *const qcoeff = p->qcoeff + block_offset;
+ tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+ uint16_t *const eob = &p->eobs[block];
+
+ if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+ const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+ if (LIKELY(!x->seg_skip_block)) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd](
+ coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
+#else
+ quant_func_list[qparam->xform_quant_idx](
+ coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
+#endif
+ } else {
+ av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
+ }
+ }
+ // use_optimize_b is true means av1_optimze_b will be called,
+ // thus cannot update entropy ctx now (performed in optimize_b)
+ if (qparam->use_optimize_b) {
+ p->txb_entropy_ctx[block] = 0;
+ } else {
+ p->txb_entropy_ctx[block] =
+ av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+ }
+}
+
+void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
+ TX_TYPE tx_type, TxfmParam *txfm_param) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ txfm_param->tx_type = tx_type;
+ txfm_param->tx_size = tx_size;
+ txfm_param->lossless = xd->lossless[mbmi->segment_id];
+ txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+ tx_size, is_inter_block(mbmi), cm->features.reduced_tx_set_used);
+
+ txfm_param->bd = xd->bd;
+ txfm_param->is_hbd = is_cur_buf_hbd(xd);
+}
+void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
+ int use_quant_b_adapt, QUANT_PARAM *qparam) {
+ qparam->log_scale = av1_get_tx_scale(tx_size);
+ qparam->tx_size = tx_size;
+
+ qparam->use_quant_b_adapt = use_quant_b_adapt;
+
+ // TODO(bohanli): optimize_b and quantization idx has relationship,
+ // but is kind of buried and complicated in different encoding stages.
+ // Should have a unified function to derive quant_idx, rather than
+ // determine and pass in the quant_idx
+ qparam->use_optimize_b = use_optimize_b;
+ qparam->xform_quant_idx = xform_quant_idx;
+
+ qparam->qmatrix = NULL;
+ qparam->iqmatrix = NULL;
+}
+void av1_setup_qmatrix(const CommonQuantParams *quant_params,
+ const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+ TX_TYPE tx_type, QUANT_PARAM *qparam) {
+ qparam->qmatrix = av1_get_qmatrix(quant_params, xd, plane, tx_size, tx_type);
+ qparam->iqmatrix =
+ av1_get_iqmatrix(quant_params, xd, plane, tx_size, tx_type);
+}
+
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg,
+ RUN_TYPE dry_run) {
+ (void)dry_run;
+ struct encode_b_args *const args = arg;
+ const AV1_COMP *const cpi = args->cpi;
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+ uint8_t *dst;
+ ENTROPY_CONTEXT *a, *l;
+ int dummy_rate_cost = 0;
+
+ const int bw = mi_size_wide[plane_bsize];
+ dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+
+ a = &args->ta[blk_col];
+ l = &args->tl[blk_row];
+
+ TX_TYPE tx_type = DCT_DCT;
+ const int blk_skip_idx = blk_row * bw + blk_col;
+ if (!is_blk_skip(x->txfm_search_info.blk_skip, plane, blk_skip_idx) &&
+ !mbmi->skip_mode) {
+ tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+ const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run);
+ int quant_idx;
+ if (use_trellis)
+ quant_idx = AV1_XFORM_QUANT_FP;
+ else
+ quant_idx =
+ USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+ av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
+ av1_setup_quant(tx_size, use_trellis, quant_idx,
+ cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+
+ // Whether trellis or dropout optimization is required for inter frames.
+ const bool do_trellis = INTER_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+ INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT;
+ const bool do_dropout = INTER_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+ INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT;
+
+ if (quant_param.use_optimize_b && do_trellis) {
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+ &dummy_rate_cost);
+ }
+ if (!quant_param.use_optimize_b && do_dropout) {
+ av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
+ cm->quant_params.base_qindex);
+ }
+ } else {
+ p->eobs[block] = 0;
+ p->txb_entropy_ctx[block] = 0;
+ }
+
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+ if (p->eobs[block]) {
+ // As long as any YUV plane has non-zero quantized transform coefficients,
+ // mbmi->skip_txfm flag is set to 0.
+ mbmi->skip_txfm = 0;
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+ pd->dst.stride, p->eobs[block],
+ cm->features.reduced_tx_set_used);
+ } else {
+ // Only when YUV planes all have zero quantized transform coefficients,
+ // mbmi->skip_txfm flag is set to 1.
+ mbmi->skip_txfm &= 1;
+ }
+
+ // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
+ // case. It is possible that certain collision in hash index would cause
+ // the assertion failure. To further optimize the rate-distortion
+ // performance, we need to re-visit this part and enable this assert
+ // again.
+ if (p->eobs[block] == 0 && plane == 0) {
+#if 0
+ if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
+ args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) {
+ // TODO(jingning,angiebird,huisu@google.com): enable txk_check when
+ // enable_optimize_b is true to detect potential RD bug.
+ const uint8_t disable_txk_check = args->enable_optimize_b;
+ if (!disable_txk_check) {
+ assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
+ DCT_DCT);
+ }
+ }
+#endif
+ update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+ }
+
+#if CONFIG_MISMATCH_DEBUG
+ if (dry_run == OUTPUT_ENABLED) {
+ int pixel_c, pixel_r;
+ BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+ int blk_w = block_size_wide[bsize];
+ int blk_h = block_size_high[bsize];
+ mi_to_pixel_loc(&pixel_c, &pixel_r, xd->mi_col, xd->mi_row, blk_col,
+ blk_row, pd->subsampling_x, pd->subsampling_y);
+ mismatch_record_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
+ plane, pixel_c, pixel_r, blk_w, blk_h,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+#endif
+}
+
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg, RUN_TYPE dry_run) {
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+ if (!plane) {
+ assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] &&
+ tx_size_high[tx_size] >= tx_size_high[plane_tx_size]);
+ }
+
+ if (tx_size == plane_tx_size || plane) {
+ encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg,
+ dry_run);
+ } else {
+ assert(tx_size < TX_SIZES_ALL);
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+ assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
+ // This is the square transform block partition entry point.
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int step = bsh * bsw;
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+ assert(bsw > 0 && bsh > 0);
+
+ for (int row = 0; row < row_end; row += bsh) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < col_end; col += bsw) {
+ const int offsetc = blk_col + col;
+
+ encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+ arg, dry_run);
+ block += step;
+ }
+ }
+ }
+}
+
+void av1_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
+ foreach_transformed_block_visitor visit, void *arg) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+ // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+ // transform size varies per plane, look it up in a common way.
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ // Call visit() directly with zero offsets if the current block size is the
+ // same as the transform block size.
+ if (plane_bsize == tx_bsize) {
+ visit(plane, 0, 0, 0, plane_bsize, tx_size, arg);
+ return;
+ }
+ const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+ const uint8_t txh_unit = tx_size_high_unit[tx_size];
+ const int step = txw_unit * txh_unit;
+
+ // If mb_to_right_edge is < 0 we are in a situation in which
+ // the current block size extends into the UMV and we won't
+ // visit the sub blocks that are wholly within the UMV.
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+ const int mu_blocks_wide =
+ AOMMIN(mi_size_wide[max_unit_bsize], max_blocks_wide);
+ const int mu_blocks_high =
+ AOMMIN(mi_size_high[max_unit_bsize], max_blocks_high);
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ int i = 0;
+ for (int r = 0; r < max_blocks_high; r += mu_blocks_high) {
+ const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (int c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+ const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
+ for (int blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+ for (int blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+ visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
+ i += step;
+ }
+ }
+ }
+ }
+ // Check if visit() is invoked at least once.
+ assert(i >= 1);
+}
+
+typedef struct encode_block_pass1_args {
+ AV1_COMP *cpi;
+ MACROBLOCK *x;
+} encode_block_pass1_args;
+
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ encode_block_pass1_args *args = (encode_block_pass1_args *)arg;
+ AV1_COMP *cpi = args->cpi;
+ AV1_COMMON *cm = &cpi->common;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+
+ uint8_t *dst;
+ dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+
+ av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+ av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
+ &quant_param);
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT,
+ &quant_param);
+
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+
+ if (p->eobs[block] > 0) {
+ txfm_param.eob = p->eobs[block];
+ if (txfm_param.is_hbd) {
+ av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+ return;
+ }
+ av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+ }
+}
+
+void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) {
+ encode_block_pass1_args args = { cpi, x };
+ av1_subtract_plane(x, bsize, 0);
+ av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+ encode_block_pass1, &args);
+}
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ RUN_TYPE dry_run) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ // In the current encoder implementation, for inter blocks,
+ // only when YUV planes all have zero quantized transform coefficients,
+ // mbmi->skip_txfm flag is set to 1.
+ // For intra blocks, this flag is set to 0 since skipped blocks are so rare
+ // that transmitting skip_txfm = 1 is very expensive.
+ // mbmi->skip_txfm is init to 1, and will be modified in encode_block() based
+ // on transform, quantization, and (if exists) trellis optimization.
+ mbmi->skip_txfm = 1;
+ if (x->txfm_search_info.skip_txfm) return;
+
+ struct optimize_ctx ctx;
+ struct encode_b_args arg = {
+ cpi, x, &ctx, NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
+ };
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int subsampling_x = pd->subsampling_x;
+ const int subsampling_y = pd->subsampling_y;
+ if (plane && !xd->is_chroma_ref) break;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, subsampling_x, subsampling_y);
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+ const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+ const int bw = mi_size_wide[txb_size];
+ const int bh = mi_size_high[txb_size];
+ int block = 0;
+ const int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]);
+ av1_subtract_plane(x, plane_bsize, plane);
+ arg.ta = ctx.ta[plane];
+ arg.tl = ctx.tl[plane];
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, subsampling_x, subsampling_y);
+ int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ int mu_blocks_high = mi_size_high[max_unit_bsize];
+ mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+ for (int idy = 0; idy < mi_height; idy += mu_blocks_high) {
+ for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+ int blk_row, blk_col;
+ const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+ const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+ for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
+ for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
+ encode_block_inter(plane, block, blk_row, blk_col, plane_bsize,
+ max_tx_size, &arg, dry_run);
+ block += step;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void encode_block_intra_and_set_context(int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ arg);
+
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *x = args->x;
+ ENTROPY_CONTEXT *a = &args->ta[blk_col];
+ ENTROPY_CONTEXT *l = &args->tl[blk_row];
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct encode_b_args *const args = arg;
+ const AV1_COMP *const cpi = args->cpi;
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ uint16_t *eob = &p->eobs[block];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ int dummy_rate_cost = 0;
+
+ av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+
+ TX_TYPE tx_type = DCT_DCT;
+ const int bw = mi_size_wide[plane_bsize];
+ if (plane == 0 && is_blk_skip(x->txfm_search_info.blk_skip, plane,
+ blk_row * bw + blk_col)) {
+ *eob = 0;
+ p->txb_entropy_ctx[block] = 0;
+ } else {
+ av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+
+ const ENTROPY_CONTEXT *a = &args->ta[blk_col];
+ const ENTROPY_CONTEXT *l = &args->tl[blk_row];
+ tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+ const int use_trellis =
+ is_trellis_used(args->enable_optimize_b, args->dry_run);
+ int quant_idx;
+ if (use_trellis)
+ quant_idx = AV1_XFORM_QUANT_FP;
+ else
+ quant_idx =
+ USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+
+ av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
+ av1_setup_quant(tx_size, use_trellis, quant_idx,
+ cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+
+ // Whether trellis or dropout optimization is required for key frames and
+ // intra frames.
+ const bool do_trellis = (frame_is_intra_only(cm) &&
+ (KEY_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+ KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) ||
+ (!frame_is_intra_only(cm) &&
+ (INTRA_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+ INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT));
+ const bool do_dropout = (frame_is_intra_only(cm) &&
+ (KEY_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+ KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) ||
+ (!frame_is_intra_only(cm) &&
+ (INTRA_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+ INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT));
+
+ if (quant_param.use_optimize_b && do_trellis) {
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+ &dummy_rate_cost);
+ }
+ if (do_dropout) {
+ av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
+ cm->quant_params.base_qindex);
+ }
+ }
+
+ if (*eob) {
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+ dst_stride, *eob,
+ cm->features.reduced_tx_set_used);
+ }
+
+ // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
+ // It is possible that certain collision in hash index would cause
+ // the assertion failure. To further optimize the rate-distortion
+ // performance, we need to re-visit this part and enable this assert
+ // again.
+ if (*eob == 0 && plane == 0) {
+#if 0
+ if (args->cpi->oxcf.q_cfg.aq_mode == NO_AQ
+ && args->cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q) {
+ assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
+ DCT_DCT);
+ }
+#endif
+ update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+ }
+
+ // For intra mode, skipped blocks are so rare that transmitting
+ // skip_txfm = 1 is very expensive.
+ mbmi->skip_txfm = 0;
+
+ if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+ cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+ }
+}
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
+ TRELLIS_OPT_TYPE enable_optimize_b) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ if (plane && !xd->is_chroma_ref) return;
+
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
+ ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
+ struct encode_b_args arg = {
+ cpi, x, NULL, ta, tl, dry_run, enable_optimize_b
+ };
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ if (enable_optimize_b) {
+ av1_get_entropy_contexts(plane_bsize, pd, ta, tl);
+ }
+ av1_foreach_transformed_block_in_plane(
+ xd, plane_bsize, plane, encode_block_intra_and_set_context, &arg);
+}
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
new file mode 100644
index 0000000000..f97bf8f517
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMB_H_
+#define AOM_AV1_ENCODER_ENCODEMB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/tokenize.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+ AV1_XFORM_QUANT_FP = 0,
+ AV1_XFORM_QUANT_B = 1,
+ AV1_XFORM_QUANT_DC = 2,
+ AV1_XFORM_QUANT_SKIP_QUANT,
+ AV1_XFORM_QUANT_TYPES,
+} UENUM1BYTE(AV1_XFORM_QUANT);
+
+// TODO(any): Merge OPT_TYPe and TRELLLIS_OPT_TYPE
+// Available optimization types to optimize the quantized coefficients.
+enum {
+ NONE_OPT = 0, // No optimization.
+ TRELLIS_OPT = 1, // Trellis optimization. See `av1_optimize_b()`.
+ DROPOUT_OPT = 2, // Dropout optimization. See `av1_dropout_qcoeff()`.
+ TRELLIS_DROPOUT_OPT = 3 // Perform dropout after trellis optimization.
+} UENUM1BYTE(OPT_TYPE);
+
+enum {
+ NO_TRELLIS_OPT, // No trellis optimization
+ FULL_TRELLIS_OPT, // Trellis optimization in all stages
+ FINAL_PASS_TRELLIS_OPT, // Trellis optimization in only the final encode pass
+ NO_ESTIMATE_YRD_TRELLIS_OPT // Disable trellis in estimate_yrd_for_sb
+} UENUM1BYTE(TRELLIS_OPT_TYPE);
+
+struct optimize_ctx {
+ ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
+ ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
+};
+
+struct encode_b_args {
+ const struct AV1_COMP *cpi;
+ MACROBLOCK *x;
+ struct optimize_ctx *ctx;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+ RUN_TYPE dry_run;
+ TRELLIS_OPT_TYPE enable_optimize_b;
+};
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ RUN_TYPE dry_run);
+
+void av1_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
+ foreach_transformed_block_visitor visit, void *arg);
+
+void av1_encode_sby_pass1(struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize);
+
+void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
+ TX_TYPE tx_type, TxfmParam *txfm_param);
+void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
+ int use_quant_b_adapt, QUANT_PARAM *qparam);
+void av1_setup_qmatrix(const CommonQuantParams *quant_params,
+ const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+ TX_TYPE tx_type, QUANT_PARAM *qparam);
+
+void av1_xform_dc_only(MACROBLOCK *x, int plane, int block,
+ TxfmParam *txfm_param, int64_t per_px_mean);
+
+void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
+ const QUANT_PARAM *qparam);
+
+void av1_xform(MACROBLOCK *x, int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TxfmParam *txfm_param);
+
+void av1_quant(MACROBLOCK *x, int plane, int block, TxfmParam *txfm_param,
+ const QUANT_PARAM *qparam);
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost);
+
+// This function can be used as (i) a further optimization to reduce the
+// redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis
+// optimization, or (ii) an alternative to trellis optimization in high-speed
+// compression mode (e.g., real-time mode under speed-6) due to its LOW time
+// complexity. The rational behind is to drop out the may-be redundant quantized
+// coefficient which is among a bunch of zeros. NOTE: This algorithm is not as
+// accurate as trellis optimization since the hyper-parameters are hard-coded
+// instead of dynamic search. More adaptive logic may improve the performance.
+// This function should be applied to all or partical block cells.
+// Inputs:
+// mb: Pointer to the MACROBLOCK to perform dropout on.
+// plane: Index of the plane to which the target block belongs.
+// block: Index of the target block.
+// tx_size: Transform size of the target block.
+// tx_type: Transform type of the target block. This field is particularly
+// used to find out the scan order of the block.
+// qindex: Quantization index used for target block. In general, all blocks
+// in a same plane share the same quantization index. This field is
+// particularly used to determine how many zeros should be used to
+// drop out a coefficient.
+// Returns:
+// Nothing will be returned, but `qcoeff`, `dqcoeff`, `eob`, as well as
+// `txb_entropy_ctx`, which `mb` points to, may be modified by this function.
+void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+ TX_TYPE tx_type, int qindex);
+// Same as above, with the number of zeroes needed before/after a coeff to drop
+// it explicitly passed in, instead of being derived from qindex.
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+ TX_SIZE tx_size, TX_TYPE tx_type,
+ int dropout_num_before, int dropout_num_after);
+
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride);
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+ int blk_col, int blk_row, TX_SIZE tx_size);
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane);
+
+static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
+ TX_SIZE tx_size, ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l) {
+ const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block];
+ memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a));
+ memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l));
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
+ TRELLIS_OPT_TYPE enable_optimize_b);
+
+static INLINE int is_trellis_used(TRELLIS_OPT_TYPE optimize_b,
+ RUN_TYPE dry_run) {
+ if (optimize_b == NO_TRELLIS_OPT) return false;
+ if (optimize_b == FINAL_PASS_TRELLIS_OPT && dry_run != OUTPUT_ENABLED)
+ return false;
+ return true;
+}
+
+// Scaling terms (precision of 12 bits) to perform tx-size specific
+// normalization that is used in DCT_DCT forward transform.
+// For transform blocks of 1:2 and 2:1 - sqrt(2) normalization is used
+// For transform blocks of 1:4 and 4:1 - factor of 2 is used
+// For transform blocks TX_8x8 and below - an additional factor of 2 is used
+// For transform blocks max(width,height)=64 - currently not supported
+
+static const uint16_t dc_coeff_scale[TX_SIZES_ALL] = {
+ 1024, 2048, 4096, 4096, 0, 1448, 1448, 2896, 2896, 2896,
+ 2896, 0, 0, 2048, 2048, 4096, 4096, 0, 0
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEMB_H_
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
new file mode 100644
index 0000000000..7cae72c159
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+static void update_mv_component_stats(int comp, nmv_component *mvcomp,
+ MvSubpelPrecision precision) {
+ assert(comp != 0);
+ int offset;
+ const int sign = comp < 0;
+ const int mag = sign ? -comp : comp;
+ const int mv_class = av1_get_mv_class(mag - 1, &offset);
+ const int d = offset >> 3; // int mv data
+ const int fr = (offset >> 1) & 3; // fractional mv data
+ const int hp = offset & 1; // high precision mv data
+
+ // Sign
+ update_cdf(mvcomp->sign_cdf, sign, 2);
+
+ // Class
+ update_cdf(mvcomp->classes_cdf, mv_class, MV_CLASSES);
+
+ // Integer bits
+ if (mv_class == MV_CLASS_0) {
+ update_cdf(mvcomp->class0_cdf, d, CLASS0_SIZE);
+ } else {
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ for (int i = 0; i < n; ++i)
+ update_cdf(mvcomp->bits_cdf[i], (d >> i) & 1, 2);
+ }
+ // Fractional bits
+ if (precision > MV_SUBPEL_NONE) {
+ aom_cdf_prob *fp_cdf =
+ mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf;
+ update_cdf(fp_cdf, fr, MV_FP_SIZE);
+ }
+
+ // High precision bit
+ if (precision > MV_SUBPEL_LOW_PRECISION) {
+ aom_cdf_prob *hp_cdf =
+ mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf;
+ update_cdf(hp_cdf, hp, 2);
+ }
+}
+
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+ MvSubpelPrecision precision) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+ update_cdf(mvctx->joints_cdf, j, MV_JOINTS);
+
+ if (mv_joint_vertical(j))
+ update_mv_component_stats(diff.row, &mvctx->comps[0], precision);
+
+ if (mv_joint_horizontal(j))
+ update_mv_component_stats(diff.col, &mvctx->comps[1], precision);
+}
+
+static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
+ MvSubpelPrecision precision) {
+ assert(comp != 0);
+ int offset;
+ const int sign = comp < 0;
+ const int mag = sign ? -comp : comp;
+ const int mv_class = av1_get_mv_class(mag - 1, &offset);
+ const int d = offset >> 3; // int mv data
+ const int fr = (offset >> 1) & 3; // fractional mv data
+ const int hp = offset & 1; // high precision mv data
+
+ // Sign
+ aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
+
+ // Class
+ aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES);
+
+ // Integer bits
+ if (mv_class == MV_CLASS_0) {
+ aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE);
+ } else {
+ int i;
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ for (i = 0; i < n; ++i)
+ aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2);
+ }
+ // Fractional bits
+ if (precision > MV_SUBPEL_NONE) {
+ aom_write_symbol(
+ w, fr,
+ mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+ MV_FP_SIZE);
+ }
+
+ // High precision bit
+ if (precision > MV_SUBPEL_LOW_PRECISION)
+ aom_write_symbol(
+ w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf,
+ 2);
+}
+
+/* TODO(siekyleb@amazon.com): This function writes MV_VALS ints or 128 KiB. This
+ * is more than most L1D caches and is a significant chunk of L2. Write
+ * SIMD that uses streaming writes to avoid loading all of that into L1, or
+ * just don't update the larger component costs every time this called
+ * (or both).
+ */
+void av1_build_nmv_component_cost_table(int *mvcost,
+ const nmv_component *const mvcomp,
+ MvSubpelPrecision precision) {
+ int i, j, v, o, mantissa;
+ int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+ int bits_cost[MV_OFFSET_BITS][2];
+ int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE] = { 0 },
+ fp_cost[MV_FP_SIZE] = { 0 };
+ int class0_hp_cost[2] = { 0 }, hp_cost[2] = { 0 };
+
+ av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
+ av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
+ av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
+ for (i = 0; i < MV_OFFSET_BITS; ++i) {
+ av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
+ }
+
+ if (precision > MV_SUBPEL_NONE) {
+ for (i = 0; i < CLASS0_SIZE; ++i)
+ av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i],
+ NULL);
+ av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+ }
+
+ if (precision > MV_SUBPEL_LOW_PRECISION) {
+ av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
+ av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
+ }
+
+ // Instead of accumulating the cost of each vector component's bits
+ // individually, compute the costs based on smaller vectors. Costs for
+ // [2^exp, 2 * 2^exp - 1] are calculated based on [0, 2^exp - 1]
+ // respectively. Offsets are maintained to swap both 1) class costs when
+ // treated as a complete vector component with the highest set bit when
+ // treated as a mantissa (significand) and 2) leading zeros to account for
+ // the current exponent.
+
+ // Cost offsets
+ int cost_swap[MV_OFFSET_BITS] = { 0 };
+ // Delta to convert positive vector to negative vector costs
+ int negate_sign = sign_cost[1] - sign_cost[0];
+
+ // Initialize with offsets to swap the class costs with the costs of the
+ // highest set bit.
+ for (i = 1; i < MV_OFFSET_BITS; ++i) {
+ cost_swap[i] = bits_cost[i - 1][1];
+ if (i > CLASS0_BITS) cost_swap[i] -= class_cost[i - CLASS0_BITS];
+ }
+
+ // Seed the fractional costs onto the output (overwritten latter).
+ for (o = 0; o < MV_FP_SIZE; ++o) {
+ int hp;
+ for (hp = 0; hp < 2; ++hp) {
+ v = 2 * o + hp + 1;
+ mvcost[v] = fp_cost[o] + hp_cost[hp] + sign_cost[0];
+ }
+ }
+
+ mvcost[0] = 0;
+ // Fill the costs for each exponent's vectors, using the costs set in the
+ // previous exponents.
+ for (i = 0; i < MV_OFFSET_BITS; ++i) {
+ const int exponent = (2 * MV_FP_SIZE) << i;
+
+ int class = 0;
+ if (i >= CLASS0_BITS) {
+ class = class_cost[i - CLASS0_BITS + 1];
+ }
+
+ // Iterate through mantissas, keeping track of the location
+ // of the highest set bit for the mantissa.
+ // To be clear: in the outer loop, the position of the highest set bit
+ // (exponent) is tracked and, in this loop, the highest set bit of the
+ // mantissa is tracked.
+ mantissa = 0;
+ for (j = 0; j <= i; ++j) {
+ for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) {
+ int cost = mvcost[mantissa + 1] + class + cost_swap[j];
+ v = exponent + mantissa + 1;
+ mvcost[v] = cost;
+ mvcost[-v] = cost + negate_sign;
+ }
+ cost_swap[j] += bits_cost[i][0];
+ }
+ }
+
+ // Special case to avoid buffer overrun
+ {
+ int exponent = (2 * MV_FP_SIZE) << MV_OFFSET_BITS;
+ int class = class_cost[MV_CLASSES - 1];
+ mantissa = 0;
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ for (; mantissa < (2 * MV_FP_SIZE) << j; ++mantissa) {
+ int cost = mvcost[mantissa + 1] + class + cost_swap[j];
+ v = exponent + mantissa + 1;
+ mvcost[v] = cost;
+ mvcost[-v] = cost + negate_sign;
+ }
+ }
+ // At this point: mantissa = exponent >> 1
+
+ // Manually calculate the final cost offset
+ int cost_swap_hi =
+ bits_cost[MV_OFFSET_BITS - 1][1] - class_cost[MV_CLASSES - 2];
+ for (; mantissa < exponent - 1; ++mantissa) {
+ int cost = mvcost[mantissa + 1] + class + cost_swap_hi;
+ v = exponent + mantissa + 1;
+ mvcost[v] = cost;
+ mvcost[-v] = cost + negate_sign;
+ }
+ }
+
+ // Fill costs for class0 vectors, overwriting previous placeholder values
+ // used for calculating the costs of the larger vectors.
+ for (i = 0; i < CLASS0_SIZE; ++i) {
+ const int top = i * 2 * MV_FP_SIZE;
+ for (o = 0; o < MV_FP_SIZE; ++o) {
+ int hp;
+ int cost = class0_fp_cost[i][o] + class_cost[0] + class0_cost[i];
+ for (hp = 0; hp < 2; ++hp) {
+ v = top + 2 * o + hp + 1;
+ mvcost[v] = cost + class0_hp_cost[hp] + sign_cost[0];
+ mvcost[-v] = cost + class0_hp_cost[hp] + sign_cost[1];
+ }
+ }
+ }
+}
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+ const MV *ref, nmv_context *mvctx, int usehp) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+ // If the mv_diff is zero, then we should have used near or nearest instead.
+ assert(j != MV_JOINT_ZERO);
+ if (cpi->common.features.cur_frame_force_integer_mv) {
+ usehp = MV_SUBPEL_NONE;
+ }
+ aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+ if (mv_joint_vertical(j))
+ encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+ if (mv_joint_horizontal(j))
+ encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+ // If auto_mv_step_size is enabled then keep track of the largest
+ // motion vector component used.
+ if (cpi->sf.mv_sf.auto_mv_step_size) {
+ int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
+ td->max_mv_magnitude = AOMMAX(maxv, td->max_mv_magnitude);
+ }
+}
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx) {
+ // DV and ref DV should not have sub-pel.
+ assert((mv->col & 7) == 0);
+ assert((mv->row & 7) == 0);
+ assert((ref->col & 7) == 0);
+ assert((ref->row & 7) == 0);
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+ aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+ if (mv_joint_vertical(j))
+ encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE);
+
+ if (mv_joint_horizontal(j))
+ encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE);
+}
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+ const nmv_context *ctx,
+ MvSubpelPrecision precision) {
+ av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL);
+ av1_build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
+ av1_build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
+}
+
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+ const MV_REFERENCE_FRAME *ref_frame,
+ int ref_mv_idx,
+ const MB_MODE_INFO_EXT *mbmi_ext) {
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const CANDIDATE_MV *curr_ref_mv_stack =
+ mbmi_ext->ref_mv_stack[ref_frame_type];
+
+ if (ref_frame[1] > INTRA_FRAME) {
+ assert(ref_idx == 0 || ref_idx == 1);
+ return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+ : curr_ref_mv_stack[ref_mv_idx].this_mv;
+ }
+
+ assert(ref_idx == 0);
+ return ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]
+ ? curr_ref_mv_stack[ref_mv_idx].this_mv
+ : mbmi_ext->global_mvs[ref_frame_type];
+}
+
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+ assert(has_second_ref(mbmi));
+ ref_mv_idx += 1;
+ }
+ return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+ &x->mbmi_ext);
+}
+
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv *nearest_mv, int_mv *near_mv,
+ int is_integer) {
+ const int ref_idx = 0;
+ MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+ *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext);
+ lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer);
+ *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext);
+ lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer);
+}
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
new file mode 100644
index 0000000000..c39001a5a2
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMV_H_
+#define AOM_AV1_ENCODER_ENCODEMV_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+ const MV *ref, nmv_context *mvctx, int usehp);
+
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+ MvSubpelPrecision precision);
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+ const nmv_context *mvctx,
+ MvSubpelPrecision precision);
+void av1_build_nmv_component_cost_table(int *mvcost,
+ const nmv_component *const mvcomp,
+ MvSubpelPrecision precision);
+
+void av1_update_mv_count(ThreadData *td);
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx);
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx);
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+ const MV_REFERENCE_FRAME *ref_frame,
+ int ref_mv_idx,
+ const MB_MODE_INFO_EXT *mbmi_ext);
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv *nearest_mv, int_mv *near_mv,
+ int is_integer);
+
+static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+ // row: Z col: Z | MV_JOINT_ZERO (0)
+ // row: Z col: NZ | MV_JOINT_HNZVZ (1)
+ // row: NZ col: Z | MV_JOINT_HZVNZ (2)
+ // row: NZ col: NZ | MV_JOINT_HNZVNZ (3)
+ return (!!mv->col) | ((!!mv->row) << 1);
+}
+
+static INLINE int av1_mv_class_base(MV_CLASS_TYPE c) {
+ return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
+static INLINE uint8_t av1_log_in_base_2(unsigned int n) {
+ // get_msb() is only valid when n != 0.
+ return n == 0 ? 0 : get_msb(n);
+}
+
+static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
+ assert(z >= 0);
+ const MV_CLASS_TYPE c = (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3);
+ assert(c <= MV_CLASS_10);
+ if (offset) *offset = z - av1_mv_class_base(c);
+ return c;
+}
+
+static INLINE int av1_check_newmv_joint_nonzero(const AV1_COMMON *cm,
+ MACROBLOCK *const x) {
+ (void)cm;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ if (this_mode == NEW_NEWMV) {
+ const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+ const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+ if (mbmi->mv[0].as_int == ref_mv_0.as_int ||
+ mbmi->mv[1].as_int == ref_mv_1.as_int) {
+ return 0;
+ }
+ } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+ const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+ if (mbmi->mv[1].as_int == ref_mv_1.as_int) {
+ return 0;
+ }
+ } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+ const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+ if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+ return 0;
+ }
+ } else if (this_mode == NEWMV) {
+ const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+ if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+ return 0;
+ }
+ }
+ return 1;
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEMV_H_
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
new file mode 100644
index 0000000000..4732ad435b
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -0,0 +1,5409 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+#include <stdlib.h>
+
+#include "av1/common/scale.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aomcx.h"
+
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/filter.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/dwt.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/hash_motion.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/mv_prec.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/pickcdef.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#if CONFIG_SALIENCY_MAP
+#include "av1/encoder/saliency_map.h"
+#endif
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
+
+#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
+
+// #define OUTPUT_YUV_REC
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#define FILE_NAME_LEN 100
+#endif
+
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file = NULL;
+#endif
+
+static INLINE void Scale2Ratio(AOM_SCALING_MODE mode, int *hr, int *hs) {
+ switch (mode) {
+ case AOME_NORMAL:
+ *hr = 1;
+ *hs = 1;
+ break;
+ case AOME_FOURFIVE:
+ *hr = 4;
+ *hs = 5;
+ break;
+ case AOME_THREEFIVE:
+ *hr = 3;
+ *hs = 5;
+ break;
+ case AOME_THREEFOUR:
+ *hr = 3;
+ *hs = 4;
+ break;
+ case AOME_ONEFOUR:
+ *hr = 1;
+ *hs = 4;
+ break;
+ case AOME_ONEEIGHT:
+ *hr = 1;
+ *hs = 8;
+ break;
+ case AOME_ONETWO:
+ *hr = 1;
+ *hs = 2;
+ break;
+ case AOME_TWOTHREE:
+ *hr = 2;
+ *hs = 3;
+ break;
+ case AOME_ONETHREE:
+ *hr = 1;
+ *hs = 3;
+ break;
+ default:
+ *hr = 1;
+ *hs = 1;
+ assert(0);
+ break;
+ }
+}
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ if (rows == mi_params->mb_rows && cols == mi_params->mb_cols) {
+ unsigned char *const active_map_4x4 = cpi->active_map.map;
+ const int mi_rows = mi_params->mi_rows;
+ const int mi_cols = mi_params->mi_cols;
+ const int row_scale = mi_size_high_log2[BLOCK_16X16];
+ const int col_scale = mi_size_wide_log2[BLOCK_16X16];
+ cpi->active_map.update = 0;
+ assert(mi_rows % 2 == 0);
+ assert(mi_cols % 2 == 0);
+ if (new_map_16x16) {
+ for (int r = 0; r < (mi_rows >> row_scale); ++r) {
+ for (int c = 0; c < (mi_cols >> col_scale); ++c) {
+ const uint8_t val = new_map_16x16[r * cols + c]
+ ? AM_SEGMENT_ID_ACTIVE
+ : AM_SEGMENT_ID_INACTIVE;
+ active_map_4x4[(2 * r + 0) * mi_cols + (c + 0)] = val;
+ active_map_4x4[(2 * r + 0) * mi_cols + (c + 1)] = val;
+ active_map_4x4[(2 * r + 1) * mi_cols + (c + 0)] = val;
+ active_map_4x4[(2 * r + 1) * mi_cols + (c + 1)] = val;
+ }
+ }
+ cpi->active_map.enabled = 1;
+ }
+ return 0;
+ }
+
+ return -1;
+}
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ if (rows == mi_params->mb_rows && cols == mi_params->mb_cols &&
+ new_map_16x16) {
+ unsigned char *const seg_map_8x8 = cpi->enc_seg.map;
+ const int mi_rows = mi_params->mi_rows;
+ const int mi_cols = mi_params->mi_cols;
+ const int row_scale = mi_size_high_log2[BLOCK_16X16];
+ const int col_scale = mi_size_wide_log2[BLOCK_16X16];
+ assert(mi_rows % 2 == 0);
+ assert(mi_cols % 2 == 0);
+
+ memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+ if (cpi->active_map.enabled) {
+ for (int r = 0; r < (mi_rows >> row_scale); ++r) {
+ for (int c = 0; c < (mi_cols >> col_scale); ++c) {
+ // Cyclic refresh segments are considered active despite not having
+ // AM_SEGMENT_ID_ACTIVE
+ uint8_t temp = 0;
+ temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 0)] !=
+ AM_SEGMENT_ID_INACTIVE;
+ temp |= seg_map_8x8[(2 * r + 0) * mi_cols + (2 * c + 1)] !=
+ AM_SEGMENT_ID_INACTIVE;
+ temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 0)] !=
+ AM_SEGMENT_ID_INACTIVE;
+ temp |= seg_map_8x8[(2 * r + 1) * mi_cols + (2 * c + 1)] !=
+ AM_SEGMENT_ID_INACTIVE;
+ new_map_16x16[r * cols + c] |= temp;
+ }
+ }
+ }
+ return 0;
+ }
+
+ return -1;
+}
+
+void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage) {
+ bool is_allintra = usage == ALLINTRA;
+
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+ av1_init_intra_predictors();
+ av1_init_me_luts();
+ if (!is_allintra) av1_init_wedge_masks();
+ if (!is_allintra || end_usage != AOM_Q) av1_rc_init_minq_luts();
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate) {
+ cpi->framerate = framerate < 0.1 ? 30 : framerate;
+ av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
+}
+
+double av1_get_compression_ratio(const AV1_COMMON *const cm,
+ size_t encoded_frame_size) {
+ const int upscaled_width = cm->superres_upscaled_width;
+ const int height = cm->height;
+ const int64_t luma_pic_size = (int64_t)upscaled_width * height;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const BITSTREAM_PROFILE profile = seq_params->profile;
+ const int pic_size_profile_factor =
+ profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
+ encoded_frame_size =
+ (encoded_frame_size > 129 ? encoded_frame_size - 128 : 1);
+ const int64_t uncompressed_frame_size =
+ (luma_pic_size * pic_size_profile_factor) >> 3;
+ return (double)uncompressed_frame_size / encoded_frame_size;
+}
+
+static void auto_tile_size_balancing(AV1_COMMON *const cm, int num_sbs,
+ int num_tiles_lg, int tile_col_row) {
+ CommonTileParams *const tiles = &cm->tiles;
+ int i, start_sb;
+ int size_sb = num_sbs >> num_tiles_lg;
+ int res_sbs = num_sbs - (size_sb << num_tiles_lg);
+ int num_tiles = 1 << num_tiles_lg;
+ int inc_index = num_tiles - res_sbs;
+
+ tiles->uniform_spacing = 0;
+
+ for (i = 0, start_sb = 0; start_sb < num_sbs && i < MAX_TILE_COLS; ++i) {
+ if (i == inc_index) ++size_sb;
+ if (tile_col_row)
+ tiles->col_start_sb[i] = start_sb;
+ else
+ tiles->row_start_sb[i] = start_sb;
+
+ start_sb += AOMMIN(size_sb, tiles->max_width_sb);
+ }
+
+ if (tile_col_row) {
+ tiles->cols = i;
+ tiles->col_start_sb[i] = num_sbs;
+ } else {
+ tiles->rows = i;
+ tiles->row_start_sb[i] = num_sbs;
+ }
+}
+
+static void set_tile_info(AV1_COMMON *const cm,
+ const TileConfig *const tile_cfg) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ CommonTileParams *const tiles = &cm->tiles;
+ int i, start_sb;
+
+ av1_get_tile_limits(cm);
+
+ int sb_cols =
+ CEIL_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2);
+ // configure tile columns
+ if (tile_cfg->tile_width_count == 0 || tile_cfg->tile_height_count == 0) {
+ tiles->uniform_spacing = 1;
+ tiles->log2_cols = AOMMAX(tile_cfg->tile_columns, tiles->min_log2_cols);
+ // Add a special case to handle super resolution
+ sb_cols = coded_to_superres_mi(sb_cols, cm->superres_scale_denominator);
+ int min_log2_cols = 0;
+ for (; (tiles->max_width_sb << min_log2_cols) <= sb_cols; ++min_log2_cols) {
+ }
+ tiles->log2_cols = AOMMAX(tiles->log2_cols, min_log2_cols);
+
+ tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols);
+ } else if (tile_cfg->tile_widths[0] < 0) {
+ auto_tile_size_balancing(cm, sb_cols, tile_cfg->tile_columns, 1);
+ } else {
+ int size_sb, j = 0;
+ tiles->uniform_spacing = 0;
+ for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
+ tiles->col_start_sb[i] = start_sb;
+ size_sb = tile_cfg->tile_widths[j++];
+ if (j >= tile_cfg->tile_width_count) j = 0;
+ start_sb += AOMMIN(size_sb, tiles->max_width_sb);
+ }
+ tiles->cols = i;
+ tiles->col_start_sb[i] = sb_cols;
+ }
+ av1_calculate_tile_cols(seq_params, mi_params->mi_rows, mi_params->mi_cols,
+ tiles);
+
+ // configure tile rows
+ int sb_rows =
+ CEIL_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2);
+ if (tiles->uniform_spacing) {
+ tiles->log2_rows = AOMMAX(tile_cfg->tile_rows, tiles->min_log2_rows);
+ tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows);
+ } else if (tile_cfg->tile_heights[0] < 0) {
+ auto_tile_size_balancing(cm, sb_rows, tile_cfg->tile_rows, 0);
+ } else {
+ int size_sb, j = 0;
+ for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
+ tiles->row_start_sb[i] = start_sb;
+ size_sb = tile_cfg->tile_heights[j++];
+ if (j >= tile_cfg->tile_height_count) j = 0;
+ start_sb += AOMMIN(size_sb, tiles->max_height_sb);
+ }
+ tiles->rows = i;
+ tiles->row_start_sb[i] = sb_rows;
+ }
+ av1_calculate_tile_rows(seq_params, mi_params->mi_rows, tiles);
+}
+
+void av1_update_frame_size(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ // Setup mi_params here in case we need more mi's.
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+ mi_params->set_mb_mi(mi_params, cm->width, cm->height,
+ cpi->sf.part_sf.default_min_partition_size);
+
+ av1_init_macroblockd(cm, xd);
+
+ if (!cpi->ppi->seq_params_locked)
+ set_sb_size(cm->seq_params,
+ av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+ cpi->ppi->number_spatial_layers));
+
+ set_tile_info(cm, &cpi->oxcf.tile_cfg);
+}
+
+static INLINE int does_level_match(int width, int height, double fps,
+ int lvl_width, int lvl_height,
+ double lvl_fps, int lvl_dim_mult) {
+ const int64_t lvl_luma_pels = (int64_t)lvl_width * lvl_height;
+ const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps;
+ const int64_t luma_pels = (int64_t)width * height;
+ const double display_sample_rate = luma_pels * fps;
+ return luma_pels <= lvl_luma_pels &&
+ display_sample_rate <= lvl_display_sample_rate &&
+ width <= lvl_width * lvl_dim_mult &&
+ height <= lvl_height * lvl_dim_mult;
+}
+
+static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width,
+ int height, double init_framerate) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ const AV1LevelParams *const level_params = &ppi->level_params;
+ // TODO(any): This is a placeholder function that only addresses dimensions
+ // and max display sample rates.
+ // Need to add checks for max bit rate, max decoded luma sample rate, header
+ // rate, etc. that are not covered by this function.
+ AV1_LEVEL level = SEQ_LEVEL_MAX;
+ if (does_level_match(width, height, init_framerate, 512, 288, 30.0, 4)) {
+ level = SEQ_LEVEL_2_0;
+ } else if (does_level_match(width, height, init_framerate, 704, 396, 30.0,
+ 4)) {
+ level = SEQ_LEVEL_2_1;
+ } else if (does_level_match(width, height, init_framerate, 1088, 612, 30.0,
+ 4)) {
+ level = SEQ_LEVEL_3_0;
+ } else if (does_level_match(width, height, init_framerate, 1376, 774, 30.0,
+ 4)) {
+ level = SEQ_LEVEL_3_1;
+ } else if (does_level_match(width, height, init_framerate, 2048, 1152, 30.0,
+ 3)) {
+ level = SEQ_LEVEL_4_0;
+ } else if (does_level_match(width, height, init_framerate, 2048, 1152, 60.0,
+ 3)) {
+ level = SEQ_LEVEL_4_1;
+ } else if (does_level_match(width, height, init_framerate, 4096, 2176, 30.0,
+ 2)) {
+ level = SEQ_LEVEL_5_0;
+ } else if (does_level_match(width, height, init_framerate, 4096, 2176, 60.0,
+ 2)) {
+ level = SEQ_LEVEL_5_1;
+ } else if (does_level_match(width, height, init_framerate, 4096, 2176, 120.0,
+ 2)) {
+ level = SEQ_LEVEL_5_2;
+ } else if (does_level_match(width, height, init_framerate, 8192, 4352, 30.0,
+ 2)) {
+ level = SEQ_LEVEL_6_0;
+ } else if (does_level_match(width, height, init_framerate, 8192, 4352, 60.0,
+ 2)) {
+ level = SEQ_LEVEL_6_1;
+ } else if (does_level_match(width, height, init_framerate, 8192, 4352, 120.0,
+ 2)) {
+ level = SEQ_LEVEL_6_2;
+ }
+#if CONFIG_CWG_C013
+ // TODO(bohanli): currently target level is only working for the 0th operating
+ // point, so scalable coding is not supported.
+ else if (level_params->target_seq_level_idx[0] >= SEQ_LEVEL_7_0 &&
+ level_params->target_seq_level_idx[0] <= SEQ_LEVEL_8_3) {
+ // Only use level 7.x to 8.x when explicitly asked to.
+ if (does_level_match(width, height, init_framerate, 16384, 8704, 30.0, 2)) {
+ level = SEQ_LEVEL_7_0;
+ } else if (does_level_match(width, height, init_framerate, 16384, 8704,
+ 60.0, 2)) {
+ level = SEQ_LEVEL_7_1;
+ } else if (does_level_match(width, height, init_framerate, 16384, 8704,
+ 120.0, 2)) {
+ level = SEQ_LEVEL_7_2;
+ } else if (does_level_match(width, height, init_framerate, 32768, 17408,
+ 30.0, 2)) {
+ level = SEQ_LEVEL_8_0;
+ } else if (does_level_match(width, height, init_framerate, 32768, 17408,
+ 60.0, 2)) {
+ level = SEQ_LEVEL_8_1;
+ } else if (does_level_match(width, height, init_framerate, 32768, 17408,
+ 120.0, 2)) {
+ level = SEQ_LEVEL_8_2;
+ }
+ }
+#endif
+
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ assert(is_valid_seq_level_idx(level_params->target_seq_level_idx[i]) ||
+ level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS);
+ // If a higher target level is specified, it is then used rather than the
+ // inferred one from resolution and framerate.
+ seq_params->seq_level_idx[i] =
+ level_params->target_seq_level_idx[i] < SEQ_LEVELS &&
+ level_params->target_seq_level_idx[i] > level
+ ? level_params->target_seq_level_idx[i]
+ : level;
+ // Set the maximum parameters for bitrate and buffer size for this profile,
+ // level, and tier
+ seq_params->op_params[i].bitrate = av1_max_level_bitrate(
+ seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]);
+ // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
+ // check
+ if (seq_params->op_params[i].bitrate == 0)
+ aom_internal_error(
+ &ppi->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "AV1 does not support this combination of profile, level, and tier.");
+ // Buffer size in bits/s is bitrate in bits/s * 1 s
+ seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
+ }
+}
+
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
+ const AV1EncoderConfig *oxcf,
+ int disable_frame_id_numbers) {
+ SequenceHeader *const seq = &ppi->seq_params;
+ const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+ const ToolCfg *const tool_cfg = &oxcf->tool_cfg;
+
+ seq->still_picture =
+ !tool_cfg->force_video_mode && (oxcf->input_cfg.limit == 1);
+ seq->reduced_still_picture_hdr =
+ seq->still_picture && !tool_cfg->full_still_picture_hdr;
+ seq->force_screen_content_tools = 2;
+ seq->force_integer_mv = 2;
+ seq->order_hint_info.enable_order_hint = tool_cfg->enable_order_hint;
+ seq->frame_id_numbers_present_flag =
+ !seq->reduced_still_picture_hdr &&
+ !oxcf->tile_cfg.enable_large_scale_tile &&
+ tool_cfg->error_resilient_mode && !disable_frame_id_numbers;
+ if (seq->reduced_still_picture_hdr) {
+ seq->order_hint_info.enable_order_hint = 0;
+ seq->force_screen_content_tools = 2;
+ seq->force_integer_mv = 2;
+ }
+ seq->order_hint_info.order_hint_bits_minus_1 =
+ seq->order_hint_info.enable_order_hint
+ ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1
+ : -1;
+
+ seq->max_frame_width = frm_dim_cfg->forced_max_frame_width
+ ? frm_dim_cfg->forced_max_frame_width
+ : frm_dim_cfg->width;
+ seq->max_frame_height = frm_dim_cfg->forced_max_frame_height
+ ? frm_dim_cfg->forced_max_frame_height
+ : frm_dim_cfg->height;
+ seq->num_bits_width =
+ (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1;
+ seq->num_bits_height =
+ (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1;
+ assert(seq->num_bits_width <= 16);
+ assert(seq->num_bits_height <= 16);
+
+ seq->frame_id_length = FRAME_ID_LENGTH;
+ seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+
+ seq->enable_dual_filter = tool_cfg->enable_dual_filter;
+ seq->order_hint_info.enable_dist_wtd_comp =
+ oxcf->comp_type_cfg.enable_dist_wtd_comp;
+ seq->order_hint_info.enable_dist_wtd_comp &=
+ seq->order_hint_info.enable_order_hint;
+ seq->order_hint_info.enable_ref_frame_mvs = tool_cfg->ref_frame_mvs_present;
+ seq->order_hint_info.enable_ref_frame_mvs &=
+ seq->order_hint_info.enable_order_hint;
+ seq->enable_superres = oxcf->superres_cfg.enable_superres;
+ seq->enable_cdef = tool_cfg->cdef_control != CDEF_NONE ? 1 : 0;
+ seq->enable_restoration = tool_cfg->enable_restoration;
+ seq->enable_warped_motion = oxcf->motion_mode_cfg.enable_warped_motion;
+ seq->enable_interintra_compound = tool_cfg->enable_interintra_comp;
+ seq->enable_masked_compound = oxcf->comp_type_cfg.enable_masked_comp;
+ seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter;
+ seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra;
+
+ set_bitstream_level_tier(ppi, frm_dim_cfg->width, frm_dim_cfg->height,
+ oxcf->input_cfg.init_framerate);
+
+ if (seq->operating_points_cnt_minus_1 == 0) {
+ seq->operating_point_idc[0] = 0;
+ } else {
+ // Set operating_point_idc[] such that the i=0 point corresponds to the
+ // highest quality operating point (all layers), and subsequent
+ // operarting points (i > 0) are lower quality corresponding to
+ // skip decoding enhancement layers (temporal first).
+ int i = 0;
+ assert(seq->operating_points_cnt_minus_1 ==
+ (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1));
+ for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) {
+ for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) {
+ seq->operating_point_idc[i] =
+ (~(~0u << (ppi->number_spatial_layers - sl)) << 8) |
+ ~(~0u << (ppi->number_temporal_layers - tl));
+ i++;
+ }
+ }
+ }
+}
+
+static void init_config_sequence(struct AV1_PRIMARY *ppi,
+ const AV1EncoderConfig *oxcf) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+ const ColorCfg *const color_cfg = &oxcf->color_cfg;
+
+ ppi->use_svc = 0;
+ ppi->number_spatial_layers = 1;
+ ppi->number_temporal_layers = 1;
+
+ seq_params->profile = oxcf->profile;
+ seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
+ seq_params->use_highbitdepth = oxcf->use_highbitdepth;
+ seq_params->color_primaries = color_cfg->color_primaries;
+ seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
+ seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
+ seq_params->monochrome = oxcf->tool_cfg.enable_monochrome;
+ seq_params->chroma_sample_position = color_cfg->chroma_sample_position;
+ seq_params->color_range = color_cfg->color_range;
+ seq_params->timing_info_present = dec_model_cfg->timing_info_present;
+ seq_params->timing_info.num_units_in_display_tick =
+ dec_model_cfg->timing_info.num_units_in_display_tick;
+ seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale;
+ seq_params->timing_info.equal_picture_interval =
+ dec_model_cfg->timing_info.equal_picture_interval;
+ seq_params->timing_info.num_ticks_per_picture =
+ dec_model_cfg->timing_info.num_ticks_per_picture;
+
+ seq_params->display_model_info_present_flag =
+ dec_model_cfg->display_model_info_present_flag;
+ seq_params->decoder_model_info_present_flag =
+ dec_model_cfg->decoder_model_info_present_flag;
+ if (dec_model_cfg->decoder_model_info_present_flag) {
+ // set the decoder model parameters in schedule mode
+ seq_params->decoder_model_info.num_units_in_decoding_tick =
+ dec_model_cfg->num_units_in_decoding_tick;
+ ppi->buffer_removal_time_present = 1;
+ av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
+ av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
+ } else if (seq_params->timing_info_present &&
+ seq_params->timing_info.equal_picture_interval &&
+ !seq_params->decoder_model_info_present_flag) {
+ // set the decoder model parameters in resource availability mode
+ av1_set_resource_availability_parameters(&seq_params->op_params[0]);
+ } else {
+ seq_params->op_params[0].initial_display_delay =
+ 10; // Default value (not signaled)
+ }
+
+ if (seq_params->monochrome) {
+ seq_params->subsampling_x = 1;
+ seq_params->subsampling_y = 1;
+ } else if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ seq_params->subsampling_x = 0;
+ seq_params->subsampling_y = 0;
+ } else {
+ if (seq_params->profile == 0) {
+ seq_params->subsampling_x = 1;
+ seq_params->subsampling_y = 1;
+ } else if (seq_params->profile == 1) {
+ seq_params->subsampling_x = 0;
+ seq_params->subsampling_y = 0;
+ } else {
+ if (seq_params->bit_depth == AOM_BITS_12) {
+ seq_params->subsampling_x = oxcf->input_cfg.chroma_subsampling_x;
+ seq_params->subsampling_y = oxcf->input_cfg.chroma_subsampling_y;
+ } else {
+ seq_params->subsampling_x = 1;
+ seq_params->subsampling_y = 0;
+ }
+ }
+ }
+ av1_change_config_seq(ppi, oxcf, NULL);
+}
+
+static void init_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
+ AV1_COMMON *const cm = &cpi->common;
+ ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+
+ cpi->oxcf = *oxcf;
+ cpi->framerate = oxcf->input_cfg.init_framerate;
+
+ cm->width = oxcf->frm_dim_cfg.width;
+ cm->height = oxcf->frm_dim_cfg.height;
+ cpi->is_dropped_frame = false;
+
+ alloc_compressor_data(cpi);
+
+ cpi->data_alloc_width = cm->width;
+ cpi->data_alloc_height = cm->height;
+ cpi->frame_size_related_setup_done = false;
+
+ // Single thread case: use counts in common.
+ cpi->td.counts = &cpi->counts;
+
+ // Init SVC parameters.
+ cpi->svc.number_spatial_layers = 1;
+ cpi->svc.number_temporal_layers = 1;
+ cm->spatial_layer_id = 0;
+ cm->temporal_layer_id = 0;
+ // Init rtc_ref parameters.
+ cpi->ppi->rtc_ref.set_ref_frame_config = 0;
+ cpi->ppi->rtc_ref.non_reference_frame = 0;
+ cpi->ppi->rtc_ref.ref_frame_comp[0] = 0;
+ cpi->ppi->rtc_ref.ref_frame_comp[1] = 0;
+ cpi->ppi->rtc_ref.ref_frame_comp[2] = 0;
+
+ // change includes all joint functionality
+ av1_change_config(cpi, oxcf, false);
+
+ cpi->ref_frame_flags = 0;
+
+ // Reset resize pending flags
+ resize_pending_params->width = 0;
+ resize_pending_params->height = 0;
+
+ // Setup identity scale factor
+ av1_setup_scale_factors_for_frame(&cm->sf_identity, 1, 1, 1, 1);
+
+ init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx);
+
+ av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+}
+
+void av1_change_config_seq(struct AV1_PRIMARY *ppi,
+ const AV1EncoderConfig *oxcf,
+ bool *is_sb_size_changed) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+ const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
+ const ColorCfg *const color_cfg = &oxcf->color_cfg;
+
+ if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
+ seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
+ seq_params->color_primaries = color_cfg->color_primaries;
+ seq_params->transfer_characteristics = color_cfg->transfer_characteristics;
+ seq_params->matrix_coefficients = color_cfg->matrix_coefficients;
+ seq_params->monochrome = oxcf->tool_cfg.enable_monochrome;
+ seq_params->chroma_sample_position = color_cfg->chroma_sample_position;
+ seq_params->color_range = color_cfg->color_range;
+
+ assert(IMPLIES(seq_params->profile <= PROFILE_1,
+ seq_params->bit_depth <= AOM_BITS_10));
+
+ seq_params->timing_info_present = dec_model_cfg->timing_info_present;
+ seq_params->timing_info.num_units_in_display_tick =
+ dec_model_cfg->timing_info.num_units_in_display_tick;
+ seq_params->timing_info.time_scale = dec_model_cfg->timing_info.time_scale;
+ seq_params->timing_info.equal_picture_interval =
+ dec_model_cfg->timing_info.equal_picture_interval;
+ seq_params->timing_info.num_ticks_per_picture =
+ dec_model_cfg->timing_info.num_ticks_per_picture;
+
+ seq_params->display_model_info_present_flag =
+ dec_model_cfg->display_model_info_present_flag;
+ seq_params->decoder_model_info_present_flag =
+ dec_model_cfg->decoder_model_info_present_flag;
+ if (dec_model_cfg->decoder_model_info_present_flag) {
+ // set the decoder model parameters in schedule mode
+ seq_params->decoder_model_info.num_units_in_decoding_tick =
+ dec_model_cfg->num_units_in_decoding_tick;
+ ppi->buffer_removal_time_present = 1;
+ av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
+ av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
+ } else if (seq_params->timing_info_present &&
+ seq_params->timing_info.equal_picture_interval &&
+ !seq_params->decoder_model_info_present_flag) {
+ // set the decoder model parameters in resource availability mode
+ av1_set_resource_availability_parameters(&seq_params->op_params[0]);
+ } else {
+ seq_params->op_params[0].initial_display_delay =
+ 10; // Default value (not signaled)
+ }
+
+ av1_update_film_grain_parameters_seq(ppi, oxcf);
+
+ int sb_size = seq_params->sb_size;
+ // Superblock size should not be updated after the first key frame.
+ if (!ppi->seq_params_locked) {
+ set_sb_size(seq_params, av1_select_sb_size(oxcf, frm_dim_cfg->width,
+ frm_dim_cfg->height,
+ ppi->number_spatial_layers));
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
+ seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
+ }
+ if (is_sb_size_changed != NULL && sb_size != seq_params->sb_size)
+ *is_sb_size_changed = true;
+
+ // Init sequence level coding tools
+ // This should not be called after the first key frame.
+ if (!ppi->seq_params_locked) {
+ seq_params->operating_points_cnt_minus_1 =
+ (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1)
+ ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1
+ : 0;
+ av1_init_seq_coding_tools(
+ ppi, oxcf, ppi->use_svc || ppi->rtc_ref.set_ref_frame_config);
+ }
+ seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ highbd_set_var_fns(ppi);
+#endif
+
+ set_primary_rc_buffer_sizes(oxcf, ppi);
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+ bool is_sb_size_changed) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = cm->seq_params;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ MACROBLOCK *const x = &cpi->td.mb;
+ AV1LevelParams *const level_params = &cpi->ppi->level_params;
+ RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+ FeatureFlags *const features = &cm->features;
+
+ // in case of LAP, lag in frames is set according to number of lap buffers
+ // calculated at init time. This stores and restores LAP's lag in frames to
+ // prevent override by new cfg.
+ int lap_lag_in_frames = -1;
+ if (cpi->ppi->lap_enabled && cpi->compressor_stage == LAP_STAGE) {
+ lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames;
+ }
+
+ cpi->oxcf = *oxcf;
+
+ av1_update_film_grain_parameters(cpi, oxcf);
+
+ // When user provides superres_mode = AOM_SUPERRES_AUTO, we still initialize
+ // superres mode for current encoding = AOM_SUPERRES_NONE. This is to ensure
+ // that any analysis (e.g. TPL) happening outside the main encoding loop still
+ // happens at full resolution.
+ // This value will later be set appropriately just before main encoding loop.
+ cpi->superres_mode = oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO
+ ? AOM_SUPERRES_NONE
+ : oxcf->superres_cfg.superres_mode; // default
+ x->e_mbd.bd = (int)seq_params->bit_depth;
+ x->e_mbd.global_motion = cm->global_motion;
+
+ memcpy(level_params->target_seq_level_idx, cpi->oxcf.target_seq_level_idx,
+ sizeof(level_params->target_seq_level_idx));
+ level_params->keep_level_stats = 0;
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ if (level_params->target_seq_level_idx[i] < SEQ_LEVELS ||
+ level_params->target_seq_level_idx[i] == SEQ_LEVEL_KEEP_STATS) {
+ level_params->keep_level_stats |= 1u << i;
+ if (!level_params->level_info[i]) {
+ CHECK_MEM_ERROR(cm, level_params->level_info[i],
+ aom_calloc(1, sizeof(*level_params->level_info[i])));
+ }
+ }
+ }
+
+ // TODO(huisu@): level targeting currently only works for the 0th operating
+ // point, so scalable coding is not supported yet.
+ if (level_params->target_seq_level_idx[0] < SEQ_LEVELS) {
+ // Adjust encoder config in order to meet target level.
+ config_target_level(cpi, level_params->target_seq_level_idx[0],
+ seq_params->tier[0]);
+ }
+
+ if (has_no_stats_stage(cpi) && (rc_cfg->mode == AOM_Q)) {
+ p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+ } else if (!is_one_pass_rt_params(cpi) ||
+ cm->current_frame.frame_number == 0) {
+ // For rtc mode: logic for setting the baseline_gf_interval is done
+ // in av1_get_one_pass_rt_params(), and it should not be reset here in
+ // change_config(), unless after init_config (first frame).
+ p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+ }
+
+ refresh_frame->golden_frame = false;
+ refresh_frame->bwd_ref_frame = false;
+
+ features->refresh_frame_context =
+ (oxcf->tool_cfg.frame_parallel_decoding_mode)
+ ? REFRESH_FRAME_CONTEXT_DISABLED
+ : REFRESH_FRAME_CONTEXT_BACKWARD;
+ if (oxcf->tile_cfg.enable_large_scale_tile)
+ features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+ if (x->palette_buffer == NULL) {
+ CHECK_MEM_ERROR(cm, x->palette_buffer,
+ aom_memalign(16, sizeof(*x->palette_buffer)));
+ }
+
+ if (x->tmp_conv_dst == NULL) {
+ CHECK_MEM_ERROR(
+ cm, x->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
+ x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
+ }
+ // The buffers 'tmp_pred_bufs[]' and 'comp_rd_buffer' are used in inter frames
+ // to store intermediate inter mode prediction results and are not required
+ // for allintra encoding mode. Hence, the memory allocations for these buffers
+ // are avoided for allintra encoding mode.
+ if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
+ if (x->comp_rd_buffer.pred0 == NULL)
+ alloc_compound_type_rd_buffers(cm->error, &x->comp_rd_buffer);
+
+ for (int i = 0; i < 2; ++i) {
+ if (x->tmp_pred_bufs[i] == NULL) {
+ CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i],
+ aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*x->tmp_pred_bufs[i])));
+ x->e_mbd.tmp_obmc_bufs[i] = x->tmp_pred_bufs[i];
+ }
+ }
+ }
+
+ av1_reset_segment_features(cm);
+
+ av1_set_high_precision_mv(cpi, 1, 0);
+
+ // Under a configuration change, where maximum_buffer_size may change,
+ // keep buffer level clipped to the maximum allowed buffer size.
+ p_rc->bits_off_target =
+ AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+ p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
+
+ // Set up frame rate and related parameters rate control values.
+ av1_new_framerate(cpi, cpi->framerate);
+
+ // Set absolute upper and lower quality limits
+ rc->worst_quality = rc_cfg->worst_allowed_q;
+ rc->best_quality = rc_cfg->best_allowed_q;
+
+ // If lossless has been requested make sure average Q accumulators are reset.
+ if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+ int i;
+ for (i = 0; i < FRAME_TYPES; ++i) {
+ p_rc->avg_frame_qindex[i] = 0;
+ }
+ }
+
+ features->interp_filter =
+ oxcf->tile_cfg.enable_large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
+ features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+ features->allow_warped_motion, oxcf->motion_mode_cfg.enable_obmc);
+
+ if (frm_dim_cfg->render_width > 0 && frm_dim_cfg->render_height > 0) {
+ cm->render_width = frm_dim_cfg->render_width;
+ cm->render_height = frm_dim_cfg->render_height;
+ } else {
+ cm->render_width = frm_dim_cfg->width;
+ cm->render_height = frm_dim_cfg->height;
+ }
+ cm->width = frm_dim_cfg->width;
+ cm->height = frm_dim_cfg->height;
+
+ if (cm->width > cpi->data_alloc_width ||
+ cm->height > cpi->data_alloc_height || is_sb_size_changed) {
+ av1_free_context_buffers(cm);
+ av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+ av1_free_sms_tree(&cpi->td);
+ av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+ cpi->td.firstpass_ctx = NULL;
+ alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->data_alloc_width = cm->width;
+ cpi->data_alloc_height = cm->height;
+ cpi->frame_size_related_setup_done = false;
+ }
+ av1_update_frame_size(cpi);
+
+ rc->is_src_frame_alt_ref = 0;
+
+ if (!cpi->ppi->rtc_ref.set_ref_frame_config)
+ cpi->ext_flags.refresh_frame.update_pending = 0;
+ cpi->ext_flags.refresh_frame_context_pending = 0;
+
+ if (cpi->ppi->use_svc)
+ av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth);
+
+ check_reset_rc_flag(cpi);
+
+ // restore the value of lag_in_frame for LAP stage.
+ if (lap_lag_in_frames != -1) {
+ cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
+ }
+
+#if CONFIG_REALTIME_ONLY
+ assert(!oxcf->tool_cfg.enable_global_motion);
+ cpi->image_pyramid_levels = 0;
+#else
+ if (oxcf->tool_cfg.enable_global_motion) {
+ cpi->image_pyramid_levels =
+ global_motion_pyr_levels[default_global_motion_method];
+ } else {
+ cpi->image_pyramid_levels = 0;
+ }
+#endif // CONFIG_REALTIME_ONLY
+}
+
+static INLINE void init_frame_info(FRAME_INFO *frame_info,
+ const AV1_COMMON *const cm) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ frame_info->frame_width = cm->width;
+ frame_info->frame_height = cm->height;
+ frame_info->mi_cols = mi_params->mi_cols;
+ frame_info->mi_rows = mi_params->mi_rows;
+ frame_info->mb_cols = mi_params->mb_cols;
+ frame_info->mb_rows = mi_params->mb_rows;
+ frame_info->num_mbs = mi_params->MBs;
+ frame_info->bit_depth = seq_params->bit_depth;
+ frame_info->subsampling_x = seq_params->subsampling_x;
+ frame_info->subsampling_y = seq_params->subsampling_y;
+}
+
+static INLINE void init_frame_index_set(FRAME_INDEX_SET *frame_index_set) {
+ frame_index_set->show_frame_count = 0;
+}
+
+static INLINE void update_counters_for_show_frame(AV1_COMP *const cpi) {
+ assert(cpi->common.show_frame);
+ cpi->frame_index_set.show_frame_count++;
+ cpi->common.current_frame.frame_number++;
+}
+
+AV1_PRIMARY *av1_create_primary_compressor(
+ struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+ const AV1EncoderConfig *oxcf) {
+ AV1_PRIMARY *volatile const ppi = aom_memalign(32, sizeof(AV1_PRIMARY));
+ if (!ppi) return NULL;
+ av1_zero(*ppi);
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(ppi->error.jmp)) {
+ ppi->error.setjmp = 0;
+ av1_remove_primary_compressor(ppi);
+ return 0;
+ }
+ ppi->error.setjmp = 1;
+
+ ppi->seq_params_locked = 0;
+ ppi->lap_enabled = num_lap_buffers > 0;
+ ppi->output_pkt_list = pkt_list_head;
+ ppi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+ ppi->frames_left = oxcf->input_cfg.limit;
+ ppi->num_fp_contexts = 1;
+
+ init_config_sequence(ppi, oxcf);
+
+#if CONFIG_ENTROPY_STATS
+ av1_zero(ppi->aggregate_fc);
+#endif // CONFIG_ENTROPY_STATS
+
+ av1_primary_rc_init(oxcf, &ppi->p_rc);
+
+ // For two pass and lag_in_frames > 33 in LAP.
+ ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2;
+ if (ppi->lap_enabled) {
+ if ((num_lap_buffers <
+ (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) &&
+ num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) {
+ /*
+ * For lag in frames >= 19 and <33, enable scenecut
+ * with limited future frame prediction.
+ */
+ ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1;
+ } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) {
+ // Disable scenecut when lag_in_frames < 19.
+ ppi->p_rc.enable_scenecut_detection = DISABLE_SCENECUT;
+ }
+ }
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+ ppi->fn_ptr[BT].sdf = SDF; \
+ ppi->fn_ptr[BT].sdaf = SDAF; \
+ ppi->fn_ptr[BT].vf = VF; \
+ ppi->fn_ptr[BT].svf = SVF; \
+ ppi->fn_ptr[BT].svaf = SVAF; \
+ ppi->fn_ptr[BT].sdx4df = SDX4DF; \
+ ppi->fn_ptr[BT].jsdaf = JSDAF; \
+ ppi->fn_ptr[BT].jsvaf = JSVAF; \
+ ppi->fn_ptr[BT].sdx3df = SDX3DF;
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+ BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
+ aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
+ aom_sad4x16x4d, aom_sad4x16x3d, aom_dist_wtd_sad4x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance4x16)
+
+ BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
+ aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
+ aom_sad16x4x4d, aom_sad16x4x3d, aom_dist_wtd_sad16x4_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x4)
+
+ BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
+ aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
+ aom_sad8x32x4d, aom_sad8x32x3d, aom_dist_wtd_sad8x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x32)
+
+ BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
+ aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
+ aom_sad32x8x4d, aom_sad32x8x3d, aom_dist_wtd_sad32x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x8)
+
+ BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
+ aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
+ aom_sad16x64x4d, aom_sad16x64x3d, aom_dist_wtd_sad16x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x64)
+
+ BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
+ aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
+ aom_sad64x16x4d, aom_sad64x16x3d, aom_dist_wtd_sad64x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x16)
+#endif // !CONFIG_REALTIME_ONLY
+
+ BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
+ aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
+ aom_sad128x128x4d, aom_sad128x128x3d, aom_dist_wtd_sad128x128_avg,
+ aom_dist_wtd_sub_pixel_avg_variance128x128)
+
+ BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
+ aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
+ aom_sad128x64x4d, aom_sad128x64x3d, aom_dist_wtd_sad128x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance128x64)
+
+ BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
+ aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
+ aom_sad64x128x4d, aom_sad64x128x3d, aom_dist_wtd_sad64x128_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x128)
+
+ BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
+ aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
+ aom_sad32x16x4d, aom_sad32x16x3d, aom_dist_wtd_sad32x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x16)
+
+ BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
+ aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
+ aom_sad16x32x4d, aom_sad16x32x3d, aom_dist_wtd_sad16x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x32)
+
+ BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
+ aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
+ aom_sad64x32x4d, aom_sad64x32x3d, aom_dist_wtd_sad64x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x32)
+
+ BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
+ aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
+ aom_sad32x64x4d, aom_sad32x64x3d, aom_dist_wtd_sad32x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x64)
+
+ BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
+ aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
+ aom_sad32x32x4d, aom_sad32x32x3d, aom_dist_wtd_sad32x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x32)
+
+ BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
+ aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
+ aom_sad64x64x4d, aom_sad64x64x3d, aom_dist_wtd_sad64x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x64)
+
+ BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
+ aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
+ aom_sad16x16x4d, aom_sad16x16x3d, aom_dist_wtd_sad16x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x16)
+
+ BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
+ aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
+ aom_sad16x8x4d, aom_sad16x8x3d, aom_dist_wtd_sad16x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x8)
+
+ BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
+ aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
+ aom_sad8x16x4d, aom_sad8x16x3d, aom_dist_wtd_sad8x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x16)
+
+ BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
+ aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
+ aom_sad8x8x3d, aom_dist_wtd_sad8x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x8)
+
+ BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
+ aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
+ aom_sad8x4x3d, aom_dist_wtd_sad8x4_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x4)
+
+ BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
+ aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
+ aom_sad4x8x3d, aom_dist_wtd_sad4x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance4x8)
+
+ BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
+ aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
+ aom_sad4x4x3d, aom_dist_wtd_sad4x4_avg,
+ aom_dist_wtd_sub_pixel_avg_variance4x4)
+
+#if !CONFIG_REALTIME_ONLY
+#define OBFP(BT, OSDF, OVF, OSVF) \
+ ppi->fn_ptr[BT].osdf = OSDF; \
+ ppi->fn_ptr[BT].ovf = OVF; \
+ ppi->fn_ptr[BT].osvf = OSVF;
+
+ OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
+ aom_obmc_sub_pixel_variance128x128)
+ OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
+ aom_obmc_sub_pixel_variance128x64)
+ OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
+ aom_obmc_sub_pixel_variance64x128)
+ OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
+ aom_obmc_sub_pixel_variance64x64)
+ OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
+ aom_obmc_sub_pixel_variance64x32)
+ OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64,
+ aom_obmc_sub_pixel_variance32x64)
+ OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32,
+ aom_obmc_sub_pixel_variance32x32)
+ OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16,
+ aom_obmc_sub_pixel_variance32x16)
+ OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32,
+ aom_obmc_sub_pixel_variance16x32)
+ OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16,
+ aom_obmc_sub_pixel_variance16x16)
+ OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8,
+ aom_obmc_sub_pixel_variance16x8)
+ OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16,
+ aom_obmc_sub_pixel_variance8x16)
+ OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8,
+ aom_obmc_sub_pixel_variance8x8)
+ OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8,
+ aom_obmc_sub_pixel_variance4x8)
+ OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4,
+ aom_obmc_sub_pixel_variance8x4)
+ OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
+ aom_obmc_sub_pixel_variance4x4)
+ OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16,
+ aom_obmc_sub_pixel_variance4x16)
+ OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4,
+ aom_obmc_sub_pixel_variance16x4)
+ OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32,
+ aom_obmc_sub_pixel_variance8x32)
+ OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
+ aom_obmc_sub_pixel_variance32x8)
+ OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64,
+ aom_obmc_sub_pixel_variance16x64)
+ OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
+ aom_obmc_sub_pixel_variance64x16)
+#endif // !CONFIG_REALTIME_ONLY
+
+#define MBFP(BT, MCSDF, MCSVF) \
+ ppi->fn_ptr[BT].msdf = MCSDF; \
+ ppi->fn_ptr[BT].msvf = MCSVF;
+
+ MBFP(BLOCK_128X128, aom_masked_sad128x128,
+ aom_masked_sub_pixel_variance128x128)
+ MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64)
+ MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128)
+ MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64)
+ MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32)
+ MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64)
+ MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32)
+ MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16)
+ MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32)
+ MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16)
+ MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8)
+ MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16)
+ MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8)
+ MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8)
+ MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
+ MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
+
+#if !CONFIG_REALTIME_ONLY
+ MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
+ MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
+ MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
+ MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
+ MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64)
+ MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
+#endif
+
+#define SDSFP(BT, SDSF, SDSX4DF) \
+ ppi->fn_ptr[BT].sdsf = SDSF; \
+ ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+ SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d)
+ SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d)
+ SDSFP(BLOCK_64X128, aom_sad_skip_64x128, aom_sad_skip_64x128x4d)
+ SDSFP(BLOCK_64X64, aom_sad_skip_64x64, aom_sad_skip_64x64x4d)
+ SDSFP(BLOCK_64X32, aom_sad_skip_64x32, aom_sad_skip_64x32x4d)
+
+ SDSFP(BLOCK_32X64, aom_sad_skip_32x64, aom_sad_skip_32x64x4d)
+ SDSFP(BLOCK_32X32, aom_sad_skip_32x32, aom_sad_skip_32x32x4d)
+ SDSFP(BLOCK_32X16, aom_sad_skip_32x16, aom_sad_skip_32x16x4d)
+
+ SDSFP(BLOCK_16X32, aom_sad_skip_16x32, aom_sad_skip_16x32x4d)
+ SDSFP(BLOCK_16X16, aom_sad_skip_16x16, aom_sad_skip_16x16x4d)
+ SDSFP(BLOCK_16X8, aom_sad_skip_16x8, aom_sad_skip_16x8x4d)
+ SDSFP(BLOCK_8X16, aom_sad_skip_8x16, aom_sad_skip_8x16x4d)
+ SDSFP(BLOCK_8X8, aom_sad_skip_8x8, aom_sad_skip_8x8x4d)
+
+ SDSFP(BLOCK_4X8, aom_sad_skip_4x8, aom_sad_skip_4x8x4d)
+
+#if !CONFIG_REALTIME_ONLY
+ SDSFP(BLOCK_64X16, aom_sad_skip_64x16, aom_sad_skip_64x16x4d)
+ SDSFP(BLOCK_16X64, aom_sad_skip_16x64, aom_sad_skip_16x64x4d)
+ SDSFP(BLOCK_32X8, aom_sad_skip_32x8, aom_sad_skip_32x8x4d)
+ SDSFP(BLOCK_8X32, aom_sad_skip_8x32, aom_sad_skip_8x32x4d)
+ SDSFP(BLOCK_4X16, aom_sad_skip_4x16, aom_sad_skip_4x16x4d)
+#endif
+#undef SDSFP
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ highbd_set_var_fns(ppi);
+#endif
+
+ {
+ // As cm->mi_params is a part of the frame level context (cpi), it is
+ // unavailable at this point. mi_params is created as a local temporary
+ // variable, to be passed into the functions used for allocating tpl
+ // buffers. The values in this variable are populated according to initial
+ // width and height of the frame.
+ CommonModeInfoParams mi_params;
+ enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+ BLOCK_4X4);
+
+ const BLOCK_SIZE bsize = BLOCK_16X16;
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ const int num_cols = (mi_params.mi_cols + w - 1) / w;
+ const int num_rows = (mi_params.mi_rows + h - 1) / h;
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, ppi->tpl_sb_rdmult_scaling_factors,
+ aom_calloc(num_rows * num_cols,
+ sizeof(*ppi->tpl_sb_rdmult_scaling_factors)));
+
+#if CONFIG_INTERNAL_STATS
+ ppi->b_calculate_blockiness = 1;
+ ppi->b_calculate_consistency = 1;
+
+ for (int i = 0; i <= STAT_ALL; i++) {
+ ppi->psnr[0].stat[i] = 0;
+ ppi->psnr[1].stat[i] = 0;
+
+ ppi->fastssim.stat[i] = 0;
+ ppi->psnrhvs.stat[i] = 0;
+ }
+
+ ppi->psnr[0].worst = 100.0;
+ ppi->psnr[1].worst = 100.0;
+ ppi->worst_ssim = 100.0;
+ ppi->worst_ssim_hbd = 100.0;
+
+ ppi->count[0] = 0;
+ ppi->count[1] = 0;
+ ppi->total_bytes = 0;
+
+ if (ppi->b_calculate_psnr) {
+ ppi->total_sq_error[0] = 0;
+ ppi->total_samples[0] = 0;
+ ppi->total_sq_error[1] = 0;
+ ppi->total_samples[1] = 0;
+ ppi->total_recode_hits = 0;
+ ppi->summed_quality = 0;
+ ppi->summed_weights = 0;
+ ppi->summed_quality_hbd = 0;
+ ppi->summed_weights_hbd = 0;
+ }
+
+ ppi->fastssim.worst = 100.0;
+ ppi->psnrhvs.worst = 100.0;
+
+ if (ppi->b_calculate_blockiness) {
+ ppi->total_blockiness = 0;
+ ppi->worst_blockiness = 0.0;
+ }
+
+ ppi->total_inconsistency = 0;
+ ppi->worst_consistency = 100.0;
+ if (ppi->b_calculate_consistency) {
+ AOM_CHECK_MEM_ERROR(&ppi->error, ppi->ssim_vars,
+ aom_malloc(sizeof(*ppi->ssim_vars) * 4 *
+ mi_params.mi_rows * mi_params.mi_cols));
+ }
+#endif
+ }
+
+ ppi->error.setjmp = 0;
+ return ppi;
+}
+
+AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
+ BufferPool *const pool, COMPRESSOR_STAGE stage,
+ int lap_lag_in_frames) {
+ AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+
+ if (!cpi) return NULL;
+
+ av1_zero(*cpi);
+
+ cpi->ppi = ppi;
+
+ AV1_COMMON *volatile const cm = &cpi->common;
+ cm->seq_params = &ppi->seq_params;
+ cm->error =
+ (struct aom_internal_error_info *)aom_calloc(1, sizeof(*cm->error));
+ if (!cm->error) {
+ aom_free(cpi);
+ return NULL;
+ }
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(cm->error->jmp)) {
+ cm->error->setjmp = 0;
+ av1_remove_compressor(cpi);
+ return NULL;
+ }
+
+ cm->error->setjmp = 1;
+ cpi->compressor_stage = stage;
+
+ cpi->do_frame_data_update = true;
+
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+ mi_params->free_mi = enc_free_mi;
+ mi_params->setup_mi = enc_setup_mi;
+ mi_params->set_mb_mi =
+ (oxcf->pass == AOM_RC_FIRST_PASS || cpi->compressor_stage == LAP_STAGE)
+ ? stat_stage_set_mb_mi
+ : enc_set_mb_mi;
+
+ mi_params->mi_alloc_bsize = BLOCK_4X4;
+
+ CHECK_MEM_ERROR(cm, cm->fc,
+ (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(
+ cm, cm->default_frame_context,
+ (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
+ memset(cm->fc, 0, sizeof(*cm->fc));
+ memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
+
+ cpi->common.buffer_pool = pool;
+
+ init_config(cpi, oxcf);
+ if (cpi->compressor_stage == LAP_STAGE) {
+ cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
+ }
+
+ av1_rc_init(&cpi->oxcf, &cpi->rc);
+
+ init_frame_info(&cpi->frame_info, cm);
+ init_frame_index_set(&cpi->frame_index_set);
+
+ cm->current_frame.frame_number = 0;
+ cpi->rc.frame_number_encoded = 0;
+ cpi->rc.prev_frame_is_dropped = 0;
+ cpi->rc.max_consec_drop = INT_MAX;
+ cpi->rc.drop_count_consec = 0;
+ cm->current_frame_id = -1;
+ cpi->tile_data = NULL;
+ cpi->last_show_frame_buf = NULL;
+ realloc_segmentation_maps(cpi);
+
+ cpi->refresh_frame.alt_ref_frame = false;
+
+#if CONFIG_SPEED_STATS
+ cpi->tx_search_count = 0;
+#endif // CONFIG_SPEED_STATS
+
+ cpi->time_stamps.first_ts_start = INT64_MAX;
+
+#ifdef OUTPUT_YUV_REC
+ yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+#ifdef OUTPUT_YUV_DENOISED
+ yuv_denoised_file = fopen("denoised.yuv", "wb");
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+ if (is_stat_consumption_stage(cpi)) {
+ const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz);
+
+ if (!cpi->ppi->lap_enabled) {
+ /*Re-initialize to stats buffer, populated by application in the case of
+ * two pass*/
+ cpi->ppi->twopass.stats_buf_ctx->stats_in_start =
+ oxcf->twopass_stats_in.buf;
+ cpi->twopass_frame.stats_in =
+ cpi->ppi->twopass.stats_buf_ctx->stats_in_start;
+ cpi->ppi->twopass.stats_buf_ctx->stats_in_end =
+ &cpi->ppi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
+
+ // The buffer size is packets - 1 because the last packet is total_stats.
+ av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info,
+ oxcf->twopass_stats_in.buf, packets - 1);
+ av1_init_second_pass(cpi);
+ } else {
+ av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, NULL, 0);
+ av1_init_single_pass_lap(cpi);
+ }
+ }
+#endif
+
+ // The buffer "obmc_buffer" is used in inter frames for fast obmc search.
+ // Hence, the memory allocation for the same is avoided for allintra encoding
+ // mode.
+ if (cpi->oxcf.kf_cfg.key_freq_max != 0)
+ alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm->error);
+
+ for (int x = 0; x < 2; x++)
+ for (int y = 0; y < 2; y++)
+ CHECK_MEM_ERROR(
+ cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+ (uint32_t *)aom_malloc(
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
+
+ cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
+
+ av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+ av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+
+ int max_mi_cols = mi_params->mi_cols;
+ int max_mi_rows = mi_params->mi_rows;
+ if (oxcf->frm_dim_cfg.forced_max_frame_width) {
+ max_mi_cols = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_width);
+ }
+ if (oxcf->frm_dim_cfg.forced_max_frame_height) {
+ max_mi_rows = size_in_mi(oxcf->frm_dim_cfg.forced_max_frame_height);
+ }
+
+ const int consec_zero_mv_alloc_size = (max_mi_rows * max_mi_cols) >> 2;
+ CHECK_MEM_ERROR(
+ cm, cpi->consec_zero_mv,
+ aom_calloc(consec_zero_mv_alloc_size, sizeof(*cpi->consec_zero_mv)));
+ cpi->consec_zero_mv_alloc_size = consec_zero_mv_alloc_size;
+
+ cpi->mb_weber_stats = NULL;
+ cpi->mb_delta_q = NULL;
+ cpi->palette_pixel_num = 0;
+ cpi->scaled_last_source_available = 0;
+
+ {
+ const BLOCK_SIZE bsize = BLOCK_16X16;
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ const int num_cols = (max_mi_cols + w - 1) / w;
+ const int num_rows = (max_mi_rows + h - 1) / h;
+ CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
+ aom_calloc(num_rows * num_cols,
+ sizeof(*cpi->ssim_rdmult_scaling_factors)));
+ CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors,
+ aom_calloc(num_rows * num_cols,
+ sizeof(*cpi->tpl_rdmult_scaling_factors)));
+ }
+
+#if CONFIG_TUNE_VMAF
+ {
+ const BLOCK_SIZE bsize = BLOCK_64X64;
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ const int num_cols = (mi_params->mi_cols + w - 1) / w;
+ const int num_rows = (mi_params->mi_rows + h - 1) / h;
+ CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors,
+ aom_calloc(num_rows * num_cols,
+ sizeof(*cpi->vmaf_info.rdmult_scaling_factors)));
+ for (int i = 0; i < MAX_ARF_LAYERS; i++) {
+ cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0;
+ cpi->vmaf_info.last_frame_ysse[i] = -1.0;
+ cpi->vmaf_info.last_frame_vmaf[i] = -1.0;
+ }
+ cpi->vmaf_info.original_qindex = -1;
+ cpi->vmaf_info.vmaf_model = NULL;
+ }
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ {
+ const int w = mi_size_wide[butteraugli_rdo_bsize];
+ const int h = mi_size_high[butteraugli_rdo_bsize];
+ const int num_cols = (mi_params->mi_cols + w - 1) / w;
+ const int num_rows = (mi_params->mi_rows + h - 1) / h;
+ CHECK_MEM_ERROR(
+ cm, cpi->butteraugli_info.rdmult_scaling_factors,
+ aom_malloc(num_rows * num_cols *
+ sizeof(*cpi->butteraugli_info.rdmult_scaling_factors)));
+ memset(&cpi->butteraugli_info.source, 0,
+ sizeof(cpi->butteraugli_info.source));
+ memset(&cpi->butteraugli_info.resized_source, 0,
+ sizeof(cpi->butteraugli_info.resized_source));
+ cpi->butteraugli_info.recon_set = false;
+ }
+#endif
+
+#if CONFIG_SALIENCY_MAP
+ {
+ CHECK_MEM_ERROR(cm, cpi->saliency_map,
+ (uint8_t *)aom_calloc(cm->height * cm->width,
+ sizeof(*cpi->saliency_map)));
+ // Buffer initialization based on MIN_MIB_SIZE_LOG2 to ensure that
+ // cpi->sm_scaling_factor buffer is allocated big enough, since we have no
+ // idea of the actual superblock size we are going to use yet.
+ const int min_mi_w_sb = (1 << MIN_MIB_SIZE_LOG2);
+ const int min_mi_h_sb = (1 << MIN_MIB_SIZE_LOG2);
+ const int max_sb_cols =
+ (cm->mi_params.mi_cols + min_mi_w_sb - 1) / min_mi_w_sb;
+ const int max_sb_rows =
+ (cm->mi_params.mi_rows + min_mi_h_sb - 1) / min_mi_h_sb;
+ CHECK_MEM_ERROR(cm, cpi->sm_scaling_factor,
+ (double *)aom_calloc(max_sb_rows * max_sb_cols,
+ sizeof(*cpi->sm_scaling_factor)));
+ }
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ av1_zero(cpi->partition_stats);
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+ // Initialize the members of DeltaQuantParams with INT_MAX to ensure that
+ // the quantizer tables are correctly initialized using the default deltaq
+ // parameters when av1_init_quantizer is called for the first time.
+ DeltaQuantParams *const prev_deltaq_params =
+ &cpi->enc_quant_dequant_params.prev_deltaq_params;
+ prev_deltaq_params->y_dc_delta_q = INT_MAX;
+ prev_deltaq_params->u_dc_delta_q = INT_MAX;
+ prev_deltaq_params->v_dc_delta_q = INT_MAX;
+ prev_deltaq_params->u_ac_delta_q = INT_MAX;
+ prev_deltaq_params->v_ac_delta_q = INT_MAX;
+
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+ av1_qm_init(&cm->quant_params, av1_num_planes(cm));
+
+ av1_loop_filter_init(cm);
+ cm->superres_scale_denominator = SCALE_NUMERATOR;
+ cm->superres_upscaled_width = oxcf->frm_dim_cfg.width;
+ cm->superres_upscaled_height = oxcf->frm_dim_cfg.height;
+#if !CONFIG_REALTIME_ONLY
+ av1_loop_restoration_precal();
+#endif
+
+ cpi->third_pass_ctx = NULL;
+ if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) {
+ av1_init_thirdpass_ctx(cm, &cpi->third_pass_ctx, NULL);
+ }
+
+ cpi->second_pass_log_stream = NULL;
+ cpi->use_ducky_encode = 0;
+
+ cm->error->setjmp = 0;
+ return cpi;
+}
+
+#if CONFIG_INTERNAL_STATS
+#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+ snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif // CONFIG_INTERNAL_STATS
+
+void av1_remove_primary_compressor(AV1_PRIMARY *ppi) {
+ if (!ppi) return;
+#if !CONFIG_REALTIME_ONLY
+ av1_tf_info_free(&ppi->tf_info);
+#endif // !CONFIG_REALTIME_ONLY
+
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ aom_free(ppi->level_params.level_info[i]);
+ }
+ av1_lookahead_destroy(ppi->lookahead);
+
+ aom_free(ppi->tpl_sb_rdmult_scaling_factors);
+ ppi->tpl_sb_rdmult_scaling_factors = NULL;
+
+ TplParams *const tpl_data = &ppi->tpl_data;
+ aom_free(tpl_data->txfm_stats_list);
+
+ for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+ aom_free(tpl_data->tpl_stats_pool[frame]);
+ aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
+ tpl_data->tpl_stats_pool[frame] = NULL;
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
+#endif
+
+ av1_terminate_workers(ppi);
+ free_thread_data(ppi);
+
+ aom_free(ppi->p_mt_info.tile_thr_data);
+ ppi->p_mt_info.tile_thr_data = NULL;
+ aom_free(ppi->p_mt_info.workers);
+ ppi->p_mt_info.workers = NULL;
+ ppi->p_mt_info.num_workers = 0;
+
+ aom_free(ppi);
+}
+
+void av1_remove_compressor(AV1_COMP *cpi) {
+ if (!cpi) return;
+#if CONFIG_RATECTRL_LOG
+ if (cpi->oxcf.pass == 3) {
+ rc_log_show(&cpi->rc_log);
+ }
+#endif // CONFIG_RATECTRL_LOG
+
+ AV1_COMMON *cm = &cpi->common;
+ if (cm->current_frame.frame_number > 0) {
+#if CONFIG_SPEED_STATS
+ if (!is_stat_generation_stage(cpi)) {
+ fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count);
+ }
+#endif // CONFIG_SPEED_STATS
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+ if (!is_stat_generation_stage(cpi)) {
+ av1_print_fr_partition_timing_stats(&cpi->partition_stats,
+ "fr_part_timing_data.csv");
+ }
+#endif
+ }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ av1_denoiser_free(&(cpi->denoiser));
+#endif
+
+ if (cm->error) {
+ // Help detect use after free of the error detail string.
+ memset(cm->error->detail, 'A', sizeof(cm->error->detail) - 1);
+ cm->error->detail[sizeof(cm->error->detail) - 1] = '\0';
+ aom_free(cm->error);
+ }
+ aom_free(cpi->td.tctx);
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_;
+ pthread_cond_t *const enc_row_mt_cond_ = mt_info->enc_row_mt.cond_;
+ pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+ pthread_mutex_t *const tpl_error_mutex_ = mt_info->tpl_row_mt.mutex_;
+ pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_;
+ if (enc_row_mt_mutex_ != NULL) {
+ pthread_mutex_destroy(enc_row_mt_mutex_);
+ aom_free(enc_row_mt_mutex_);
+ }
+ if (enc_row_mt_cond_ != NULL) {
+ pthread_cond_destroy(enc_row_mt_cond_);
+ aom_free(enc_row_mt_cond_);
+ }
+ if (gm_mt_mutex_ != NULL) {
+ pthread_mutex_destroy(gm_mt_mutex_);
+ aom_free(gm_mt_mutex_);
+ }
+ if (tpl_error_mutex_ != NULL) {
+ pthread_mutex_destroy(tpl_error_mutex_);
+ aom_free(tpl_error_mutex_);
+ }
+ if (pack_bs_mt_mutex_ != NULL) {
+ pthread_mutex_destroy(pack_bs_mt_mutex_);
+ aom_free(pack_bs_mt_mutex_);
+ }
+#endif
+ av1_row_mt_mem_dealloc(cpi);
+
+ if (mt_info->num_workers > 1) {
+ av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync);
+ av1_loop_filter_dealloc(&mt_info->lf_row_sync);
+ av1_cdef_mt_dealloc(&mt_info->cdef_sync);
+#if !CONFIG_REALTIME_ONLY
+ av1_loop_restoration_dealloc(&mt_info->lr_row_sync);
+ av1_tf_mt_dealloc(&mt_info->tf_sync);
+#endif
+ }
+
+ av1_free_thirdpass_ctx(cpi->third_pass_ctx);
+
+ av1_close_second_pass_log(cpi);
+
+ dealloc_compressor_data(cpi);
+
+ av1_ext_part_delete(&cpi->ext_part_controller);
+
+ av1_remove_common(cm);
+
+ aom_free(cpi);
+
+#ifdef OUTPUT_YUV_REC
+ fclose(yuv_rec_file);
+#endif
+
+#ifdef OUTPUT_YUV_DENOISED
+ fclose(yuv_denoised_file);
+#endif
+}
+
+static void generate_psnr_packet(AV1_COMP *cpi) {
+ struct aom_codec_cx_pkt pkt;
+ int i;
+ PSNR_STATS psnr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+ aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr,
+ bit_depth, in_bit_depth);
+#else
+ aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+#endif
+
+ for (i = 0; i < 4; ++i) {
+ pkt.data.psnr.samples[i] = psnr.samples[i];
+ pkt.data.psnr.sse[i] = psnr.sse[i];
+ pkt.data.psnr.psnr[i] = psnr.psnr[i];
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+ (in_bit_depth < bit_depth)) {
+ for (i = 0; i < 4; ++i) {
+ pkt.data.psnr.samples_hbd[i] = psnr.samples_hbd[i];
+ pkt.data.psnr.sse_hbd[i] = psnr.sse_hbd[i];
+ pkt.data.psnr.psnr_hbd[i] = psnr.psnr_hbd[i];
+ }
+ }
+#endif
+
+ pkt.kind = AOM_CODEC_PSNR_PKT;
+ aom_codec_pkt_list_add(cpi->ppi->output_pkt_list, &pkt);
+}
+
+int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) {
+ if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
+
+ *ext_ref_frame_flags = ref_frame_flags;
+ return 0;
+}
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+ if (cfg) {
+ aom_yv12_copy_frame(cfg, sd, num_planes);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+ if (cfg) {
+ aom_yv12_copy_frame(sd, cfg, num_planes);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+#ifdef OUTPUT_YUV_REC
+void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+ uint8_t *src = s->y_buffer;
+ int h = cm->height;
+ if (yuv_rec_file == NULL) return;
+ if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+ do {
+ fwrite(src16, s->y_width, 2, yuv_rec_file);
+ src16 += s->y_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+ return;
+ }
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_rec_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+}
+#endif // OUTPUT_YUV_REC
+
+void av1_set_mv_search_params(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+ const int max_mv_def = AOMMAX(cm->width, cm->height);
+
+ // Default based on max resolution.
+ mv_search_params->mv_step_param = av1_init_search_range(max_mv_def);
+
+ if (cpi->sf.mv_sf.auto_mv_step_size) {
+ if (frame_is_intra_only(cm)) {
+ // Initialize max_mv_magnitude for use in the first INTER frame
+ // after a key/intra-only frame.
+ mv_search_params->max_mv_magnitude = max_mv_def;
+ } else {
+ // Use adaptive mv steps based on previous frame stats for show frames and
+ // internal arfs.
+ FRAME_UPDATE_TYPE cur_update_type =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+ int use_auto_mv_step =
+ (cm->show_frame || cur_update_type == INTNL_ARF_UPDATE) &&
+ mv_search_params->max_mv_magnitude != -1 &&
+ cpi->sf.mv_sf.auto_mv_step_size >= 2;
+ if (use_auto_mv_step) {
+ // Allow mv_steps to correspond to twice the max mv magnitude found
+ // in the previous frame, capped by the default max_mv_magnitude based
+ // on resolution.
+ mv_search_params->mv_step_param = av1_init_search_range(
+ AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude));
+ }
+ // Reset max_mv_magnitude based on update flag.
+ if (cpi->do_frame_data_update) mv_search_params->max_mv_magnitude = -1;
+ }
+ }
+}
+
+void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ if (cm->seq_params->force_screen_content_tools != 2) {
+ features->allow_screen_content_tools = features->allow_intrabc =
+ cm->seq_params->force_screen_content_tools;
+ return;
+ }
+
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ features->allow_screen_content_tools = 1;
+ features->allow_intrabc = cpi->oxcf.mode == REALTIME ? 0 : 1;
+ cpi->is_screen_content_type = 1;
+ cpi->use_screen_content_tools = 1;
+ return;
+ }
+
+ if (cpi->oxcf.mode == REALTIME) {
+ features->allow_screen_content_tools = features->allow_intrabc = 0;
+ return;
+ }
+
+ // Screen content tools are not evaluated in non-RD encoding mode unless
+ // content type is not set explicitly, i.e., when
+ // cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN, use_nonrd_pick_mode = 1
+ // and hybrid_intra_pickmode = 0. Hence, screen content detection is
+ // disabled.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ !cpi->sf.rt_sf.hybrid_intra_pickmode) {
+ features->allow_screen_content_tools = features->allow_intrabc = 0;
+ return;
+ }
+
+ // Estimate if the source frame is screen content, based on the portion of
+ // blocks that have few luma colors.
+ const uint8_t *src = cpi->unfiltered_source->y_buffer;
+ assert(src != NULL);
+ const int use_hbd = cpi->unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int stride = cpi->unfiltered_source->y_stride;
+ const int width = cpi->unfiltered_source->y_width;
+ const int height = cpi->unfiltered_source->y_height;
+ const int64_t area = (int64_t)width * height;
+ const int bd = cm->seq_params->bit_depth;
+ const int blk_w = 16;
+ const int blk_h = 16;
+ // These threshold values are selected experimentally.
+ const int color_thresh = 4;
+ const unsigned int var_thresh = 0;
+ // Counts of blocks with no more than color_thresh colors.
+ int64_t counts_1 = 0;
+ // Counts of blocks with no more than color_thresh colors and variance larger
+ // than var_thresh.
+ int64_t counts_2 = 0;
+
+ for (int r = 0; r + blk_h <= height; r += blk_h) {
+ for (int c = 0; c + blk_w <= width; c += blk_w) {
+ int count_buf[1 << 8]; // Maximum (1 << 8) bins for hbd path.
+ const uint8_t *const this_src = src + r * stride + c;
+ int n_colors;
+ if (use_hbd)
+ av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, NULL,
+ count_buf, &n_colors, NULL);
+ else
+ av1_count_colors(this_src, stride, blk_w, blk_h, count_buf, &n_colors);
+ if (n_colors > 1 && n_colors <= color_thresh) {
+ ++counts_1;
+ struct buf_2d buf;
+ buf.stride = stride;
+ buf.buf = (uint8_t *)this_src;
+ const unsigned int var = av1_get_perpixel_variance(
+ cpi, xd, &buf, BLOCK_16X16, AOM_PLANE_Y, use_hbd);
+ if (var > var_thresh) ++counts_2;
+ }
+ }
+ }
+
+ // The threshold values are selected experimentally.
+ features->allow_screen_content_tools = counts_1 * blk_h * blk_w * 10 > area;
+ // IntraBC would force loop filters off, so we use more strict rules that also
+ // requires that the block has high variance.
+ features->allow_intrabc = features->allow_screen_content_tools &&
+ counts_2 * blk_h * blk_w * 12 > area;
+ cpi->use_screen_content_tools = features->allow_screen_content_tools;
+ cpi->is_screen_content_type =
+ features->allow_intrabc || (counts_1 * blk_h * blk_w * 10 > area * 4 &&
+ counts_2 * blk_h * blk_w * 30 > area);
+}
+
+static void init_motion_estimation(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+ const int aligned_width = (cm->width + 7) & ~7;
+ const int y_stride =
+ aom_calc_y_stride(aligned_width, cpi->oxcf.border_in_pixels);
+ const int y_stride_src = ((cpi->oxcf.frm_dim_cfg.width != cm->width ||
+ cpi->oxcf.frm_dim_cfg.height != cm->height) ||
+ av1_superres_scaled(cm))
+ ? y_stride
+ : cpi->ppi->lookahead->buf->img.y_stride;
+ int fpf_y_stride =
+ cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride : y_stride;
+
+ // Update if search_site_cfg is uninitialized or the current frame has a new
+ // stride
+ const int should_update =
+ !mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride ||
+ !mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][DIAMOND].stride ||
+ (y_stride !=
+ mv_search_params->search_site_cfg[SS_CFG_SRC][DIAMOND].stride);
+
+ if (!should_update) {
+ return;
+ }
+
+ // Initialization of search_site_cfg for NUM_DISTINCT_SEARCH_METHODS.
+ for (SEARCH_METHODS i = DIAMOND; i < NUM_DISTINCT_SEARCH_METHODS; i++) {
+ const int level = ((i == NSTEP_8PT) || (i == CLAMPED_DIAMOND)) ? 1 : 0;
+ av1_init_motion_compensation[i](
+ &mv_search_params->search_site_cfg[SS_CFG_SRC][i], y_stride, level);
+ av1_init_motion_compensation[i](
+ &mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD][i], y_stride_src,
+ level);
+ }
+
+ // First pass search site config initialization.
+ av1_init_motion_fpf(&mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND],
+ fpf_y_stride);
+ for (SEARCH_METHODS i = NSTEP; i < NUM_DISTINCT_SEARCH_METHODS; i++) {
+ memcpy(&mv_search_params->search_site_cfg[SS_CFG_FPF][i],
+ &mv_search_params->search_site_cfg[SS_CFG_FPF][DIAMOND],
+ sizeof(search_site_config));
+ }
+}
+
+static void init_ref_frame_bufs(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int i;
+ if (cm->cur_frame) {
+ cm->cur_frame->ref_count--;
+ cm->cur_frame = NULL;
+ }
+ for (i = 0; i < REF_FRAMES; ++i) {
+ if (cm->ref_frame_map[i]) {
+ cm->ref_frame_map[i]->ref_count--;
+ cm->ref_frame_map[i] = NULL;
+ }
+ }
+#ifndef NDEBUG
+ BufferPool *const pool = cm->buffer_pool;
+ for (i = 0; i < pool->num_frame_bufs; ++i) {
+ assert(pool->frame_bufs[i].ref_count == 0);
+ }
+#endif
+}
+
+// TODO(chengchen): consider renaming this function as it is necessary
+// for the encoder to setup critical parameters, and it does not
+// deal with initial width any longer.
+aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+ int subsampling_x, int subsampling_y) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = cm->seq_params;
+
+ if (!cpi->frame_size_related_setup_done ||
+ seq_params->use_highbitdepth != use_highbitdepth ||
+ seq_params->subsampling_x != subsampling_x ||
+ seq_params->subsampling_y != subsampling_y) {
+ seq_params->subsampling_x = subsampling_x;
+ seq_params->subsampling_y = subsampling_y;
+ seq_params->use_highbitdepth = use_highbitdepth;
+
+ av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+ av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed);
+
+ if (!is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+ if (!av1_tf_info_alloc(&cpi->ppi->tf_info, cpi))
+ return AOM_CODEC_MEM_ERROR;
+#endif // !CONFIG_REALTIME_ONLY
+ }
+ init_ref_frame_bufs(cpi);
+
+ init_motion_estimation(cpi); // TODO(agrange) This can be removed.
+
+ cpi->initial_mbs = cm->mi_params.MBs;
+ cpi->frame_size_related_setup_done = true;
+ }
+ return AOM_CODEC_OK;
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (cpi->oxcf.noise_sensitivity > 0 &&
+ !cpi->denoiser.frame_buffer_initialized) {
+ if (av1_denoiser_alloc(
+ cm, &cpi->svc, &cpi->denoiser, cpi->ppi->use_svc,
+ cpi->oxcf.noise_sensitivity, cm->width, cm->height,
+ cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+ cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate denoiser");
+ }
+}
+#endif
+
+// Returns 1 if the assigned width or height was <= 0.
+static int set_size_literal(AV1_COMP *cpi, int width, int height) {
+ AV1_COMMON *cm = &cpi->common;
+ aom_codec_err_t err = av1_check_initial_width(
+ cpi, cm->seq_params->use_highbitdepth, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y);
+ if (err != AOM_CODEC_OK) {
+ aom_internal_error(cm->error, err, "av1_check_initial_width() failed");
+ }
+
+ if (width <= 0 || height <= 0) return 1;
+
+ cm->width = width;
+ cm->height = height;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
+
+ if (cm->width > cpi->data_alloc_width ||
+ cm->height > cpi->data_alloc_height) {
+ av1_free_context_buffers(cm);
+ av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+ av1_free_sms_tree(&cpi->td);
+ av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
+ cpi->td.firstpass_ctx = NULL;
+ alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->data_alloc_width = cm->width;
+ cpi->data_alloc_height = cm->height;
+ cpi->frame_size_related_setup_done = false;
+ }
+ alloc_mb_mode_info_buffers(cpi);
+ av1_update_frame_size(cpi);
+
+ return 0;
+}
+
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ int ref_frame;
+
+ if (width != cm->width || height != cm->height) {
+ // There has been a change in the encoded frame size
+ set_size_literal(cpi, width, height);
+ // Recalculate 'all_lossless' in case super-resolution was (un)selected.
+ cm->features.all_lossless =
+ cm->features.coded_lossless && !av1_superres_scaled(cm);
+
+ av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ // Reset the denoiser on the resized frame.
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ av1_denoiser_free(&(cpi->denoiser));
+ setup_denoiser_buffer(cpi);
+ }
+#endif
+ }
+ if (is_stat_consumption_stage(cpi)) {
+ av1_set_target_rate(cpi, cm->width, cm->height);
+ }
+
+ alloc_frame_mvs(cm, cm->cur_frame);
+
+ // Allocate above context buffers
+ CommonContexts *const above_contexts = &cm->above_contexts;
+ if (above_contexts->num_planes < av1_num_planes(cm) ||
+ above_contexts->num_mi_cols < cm->mi_params.mi_cols ||
+ above_contexts->num_tile_rows < cm->tiles.rows) {
+ av1_free_above_context_buffers(above_contexts);
+ if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
+ cm->mi_params.mi_cols,
+ av1_num_planes(cm)))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate context buffers");
+ }
+
+ AV1EncoderConfig *oxcf = &cpi->oxcf;
+ oxcf->border_in_pixels = av1_get_enc_border_size(
+ av1_is_resize_needed(oxcf), oxcf->kf_cfg.key_freq_max == 0,
+ cm->seq_params->sb_size);
+
+ // Reset the frame pointers to the current frame size.
+ if (aom_realloc_frame_buffer(
+ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+ NULL, cpi->image_pyramid_levels, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+
+ if (!is_stat_generation_stage(cpi)) av1_init_cdef_worker(cpi);
+
+#if !CONFIG_REALTIME_ONLY
+ if (is_restoration_used(cm)) {
+ for (int i = 0; i < num_planes; ++i)
+ cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+
+ const bool is_sgr_enabled = !cpi->sf.lpf_sf.disable_sgr_filter;
+ av1_alloc_restoration_buffers(cm, is_sgr_enabled);
+ // Store the allocated restoration buffers in MT object.
+ if (cpi->ppi->p_mt_info.num_workers > 1) {
+ av1_init_lr_mt_buffers(cpi);
+ }
+ }
+#endif
+
+ init_motion_estimation(cpi);
+
+ int has_valid_ref_frame = 0;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf != NULL) {
+ struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
+ av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width,
+ buf->buf.y_crop_height, cm->width,
+ cm->height);
+ has_valid_ref_frame |= av1_is_valid_scale(sf);
+ if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes);
+ }
+ }
+ if (!frame_is_intra_only(cm) && !has_valid_ref_frame) {
+ aom_internal_error(
+ cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Can't find at least one reference frame with valid size");
+ }
+
+ av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
+ cm->width, cm->height);
+
+ set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static INLINE int extend_borders_mt(const AV1_COMP *cpi,
+ MULTI_THREADED_MODULES stage, int plane) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (cpi->mt_info.num_mod_workers[stage] < 2) return 0;
+ switch (stage) {
+ // TODO(deepa.kg@ittiam.com): When cdef and loop-restoration are disabled,
+ // multi-thread frame border extension along with loop filter frame.
+ // As loop-filtering of a superblock row modifies the pixels of the
+ // above superblock row, border extension requires that loop filtering
+ // of the current and above superblock row is complete.
+ case MOD_LPF: return 0;
+ case MOD_CDEF:
+ return is_cdef_used(cm) && !cpi->ppi->rtc_ref.non_reference_frame &&
+ !is_restoration_used(cm) && !av1_superres_scaled(cm);
+ case MOD_LR:
+ return is_restoration_used(cm) &&
+ (cm->rst_info[plane].frame_restoration_type != RESTORE_NONE);
+ default: assert(0);
+ }
+ return 0;
+}
+
+/*!\brief Select and apply cdef filters and switchable restoration filters
+ *
+ * \ingroup high_level_algo
+ */
+static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
+ MACROBLOCKD *xd, int use_restoration,
+ int use_cdef,
+ unsigned int skip_apply_postproc_filters) {
+#if !CONFIG_REALTIME_ONLY
+ if (use_restoration)
+ av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0);
+#else
+ (void)use_restoration;
+#endif
+
+ if (use_cdef) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, cdef_time);
+#endif
+ const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF];
+ // Find CDEF parameters
+ av1_cdef_search(cpi);
+
+ // Apply the filter
+ if ((skip_apply_postproc_filters & SKIP_APPLY_CDEF) == 0) {
+ assert(!cpi->ppi->rtc_ref.non_reference_frame);
+ if (num_workers > 1) {
+ // Extension of frame borders is multi-threaded along with cdef.
+ const int do_extend_border =
+ extend_borders_mt(cpi, MOD_CDEF, /* plane */ 0);
+ av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker,
+ cpi->mt_info.workers, &cpi->mt_info.cdef_sync,
+ num_workers, av1_cdef_init_fb_row_mt,
+ do_extend_border);
+ } else {
+ av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row);
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, cdef_time);
+#endif
+ }
+
+ const int use_superres = av1_superres_scaled(cm);
+ if (use_superres) {
+ if ((skip_apply_postproc_filters & SKIP_APPLY_SUPERRES) == 0) {
+ av1_superres_post_encode(cpi);
+ }
+ }
+
+#if !CONFIG_REALTIME_ONLY
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, loop_restoration_time);
+#endif
+ if (use_restoration) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ const int num_workers = mt_info->num_mod_workers[MOD_LR];
+ av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
+ av1_pick_filter_restoration(cpi->source, cpi);
+ if ((skip_apply_postproc_filters & SKIP_APPLY_RESTORATION) == 0 &&
+ (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[2].frame_restoration_type != RESTORE_NONE)) {
+ if (num_workers > 1) {
+ // Extension of frame borders is multi-threaded along with loop
+ // restoration filter.
+ const int do_extend_border = 1;
+ av1_loop_restoration_filter_frame_mt(
+ &cm->cur_frame->buf, cm, 0, mt_info->workers, num_workers,
+ &mt_info->lr_row_sync, &cpi->lr_ctxt, do_extend_border);
+ } else {
+ av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0,
+ &cpi->lr_ctxt);
+ }
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, loop_restoration_time);
+#endif
+#endif // !CONFIG_REALTIME_ONLY
+}
+
+static void extend_frame_borders(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ // TODO(debargha): Fix mv search range on encoder side
+ for (int plane = 0; plane < av1_num_planes(cm); ++plane) {
+ const bool extend_border_done = extend_borders_mt(cpi, MOD_CDEF, plane) ||
+ extend_borders_mt(cpi, MOD_LR, plane);
+ if (!extend_border_done) {
+ const YV12_BUFFER_CONFIG *const ybf = &cm->cur_frame->buf;
+ aom_extend_frame_borders_plane_row(ybf, plane, 0,
+ ybf->crop_heights[plane > 0]);
+ }
+ }
+}
+
+/*!\brief Select and apply deblocking filters, cdef filters, and restoration
+ * filters.
+ *
+ * \ingroup high_level_algo
+ */
+static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ const int num_workers = mt_info->num_mod_workers[MOD_LPF];
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+ cpi->td.mb.rdmult = cpi->rd.RDMULT;
+
+ assert(IMPLIES(is_lossless_requested(&cpi->oxcf.rc_cfg),
+ cm->features.coded_lossless && cm->features.all_lossless));
+
+ const int use_loopfilter =
+ is_loopfilter_used(cm) && !cpi->mt_info.pipeline_lpf_mt_with_enc;
+ const int use_cdef = is_cdef_used(cm);
+ const int use_superres = av1_superres_scaled(cm);
+ const int use_restoration = is_restoration_used(cm);
+
+ const unsigned int skip_apply_postproc_filters =
+ derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef,
+ use_superres, use_restoration);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, loop_filter_time);
+#endif
+ if (use_loopfilter) {
+ av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick);
+ struct loopfilter *lf = &cm->lf;
+ if ((lf->filter_level[0] || lf->filter_level[1]) &&
+ (skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0) {
+ assert(!cpi->ppi->rtc_ref.non_reference_frame);
+ // lpf_opt_level = 1 : Enables dual/quad loop-filtering.
+ // lpf_opt_level is set to 1 if transform size search depth in inter
+ // blocks is limited to one as quad loop filtering assumes that all the
+ // transform blocks within a 16x8/8x16/16x16 prediction block are of the
+ // same size. lpf_opt_level = 2 : Filters both chroma planes together, in
+ // addition to enabling dual/quad loop-filtering. This is enabled when lpf
+ // pick method is LPF_PICK_FROM_Q as u and v plane filter levels are
+ // equal.
+ int lpf_opt_level = get_lpf_opt_level(&cpi->sf);
+ av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
+ mt_info->workers, num_workers,
+ &mt_info->lf_row_sync, lpf_opt_level);
+ }
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, loop_filter_time);
+#endif
+
+ cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef,
+ skip_apply_postproc_filters);
+}
+
+static void update_motion_stat(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ RATE_CONTROL *const rc = &cpi->rc;
+ SVC *const svc = &cpi->svc;
+ const int avg_cnt_zeromv =
+ 100 * cpi->rc.cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
+ if (!cpi->ppi->use_svc ||
+ (cpi->ppi->use_svc &&
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+ rc->avg_frame_low_motion =
+ (rc->avg_frame_low_motion == 0)
+ ? avg_cnt_zeromv
+ : (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4;
+ // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
+ // to all lower spatial layers.
+ if (cpi->ppi->use_svc &&
+ svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+ for (int i = 0; i < svc->number_spatial_layers - 1; ++i) {
+ const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+ lrc->avg_frame_low_motion = rc->avg_frame_low_motion;
+ }
+ }
+ }
+}
+
+/*!\brief Encode a frame without the recode loop, usually used in one-pass
+ * encoding and realtime coding.
+ *
+ * \ingroup high_level_algo
+ *
+ * \param[in] cpi Top-level encoder structure
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_without_recode(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg;
+ SVC *const svc = &cpi->svc;
+ const int resize_pending = is_frame_resize_pending(cpi);
+ int top_index = 0, bottom_index = 0, q = 0;
+ YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
+ InterpFilter filter_scaler =
+ cpi->ppi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
+ : EIGHTTAP_SMOOTH;
+ int phase_scaler = cpi->ppi->use_svc
+ ? svc->downsample_filter_phase[svc->spatial_layer_id]
+ : 0;
+
+ set_size_independent_vars(cpi);
+ av1_setup_frame_size(cpi);
+ cm->prev_frame = get_primary_ref_frame_buf(cm);
+ av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+ av1_set_mv_search_params(cpi);
+
+ if (cm->current_frame.frame_number == 0 &&
+ (cpi->ppi->use_svc || cpi->oxcf.rc_cfg.drop_frames_water_mark > 0) &&
+ cpi->svc.temporal_layer_id == 0) {
+ const SequenceHeader *seq_params = cm->seq_params;
+ if (aom_alloc_frame_buffer(
+ &cpi->svc.source_last_TL0, cpi->oxcf.frm_dim_cfg.width,
+ cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffer for source_last_TL0");
+ }
+ }
+
+ if (!cpi->ppi->use_svc) {
+ phase_scaler = 8;
+ // 2:1 scaling.
+ if ((cm->width << 1) == unscaled->y_crop_width &&
+ (cm->height << 1) == unscaled->y_crop_height) {
+ filter_scaler = BILINEAR;
+ // For lower resolutions use eighttap_smooth.
+ if (cm->width * cm->height <= 320 * 180) filter_scaler = EIGHTTAP_SMOOTH;
+ } else if ((cm->width << 2) == unscaled->y_crop_width &&
+ (cm->height << 2) == unscaled->y_crop_height) {
+ // 4:1 scaling.
+ filter_scaler = EIGHTTAP_SMOOTH;
+ } else if ((cm->width << 2) == 3 * unscaled->y_crop_width &&
+ (cm->height << 2) == 3 * unscaled->y_crop_height) {
+ // 4:3 scaling.
+ filter_scaler = EIGHTTAP_REGULAR;
+ }
+ }
+
+ allocate_gradient_info_for_hog(cpi);
+
+ allocate_src_var_of_4x4_sub_block_buf(cpi);
+
+ const SPEED_FEATURES *sf = &cpi->sf;
+ if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION)
+ variance_partition_alloc(cpi);
+
+ if (cm->current_frame.frame_type == KEY_FRAME ||
+ ((sf->inter_sf.extra_prune_warped && cpi->refresh_frame.golden_frame)))
+ copy_frame_prob_info(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ printf("\n Encoding a frame: \n");
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+ av1_setup_butteraugli_rdmult(cpi);
+ }
+#endif
+
+ cpi->source = av1_realloc_and_scale_if_required(
+ cm, unscaled, &cpi->scaled_source, filter_scaler, phase_scaler, true,
+ false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+ if (frame_is_intra_only(cm) || resize_pending != 0) {
+ const int current_size =
+ (cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2;
+ if (cpi->consec_zero_mv &&
+ (cpi->consec_zero_mv_alloc_size < current_size)) {
+ aom_free(cpi->consec_zero_mv);
+ cpi->consec_zero_mv_alloc_size = 0;
+ CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
+ aom_malloc(current_size * sizeof(*cpi->consec_zero_mv)));
+ cpi->consec_zero_mv_alloc_size = current_size;
+ }
+ assert(cpi->consec_zero_mv != NULL);
+ memset(cpi->consec_zero_mv, 0, current_size * sizeof(*cpi->consec_zero_mv));
+ }
+
+ if (cpi->scaled_last_source_available) {
+ cpi->last_source = &cpi->scaled_last_source;
+ cpi->scaled_last_source_available = 0;
+ } else if (cpi->unscaled_last_source != NULL) {
+ cpi->last_source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler,
+ phase_scaler, true, false, cpi->oxcf.border_in_pixels,
+ cpi->image_pyramid_levels);
+ }
+
+ if (cpi->sf.rt_sf.use_temporal_noise_estimate) {
+ av1_update_noise_estimate(cpi);
+ }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && cpi->ppi->use_svc)
+ av1_denoiser_reset_on_first_frame(cpi);
+#endif
+
+ // For 1 spatial layer encoding: if the (non-LAST) reference has different
+ // resolution from the source then disable that reference. This is to avoid
+ // significant increase in encode time from scaling the references in
+ // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger)
+ // resized frame and ALTREF will be refreshed ~4 frames later, so both
+ // references become available again after few frames.
+ // For superres: don't disable golden reference.
+ if (svc->number_spatial_layers == 1) {
+ if (!cpi->oxcf.superres_cfg.enable_superres) {
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) {
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+ cpi->ref_frame_flags ^= AOM_GOLD_FLAG;
+ }
+ }
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) {
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)
+ cpi->ref_frame_flags ^= AOM_ALT_FLAG;
+ }
+ }
+
+ int scale_references = 0;
+#if CONFIG_FPMT_TEST
+ scale_references =
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0;
+#endif // CONFIG_FPMT_TEST
+ if (scale_references ||
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ if (!frame_is_intra_only(cm)) {
+ av1_scale_references(cpi, filter_scaler, phase_scaler, 1);
+ }
+ }
+
+ av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+ q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+ av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+ av1_set_variance_partition_thresholds(cpi, q, 0);
+ av1_setup_frame(cpi);
+
+ // Check if this high_source_sad (scene/slide change) frame should be
+ // encoded at high/max QP, and if so, set the q and adjust some rate
+ // control parameters.
+ if (cpi->sf.rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ &&
+ cpi->rc.high_source_sad) {
+ if (av1_encodedframe_overshoot_cbr(cpi, &q)) {
+ av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+ q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+ av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+ av1_set_variance_partition_thresholds(cpi, q, 0);
+ if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+ cm->features.primary_ref_frame == PRIMARY_REF_NONE)
+ av1_setup_frame(cpi);
+ }
+ }
+
+ if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
+ suppress_active_map(cpi);
+ av1_cyclic_refresh_setup(cpi);
+ }
+ av1_apply_active_map(cpi);
+ if (cm->seg.enabled) {
+ if (!cm->seg.update_data && cm->prev_frame) {
+ segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ cm->seg.enabled = cm->prev_frame->seg.enabled;
+ } else {
+ av1_calculate_segdata(&cm->seg);
+ }
+ } else {
+ memset(&cm->seg, 0, sizeof(cm->seg));
+ }
+ segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+ cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+ // This is for rtc temporal filtering case.
+ if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf &&
+ cm->current_frame.frame_type != KEY_FRAME) {
+ const SequenceHeader *seq_params = cm->seq_params;
+
+ if (cpi->orig_source.buffer_alloc_sz == 0 ||
+ cpi->last_source->y_width != cpi->source->y_width ||
+ cpi->last_source->y_height != cpi->source->y_height) {
+ // Allocate a source buffer to store the true source for psnr calculation.
+ if (aom_alloc_frame_buffer(
+ &cpi->orig_source, cpi->oxcf.frm_dim_cfg.width,
+ cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate scaled buffer");
+ }
+
+ aom_yv12_copy_y(cpi->source, &cpi->orig_source);
+ aom_yv12_copy_u(cpi->source, &cpi->orig_source);
+ aom_yv12_copy_v(cpi->source, &cpi->orig_source);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_encode_frame_time);
+#endif
+
+ // Set the motion vector precision based on mv stats from the last coded
+ // frame.
+ if (!frame_is_intra_only(cm)) av1_pick_and_set_high_precision_mv(cpi, q);
+
+ // transform / motion compensation build reconstruction frame
+ av1_encode_frame(cpi);
+
+ if (!cpi->rc.rtc_external_ratectrl && !frame_is_intra_only(cm))
+ update_motion_stat(cpi);
+
+ // Adjust the refresh of the golden (longer-term) reference based on QP
+ // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode.
+ if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+ cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 &&
+ svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl &&
+ sf->rt_sf.gf_refresh_based_on_qp)
+ av1_adjust_gf_refresh_qp_one_pass_rt(cpi);
+
+ // For non-svc: if scaling is required, copy scaled_source
+ // into scaled_last_source.
+ if (cm->current_frame.frame_number > 1 && !cpi->ppi->use_svc &&
+ cpi->scaled_source.y_buffer != NULL &&
+ cpi->scaled_last_source.y_buffer != NULL &&
+ cpi->scaled_source.y_crop_width == cpi->scaled_last_source.y_crop_width &&
+ cpi->scaled_source.y_crop_height ==
+ cpi->scaled_last_source.y_crop_height &&
+ (cm->width != cpi->unscaled_source->y_crop_width ||
+ cm->height != cpi->unscaled_source->y_crop_height)) {
+ cpi->scaled_last_source_available = 1;
+ aom_yv12_copy_y(&cpi->scaled_source, &cpi->scaled_last_source);
+ aom_yv12_copy_u(&cpi->scaled_source, &cpi->scaled_last_source);
+ aom_yv12_copy_v(&cpi->scaled_source, &cpi->scaled_last_source);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_encode_frame_time);
+#endif
+#if CONFIG_INTERNAL_STATS
+ ++cpi->frame_recode_hits;
+#endif
+
+ return AOM_CODEC_OK;
+}
+
+#if !CONFIG_REALTIME_ONLY
+
+/*!\brief Recode loop for encoding one frame. the purpose of encoding one frame
+ * for multiple times can be approaching a target bitrate or adjusting the usage
+ * of global motions.
+ *
+ * \ingroup high_level_algo
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] size Bitstream size
+ * \param[in] dest Bitstream output
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ GlobalMotionInfo *const gm_info = &cpi->gm_info;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+ const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE);
+ // Must allow recode if minimum compression ratio is set.
+ assert(IMPLIES(oxcf->rc_cfg.min_cr > 0, allow_recode));
+
+ set_size_independent_vars(cpi);
+ if (is_stat_consumption_stage_twopass(cpi) &&
+ cpi->sf.interp_sf.adaptive_interp_filter_search)
+ cpi->interp_search_flags.interp_filter_search_mask =
+ av1_setup_interp_filter_search_mask(cpi);
+
+ av1_setup_frame_size(cpi);
+
+ if (av1_superres_in_recode_allowed(cpi) &&
+ cpi->superres_mode != AOM_SUPERRES_NONE &&
+ cm->superres_scale_denominator == SCALE_NUMERATOR) {
+ // Superres mode is currently enabled, but the denominator selected will
+ // disable superres. So no need to continue, as we will go through another
+ // recode loop for full-resolution after this anyway.
+ return -1;
+ }
+
+ int top_index = 0, bottom_index = 0;
+ int q = 0, q_low = 0, q_high = 0;
+ av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+ q_low = bottom_index;
+ q_high = top_index;
+
+ av1_set_mv_search_params(cpi);
+
+ allocate_gradient_info_for_hog(cpi);
+
+ allocate_src_var_of_4x4_sub_block_buf(cpi);
+
+ if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION)
+ variance_partition_alloc(cpi);
+
+ if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ printf("\n Encoding a frame: \n");
+#endif
+
+#if !CONFIG_RD_COMMAND
+ // Determine whether to use screen content tools using two fast encoding.
+ if (!cpi->sf.hl_sf.disable_extra_sc_testing && !cpi->use_ducky_encode)
+ av1_determine_sc_tools_with_encoding(cpi, q);
+#endif // !CONFIG_RD_COMMAND
+
+#if CONFIG_TUNE_VMAF
+ if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ av1_vmaf_neg_preprocessing(cpi, cpi->unscaled_source);
+ }
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ cpi->butteraugli_info.recon_set = false;
+ int original_q = 0;
+#endif
+
+ cpi->num_frame_recode = 0;
+
+ // Loop variables
+ int loop = 0;
+ int loop_count = 0;
+ int overshoot_seen = 0;
+ int undershoot_seen = 0;
+ int low_cr_seen = 0;
+ int last_loop_allow_hp = 0;
+
+ do {
+ loop = 0;
+ int do_mv_stats_collection = 1;
+
+ // if frame was scaled calculate global_motion_search again if already
+ // done
+ if (loop_count > 0 && cpi->source && gm_info->search_done) {
+ if (cpi->source->y_crop_width != cm->width ||
+ cpi->source->y_crop_height != cm->height) {
+ gm_info->search_done = 0;
+ }
+ }
+ cpi->source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_source, &cpi->scaled_source, EIGHTTAP_REGULAR, 0,
+ false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ if (oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+ if (loop_count == 0) {
+ original_q = q;
+ // TODO(sdeng): different q here does not make big difference. Use a
+ // faster pass instead.
+ q = 96;
+ av1_setup_butteraugli_source(cpi);
+ } else {
+ q = original_q;
+ }
+ }
+#endif
+
+ if (cpi->unscaled_last_source != NULL) {
+ cpi->last_source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+ EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels,
+ cpi->image_pyramid_levels);
+ }
+
+ int scale_references = 0;
+#if CONFIG_FPMT_TEST
+ scale_references =
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE ? 1 : 0;
+#endif // CONFIG_FPMT_TEST
+ if (scale_references ||
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ if (!frame_is_intra_only(cm)) {
+ if (loop_count > 0) {
+ release_scaled_references(cpi);
+ }
+ av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0);
+ }
+ }
+
+#if CONFIG_TUNE_VMAF
+ if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+ oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ cpi->vmaf_info.original_qindex = q;
+ q = av1_get_vmaf_base_qindex(cpi, q);
+ }
+#endif
+
+#if CONFIG_RD_COMMAND
+ RD_COMMAND *rd_command = &cpi->rd_command;
+ RD_OPTION option = rd_command->option_ls[rd_command->frame_index];
+ if (option == RD_OPTION_SET_Q || option == RD_OPTION_SET_Q_RDMULT) {
+ q = rd_command->q_index_ls[rd_command->frame_index];
+ }
+#endif // CONFIG_RD_COMMAND
+
+#if CONFIG_BITRATE_ACCURACY
+#if CONFIG_THREE_PASS
+ if (oxcf->pass == AOM_RC_THIRD_PASS && cpi->vbr_rc_info.ready == 1) {
+ int frame_coding_idx =
+ av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+ if (frame_coding_idx < cpi->vbr_rc_info.total_frame_count) {
+ q = cpi->vbr_rc_info.q_index_list[frame_coding_idx];
+ } else {
+ // TODO(angiebird): Investigate why sometimes there is an extra frame
+ // after the last GOP.
+ q = cpi->vbr_rc_info.base_q_index;
+ }
+ }
+#else
+ if (cpi->vbr_rc_info.q_index_list_ready) {
+ q = cpi->vbr_rc_info.q_index_list[cpi->gf_frame_index];
+ }
+#endif // CONFIG_THREE_PASS
+#endif // CONFIG_BITRATE_ACCURACY
+
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+ // TODO(angiebird): Move this into a function.
+ if (oxcf->pass == AOM_RC_THIRD_PASS) {
+ int frame_coding_idx =
+ av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+ double qstep_ratio = cpi->vbr_rc_info.qstep_ratio_list[frame_coding_idx];
+ FRAME_UPDATE_TYPE update_type =
+ cpi->vbr_rc_info.update_type_list[frame_coding_idx];
+ rc_log_frame_encode_param(&cpi->rc_log, frame_coding_idx, qstep_ratio, q,
+ update_type);
+ }
+#endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+
+ if (cpi->use_ducky_encode) {
+ const DuckyEncodeFrameInfo *frame_info =
+ &cpi->ducky_encode_info.frame_info;
+ if (frame_info->qp_mode == DUCKY_ENCODE_FRAME_MODE_QINDEX) {
+ q = frame_info->q_index;
+ cm->delta_q_info.delta_q_present_flag = frame_info->delta_q_enabled;
+ }
+ }
+
+ av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
+ q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+ av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+
+ av1_set_variance_partition_thresholds(cpi, q, 0);
+
+ // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n",
+ // cm->current_frame.frame_number, cm->show_frame, q,
+ // cm->current_frame.frame_type, cm->superres_scale_denominator);
+
+ if (loop_count == 0) {
+ av1_setup_frame(cpi);
+ } else if (get_primary_ref_frame_buf(cm) == NULL) {
+ // Base q-index may have changed, so we need to assign proper default coef
+ // probs before every iteration.
+ av1_default_coef_probs(cm);
+ av1_setup_frame_contexts(cm);
+ }
+
+ if (q_cfg->aq_mode == VARIANCE_AQ) {
+ av1_vaq_frame_setup(cpi);
+ } else if (q_cfg->aq_mode == COMPLEXITY_AQ) {
+ av1_setup_in_frame_q_adj(cpi);
+ }
+
+ if (cm->seg.enabled) {
+ if (!cm->seg.update_data && cm->prev_frame) {
+ segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ cm->seg.enabled = cm->prev_frame->seg.enabled;
+ } else {
+ av1_calculate_segdata(&cm->seg);
+ }
+ } else {
+ memset(&cm->seg, 0, sizeof(cm->seg));
+ }
+ segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+ cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_encode_frame_time);
+#endif
+ // Set the motion vector precision based on mv stats from the last coded
+ // frame.
+ if (!frame_is_intra_only(cm)) {
+ av1_pick_and_set_high_precision_mv(cpi, q);
+
+ // If the precision has changed during different iteration of the loop,
+ // then we need to reset the global motion vectors
+ if (loop_count > 0 &&
+ cm->features.allow_high_precision_mv != last_loop_allow_hp) {
+ gm_info->search_done = 0;
+ }
+ last_loop_allow_hp = cm->features.allow_high_precision_mv;
+ }
+
+ // transform / motion compensation build reconstruction frame
+ av1_encode_frame(cpi);
+
+ // Disable mv_stats collection for parallel frames based on update flag.
+ if (!cpi->do_frame_data_update) do_mv_stats_collection = 0;
+
+ // Reset the mv_stats in case we are interrupted by an intraframe or an
+ // overlay frame.
+ if (cpi->mv_stats.valid && do_mv_stats_collection) av1_zero(cpi->mv_stats);
+
+ // Gather the mv_stats for the next frame
+ if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
+ av1_frame_allows_smart_mv(cpi) && do_mv_stats_collection) {
+ av1_collect_mv_stats(cpi, q);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_encode_frame_time);
+#endif
+
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+ const int do_dummy_pack = 1;
+#else // CONFIG_BITRATE_ACCURACY
+ // Dummy pack of the bitstream using up to date stats to get an
+ // accurate estimate of output frame size to determine if we need
+ // to recode.
+ const int do_dummy_pack =
+ (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
+ oxcf->rc_cfg.mode != AOM_Q) ||
+ oxcf->rc_cfg.min_cr > 0;
+#endif // CONFIG_BITRATE_ACCURACY
+ if (do_dummy_pack) {
+ av1_finalize_encoded_frame(cpi);
+ int largest_tile_id = 0; // Output from bitstream: unused here
+ rc->coefficient_size = 0;
+ if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ // bits used for this frame
+ rc->projected_frame_size = (int)(*size) << 3;
+#if CONFIG_RD_COMMAND
+ PSNR_STATS psnr;
+ aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+ printf("q %d rdmult %d rate %d dist %" PRIu64 "\n", q, cpi->rd.RDMULT,
+ rc->projected_frame_size, psnr.sse[0]);
+ ++rd_command->frame_index;
+ if (rd_command->frame_index == rd_command->frame_count) {
+ return AOM_CODEC_ERROR;
+ }
+#endif // CONFIG_RD_COMMAND
+
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+ if (oxcf->pass == AOM_RC_THIRD_PASS) {
+ int frame_coding_idx =
+ av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+ rc_log_frame_entropy(&cpi->rc_log, frame_coding_idx,
+ rc->projected_frame_size, rc->coefficient_size);
+ }
+#endif // CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+ }
+
+#if CONFIG_TUNE_VMAF
+ if (oxcf->tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+ oxcf->tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ q = cpi->vmaf_info.original_qindex;
+ }
+#endif
+ if (allow_recode) {
+ // Update q and decide whether to do a recode loop
+ recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index,
+ bottom_index, &undershoot_seen, &overshoot_seen,
+ &low_cr_seen, loop_count);
+ }
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ if (loop_count == 0 && oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+ loop = 1;
+ av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.4);
+ }
+#endif
+
+ if (cpi->use_ducky_encode) {
+ // Ducky encode currently does not support recode loop.
+ loop = 0;
+ }
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+ loop = 0; // turn off recode loop when CONFIG_BITRATE_ACCURACY is on
+#endif // CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+
+ if (loop) {
+ ++loop_count;
+ cpi->num_frame_recode =
+ (cpi->num_frame_recode < (NUM_RECODES_PER_FRAME - 1))
+ ? (cpi->num_frame_recode + 1)
+ : (NUM_RECODES_PER_FRAME - 1);
+#if CONFIG_INTERNAL_STATS
+ ++cpi->frame_recode_hits;
+#endif
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (loop) printf("\n Recoding:");
+#endif
+ } while (loop);
+
+ return AOM_CODEC_OK;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// TODO(jingning, paulwilkins): Set up high grain level to test
+// hardware decoders. Need to adapt the actual noise variance
+// according to the difference between reconstructed frame and the
+// source signal.
+static void set_grain_syn_params(AV1_COMMON *cm) {
+ aom_film_grain_t *film_grain_params = &cm->film_grain_params;
+ film_grain_params->apply_grain = 1;
+ film_grain_params->update_parameters = 1;
+ film_grain_params->random_seed = rand() & 0xffff;
+
+ film_grain_params->num_y_points = 1;
+ film_grain_params->scaling_points_y[0][0] = 128;
+ film_grain_params->scaling_points_y[0][1] = 100;
+
+ if (!cm->seq_params->monochrome) {
+ film_grain_params->num_cb_points = 1;
+ film_grain_params->scaling_points_cb[0][0] = 128;
+ film_grain_params->scaling_points_cb[0][1] = 100;
+
+ film_grain_params->num_cr_points = 1;
+ film_grain_params->scaling_points_cr[0][0] = 128;
+ film_grain_params->scaling_points_cr[0][1] = 100;
+ } else {
+ film_grain_params->num_cb_points = 0;
+ film_grain_params->num_cr_points = 0;
+ }
+
+ film_grain_params->chroma_scaling_from_luma = 0;
+
+ film_grain_params->scaling_shift = 1;
+ film_grain_params->ar_coeff_lag = 0;
+ film_grain_params->ar_coeff_shift = 1;
+ film_grain_params->overlap_flag = 1;
+ film_grain_params->grain_scale_shift = 0;
+}
+
+/*!\brief Recode loop or a single loop for encoding one frame, followed by
+ * in-loop deblocking filters, CDEF filters, and restoration filters.
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] size Bitstream size
+ * \param[in] dest Bitstream output
+ * \param[in] sse Total distortion of the frame
+ * \param[in] rate Total rate of the frame
+ * \param[in] largest_tile_id Tile id of the last tile
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
+ uint8_t *dest, int64_t *sse,
+ int64_t *rate,
+ int *largest_tile_id) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_with_or_without_recode_time);
+#endif
+ for (int i = 0; i < NUM_RECODES_PER_FRAME; i++) {
+ cpi->do_update_frame_probs_txtype[i] = 0;
+ cpi->do_update_frame_probs_obmc[i] = 0;
+ cpi->do_update_frame_probs_warp[i] = 0;
+ cpi->do_update_frame_probs_interpfilter[i] = 0;
+ }
+
+ cpi->do_update_vbr_bits_off_target_fast = 0;
+ int err;
+#if CONFIG_REALTIME_ONLY
+ err = encode_without_recode(cpi);
+#else
+ if (cpi->sf.hl_sf.recode_loop == DISALLOW_RECODE)
+ err = encode_without_recode(cpi);
+ else
+ err = encode_with_recode_loop(cpi, size, dest);
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_with_or_without_recode_time);
+#endif
+ if (err != AOM_CODEC_OK) {
+ if (err == -1) {
+ // special case as described in encode_with_recode_loop().
+ // Encoding was skipped.
+ err = AOM_CODEC_OK;
+ if (sse != NULL) *sse = INT64_MAX;
+ if (rate != NULL) *rate = INT64_MAX;
+ *largest_tile_id = 0;
+ }
+ return err;
+ }
+
+#ifdef OUTPUT_YUV_DENOISED
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) {
+ aom_write_yuv_frame(yuv_denoised_file,
+ &cpi->denoiser.running_avg_y[INTRA_FRAME]);
+ }
+#endif
+
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = cm->seq_params;
+
+ // Special case code to reduce pulsing when key frames are forced at a
+ // fixed interval. Note the reconstruction error if it is the frame before
+ // the force key frame
+ if (cpi->ppi->p_rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (seq_params->use_highbitdepth) {
+ cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+ } else {
+ cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+ }
+#else
+ cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+ }
+
+ cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+ cm->cur_frame->buf.transfer_characteristics =
+ seq_params->transfer_characteristics;
+ cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+ cm->cur_frame->buf.monochrome = seq_params->monochrome;
+ cm->cur_frame->buf.chroma_sample_position =
+ seq_params->chroma_sample_position;
+ cm->cur_frame->buf.color_range = seq_params->color_range;
+ cm->cur_frame->buf.render_width = cm->render_width;
+ cm->cur_frame->buf.render_height = cm->render_height;
+
+ if (!cpi->mt_info.pipeline_lpf_mt_with_enc)
+ set_postproc_filter_default_params(&cpi->common);
+
+ if (!cm->features.allow_intrabc) {
+ loopfilter_frame(cpi, cm);
+ }
+
+ if (cpi->oxcf.mode != ALLINTRA && !cpi->ppi->rtc_ref.non_reference_frame) {
+ extend_frame_borders(cpi);
+ }
+
+#ifdef OUTPUT_YUV_REC
+ aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
+#endif
+
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_FILM) {
+ set_grain_syn_params(cm);
+ }
+
+ av1_finalize_encoded_frame(cpi);
+ // Build the bitstream
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+ cpi->rc.coefficient_size = 0;
+ if (av1_pack_bitstream(cpi, dest, size, largest_tile_id) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+
+ // Compute sse and rate.
+ if (sse != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ *sse = (seq_params->use_highbitdepth)
+ ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf)
+ : aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#else
+ *sse = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+ }
+ if (rate != NULL) {
+ const int64_t bits = (*size << 3);
+ *rate = (bits << 5); // To match scale.
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->use_ducky_encode) {
+ PSNR_STATS psnr;
+ aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+ DuckyEncodeFrameResult *frame_result = &cpi->ducky_encode_info.frame_result;
+ frame_result->global_order_idx = cm->cur_frame->display_order_hint;
+ frame_result->q_index = cm->quant_params.base_qindex;
+ frame_result->rdmult = cpi->rd.RDMULT;
+ frame_result->rate = (int)(*size) * 8;
+ frame_result->dist = psnr.sse[0];
+ frame_result->psnr = psnr.psnr[0];
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ return AOM_CODEC_OK;
+}
+
+static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
+ uint8_t *dest,
+ int *largest_tile_id) {
+ const AV1_COMMON *const cm = &cpi->common;
+ assert(cm->seq_params->enable_superres);
+ assert(av1_superres_in_recode_allowed(cpi));
+ aom_codec_err_t err = AOM_CODEC_OK;
+ av1_save_all_coding_context(cpi);
+
+ int64_t sse1 = INT64_MAX;
+ int64_t rate1 = INT64_MAX;
+ int largest_tile_id1 = 0;
+ int64_t sse2 = INT64_MAX;
+ int64_t rate2 = INT64_MAX;
+ int largest_tile_id2;
+ double proj_rdcost1 = DBL_MAX;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const FRAME_UPDATE_TYPE update_type =
+ gf_group->update_type[cpi->gf_frame_index];
+ const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+
+ // Encode with superres.
+ if (cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_ALL) {
+ SuperResCfg *const superres_cfg = &cpi->oxcf.superres_cfg;
+ int64_t superres_sses[SCALE_NUMERATOR];
+ int64_t superres_rates[SCALE_NUMERATOR];
+ int superres_largest_tile_ids[SCALE_NUMERATOR];
+ // Use superres for Key-frames and Alt-ref frames only.
+ if (update_type != OVERLAY_UPDATE && update_type != INTNL_OVERLAY_UPDATE) {
+ for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+ ++denom) {
+ superres_cfg->superres_scale_denominator = denom;
+ superres_cfg->superres_kf_scale_denominator = denom;
+ const int this_index = denom - (SCALE_NUMERATOR + 1);
+
+ cpi->superres_mode = AOM_SUPERRES_AUTO; // Super-res on for this loop.
+ err = encode_with_recode_loop_and_filter(
+ cpi, size, dest, &superres_sses[this_index],
+ &superres_rates[this_index],
+ &superres_largest_tile_ids[this_index]);
+ cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res).
+ if (err != AOM_CODEC_OK) return err;
+ restore_all_coding_context(cpi);
+ }
+ // Reset.
+ superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+ superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+ } else {
+ for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+ ++denom) {
+ const int this_index = denom - (SCALE_NUMERATOR + 1);
+ superres_sses[this_index] = INT64_MAX;
+ superres_rates[this_index] = INT64_MAX;
+ }
+ }
+ // Encode without superres.
+ assert(cpi->superres_mode == AOM_SUPERRES_NONE);
+ err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+ &largest_tile_id2);
+ if (err != AOM_CODEC_OK) return err;
+
+ // Note: Both use common rdmult based on base qindex of fullres.
+ const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
+ bit_depth, update_type, cm->quant_params.base_qindex);
+
+ // Find the best rdcost among all superres denoms.
+ int best_denom = -1;
+ for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+ ++denom) {
+ const int this_index = denom - (SCALE_NUMERATOR + 1);
+ const int64_t this_sse = superres_sses[this_index];
+ const int64_t this_rate = superres_rates[this_index];
+ const int this_largest_tile_id = superres_largest_tile_ids[this_index];
+ const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ rdmult, this_rate, this_sse, bit_depth);
+ if (this_rdcost < proj_rdcost1) {
+ sse1 = this_sse;
+ rate1 = this_rate;
+ largest_tile_id1 = this_largest_tile_id;
+ proj_rdcost1 = this_rdcost;
+ best_denom = denom;
+ }
+ }
+ const double proj_rdcost2 =
+ RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth);
+ // Re-encode with superres if it's better.
+ if (proj_rdcost1 < proj_rdcost2) {
+ restore_all_coding_context(cpi);
+ // TODO(urvang): We should avoid rerunning the recode loop by saving
+ // previous output+state, or running encode only for the selected 'q' in
+ // previous step.
+ // Again, temporarily force the best denom.
+ superres_cfg->superres_scale_denominator = best_denom;
+ superres_cfg->superres_kf_scale_denominator = best_denom;
+ int64_t sse3 = INT64_MAX;
+ int64_t rate3 = INT64_MAX;
+ cpi->superres_mode =
+ AOM_SUPERRES_AUTO; // Super-res on for this recode loop.
+ err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+ largest_tile_id);
+ cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res).
+ assert(sse1 == sse3);
+ assert(rate1 == rate3);
+ assert(largest_tile_id1 == *largest_tile_id);
+ // Reset.
+ superres_cfg->superres_scale_denominator = SCALE_NUMERATOR;
+ superres_cfg->superres_kf_scale_denominator = SCALE_NUMERATOR;
+ } else {
+ *largest_tile_id = largest_tile_id2;
+ }
+ } else {
+ assert(cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_DUAL);
+ cpi->superres_mode =
+ AOM_SUPERRES_AUTO; // Super-res on for this recode loop.
+ err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1,
+ &largest_tile_id1);
+ cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res).
+ if (err != AOM_CODEC_OK) return err;
+ restore_all_coding_context(cpi);
+ // Encode without superres.
+ assert(cpi->superres_mode == AOM_SUPERRES_NONE);
+ err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+ &largest_tile_id2);
+ if (err != AOM_CODEC_OK) return err;
+
+ // Note: Both use common rdmult based on base qindex of fullres.
+ const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
+ bit_depth, update_type, cm->quant_params.base_qindex);
+ proj_rdcost1 =
+ RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1, bit_depth);
+ const double proj_rdcost2 =
+ RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth);
+ // Re-encode with superres if it's better.
+ if (proj_rdcost1 < proj_rdcost2) {
+ restore_all_coding_context(cpi);
+ // TODO(urvang): We should avoid rerunning the recode loop by saving
+ // previous output+state, or running encode only for the selected 'q' in
+ // previous step.
+ int64_t sse3 = INT64_MAX;
+ int64_t rate3 = INT64_MAX;
+ cpi->superres_mode =
+ AOM_SUPERRES_AUTO; // Super-res on for this recode loop.
+ err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+ largest_tile_id);
+ cpi->superres_mode = AOM_SUPERRES_NONE; // Reset to default (full-res).
+ assert(sse1 == sse3);
+ assert(rate1 == rate3);
+ assert(largest_tile_id1 == *largest_tile_id);
+ } else {
+ *largest_tile_id = largest_tile_id2;
+ }
+ }
+
+ return err;
+}
+
+// Conditions to disable cdf_update mode in selective mode for real-time.
+// Handle case for layers, scene change, and resizing.
+static AOM_INLINE int selective_disable_cdf_rtc(const AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ // For single layer.
+ if (cpi->svc.number_spatial_layers == 1 &&
+ cpi->svc.number_temporal_layers == 1) {
+ // Don't disable on intra_only, scene change (high_source_sad = 1),
+ // or resized frame. To avoid quality loss force enable at
+ // for ~30 frames after key or scene/slide change, and
+ // after 8 frames since last update if frame_source_sad > 0.
+ if (frame_is_intra_only(cm) || is_frame_resize_pending(cpi) ||
+ rc->high_source_sad || rc->frames_since_key < 30 ||
+ (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->cyclic_refresh->counter_encode_maxq_scene_change < 30) ||
+ (cpi->frames_since_last_update > 8 && cpi->rc.frame_source_sad > 0))
+ return 0;
+ else
+ return 1;
+ } else if (cpi->svc.number_temporal_layers > 1) {
+ // Disable only on top temporal enhancement layer for now.
+ return cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1;
+ }
+ return 1;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void subtract_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame -= frame->frame;
+ section->weight -= frame->weight;
+ section->intra_error -= frame->intra_error;
+ section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
+ section->coded_error -= frame->coded_error;
+ section->sr_coded_error -= frame->sr_coded_error;
+ section->pcnt_inter -= frame->pcnt_inter;
+ section->pcnt_motion -= frame->pcnt_motion;
+ section->pcnt_second_ref -= frame->pcnt_second_ref;
+ section->pcnt_neutral -= frame->pcnt_neutral;
+ section->intra_skip_pct -= frame->intra_skip_pct;
+ section->inactive_zone_rows -= frame->inactive_zone_rows;
+ section->inactive_zone_cols -= frame->inactive_zone_cols;
+ section->MVr -= frame->MVr;
+ section->mvr_abs -= frame->mvr_abs;
+ section->MVc -= frame->MVc;
+ section->mvc_abs -= frame->mvc_abs;
+ section->MVrv -= frame->MVrv;
+ section->MVcv -= frame->MVcv;
+ section->mv_in_out_count -= frame->mv_in_out_count;
+ section->new_mv_count -= frame->new_mv_count;
+ section->count -= frame->count;
+ section->duration -= frame->duration;
+}
+
+static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ const FIRSTPASS_STATS *const total_stats =
+ twopass->stats_buf_ctx->total_stats;
+
+ if (is_one_pass_rt_params(cpi) ||
+ (cpi->oxcf.q_cfg.deltaq_mode != DELTA_Q_PERCEPTUAL) ||
+ (is_fp_wavelet_energy_invalid(total_stats) == 0))
+ return;
+
+ const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.mi_params.MBs;
+ const YV12_BUFFER_CONFIG *const unfiltered_source = cpi->unfiltered_source;
+ const uint8_t *const src = unfiltered_source->y_buffer;
+ const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int stride = unfiltered_source->y_stride;
+ const BLOCK_SIZE fp_block_size =
+ get_fp_block_size(cpi->is_screen_content_type);
+ const int fp_block_size_width = block_size_wide[fp_block_size];
+ const int fp_block_size_height = block_size_high[fp_block_size];
+ const int num_unit_cols =
+ get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width);
+ const int num_unit_rows =
+ get_num_blocks(unfiltered_source->y_crop_height, fp_block_size_height);
+ const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8);
+ const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8);
+ int64_t frame_avg_wavelet_energy = av1_haar_ac_sad_mxn_uint8_input(
+ src, stride, hbd, num_8x8_rows, num_8x8_cols);
+
+ cpi->twopass_frame.frame_avg_haar_energy =
+ log1p((double)frame_avg_wavelet_energy / num_mbs);
+}
+#endif
+
+extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
+ const char *filename);
+
+/*!\brief Run the final pass encoding for 1-pass/2-pass encoding mode, and pack
+ * the bitstream
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] size Bitstream size
+ * \param[in] dest Bitstream output
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval #AOM_CODEC_ERROR
+ */
+static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
+ uint8_t *dest) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = cm->seq_params;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ struct segmentation *const seg = &cm->seg;
+ FeatureFlags *const features = &cm->features;
+ const TileConfig *const tile_cfg = &oxcf->tile_cfg;
+ assert(cpi->source != NULL);
+ cpi->td.mb.e_mbd.cur_buf = cpi->source;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_frame_to_data_rate_time);
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+ calculate_frame_avg_haar_energy(cpi);
+#endif
+
+ // frame type has been decided outside of this function call
+ cm->cur_frame->frame_type = current_frame->frame_type;
+
+ cm->tiles.large_scale = tile_cfg->enable_large_scale_tile;
+ cm->tiles.single_tile_decoding = tile_cfg->enable_single_tile_decoding;
+
+ features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
+ // features->allow_ref_frame_mvs needs to be written into the frame header
+ // while cm->tiles.large_scale is 1, therefore, "cm->tiles.large_scale=1" case
+ // is separated from frame_might_allow_ref_frame_mvs().
+ features->allow_ref_frame_mvs &= !cm->tiles.large_scale;
+
+ features->allow_warped_motion = oxcf->motion_mode_cfg.allow_warped_motion &&
+ frame_might_allow_warped_motion(cm);
+
+ cpi->last_frame_type = current_frame->frame_type;
+
+ if (frame_is_intra_only(cm)) {
+ cpi->frames_since_last_update = 0;
+ }
+
+ if (frame_is_sframe(cm)) {
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ // S frame will wipe out any previously encoded altref so we cannot place
+ // an overlay frame
+ gf_group->update_type[gf_group->size] = GF_UPDATE;
+ }
+
+ if (encode_show_existing_frame(cm)) {
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+ // TODO(angiebird): Move this into a function.
+ if (oxcf->pass == AOM_RC_THIRD_PASS) {
+ int frame_coding_idx =
+ av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, cpi->gf_frame_index);
+ rc_log_frame_encode_param(
+ &cpi->rc_log, frame_coding_idx, 1, 255,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index]);
+ }
+#endif
+ av1_finalize_encoded_frame(cpi);
+ // Build the bitstream
+ int largest_tile_id = 0; // Output from bitstream: unused here
+ cpi->rc.coefficient_size = 0;
+ if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+
+ if (seq_params->frame_id_numbers_present_flag &&
+ current_frame->frame_type == KEY_FRAME) {
+ // Displaying a forward key-frame, so reset the ref buffer IDs
+ int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+ for (int i = 0; i < REF_FRAMES; i++)
+ cm->ref_frame_id[i] = display_frame_id;
+ }
+
+#if DUMP_RECON_FRAMES == 1
+ // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+ av1_dump_filtered_recon_frames(cpi);
+#endif // DUMP_RECON_FRAMES
+
+ // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+ // for the purpose to verify no mismatch between encoder and decoder.
+ if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ av1_denoiser_update_ref_frame(cpi);
+#endif
+
+ // Since we allocate a spot for the OVERLAY frame in the gf group, we need
+ // to do post-encoding update accordingly.
+ av1_set_target_rate(cpi, cm->width, cm->height);
+
+ if (is_psnr_calc_enabled(cpi)) {
+ cpi->source =
+ realloc_and_scale_source(cpi, cm->cur_frame->buf.y_crop_width,
+ cm->cur_frame->buf.y_crop_height);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->use_ducky_encode) {
+ PSNR_STATS psnr;
+ aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+ DuckyEncodeFrameResult *frame_result =
+ &cpi->ducky_encode_info.frame_result;
+ frame_result->global_order_idx = cm->cur_frame->display_order_hint;
+ frame_result->q_index = cm->quant_params.base_qindex;
+ frame_result->rdmult = cpi->rd.RDMULT;
+ frame_result->rate = (int)(*size) * 8;
+ frame_result->dist = psnr.sse[0];
+ frame_result->psnr = psnr.psnr[0];
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ update_counters_for_show_frame(cpi);
+ return AOM_CODEC_OK;
+ }
+
+ // Work out whether to force_integer_mv this frame
+ if (!is_stat_generation_stage(cpi) &&
+ cpi->common.features.allow_screen_content_tools &&
+ !frame_is_intra_only(cm) && !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ if (cpi->common.seq_params->force_integer_mv == 2) {
+ // Adaptive mode: see what previous frame encoded did
+ if (cpi->unscaled_last_source != NULL) {
+ features->cur_frame_force_integer_mv = av1_is_integer_mv(
+ cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info);
+ } else {
+ cpi->common.features.cur_frame_force_integer_mv = 0;
+ }
+ } else {
+ cpi->common.features.cur_frame_force_integer_mv =
+ cpi->common.seq_params->force_integer_mv;
+ }
+ } else {
+ cpi->common.features.cur_frame_force_integer_mv = 0;
+ }
+
+ // This is used by av1_pack_bitstream. So this needs to be set in case of
+ // row-mt where the encoding code will use a temporary structure.
+ cpi->td.mb.e_mbd.cur_frame_force_integer_mv =
+ cpi->common.features.cur_frame_force_integer_mv;
+
+ // Set default state for segment based loop filter update flags.
+ cm->lf.mode_ref_delta_update = 0;
+
+ // Set various flags etc to special state if it is a key frame.
+ if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
+ // Reset the loop filter deltas and segmentation map.
+ av1_reset_segment_features(cm);
+
+ // If segmentation is enabled force a map update for key frames.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+ }
+ }
+ if (tile_cfg->mtu == 0) {
+ cpi->num_tg = tile_cfg->num_tile_groups;
+ } else {
+ // Use a default value for the purposes of weighting costs in probability
+ // updates
+ cpi->num_tg = DEFAULT_MAX_NUM_TG;
+ }
+
+ // For 1 pass CBR mode: check if we are dropping this frame.
+ if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR) {
+ // Always drop for spatial enhancement layer if layer bandwidth is 0.
+ // Otherwise check for frame-dropping based on buffer level in
+ // av1_rc_drop_frame().
+ if ((cpi->svc.spatial_layer_id > 0 &&
+ cpi->oxcf.rc_cfg.target_bandwidth == 0) ||
+ av1_rc_drop_frame(cpi)) {
+ cpi->is_dropped_frame = true;
+ }
+ if (cpi->is_dropped_frame) {
+ av1_setup_frame_size(cpi);
+ av1_set_mv_search_params(cpi);
+ av1_rc_postencode_update_drop_frame(cpi);
+ release_scaled_references(cpi);
+ cpi->ppi->gf_group.is_frame_dropped[cpi->gf_frame_index] = true;
+ // A dropped frame might not be shown but it always takes a slot in the gf
+ // group. Therefore, even when it is not shown, we still need to update
+ // the relevant frame counters.
+ if (cm->show_frame) {
+ update_counters_for_show_frame(cpi);
+ }
+ return AOM_CODEC_OK;
+ }
+ }
+
+ if (oxcf->tune_cfg.tuning == AOM_TUNE_SSIM) {
+ av1_set_mb_ssim_rdmult_scaling(cpi);
+ }
+#if CONFIG_SALIENCY_MAP
+ else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP &&
+ !(cpi->source->flags & YV12_FLAG_HIGHBITDEPTH)) {
+ if (av1_set_saliency_map(cpi) == 0) {
+ return AOM_CODEC_MEM_ERROR;
+ }
+#if !CONFIG_REALTIME_ONLY
+ double motion_ratio = av1_setup_motion_ratio(cpi);
+#else
+ double motion_ratio = 1.0;
+#endif
+ if (av1_setup_sm_rdmult_scaling_factor(cpi, motion_ratio) == 0) {
+ return AOM_CODEC_MEM_ERROR;
+ }
+ }
+#endif
+#if CONFIG_TUNE_VMAF
+ else if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+ oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN ||
+ oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ av1_set_mb_vmaf_rdmult_scaling(cpi);
+ }
+#endif
+
+ if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI &&
+ cpi->sf.rt_sf.use_nonrd_pick_mode == 0) {
+ av1_init_mb_wiener_var_buffer(cpi);
+ av1_set_mb_wiener_variance(cpi);
+ }
+
+ if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
+ av1_init_mb_ur_var_buffer(cpi);
+ av1_set_mb_ur_variance(cpi);
+ }
+
+#if CONFIG_INTERNAL_STATS
+ memset(cpi->mode_chosen_counts, 0,
+ MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+
+ if (seq_params->frame_id_numbers_present_flag) {
+ /* Non-normative definition of current_frame_id ("frame counter" with
+ * wraparound) */
+ if (cm->current_frame_id == -1) {
+ int lsb, msb;
+ /* quasi-random initialization of current_frame_id for a key frame */
+ if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
+ lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
+ msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
+ } else {
+ lsb = cpi->source->y_buffer[0] & 0xff;
+ msb = cpi->source->y_buffer[1] & 0xff;
+ }
+ cm->current_frame_id =
+ ((msb << 8) + lsb) % (1 << seq_params->frame_id_length);
+
+ // S_frame is meant for stitching different streams of different
+ // resolutions together, so current_frame_id must be the
+ // same across different streams of the same content current_frame_id
+ // should be the same and not random. 0x37 is a chosen number as start
+ // point
+ if (oxcf->kf_cfg.sframe_dist != 0) cm->current_frame_id = 0x37;
+ } else {
+ cm->current_frame_id =
+ (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) %
+ (1 << seq_params->frame_id_length);
+ }
+ }
+
+ switch (oxcf->algo_cfg.cdf_update_mode) {
+ case 0: // No CDF update for any frames(4~6% compression loss).
+ features->disable_cdf_update = 1;
+ break;
+ case 1: // Enable CDF update for all frames.
+ if (cpi->sf.rt_sf.disable_cdf_update_non_reference_frame &&
+ cpi->ppi->rtc_ref.non_reference_frame && cpi->rc.frames_since_key > 2)
+ features->disable_cdf_update = 1;
+ else if (cpi->sf.rt_sf.selective_cdf_update)
+ features->disable_cdf_update = selective_disable_cdf_rtc(cpi);
+ else
+ features->disable_cdf_update = 0;
+ break;
+ case 2:
+ // Strategically determine at which frames to do CDF update.
+ // Currently only enable CDF update for all-intra and no-show frames(1.5%
+ // compression loss) for good qualiy or allintra mode.
+ if (oxcf->mode == GOOD || oxcf->mode == ALLINTRA) {
+ features->disable_cdf_update =
+ (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
+ } else {
+ features->disable_cdf_update = selective_disable_cdf_rtc(cpi);
+ }
+ break;
+ }
+
+ // Disable cdf update for the INTNL_ARF_UPDATE frame with
+ // frame_parallel_level 1.
+ if (!cpi->do_frame_data_update &&
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+ assert(cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1);
+ features->disable_cdf_update = 1;
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->oxcf.tool_cfg.enable_global_motion && !frame_is_intra_only(cm)) {
+ // Flush any stale global motion information, which may be left over
+ // from a previous frame
+ aom_invalidate_pyramid(cpi->source->y_pyramid);
+ av1_invalidate_corner_list(cpi->source->corners);
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ int largest_tile_id = 0;
+ if (av1_superres_in_recode_allowed(cpi)) {
+ if (encode_with_and_without_superres(cpi, size, dest, &largest_tile_id) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ const aom_superres_mode orig_superres_mode = cpi->superres_mode; // save
+ cpi->superres_mode = cpi->oxcf.superres_cfg.superres_mode;
+ if (encode_with_recode_loop_and_filter(cpi, size, dest, NULL, NULL,
+ &largest_tile_id) != AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ cpi->superres_mode = orig_superres_mode; // restore
+ }
+
+ // Update reference frame ids for reference frames this frame will overwrite
+ if (seq_params->frame_id_numbers_present_flag) {
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if ((current_frame->refresh_frame_flags >> i) & 1) {
+ cm->ref_frame_id[i] = cm->current_frame_id;
+ }
+ }
+ }
+
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+ cpi->svc.num_encoded_top_layer++;
+
+#if DUMP_RECON_FRAMES == 1
+ // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+ av1_dump_filtered_recon_frames(cpi);
+#endif // DUMP_RECON_FRAMES
+
+ if (cm->seg.enabled) {
+ if (cm->seg.update_map == 0 && cm->last_frame_seg_map) {
+ memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map,
+ cm->cur_frame->mi_cols * cm->cur_frame->mi_rows *
+ sizeof(*cm->cur_frame->seg_map));
+ }
+ }
+
+ int release_scaled_refs = 0;
+#if CONFIG_FPMT_TEST
+ release_scaled_refs =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 1 : 0;
+#endif // CONFIG_FPMT_TEST
+ if (release_scaled_refs ||
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+ if (frame_is_intra_only(cm) == 0) {
+ release_scaled_references(cpi);
+ }
+ }
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ av1_denoiser_update_ref_frame(cpi);
+#endif
+
+ // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+ // for the purpose to verify no mismatch between encoder and decoder.
+ if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+ if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ *cm->fc = cpi->tile_data[largest_tile_id].tctx;
+ av1_reset_cdf_symbol_counters(cm->fc);
+ }
+ if (!cm->tiles.large_scale) {
+ cm->cur_frame->frame_context = *cm->fc;
+ }
+
+ if (tile_cfg->enable_ext_tile_debug) {
+ // (yunqing) This test ensures the correctness of large scale tile coding.
+ if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) {
+ char fn[20] = "./fc";
+ fn[4] = current_frame->frame_number / 100 + '0';
+ fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+ fn[6] = (current_frame->frame_number % 10) + '0';
+ fn[7] = '\0';
+ av1_print_frame_contexts(cm->fc, fn);
+ }
+ }
+
+ cpi->last_frame_type = current_frame->frame_type;
+
+ if (cm->features.disable_cdf_update) {
+ cpi->frames_since_last_update++;
+ } else {
+ cpi->frames_since_last_update = 1;
+ }
+
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+ cpi->svc.prev_number_spatial_layers = cpi->svc.number_spatial_layers;
+
+ // Clear the one shot update flags for segmentation map and mode/ref loop
+ // filter deltas.
+ cm->seg.update_map = 0;
+ cm->seg.update_data = 0;
+ cm->lf.mode_ref_delta_update = 0;
+
+ if (cm->show_frame) {
+ update_counters_for_show_frame(cpi);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_frame_to_data_rate_time);
+#endif
+
+ return AOM_CODEC_OK;
+}
+
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+ const EncodeFrameInput *const frame_input,
+ const EncodeFrameParams *const frame_params,
+ EncodeFrameResults *const frame_results) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+
+ cpi->unscaled_source = frame_input->source;
+ cpi->source = frame_input->source;
+ cpi->unscaled_last_source = frame_input->last_source;
+
+ current_frame->refresh_frame_flags = frame_params->refresh_frame_flags;
+ cm->features.error_resilient_mode = frame_params->error_resilient_mode;
+ cm->features.primary_ref_frame = frame_params->primary_ref_frame;
+ cm->current_frame.frame_type = frame_params->frame_type;
+ cm->show_frame = frame_params->show_frame;
+ cpi->ref_frame_flags = frame_params->ref_frame_flags;
+ cpi->speed = frame_params->speed;
+ cm->show_existing_frame = frame_params->show_existing_frame;
+ cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show;
+
+ memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx,
+ REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+ memcpy(&cpi->refresh_frame, &frame_params->refresh_frame,
+ sizeof(cpi->refresh_frame));
+
+ if (current_frame->frame_type == KEY_FRAME &&
+ cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+ current_frame->frame_number = 0;
+ }
+
+ current_frame->order_hint =
+ current_frame->frame_number + frame_params->order_offset;
+
+ current_frame->display_order_hint = current_frame->order_hint;
+ current_frame->order_hint %=
+ (1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1));
+
+ current_frame->pyramid_level = get_true_pyr_level(
+ cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index],
+ current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth);
+
+ if (is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->oxcf.q_cfg.use_fixed_qp_offsets)
+ av1_noop_first_pass_frame(cpi, frame_input->ts_duration);
+ else
+ av1_first_pass(cpi, frame_input->ts_duration);
+#endif
+ } else if (cpi->oxcf.pass == AOM_RC_ONE_PASS ||
+ cpi->oxcf.pass >= AOM_RC_SECOND_PASS) {
+ if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+#if CONFIG_DENOISE
+static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
+ int block_size, float noise_level,
+ int64_t time_stamp, int64_t end_time) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (!cpi->denoise_and_model) {
+ cpi->denoise_and_model = aom_denoise_and_model_alloc(
+ cm->seq_params->bit_depth, block_size, noise_level);
+ if (!cpi->denoise_and_model) {
+ aom_set_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating denoise and model");
+ return -1;
+ }
+ }
+ if (!cpi->film_grain_table) {
+ cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+ if (!cpi->film_grain_table) {
+ aom_set_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating grain table");
+ return -1;
+ }
+ memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table));
+ }
+ if (aom_denoise_and_model_run(cpi->denoise_and_model, sd,
+ &cm->film_grain_params,
+ cpi->oxcf.enable_dnl_denoising)) {
+ if (cm->film_grain_params.apply_grain) {
+ aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time,
+ &cm->film_grain_params);
+ }
+ }
+ return 0;
+}
+#endif
+
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ int res = 0;
+ const int subsampling_x = sd->subsampling_x;
+ const int subsampling_y = sd->subsampling_y;
+ const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+
+#if CONFIG_TUNE_VMAF
+ if (!is_stat_generation_stage(cpi) &&
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) {
+ av1_vmaf_frame_preprocessing(cpi, sd);
+ }
+ if (!is_stat_generation_stage(cpi) &&
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+ av1_vmaf_blk_preprocessing(cpi, sd);
+ }
+#endif
+
+#if CONFIG_INTERNAL_STATS
+ struct aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
+
+#if CONFIG_DENOISE
+ // even if denoise_noise_level is > 0, we don't need need to denoise on pass
+ // 1 of 2 if enable_dnl_denoising is disabled since the 2nd pass will be
+ // encoding the original (non-denoised) frame
+ if (cpi->oxcf.noise_level > 0 && !(cpi->oxcf.pass == AOM_RC_FIRST_PASS &&
+ !cpi->oxcf.enable_dnl_denoising)) {
+#if !CONFIG_REALTIME_ONLY
+ // Choose a synthetic noise level for still images for enhanced perceptual
+ // quality based on an estimated noise level in the source, but only if
+ // the noise level is set on the command line to > 0.
+ if (cpi->oxcf.mode == ALLINTRA) {
+ // No noise synthesis if source is very clean.
+ // Uses a low edge threshold to focus on smooth areas.
+ // Increase output noise setting a little compared to measured value.
+ double y_noise_level = 0.0;
+ av1_estimate_noise_level(sd, &y_noise_level, AOM_PLANE_Y, AOM_PLANE_Y,
+ cm->seq_params->bit_depth, 16);
+ cpi->oxcf.noise_level = (float)(y_noise_level - 0.1);
+ cpi->oxcf.noise_level = (float)AOMMAX(0.0, cpi->oxcf.noise_level);
+ if (cpi->oxcf.noise_level > 0.0) {
+ cpi->oxcf.noise_level += (float)0.5;
+ }
+ cpi->oxcf.noise_level = (float)AOMMIN(5.0, cpi->oxcf.noise_level);
+ }
+#endif
+
+ if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
+ cpi->oxcf.noise_level, time_stamp, end_time) < 0)
+ res = -1;
+ }
+#endif // CONFIG_DENOISE
+
+ if (av1_lookahead_push(cpi->ppi->lookahead, sd, time_stamp, end_time,
+ use_highbitdepth, cpi->image_pyramid_levels,
+ frame_flags)) {
+ aom_set_error(cm->error, AOM_CODEC_ERROR, "av1_lookahead_push() failed");
+ res = -1;
+ }
+#if CONFIG_INTERNAL_STATS
+ aom_usec_timer_mark(&timer);
+ cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer);
+#endif
+
+ // Note: Regarding profile setting, the following checks are added to help
+ // choose a proper profile for the input video. The criterion is that all
+ // bitstreams must be designated as the lowest profile that match its content.
+ // E.G. A bitstream that contains 4:4:4 video must be designated as High
+ // Profile in the seq header, and likewise a bitstream that contains 4:2:2
+ // bitstream must be designated as Professional Profile in the sequence
+ // header.
+ if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
+ (subsampling_x != 1 || subsampling_y != 1)) {
+ aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM,
+ "Non-4:2:0 color format requires profile 1 or 2");
+ res = -1;
+ }
+ if ((seq_params->profile == PROFILE_1) &&
+ !(subsampling_x == 0 && subsampling_y == 0)) {
+ aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM,
+ "Profile 1 requires 4:4:4 color format");
+ res = -1;
+ }
+ if ((seq_params->profile == PROFILE_2) &&
+ (seq_params->bit_depth <= AOM_BITS_10) &&
+ !(subsampling_x == 1 && subsampling_y == 0)) {
+ aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM,
+ "Profile 2 bit-depth <= 10 requires 4:2:2 color format");
+ res = -1;
+ }
+
+ return res;
+}
+
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi) {
+ if (!ppi->cpi) return;
+
+ if (ppi->cpi->oxcf.pass != 1 &&
+ ppi->cpi->common.current_frame.frame_number > 0) {
+ fprintf(stderr, "Writing counts.stt\n");
+ FILE *f = fopen("counts.stt", "wb");
+ fwrite(&ppi->aggregate_fc, sizeof(ppi->aggregate_fc), 1, f);
+ fclose(f);
+ }
+}
+#endif // CONFIG_ENTROPY_STATS
+
+#if CONFIG_INTERNAL_STATS
+extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+ const unsigned char *img2, int img2_pitch,
+ int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+ ImageStat *s) {
+ s->stat[STAT_Y] += y;
+ s->stat[STAT_U] += u;
+ s->stat[STAT_V] += v;
+ s->stat[STAT_ALL] += all;
+ s->worst = AOMMIN(s->worst, all);
+}
+
+static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
+ AV1_PRIMARY *const ppi = cpi->ppi;
+ AV1_COMMON *const cm = &cpi->common;
+ double samples = 0.0;
+ const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+
+ if (cpi->ppi->use_svc &&
+ cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+ return;
+
+#if CONFIG_INTER_STATS_ONLY
+ if (cm->current_frame.frame_type == KEY_FRAME) return; // skip key frame
+#endif
+ cpi->bytes += frame_bytes;
+ if (cm->show_frame) {
+ const YV12_BUFFER_CONFIG *orig = cpi->source;
+ const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+ double y, u, v, frame_all;
+
+ ppi->count[0]++;
+ ppi->count[1]++;
+ if (cpi->ppi->b_calculate_psnr) {
+ PSNR_STATS psnr;
+ double weight[2] = { 0.0, 0.0 };
+ double frame_ssim2[2] = { 0.0, 0.0 };
+#if CONFIG_AV1_HIGHBITDEPTH
+ aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+ aom_calc_psnr(orig, recon, &psnr);
+#endif
+ adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
+ &(ppi->psnr[0]));
+ ppi->total_sq_error[0] += psnr.sse[0];
+ ppi->total_samples[0] += psnr.samples[0];
+ samples = psnr.samples[0];
+
+ aom_calc_ssim(orig, recon, bit_depth, in_bit_depth,
+ cm->seq_params->use_highbitdepth, weight, frame_ssim2);
+
+ ppi->worst_ssim = AOMMIN(ppi->worst_ssim, frame_ssim2[0]);
+ ppi->summed_quality += frame_ssim2[0] * weight[0];
+ ppi->summed_weights += weight[0];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ // Compute PSNR based on stream bit depth
+ if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
+ (in_bit_depth < bit_depth)) {
+ adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3],
+ psnr.psnr_hbd[0], &ppi->psnr[1]);
+ ppi->total_sq_error[1] += psnr.sse_hbd[0];
+ ppi->total_samples[1] += psnr.samples_hbd[0];
+
+ ppi->worst_ssim_hbd = AOMMIN(ppi->worst_ssim_hbd, frame_ssim2[1]);
+ ppi->summed_quality_hbd += frame_ssim2[1] * weight[1];
+ ppi->summed_weights_hbd += weight[1];
+ }
+#endif
+
+#if 0
+ {
+ FILE *f = fopen("q_used.stt", "a");
+ double y2 = psnr.psnr[1];
+ double u2 = psnr.psnr[2];
+ double v2 = psnr.psnr[3];
+ double frame_psnr2 = psnr.psnr[0];
+ fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+ cm->current_frame.frame_number, y2, u2, v2,
+ frame_psnr2, frame_ssim2);
+ fclose(f);
+ }
+#endif
+ }
+ if (ppi->b_calculate_blockiness) {
+ if (!cm->seq_params->use_highbitdepth) {
+ const double frame_blockiness =
+ av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
+ recon->y_stride, orig->y_width, orig->y_height);
+ ppi->worst_blockiness = AOMMAX(ppi->worst_blockiness, frame_blockiness);
+ ppi->total_blockiness += frame_blockiness;
+ }
+
+ if (ppi->b_calculate_consistency) {
+ if (!cm->seq_params->use_highbitdepth) {
+ const double this_inconsistency = aom_get_ssim_metrics(
+ orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
+ orig->y_width, orig->y_height, ppi->ssim_vars, &ppi->metrics, 1);
+
+ const double peak = (double)((1 << in_bit_depth) - 1);
+ const double consistency =
+ aom_sse_to_psnr(samples, peak, ppi->total_inconsistency);
+ if (consistency > 0.0)
+ ppi->worst_consistency =
+ AOMMIN(ppi->worst_consistency, consistency);
+ ppi->total_inconsistency += this_inconsistency;
+ }
+ }
+ }
+
+ frame_all =
+ aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+ adjust_image_stat(y, u, v, frame_all, &ppi->fastssim);
+ frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+ adjust_image_stat(y, u, v, frame_all, &ppi->psnrhvs);
+ }
+}
+
+void print_internal_stats(AV1_PRIMARY *ppi) {
+ if (!ppi->cpi) return;
+ AV1_COMP *const cpi = ppi->cpi;
+
+ if (ppi->cpi->oxcf.pass != 1 &&
+ ppi->cpi->common.current_frame.frame_number > 0) {
+ char headings[512] = { 0 };
+ char results[512] = { 0 };
+ FILE *f = fopen("opsnr.stt", "a");
+ double time_encoded =
+ (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) /
+ 10000000.000;
+ double total_encode_time =
+ (ppi->total_time_receive_data + ppi->total_time_compress_data) /
+ 1000.000;
+ const double dr =
+ (double)ppi->total_bytes * (double)8 / (double)1000 / time_encoded;
+ const double peak =
+ (double)((1 << ppi->cpi->oxcf.input_cfg.input_bit_depth) - 1);
+ const double target_rate =
+ (double)ppi->cpi->oxcf.rc_cfg.target_bandwidth / 1000;
+ const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+ if (ppi->b_calculate_psnr) {
+ const double total_psnr = aom_sse_to_psnr(
+ (double)ppi->total_samples[0], peak, (double)ppi->total_sq_error[0]);
+ const double total_ssim =
+ 100 * pow(ppi->summed_quality / ppi->summed_weights, 8.0);
+ snprintf(headings, sizeof(headings),
+ "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+ "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+ "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+ "AVPsrnY\tAPsnrCb\tAPsnrCr");
+ snprintf(results, sizeof(results),
+ "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f",
+ dr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+ ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+ total_ssim, total_ssim,
+ ppi->fastssim.stat[STAT_ALL] / ppi->count[0],
+ ppi->psnrhvs.stat[STAT_ALL] / ppi->count[0], ppi->psnr[0].worst,
+ ppi->worst_ssim, ppi->fastssim.worst, ppi->psnrhvs.worst,
+ ppi->psnr[0].stat[STAT_Y] / ppi->count[0],
+ ppi->psnr[0].stat[STAT_U] / ppi->count[0],
+ ppi->psnr[0].stat[STAT_V] / ppi->count[0]);
+
+ if (ppi->b_calculate_blockiness) {
+ SNPRINT(headings, "\t Block\tWstBlck");
+ SNPRINT2(results, "\t%7.3f", ppi->total_blockiness / ppi->count[0]);
+ SNPRINT2(results, "\t%7.3f", ppi->worst_blockiness);
+ }
+
+ if (ppi->b_calculate_consistency) {
+ double consistency =
+ aom_sse_to_psnr((double)ppi->total_samples[0], peak,
+ (double)ppi->total_inconsistency);
+
+ SNPRINT(headings, "\tConsist\tWstCons");
+ SNPRINT2(results, "\t%7.3f", consistency);
+ SNPRINT2(results, "\t%7.3f", ppi->worst_consistency);
+ }
+
+ SNPRINT(headings, "\t Time\tRcErr\tAbsErr");
+ SNPRINT2(results, "\t%8.0f", total_encode_time);
+ SNPRINT2(results, " %7.2f", rate_err);
+ SNPRINT2(results, " %7.2f", fabs(rate_err));
+
+ SNPRINT(headings, "\tAPsnr611");
+ SNPRINT2(results, " %7.3f",
+ (6 * ppi->psnr[0].stat[STAT_Y] + ppi->psnr[0].stat[STAT_U] +
+ ppi->psnr[0].stat[STAT_V]) /
+ (ppi->count[0] * 8));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint32_t in_bit_depth = ppi->cpi->oxcf.input_cfg.input_bit_depth;
+ const uint32_t bit_depth = ppi->seq_params.bit_depth;
+ // Since cpi->source->flags is not available here, but total_samples[1]
+ // will be non-zero if cpi->source->flags & YV12_FLAG_HIGHBITDEPTH was
+ // true in compute_internal_stats
+ if ((ppi->total_samples[1] > 0) && (in_bit_depth < bit_depth)) {
+ const double peak_hbd = (double)((1 << bit_depth) - 1);
+ const double total_psnr_hbd =
+ aom_sse_to_psnr((double)ppi->total_samples[1], peak_hbd,
+ (double)ppi->total_sq_error[1]);
+ const double total_ssim_hbd =
+ 100 * pow(ppi->summed_quality_hbd / ppi->summed_weights_hbd, 8.0);
+ SNPRINT(headings,
+ "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH"
+ " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH"
+ " AOMSSIMH VPSSIMPH WstSsimH");
+ SNPRINT2(results, "\t%7.3f",
+ ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+ SNPRINT2(results, " %7.3f", total_psnr_hbd);
+ SNPRINT2(results, " %7.3f",
+ ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+ SNPRINT2(results, " %7.3f", total_psnr_hbd);
+ SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_Y] / ppi->count[1]);
+ SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_U] / ppi->count[1]);
+ SNPRINT2(results, " %7.3f", ppi->psnr[1].stat[STAT_V] / ppi->count[1]);
+ SNPRINT2(results, " %7.3f", ppi->psnr[1].worst);
+ SNPRINT2(results, " %7.3f", total_ssim_hbd);
+ SNPRINT2(results, " %7.3f", total_ssim_hbd);
+ SNPRINT2(results, " %7.3f", ppi->worst_ssim_hbd);
+ }
+#endif
+ fprintf(f, "%s\n", headings);
+ fprintf(f, "%s\n", results);
+ }
+
+ fclose(f);
+
+ aom_free(ppi->ssim_vars);
+ ppi->ssim_vars = NULL;
+ }
+}
+#endif // CONFIG_INTERNAL_STATS
+
+static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+ if (cpi->common.show_frame && cpi->rc.frames_to_key) {
+#if !CONFIG_REALTIME_ONLY
+ FIRSTPASS_INFO *firstpass_info = &cpi->ppi->twopass.firstpass_info;
+ if (firstpass_info->past_stats_count > FIRSTPASS_INFO_STATS_PAST_MIN) {
+ av1_firstpass_info_move_cur_index_and_pop(firstpass_info);
+ } else {
+ // When there is not enough past stats, we move the current
+ // index without popping the past stats
+ av1_firstpass_info_move_cur_index(firstpass_info);
+ }
+#endif
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ cpi->rc.frames_to_fwd_kf--;
+ }
+ }
+}
+
+static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+ // TODO(weitinglin): Updating this counter for is_frame_droppable
+ // is a work-around to handle the condition when a frame is drop.
+ // We should fix the cpi->common.show_frame flag
+ // instead of checking the other condition to update the counter properly.
+ if (cpi->common.show_frame ||
+ is_frame_droppable(&cpi->ppi->rtc_ref, &cpi->ext_flags.refresh_frame)) {
+ // Decrement count down till next gf
+ if (cpi->rc.frames_till_gf_update_due > 0)
+ cpi->rc.frames_till_gf_update_due--;
+ }
+}
+
+static AOM_INLINE void update_gf_group_index(AV1_COMP *cpi) {
+ // Increment the gf group index ready for the next frame.
+ if (is_one_pass_rt_params(cpi) &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ ++cpi->gf_frame_index;
+ // Reset gf_frame_index in case it reaches MAX_STATIC_GF_GROUP_LENGTH
+ // for real time encoding.
+ if (cpi->gf_frame_index == MAX_STATIC_GF_GROUP_LENGTH)
+ cpi->gf_frame_index = 0;
+ } else {
+ ++cpi->gf_frame_index;
+ }
+}
+
+static void update_fb_of_context_type(const AV1_COMP *const cpi,
+ int *const fb_of_context_type) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int current_frame_ref_type = get_current_frame_ref_type(cpi);
+
+ if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+ cpi->ext_flags.use_primary_ref_none) {
+ for (int i = 0; i < REF_FRAMES; i++) {
+ fb_of_context_type[i] = -1;
+ }
+ fb_of_context_type[current_frame_ref_type] =
+ cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
+ : get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ }
+
+ if (!encode_show_existing_frame(cm)) {
+ // Refresh fb_of_context_type[]: see encoder.h for explanation
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ // All ref frames are refreshed, pick one that will live long enough
+ fb_of_context_type[current_frame_ref_type] = 0;
+ } else {
+ // If more than one frame is refreshed, it doesn't matter which one we
+ // pick so pick the first. LST sometimes doesn't refresh any: this is ok
+
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (cm->current_frame.refresh_frame_flags & (1 << i)) {
+ fb_of_context_type[current_frame_ref_type] = i;
+ break;
+ }
+ }
+ }
+ }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+ update_keyframe_counters(cpi);
+ update_frames_till_gf_update(cpi);
+ update_gf_group_index(cpi);
+}
+
+static void update_end_of_frame_stats(AV1_COMP *cpi) {
+ if (cpi->do_frame_data_update) {
+ // Store current frame loopfilter levels in ppi, if update flag is set.
+ if (!cpi->common.show_existing_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct loopfilter *const lf = &cm->lf;
+ cpi->ppi->filter_level[0] = lf->filter_level[0];
+ cpi->ppi->filter_level[1] = lf->filter_level[1];
+ cpi->ppi->filter_level_u = lf->filter_level_u;
+ cpi->ppi->filter_level_v = lf->filter_level_v;
+ }
+ }
+ // Store frame level mv_stats from cpi to ppi.
+ cpi->ppi->mv_stats = cpi->mv_stats;
+}
+
+// Updates frame level stats related to global motion
+static AOM_INLINE void update_gm_stats(AV1_COMP *cpi) {
+ FRAME_UPDATE_TYPE update_type =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+ int i, is_gm_present = 0;
+
+ // Check if the current frame has any valid global motion model across its
+ // reference frames
+ for (i = 0; i < REF_FRAMES; i++) {
+ if (cpi->common.global_motion[i].wmtype != IDENTITY) {
+ is_gm_present = 1;
+ break;
+ }
+ }
+ int update_actual_stats = 1;
+#if CONFIG_FPMT_TEST
+ update_actual_stats =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+ if (!update_actual_stats) {
+ if (cpi->ppi->temp_valid_gm_model_found[update_type] == INT32_MAX) {
+ cpi->ppi->temp_valid_gm_model_found[update_type] = is_gm_present;
+ } else {
+ cpi->ppi->temp_valid_gm_model_found[update_type] |= is_gm_present;
+ }
+ int show_existing_between_parallel_frames =
+ (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+ if (cpi->do_frame_data_update == 1 &&
+ !show_existing_between_parallel_frames) {
+ for (i = 0; i < FRAME_UPDATE_TYPES; i++) {
+ cpi->ppi->valid_gm_model_found[i] =
+ cpi->ppi->temp_valid_gm_model_found[i];
+ }
+ }
+ }
+#endif
+ if (update_actual_stats) {
+ if (cpi->ppi->valid_gm_model_found[update_type] == INT32_MAX) {
+ cpi->ppi->valid_gm_model_found[update_type] = is_gm_present;
+ } else {
+ cpi->ppi->valid_gm_model_found[update_type] |= is_gm_present;
+ }
+ }
+}
+
+void av1_post_encode_updates(AV1_COMP *const cpi,
+ const AV1_COMP_DATA *const cpi_data) {
+ AV1_PRIMARY *const ppi = cpi->ppi;
+ AV1_COMMON *const cm = &cpi->common;
+
+ update_gm_stats(cpi);
+
+#if !CONFIG_REALTIME_ONLY
+ // Update the total stats remaining structure.
+ if (cpi->twopass_frame.this_frame != NULL &&
+ ppi->twopass.stats_buf_ctx->total_left_stats) {
+ subtract_stats(ppi->twopass.stats_buf_ctx->total_left_stats,
+ cpi->twopass_frame.this_frame);
+ }
+#endif
+
+#if CONFIG_OUTPUT_FRAME_SIZE
+ FILE *f = fopen("frame_sizes.csv", "a");
+ fprintf(f, "%d,", 8 * (int)cpi_data->frame_size);
+ fprintf(f, "%d\n", cm->quant_params.base_qindex);
+ fclose(f);
+#endif // CONFIG_OUTPUT_FRAME_SIZE
+
+ if (!is_stat_generation_stage(cpi) && !cpi->is_dropped_frame) {
+ // Before calling refresh_reference_frames(), copy ppi->ref_frame_map_copy
+ // to cm->ref_frame_map for frame_parallel_level 2 frame in a parallel
+ // encode set of lower layer frames.
+ // TODO(Remya): Move ref_frame_map from AV1_COMMON to AV1_PRIMARY to avoid
+ // copy.
+ if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 2 &&
+ ppi->gf_group.frame_parallel_level[cpi->gf_frame_index - 1] == 1 &&
+ ppi->gf_group.update_type[cpi->gf_frame_index - 1] ==
+ INTNL_ARF_UPDATE) {
+ memcpy(cm->ref_frame_map, ppi->ref_frame_map_copy,
+ sizeof(cm->ref_frame_map));
+ }
+ refresh_reference_frames(cpi);
+ // For frame_parallel_level 1 frame in a parallel encode set of lower layer
+ // frames, store the updated cm->ref_frame_map in ppi->ref_frame_map_copy.
+ if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1 &&
+ ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+ memcpy(ppi->ref_frame_map_copy, cm->ref_frame_map,
+ sizeof(cm->ref_frame_map));
+ }
+ av1_rc_postencode_update(cpi, cpi_data->frame_size);
+ }
+
+ if (cpi_data->pop_lookahead == 1) {
+ av1_lookahead_pop(cpi->ppi->lookahead, cpi_data->flush,
+ cpi->compressor_stage);
+ }
+ if (cpi->common.show_frame) {
+ cpi->ppi->ts_start_last_show_frame = cpi_data->ts_frame_start;
+ cpi->ppi->ts_end_last_show_frame = cpi_data->ts_frame_end;
+ }
+ if (ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
+ // Initialize level info. at the beginning of each sequence.
+ if (cm->current_frame.frame_type == KEY_FRAME &&
+ ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+ av1_init_level_info(cpi);
+ }
+ av1_update_level_info(cpi, cpi_data->frame_size, cpi_data->ts_frame_start,
+ cpi_data->ts_frame_end);
+ }
+
+ if (!is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+ if (!has_no_stats_stage(cpi)) av1_twopass_postencode_update(cpi);
+#endif
+ update_fb_of_context_type(cpi, ppi->fb_of_context_type);
+ update_rc_counts(cpi);
+ update_end_of_frame_stats(cpi);
+ }
+
+ if (cpi->oxcf.pass == AOM_RC_THIRD_PASS && cpi->third_pass_ctx) {
+ av1_pop_third_pass_info(cpi->third_pass_ctx);
+ }
+
+ if (ppi->rtc_ref.set_ref_frame_config) {
+ av1_svc_update_buffer_slot_refreshed(cpi);
+ av1_svc_set_reference_was_previous(cpi);
+ }
+
+ if (ppi->use_svc) av1_save_layer_context(cpi);
+
+ // Note *size = 0 indicates a dropped frame for which psnr is not calculated
+ if (ppi->b_calculate_psnr && cpi_data->frame_size > 0) {
+ if (cm->show_existing_frame ||
+ (!is_stat_generation_stage(cpi) && cm->show_frame)) {
+ generate_psnr_packet(cpi);
+ }
+ }
+
+#if CONFIG_INTERNAL_STATS
+ if (!is_stat_generation_stage(cpi)) {
+ compute_internal_stats(cpi, (int)cpi_data->frame_size);
+ }
+#endif // CONFIG_INTERNAL_STATS
+
+ // Write frame info. Subtract 1 from frame index since if was incremented in
+ // update_rc_counts.
+ av1_write_second_pass_per_frame_info(cpi, cpi->gf_frame_index - 1);
+}
+
+int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ AV1_COMMON *const cm = &cpi->common;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(cm->error->jmp)) {
+ cm->error->setjmp = 0;
+ return cm->error->error_code;
+ }
+ cm->error->setjmp = 1;
+
+#if CONFIG_INTERNAL_STATS
+ cpi->frame_recode_hits = 0;
+ cpi->time_compress_data = 0;
+ cpi->bytes = 0;
+#endif
+#if CONFIG_ENTROPY_STATS
+ if (cpi->compressor_stage == ENCODE_STAGE) {
+ av1_zero(cpi->counts);
+ }
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+ assert(cpi->oxcf.max_threads <= 1 &&
+ "bitstream debug tool does not support multithreading");
+ bitstream_queue_record_write();
+
+ if (cm->seq_params->order_hint_info.enable_order_hint) {
+ aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 +
+ cm->show_frame);
+ } else {
+ // This is currently used in RTC encoding. cm->show_frame is always 1.
+ aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number);
+ }
+#endif
+ if (cpi->ppi->use_svc) {
+ av1_one_pass_cbr_svc_start_layer(cpi);
+ }
+
+ cpi->is_dropped_frame = false;
+ cm->showable_frame = 0;
+ cpi_data->frame_size = 0;
+ cpi->available_bs_size = cpi_data->cx_data_sz;
+#if CONFIG_INTERNAL_STATS
+ struct aom_usec_timer cmptimer;
+ aom_usec_timer_start(&cmptimer);
+#endif
+ av1_set_high_precision_mv(cpi, 1, 0);
+
+ // Normal defaults
+ cm->features.refresh_frame_context =
+ oxcf->tool_cfg.frame_parallel_decoding_mode
+ ? REFRESH_FRAME_CONTEXT_DISABLED
+ : REFRESH_FRAME_CONTEXT_BACKWARD;
+ if (oxcf->tile_cfg.enable_large_scale_tile)
+ cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+ if (assign_cur_frame_new_fb(cm) == NULL) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Failed to allocate new cur_frame");
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ // Accumulate 2nd pass time in 2-pass case or 1 pass time in 1-pass case.
+ if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0)
+ start_timing(cpi, av1_encode_strategy_time);
+#endif
+
+ const int result = av1_encode_strategy(
+ cpi, &cpi_data->frame_size, cpi_data->cx_data, &cpi_data->lib_flags,
+ &cpi_data->ts_frame_start, &cpi_data->ts_frame_end,
+ cpi_data->timestamp_ratio, &cpi_data->pop_lookahead, cpi_data->flush);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0)
+ end_timing(cpi, av1_encode_strategy_time);
+
+ // Print out timing information.
+ // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+ // show_existing_frame and lag-in-frames.
+ if ((cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) &&
+ cpi->frame_component_time[0] > 100) {
+ int i;
+ uint64_t frame_total = 0, total = 0;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ FRAME_UPDATE_TYPE frame_update_type =
+ get_frame_update_type(gf_group, cpi->gf_frame_index);
+
+ fprintf(stderr,
+ "\n Frame number: %d, Frame type: %s, Show Frame: %d, Frame Update "
+ "Type: %d, Q: %d\n",
+ cm->current_frame.frame_number,
+ get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame,
+ frame_update_type, cm->quant_params.base_qindex);
+ for (i = 0; i < kTimingComponents; i++) {
+ cpi->component_time[i] += cpi->frame_component_time[i];
+ // Use av1_encode_strategy_time (i = 0) as the total time.
+ if (i == 0) {
+ frame_total = cpi->frame_component_time[0];
+ total = cpi->component_time[0];
+ }
+ fprintf(stderr,
+ " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64
+ " us [%6.2f%%])\n",
+ get_component_name(i), cpi->frame_component_time[i],
+ (float)((float)cpi->frame_component_time[i] * 100.0 /
+ (float)frame_total),
+ cpi->component_time[i],
+ (float)((float)cpi->component_time[i] * 100.0 / (float)total));
+ cpi->frame_component_time[i] = 0;
+ }
+ }
+#endif
+
+ // Reset the flag to 0 afer encoding.
+ cpi->rc.use_external_qp_one_pass = 0;
+
+ if (result == -1) {
+ cm->error->setjmp = 0;
+ // Returning -1 indicates no frame encoded; more input is required
+ return -1;
+ }
+ if (result != AOM_CODEC_OK) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Failed to encode frame");
+ }
+#if CONFIG_INTERNAL_STATS
+ aom_usec_timer_mark(&cmptimer);
+ cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
+#endif // CONFIG_INTERNAL_STATS
+
+#if CONFIG_SPEED_STATS
+ if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) {
+ cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count;
+ cpi->td.mb.txfm_search_info.tx_search_count = 0;
+ }
+#endif // CONFIG_SPEED_STATS
+
+ cm->error->setjmp = 0;
+ return AOM_CODEC_OK;
+}
+
+// Populates cpi->scaled_ref_buf corresponding to frames in a parallel encode
+// set. Also sets the bitmask 'ref_buffers_used_map'.
+void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map) {
+ AV1_COMMON *cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_yv12_buf(cm, ref_frame);
+
+ if (ref == NULL) {
+ cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+ continue;
+ }
+
+ // FPMT does not support scaling yet.
+ assert(ref->y_crop_width == cm->width &&
+ ref->y_crop_height == cm->height);
+
+ RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+ cpi->scaled_ref_buf[ref_frame - 1] = buf;
+ for (int i = 0; i < cm->buffer_pool->num_frame_bufs; ++i) {
+ if (&cm->buffer_pool->frame_bufs[i] == buf) {
+ *ref_buffers_used_map |= (1 << i);
+ }
+ }
+ } else {
+ if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+ }
+ }
+}
+
+// Increments the ref_count of frame buffers referenced by cpi->scaled_ref_buf
+// corresponding to frames in a parallel encode set.
+void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool,
+ int ref_buffers_used_map) {
+ for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) {
+ if (ref_buffers_used_map & (1 << i)) {
+ ++buffer_pool->frame_bufs[i].ref_count;
+ }
+ }
+}
+
+// Releases cpi->scaled_ref_buf corresponding to frames in a parallel encode
+// set.
+void av1_release_scaled_references_fpmt(AV1_COMP *cpi) {
+ // TODO(isbs): only refresh the necessary frames, rather than all of them
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+ if (buf != NULL) {
+ cpi->scaled_ref_buf[i] = NULL;
+ }
+ }
+}
+
+// Decrements the ref_count of frame buffers referenced by cpi->scaled_ref_buf
+// corresponding to frames in a parallel encode set.
+void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool,
+ int ref_buffers_used_map) {
+ for (int i = 0; i < buffer_pool->num_frame_bufs; ++i) {
+ if (ref_buffers_used_map & (1 << i)) {
+ --buffer_pool->frame_bufs[i].ref_count;
+ }
+ }
+}
+
+// Initialize parallel frame contexts with screen content decisions.
+void av1_init_sc_decisions(AV1_PRIMARY *const ppi) {
+ AV1_COMP *const first_cpi = ppi->cpi;
+ for (int i = 1; i < ppi->num_fp_contexts; ++i) {
+ AV1_COMP *cur_cpi = ppi->parallel_cpi[i];
+ cur_cpi->common.features.allow_screen_content_tools =
+ first_cpi->common.features.allow_screen_content_tools;
+ cur_cpi->common.features.allow_intrabc =
+ first_cpi->common.features.allow_intrabc;
+ cur_cpi->use_screen_content_tools = first_cpi->use_screen_content_tools;
+ cur_cpi->is_screen_content_type = first_cpi->is_screen_content_type;
+ }
+}
+
+AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
+ AV1_COMP_DATA *const first_cpi_data) {
+ int cpi_idx = 0;
+
+ // Loop over parallel_cpi to find the cpi that processed the current
+ // gf_frame_index ahead of time.
+ for (int i = 1; i < ppi->num_fp_contexts; i++) {
+ if (ppi->cpi->gf_frame_index == ppi->parallel_cpi[i]->gf_frame_index) {
+ cpi_idx = i;
+ break;
+ }
+ }
+
+ assert(cpi_idx > 0);
+ assert(!ppi->parallel_cpi[cpi_idx]->common.show_existing_frame);
+
+ // Release the previously-used frame-buffer.
+ if (ppi->cpi->common.cur_frame != NULL) {
+ --ppi->cpi->common.cur_frame->ref_count;
+ ppi->cpi->common.cur_frame = NULL;
+ }
+
+ // Swap the appropriate parallel_cpi with the parallel_cpi[0].
+ ppi->cpi = ppi->parallel_cpi[cpi_idx];
+ ppi->parallel_cpi[cpi_idx] = ppi->parallel_cpi[0];
+ ppi->parallel_cpi[0] = ppi->cpi;
+
+ // Copy appropriate parallel_frames_data to local data.
+ {
+ AV1_COMP_DATA *data = &ppi->parallel_frames_data[cpi_idx - 1];
+ assert(data->frame_size > 0);
+ assert(first_cpi_data->cx_data_sz > data->frame_size);
+
+ first_cpi_data->lib_flags = data->lib_flags;
+ first_cpi_data->ts_frame_start = data->ts_frame_start;
+ first_cpi_data->ts_frame_end = data->ts_frame_end;
+ memcpy(first_cpi_data->cx_data, data->cx_data, data->frame_size);
+ first_cpi_data->frame_size = data->frame_size;
+ if (ppi->cpi->common.show_frame) {
+ first_cpi_data->pop_lookahead = 1;
+ }
+ }
+
+ return ppi->cpi;
+}
+
+// Initialises frames belonging to a parallel encode set.
+int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
+ AV1_PRIMARY *const ppi,
+ int *ref_buffers_used_map) {
+ AV1_COMP *const first_cpi = ppi->cpi;
+ GF_GROUP *const gf_group = &ppi->gf_group;
+ int gf_index_start = first_cpi->gf_frame_index;
+ assert(gf_group->frame_parallel_level[gf_index_start] == 1);
+ int parallel_frame_count = 0;
+ int cur_frame_num = first_cpi->common.current_frame.frame_number;
+ int show_frame_count = first_cpi->frame_index_set.show_frame_count;
+ int frames_since_key = first_cpi->rc.frames_since_key;
+ int frames_to_key = first_cpi->rc.frames_to_key;
+ int frames_to_fwd_kf = first_cpi->rc.frames_to_fwd_kf;
+ int cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[gf_index_start];
+ const FIRSTPASS_STATS *stats_in = first_cpi->twopass_frame.stats_in;
+
+ assert(*ref_buffers_used_map == 0);
+
+ // Release the previously used frame-buffer by a frame_parallel_level 1 frame.
+ if (first_cpi->common.cur_frame != NULL) {
+ --first_cpi->common.cur_frame->ref_count;
+ first_cpi->common.cur_frame = NULL;
+ }
+
+ RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+ RefFrameMapPair first_ref_frame_map_pairs[REF_FRAMES];
+ init_ref_map_pair(first_cpi, first_ref_frame_map_pairs);
+ memcpy(ref_frame_map_pairs, first_ref_frame_map_pairs,
+ sizeof(RefFrameMapPair) * REF_FRAMES);
+
+ // Store the reference refresh index of frame_parallel_level 1 frame in a
+ // parallel encode set of lower layer frames.
+ if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) {
+ first_cpi->ref_refresh_index = av1_calc_refresh_idx_for_intnl_arf(
+ first_cpi, ref_frame_map_pairs, gf_index_start);
+ assert(first_cpi->ref_refresh_index != INVALID_IDX &&
+ first_cpi->ref_refresh_index < REF_FRAMES);
+ first_cpi->refresh_idx_available = true;
+ // Update ref_frame_map_pairs.
+ ref_frame_map_pairs[first_cpi->ref_refresh_index].disp_order =
+ gf_group->display_idx[gf_index_start];
+ ref_frame_map_pairs[first_cpi->ref_refresh_index].pyr_level =
+ gf_group->layer_depth[gf_index_start];
+ }
+
+ // Set do_frame_data_update flag as false for frame_parallel_level 1 frame.
+ first_cpi->do_frame_data_update = false;
+ if (gf_group->arf_src_offset[gf_index_start] == 0) {
+ first_cpi->time_stamps.prev_ts_start = ppi->ts_start_last_show_frame;
+ first_cpi->time_stamps.prev_ts_end = ppi->ts_end_last_show_frame;
+ }
+
+ av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, first_cpi,
+ gf_index_start, 1, first_cpi->common.remapped_ref_idx);
+
+ av1_scale_references_fpmt(first_cpi, ref_buffers_used_map);
+ parallel_frame_count++;
+
+ // Iterate through the GF_GROUP to find the remaining frame_parallel_level 2
+ // frames which are part of the current parallel encode set and initialize the
+ // required cpi elements.
+ for (int i = gf_index_start + 1; i < gf_group->size; i++) {
+ // Update frame counters if previous frame was show frame or show existing
+ // frame.
+ if (gf_group->arf_src_offset[i - 1] == 0) {
+ cur_frame_num++;
+ show_frame_count++;
+ if (frames_to_fwd_kf <= 0)
+ frames_to_fwd_kf = first_cpi->oxcf.kf_cfg.fwd_kf_dist;
+ if (frames_to_key) {
+ frames_since_key++;
+ frames_to_key--;
+ frames_to_fwd_kf--;
+ }
+ stats_in++;
+ }
+ cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[i];
+ if (gf_group->frame_parallel_level[i] == 2) {
+ AV1_COMP *cur_cpi = ppi->parallel_cpi[parallel_frame_count];
+ AV1_COMP_DATA *cur_cpi_data =
+ &ppi->parallel_frames_data[parallel_frame_count - 1];
+ cur_cpi->gf_frame_index = i;
+ cur_cpi->framerate = first_cpi->framerate;
+ cur_cpi->common.current_frame.frame_number = cur_frame_num;
+ cur_cpi->common.current_frame.frame_type = gf_group->frame_type[i];
+ cur_cpi->frame_index_set.show_frame_count = show_frame_count;
+ cur_cpi->rc.frames_since_key = frames_since_key;
+ cur_cpi->rc.frames_to_key = frames_to_key;
+ cur_cpi->rc.frames_to_fwd_kf = frames_to_fwd_kf;
+ cur_cpi->rc.active_worst_quality = first_cpi->rc.active_worst_quality;
+ cur_cpi->rc.avg_frame_bandwidth = first_cpi->rc.avg_frame_bandwidth;
+ cur_cpi->rc.max_frame_bandwidth = first_cpi->rc.max_frame_bandwidth;
+ cur_cpi->rc.min_frame_bandwidth = first_cpi->rc.min_frame_bandwidth;
+ cur_cpi->rc.intervals_till_gf_calculate_due =
+ first_cpi->rc.intervals_till_gf_calculate_due;
+ cur_cpi->mv_search_params.max_mv_magnitude =
+ first_cpi->mv_search_params.max_mv_magnitude;
+ if (gf_group->update_type[cur_cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+ cur_cpi->common.lf.mode_ref_delta_enabled = 1;
+ }
+ cur_cpi->do_frame_data_update = false;
+ // Initialize prev_ts_start and prev_ts_end for show frame(s) and show
+ // existing frame(s).
+ if (gf_group->arf_src_offset[i] == 0) {
+ // Choose source of prev frame.
+ int src_index = gf_group->src_offset[i];
+ struct lookahead_entry *prev_source = av1_lookahead_peek(
+ ppi->lookahead, src_index - 1, cur_cpi->compressor_stage);
+ // Save timestamps of prev frame.
+ cur_cpi->time_stamps.prev_ts_start = prev_source->ts_start;
+ cur_cpi->time_stamps.prev_ts_end = prev_source->ts_end;
+ }
+ cur_cpi->time_stamps.first_ts_start =
+ first_cpi->time_stamps.first_ts_start;
+
+ memcpy(cur_cpi->common.ref_frame_map, first_cpi->common.ref_frame_map,
+ sizeof(first_cpi->common.ref_frame_map));
+ cur_cpi_data->lib_flags = 0;
+ cur_cpi_data->timestamp_ratio = first_cpi_data->timestamp_ratio;
+ cur_cpi_data->flush = first_cpi_data->flush;
+ cur_cpi_data->frame_size = 0;
+ if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) {
+ // If the first frame in a parallel encode set is INTNL_ARF_UPDATE
+ // frame, initialize lib_flags of frame_parallel_level 2 frame in the
+ // set with that of frame_parallel_level 1 frame.
+ cur_cpi_data->lib_flags = first_cpi_data->lib_flags;
+ // Store the reference refresh index of frame_parallel_level 2 frame in
+ // a parallel encode set of lower layer frames.
+ cur_cpi->ref_refresh_index =
+ av1_calc_refresh_idx_for_intnl_arf(cur_cpi, ref_frame_map_pairs, i);
+ cur_cpi->refresh_idx_available = true;
+ // Skip the reference frame which will be refreshed by
+ // frame_parallel_level 1 frame in a parallel encode set of lower layer
+ // frames.
+ cur_cpi->ref_idx_to_skip = first_cpi->ref_refresh_index;
+ } else {
+ cur_cpi->ref_idx_to_skip = INVALID_IDX;
+ cur_cpi->ref_refresh_index = INVALID_IDX;
+ cur_cpi->refresh_idx_available = false;
+ }
+ cur_cpi->twopass_frame.stats_in = stats_in;
+
+ av1_get_ref_frames(first_ref_frame_map_pairs, cur_frame_disp, cur_cpi, i,
+ 1, cur_cpi->common.remapped_ref_idx);
+ av1_scale_references_fpmt(cur_cpi, ref_buffers_used_map);
+ parallel_frame_count++;
+ }
+
+ // Set do_frame_data_update to true for the last frame_parallel_level 2
+ // frame in the current parallel encode set.
+ if (i == (gf_group->size - 1) ||
+ (gf_group->frame_parallel_level[i + 1] == 0 &&
+ (gf_group->update_type[i + 1] == ARF_UPDATE ||
+ gf_group->update_type[i + 1] == INTNL_ARF_UPDATE)) ||
+ gf_group->frame_parallel_level[i + 1] == 1) {
+ ppi->parallel_cpi[parallel_frame_count - 1]->do_frame_data_update = true;
+ break;
+ }
+ }
+
+ av1_increment_scaled_ref_counts_fpmt(first_cpi->common.buffer_pool,
+ *ref_buffers_used_map);
+
+ // Return the number of frames in the parallel encode set.
+ return parallel_frame_count;
+}
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
+ AV1_COMMON *cm = &cpi->common;
+ if (!cm->show_frame) {
+ return -1;
+ } else {
+ int ret;
+ if (cm->cur_frame != NULL && !cpi->oxcf.algo_cfg.skip_postproc_filtering) {
+ *dest = cm->cur_frame->buf;
+ dest->y_width = cm->width;
+ dest->y_height = cm->height;
+ dest->uv_width = cm->width >> cm->seq_params->subsampling_x;
+ dest->uv_height = cm->height >> cm->seq_params->subsampling_y;
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+ return ret;
+ }
+}
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+ if (cpi->last_show_frame_buf == NULL ||
+ cpi->oxcf.algo_cfg.skip_postproc_filtering)
+ return -1;
+
+ *frame = cpi->last_show_frame_buf->buf;
+ return 0;
+}
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *sd) {
+ const int num_planes = av1_num_planes(cm);
+ if (!equal_dimensions_and_border(new_frame, sd))
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ else
+ aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+ return cm->error->error_code;
+}
+
+int av1_set_internal_size(AV1EncoderConfig *const oxcf,
+ ResizePendingParams *resize_pending_params,
+ AOM_SCALING_MODE horiz_mode,
+ AOM_SCALING_MODE vert_mode) {
+ int hr = 0, hs = 0, vr = 0, vs = 0;
+
+ // Checks for invalid AOM_SCALING_MODE values.
+ if (horiz_mode > AOME_ONETHREE || vert_mode > AOME_ONETHREE) return -1;
+
+ Scale2Ratio(horiz_mode, &hr, &hs);
+ Scale2Ratio(vert_mode, &vr, &vs);
+
+ // always go to the next whole number
+ resize_pending_params->width = (hs - 1 + oxcf->frm_dim_cfg.width * hr) / hs;
+ resize_pending_params->height = (vs - 1 + oxcf->frm_dim_cfg.height * vr) / vs;
+
+ if (horiz_mode != AOME_NORMAL || vert_mode != AOME_NORMAL) {
+ oxcf->resize_cfg.resize_mode = RESIZE_FIXED;
+ oxcf->algo_cfg.enable_tpl_model = 0;
+ }
+ return 0;
+}
+
+int av1_get_quantizer(AV1_COMP *cpi) {
+ return cpi->common.quant_params.base_qindex;
+}
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
+ size_t output_size = 0;
+ size_t total_bytes_read = 0;
+ size_t remaining_size = *frame_size;
+ uint8_t *buff_ptr = buffer;
+
+ // go through each OBUs
+ while (total_bytes_read < *frame_size) {
+ uint8_t saved_obu_header[2];
+ uint64_t obu_payload_size;
+ size_t length_of_payload_size;
+ size_t length_of_obu_size;
+ uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1;
+ size_t obu_bytes_read = obu_header_size; // bytes read for current obu
+
+ // save the obu header (1 or 2 bytes)
+ memmove(saved_obu_header, buff_ptr, obu_header_size);
+ // clear the obu_has_size_field
+ saved_obu_header[0] = saved_obu_header[0] & (~0x2);
+
+ // get the payload_size and length of payload_size
+ if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size,
+ &obu_payload_size, &length_of_payload_size) != 0) {
+ return AOM_CODEC_ERROR;
+ }
+ obu_bytes_read += length_of_payload_size;
+
+ // calculate the length of size of the obu header plus payload
+ length_of_obu_size =
+ aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size));
+
+ // move the rest of data to new location
+ memmove(buff_ptr + length_of_obu_size + obu_header_size,
+ buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read);
+ obu_bytes_read += (size_t)obu_payload_size;
+
+ // write the new obu size
+ const uint64_t obu_size = obu_header_size + obu_payload_size;
+ size_t coded_obu_size;
+ if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr,
+ &coded_obu_size) != 0) {
+ return AOM_CODEC_ERROR;
+ }
+
+ // write the saved (modified) obu_header following obu size
+ memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size);
+
+ total_bytes_read += obu_bytes_read;
+ remaining_size -= obu_bytes_read;
+ buff_ptr += length_of_obu_size + obu_size;
+ output_size += length_of_obu_size + (size_t)obu_size;
+ }
+
+ *frame_size = output_size;
+ return AOM_CODEC_OK;
+}
+
+static void rtc_set_updates_ref_frame_config(
+ ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags,
+ RTC_REF *const rtc_ref) {
+ ext_refresh_frame_flags->update_pending = 1;
+ ext_refresh_frame_flags->last_frame = rtc_ref->refresh[rtc_ref->ref_idx[0]];
+ ext_refresh_frame_flags->golden_frame = rtc_ref->refresh[rtc_ref->ref_idx[3]];
+ ext_refresh_frame_flags->bwd_ref_frame =
+ rtc_ref->refresh[rtc_ref->ref_idx[4]];
+ ext_refresh_frame_flags->alt2_ref_frame =
+ rtc_ref->refresh[rtc_ref->ref_idx[5]];
+ ext_refresh_frame_flags->alt_ref_frame =
+ rtc_ref->refresh[rtc_ref->ref_idx[6]];
+ rtc_ref->non_reference_frame = 1;
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (rtc_ref->refresh[i] == 1) {
+ rtc_ref->non_reference_frame = 0;
+ break;
+ }
+ }
+}
+
+static int rtc_set_references_external_ref_frame_config(AV1_COMP *cpi) {
+ // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+ // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ int ref = AOM_REFFRAME_ALL;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ if (!cpi->ppi->rtc_ref.reference[i]) ref ^= (1 << i);
+ }
+ return ref;
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
+ // TODO(yunqingwang): For what references to use, external encoding flags
+ // should be consistent with internal reference frame selection. Need to
+ // ensure that there is not conflict between the two. In AV1 encoder, the
+ // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3,
+ // GOLDEN, BWDREF, ALTREF2.
+
+ ExternalFlags *const ext_flags = &cpi->ext_flags;
+ ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+ &ext_flags->refresh_frame;
+ ext_flags->ref_frame_flags = AOM_REFFRAME_ALL;
+ if (flags &
+ (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+ AOM_EFLAG_NO_REF_ARF2)) {
+ int ref = AOM_REFFRAME_ALL;
+
+ if (flags & AOM_EFLAG_NO_REF_LAST) ref ^= AOM_LAST_FLAG;
+ if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG;
+ if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG;
+
+ if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+
+ if (flags & AOM_EFLAG_NO_REF_ARF) {
+ ref ^= AOM_ALT_FLAG;
+ ref ^= AOM_BWD_FLAG;
+ ref ^= AOM_ALT2_FLAG;
+ } else {
+ if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG;
+ if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG;
+ }
+
+ av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
+ } else {
+ if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+ int ref = rtc_set_references_external_ref_frame_config(cpi);
+ av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
+ }
+ }
+
+ if (flags &
+ (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
+ int upd = AOM_REFFRAME_ALL;
+
+ // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag.
+ if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG;
+
+ if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG;
+
+ if (flags & AOM_EFLAG_NO_UPD_ARF) {
+ upd ^= AOM_ALT_FLAG;
+ upd ^= AOM_BWD_FLAG;
+ upd ^= AOM_ALT2_FLAG;
+ }
+
+ ext_refresh_frame_flags->last_frame = (upd & AOM_LAST_FLAG) != 0;
+ ext_refresh_frame_flags->golden_frame = (upd & AOM_GOLD_FLAG) != 0;
+ ext_refresh_frame_flags->alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
+ ext_refresh_frame_flags->bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
+ ext_refresh_frame_flags->alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
+ ext_refresh_frame_flags->update_pending = 1;
+ } else {
+ if (cpi->ppi->rtc_ref.set_ref_frame_config)
+ rtc_set_updates_ref_frame_config(ext_refresh_frame_flags,
+ &cpi->ppi->rtc_ref);
+ else
+ ext_refresh_frame_flags->update_pending = 0;
+ }
+
+ ext_flags->use_ref_frame_mvs = cpi->oxcf.tool_cfg.enable_ref_frame_mvs &
+ ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
+ ext_flags->use_error_resilient = cpi->oxcf.tool_cfg.error_resilient_mode |
+ ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
+ ext_flags->use_s_frame =
+ cpi->oxcf.kf_cfg.enable_sframe | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
+ ext_flags->use_primary_ref_none =
+ (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
+
+ if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
+ update_entropy(&ext_flags->refresh_frame_context,
+ &ext_flags->refresh_frame_context_pending, 0);
+ }
+}
+
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) {
+ if (!ppi) return NULL;
+
+ uint8_t header_buf[512] = { 0 };
+ const uint32_t sequence_header_size =
+ av1_write_sequence_header_obu(&ppi->seq_params, &header_buf[0]);
+ assert(sequence_header_size <= sizeof(header_buf));
+ if (sequence_header_size == 0) return NULL;
+
+ const size_t obu_header_size = 1;
+ const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size);
+ const size_t payload_offset = obu_header_size + size_field_size;
+
+ if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
+ memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
+
+ if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count,
+ OBU_SEQUENCE_HEADER, 0,
+ &header_buf[0]) != obu_header_size) {
+ return NULL;
+ }
+
+ size_t coded_size_field_size = 0;
+ if (aom_uleb_encode(sequence_header_size, size_field_size,
+ &header_buf[obu_header_size],
+ &coded_size_field_size) != 0) {
+ return NULL;
+ }
+ assert(coded_size_field_size == size_field_size);
+
+ aom_fixed_buf_t *global_headers =
+ (aom_fixed_buf_t *)malloc(sizeof(*global_headers));
+ if (!global_headers) return NULL;
+
+ const size_t global_header_buf_size =
+ obu_header_size + size_field_size + sequence_header_size;
+
+ global_headers->buf = malloc(global_header_buf_size);
+ if (!global_headers->buf) {
+ free(global_headers);
+ return NULL;
+ }
+
+ memcpy(global_headers->buf, &header_buf[0], global_header_buf_size);
+ global_headers->sz = global_header_buf_size;
+ return global_headers;
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
new file mode 100644
index 0000000000..5f6f67eda8
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -0,0 +1,4512 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares top-level encoder structures and functions.
+ */
+#ifndef AOM_AV1_ENCODER_ENCODER_H_
+#define AOM_AV1_ENCODER_ENCODER_H_
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aomcx.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/timing.h"
+
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/external_partition.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/level.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/pickcdef.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/svc_layercontext.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/bitstream.h"
+
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_dsp/variance.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/noise_model.h"
+#endif
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+#if CONFIG_AV1_TEMPORAL_DENOISING
+#include "av1/encoder/av1_temporal_denoiser.h"
+#endif
+#if CONFIG_TUNE_BUTTERAUGLI
+#include "av1/encoder/tune_butteraugli.h"
+#endif
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(yunqing, any): Added suppression tag to quiet Doxygen warnings. Need to
+// adjust it while we work on documentation.
+/*!\cond */
+// Number of frames required to test for scene cut detection
+#define SCENE_CUT_KEY_TEST_INTERVAL 16
+
+// Lookahead index threshold to enable temporal filtering for second arf.
+#define TF_LOOKAHEAD_IDX_THR 7
+
+#define HDR_QP_LEVELS 10
+#define CHROMA_CB_QP_SCALE 1.04
+#define CHROMA_CR_QP_SCALE 1.04
+#define CHROMA_QP_SCALE -0.46
+#define CHROMA_QP_OFFSET 9.26
+#define QP_SCALE_FACTOR 2.0
+#define DISABLE_HDR_LUMA_DELTAQ 1
+
+// Rational number with an int64 numerator
+// This structure holds a fractional value
+typedef struct aom_rational64 {
+ int64_t num; // fraction numerator
+ int den; // fraction denominator
+} aom_rational64_t; // alias for struct aom_rational
+
+enum {
+ // Good Quality Fast Encoding. The encoder balances quality with the amount of
+ // time it takes to encode the output. Speed setting controls how fast.
+ GOOD,
+ // Realtime Fast Encoding. Will force some restrictions on bitrate
+ // constraints.
+ REALTIME,
+ // All intra mode. All the frames are coded as intra frames.
+ ALLINTRA
+} UENUM1BYTE(MODE);
+
+enum {
+ FRAMEFLAGS_KEY = 1 << 0,
+ FRAMEFLAGS_GOLDEN = 1 << 1,
+ FRAMEFLAGS_BWDREF = 1 << 2,
+ // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME
+ FRAMEFLAGS_ALTREF = 1 << 3,
+ FRAMEFLAGS_INTRAONLY = 1 << 4,
+ FRAMEFLAGS_SWITCH = 1 << 5,
+ FRAMEFLAGS_ERROR_RESILIENT = 1 << 6,
+} UENUM1BYTE(FRAMETYPE_FLAGS);
+
+#if CONFIG_FPMT_TEST
+enum {
+ PARALLEL_ENCODE = 0,
+ PARALLEL_SIMULATION_ENCODE,
+ NUM_FPMT_TEST_ENCODES
+} UENUM1BYTE(FPMT_TEST_ENC_CFG);
+#endif // CONFIG_FPMT_TEST
+// 0 level frames are sometimes used for rate control purposes, but for
+// reference mapping purposes, the minimum level should be 1.
+#define MIN_PYR_LEVEL 1
+static INLINE int get_true_pyr_level(int frame_level, int frame_order,
+ int max_layer_depth) {
+ if (frame_order == 0) {
+ // Keyframe case
+ return MIN_PYR_LEVEL;
+ } else if (frame_level == MAX_ARF_LAYERS) {
+ // Leaves
+ return max_layer_depth;
+ } else if (frame_level == (MAX_ARF_LAYERS + 1)) {
+ // Altrefs
+ return MIN_PYR_LEVEL;
+ }
+ return AOMMAX(MIN_PYR_LEVEL, frame_level);
+}
+
+enum {
+ NO_AQ = 0,
+ VARIANCE_AQ = 1,
+ COMPLEXITY_AQ = 2,
+ CYCLIC_REFRESH_AQ = 3,
+ AQ_MODE_COUNT // This should always be the last member of the enum
+} UENUM1BYTE(AQ_MODE);
+enum {
+ NO_DELTA_Q = 0,
+ DELTA_Q_OBJECTIVE = 1, // Modulation to improve objective quality
+ DELTA_Q_PERCEPTUAL = 2, // Modulation to improve video perceptual quality
+ DELTA_Q_PERCEPTUAL_AI = 3, // Perceptual quality opt for all intra mode
+ DELTA_Q_USER_RATING_BASED = 4, // User rating based delta q mode
+ DELTA_Q_HDR = 5, // QP adjustment based on HDR block pixel average
+ DELTA_Q_MODE_COUNT // This should always be the last member of the enum
+} UENUM1BYTE(DELTAQ_MODE);
+
+enum {
+ RESIZE_NONE = 0, // No frame resizing allowed.
+ RESIZE_FIXED = 1, // All frames are coded at the specified scale.
+ RESIZE_RANDOM = 2, // All frames are coded at a random scale.
+ RESIZE_DYNAMIC = 3, // Frames coded at lower scale based on rate control.
+ RESIZE_MODES
+} UENUM1BYTE(RESIZE_MODE);
+
+enum {
+ SS_CFG_SRC = 0,
+ SS_CFG_LOOKAHEAD = 1,
+ SS_CFG_FPF = 2,
+ SS_CFG_TOTAL = 3
+} UENUM1BYTE(SS_CFG_OFFSET);
+
+enum {
+ DISABLE_SCENECUT, // For LAP, lag_in_frames < 19
+ ENABLE_SCENECUT_MODE_1, // For LAP, lag_in_frames >=19 and < 33
+ ENABLE_SCENECUT_MODE_2 // For twopass and LAP - lag_in_frames >=33
+} UENUM1BYTE(SCENECUT_MODE);
+
+#define MAX_VBR_CORPUS_COMPLEXITY 10000
+
+typedef enum {
+ MOD_FP, // First pass
+ MOD_TF, // Temporal filtering
+ MOD_TPL, // TPL
+ MOD_GME, // Global motion estimation
+ MOD_ENC, // Encode stage
+ MOD_LPF, // Deblocking loop filter
+ MOD_CDEF_SEARCH, // CDEF search
+ MOD_CDEF, // CDEF frame
+ MOD_LR, // Loop restoration filtering
+ MOD_PACK_BS, // Pack bitstream
+ MOD_FRAME_ENC, // Frame Parallel encode
+ MOD_AI, // All intra
+ NUM_MT_MODULES
+} MULTI_THREADED_MODULES;
+
+/*!\endcond */
+
+/*!\enum COST_UPDATE_TYPE
+ * \brief This enum controls how often the entropy costs should be updated.
+ * \warning In case of any modifications/additions done to the enum
+ * COST_UPDATE_TYPE, the enum INTERNAL_COST_UPDATE_TYPE needs to be updated as
+ * well.
+ */
+typedef enum {
+ COST_UPD_SB, /*!< Update every sb. */
+ COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */
+ COST_UPD_TILE, /*!< Update every tile. */
+ COST_UPD_OFF, /*!< Turn off cost updates. */
+ NUM_COST_UPDATE_TYPES, /*!< Number of cost update types. */
+} COST_UPDATE_TYPE;
+
+/*!\enum LOOPFILTER_CONTROL
+ * \brief This enum controls to which frames loopfilter is applied.
+ */
+typedef enum {
+ LOOPFILTER_NONE = 0, /*!< Disable loopfilter on all frames. */
+ LOOPFILTER_ALL = 1, /*!< Enable loopfilter for all frames. */
+ LOOPFILTER_REFERENCE = 2, /*!< Disable loopfilter on non reference frames. */
+ LOOPFILTER_SELECTIVELY =
+ 3, /*!< Disable loopfilter on frames with low motion. */
+} LOOPFILTER_CONTROL;
+
+/*!\enum SKIP_APPLY_POSTPROC_FILTER
+ * \brief This enum controls the application of post-processing filters on a
+ * reconstructed frame.
+ */
+typedef enum {
+ SKIP_APPLY_RESTORATION = 1 << 0,
+ SKIP_APPLY_SUPERRES = 1 << 1,
+ SKIP_APPLY_CDEF = 1 << 2,
+ SKIP_APPLY_LOOPFILTER = 1 << 3,
+} SKIP_APPLY_POSTPROC_FILTER;
+
+/*!
+ * \brief Encoder config related to resize.
+ */
+typedef struct {
+ /*!
+ * Indicates the frame resize mode to be used by the encoder.
+ */
+ RESIZE_MODE resize_mode;
+ /*!
+ * Indicates the denominator for resize of inter frames, assuming 8 as the
+ * numerator. Its value ranges between 8-16.
+ */
+ uint8_t resize_scale_denominator;
+ /*!
+ * Indicates the denominator for resize of key frames, assuming 8 as the
+ * numerator. Its value ranges between 8-16.
+ */
+ uint8_t resize_kf_scale_denominator;
+} ResizeCfg;
+
+/*!
+ * \brief Encoder config for coding block partitioning.
+ */
+typedef struct {
+ /*!
+ * Flag to indicate if rectanguar partitions should be enabled.
+ */
+ bool enable_rect_partitions;
+ /*!
+ * Flag to indicate if AB partitions should be enabled.
+ */
+ bool enable_ab_partitions;
+ /*!
+ * Flag to indicate if 1:4 / 4:1 partitions should be enabled.
+ */
+ bool enable_1to4_partitions;
+ /*!
+ * Indicates the minimum partition size that should be allowed. Both width and
+ * height of a partition cannot be smaller than the min_partition_size.
+ */
+ BLOCK_SIZE min_partition_size;
+ /*!
+ * Indicates the maximum partition size that should be allowed. Both width and
+ * height of a partition cannot be larger than the max_partition_size.
+ */
+ BLOCK_SIZE max_partition_size;
+} PartitionCfg;
+
+/*!
+ * \brief Encoder flags for intra prediction.
+ */
+typedef struct {
+ /*!
+ * Flag to indicate if intra edge filtering process should be enabled.
+ */
+ bool enable_intra_edge_filter;
+ /*!
+ * Flag to indicate if recursive filtering based intra prediction should be
+ * enabled.
+ */
+ bool enable_filter_intra;
+ /*!
+ * Flag to indicate if smooth intra prediction modes should be enabled.
+ */
+ bool enable_smooth_intra;
+ /*!
+ * Flag to indicate if PAETH intra prediction mode should be enabled.
+ */
+ bool enable_paeth_intra;
+ /*!
+ * Flag to indicate if CFL uv intra mode should be enabled.
+ */
+ bool enable_cfl_intra;
+ /*!
+ * Flag to indicate if directional modes should be enabled.
+ */
+ bool enable_directional_intra;
+ /*!
+ * Flag to indicate if the subset of directional modes from D45 to D203 intra
+ * should be enabled. Has no effect if directional modes are disabled.
+ */
+ bool enable_diagonal_intra;
+ /*!
+ * Flag to indicate if delta angles for directional intra prediction should be
+ * enabled.
+ */
+ bool enable_angle_delta;
+ /*!
+ * Flag to indicate whether to automatically turn off several intral coding
+ * tools.
+ * This flag is only used when "--deltaq-mode=3" is true.
+ * When set to 1, the encoder will analyze the reconstruction quality
+ * as compared to the source image in the preprocessing pass.
+ * If the recontruction quality is considered high enough, we disable
+ * the following intra coding tools, for better encoding speed:
+ * "--enable_smooth_intra",
+ * "--enable_paeth_intra",
+ * "--enable_cfl_intra",
+ * "--enable_diagonal_intra".
+ */
+ bool auto_intra_tools_off;
+} IntraModeCfg;
+
+/*!
+ * \brief Encoder flags for transform sizes and types.
+ */
+typedef struct {
+ /*!
+ * Flag to indicate if 64-pt transform should be enabled.
+ */
+ bool enable_tx64;
+ /*!
+ * Flag to indicate if flip and identity transform types should be enabled.
+ */
+ bool enable_flip_idtx;
+ /*!
+ * Flag to indicate if rectangular transform should be enabled.
+ */
+ bool enable_rect_tx;
+ /*!
+ * Flag to indicate whether or not to use a default reduced set for ext-tx
+ * rather than the potential full set of 16 transforms.
+ */
+ bool reduced_tx_type_set;
+ /*!
+ * Flag to indicate if transform type for intra blocks should be limited to
+ * DCT_DCT.
+ */
+ bool use_intra_dct_only;
+ /*!
+ * Flag to indicate if transform type for inter blocks should be limited to
+ * DCT_DCT.
+ */
+ bool use_inter_dct_only;
+ /*!
+ * Flag to indicate if intra blocks should use default transform type
+ * (mode-dependent) only.
+ */
+ bool use_intra_default_tx_only;
+ /*!
+ * Flag to indicate if transform size search should be enabled.
+ */
+ bool enable_tx_size_search;
+} TxfmSizeTypeCfg;
+
+/*!
+ * \brief Encoder flags for compound prediction modes.
+ */
+typedef struct {
+ /*!
+ * Flag to indicate if distance-weighted compound type should be enabled.
+ */
+ bool enable_dist_wtd_comp;
+ /*!
+ * Flag to indicate if masked (wedge/diff-wtd) compound type should be
+ * enabled.
+ */
+ bool enable_masked_comp;
+ /*!
+ * Flag to indicate if smooth interintra mode should be enabled.
+ */
+ bool enable_smooth_interintra;
+ /*!
+ * Flag to indicate if difference-weighted compound type should be enabled.
+ */
+ bool enable_diff_wtd_comp;
+ /*!
+ * Flag to indicate if inter-inter wedge compound type should be enabled.
+ */
+ bool enable_interinter_wedge;
+ /*!
+ * Flag to indicate if inter-intra wedge compound type should be enabled.
+ */
+ bool enable_interintra_wedge;
+} CompoundTypeCfg;
+
+/*!
+ * \brief Encoder config related to frame super-resolution.
+ */
+typedef struct {
+ /*!
+ * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH
+ * mode is used for inter frames.
+ */
+ int superres_qthresh;
+ /*!
+ * Indicates the qindex based threshold to be used when AOM_SUPERRES_QTHRESH
+ * mode is used for key frames.
+ */
+ int superres_kf_qthresh;
+ /*!
+ * Indicates the denominator of the fraction that specifies the ratio between
+ * the superblock width before and after upscaling for inter frames. The
+ * numerator of this fraction is equal to the constant SCALE_NUMERATOR.
+ */
+ uint8_t superres_scale_denominator;
+ /*!
+ * Indicates the denominator of the fraction that specifies the ratio between
+ * the superblock width before and after upscaling for key frames. The
+ * numerator of this fraction is equal to the constant SCALE_NUMERATOR.
+ */
+ uint8_t superres_kf_scale_denominator;
+ /*!
+ * Indicates the Super-resolution mode to be used by the encoder.
+ */
+ aom_superres_mode superres_mode;
+ /*!
+ * Flag to indicate if super-resolution should be enabled for the sequence.
+ */
+ bool enable_superres;
+} SuperResCfg;
+
+/*!
+ * \brief Encoder config related to the coding of key frames.
+ */
+typedef struct {
+ /*!
+ * Indicates the minimum distance to a key frame.
+ */
+ int key_freq_min;
+
+ /*!
+ * Indicates the maximum distance to a key frame.
+ */
+ int key_freq_max;
+
+ /*!
+ * Indicates if temporal filtering should be applied on keyframe.
+ */
+ int enable_keyframe_filtering;
+
+ /*!
+ * Indicates the number of frames after which a frame may be coded as an
+ * S-Frame.
+ */
+ int sframe_dist;
+
+ /*!
+ * Indicates how an S-Frame should be inserted.
+ * 1: the considered frame will be made into an S-Frame only if it is an
+ * altref frame. 2: the next altref frame will be made into an S-Frame.
+ */
+ int sframe_mode;
+
+ /*!
+ * Indicates if encoder should autodetect cut scenes and set the keyframes.
+ */
+ bool auto_key;
+
+ /*!
+ * Indicates the forward key frame distance.
+ */
+ int fwd_kf_dist;
+
+ /*!
+ * Indicates if forward keyframe reference should be enabled.
+ */
+ bool fwd_kf_enabled;
+
+ /*!
+ * Indicates if S-Frames should be enabled for the sequence.
+ */
+ bool enable_sframe;
+
+ /*!
+ * Indicates if intra block copy prediction mode should be enabled or not.
+ */
+ bool enable_intrabc;
+} KeyFrameCfg;
+
+/*!
+ * \brief Encoder rate control configuration parameters
+ */
+typedef struct {
+ /*!\cond */
+ // BUFFERING PARAMETERS
+ /*!\endcond */
+ /*!
+ * Indicates the amount of data that will be buffered by the decoding
+ * application prior to beginning playback, and is expressed in units of
+ * time(milliseconds).
+ */
+ int64_t starting_buffer_level_ms;
+ /*!
+ * Indicates the amount of data that the encoder should try to maintain in the
+ * decoder's buffer, and is expressed in units of time(milliseconds).
+ */
+ int64_t optimal_buffer_level_ms;
+ /*!
+ * Indicates the maximum amount of data that may be buffered by the decoding
+ * application, and is expressed in units of time(milliseconds).
+ */
+ int64_t maximum_buffer_size_ms;
+
+ /*!
+ * Indicates the bandwidth to be used in bits per second.
+ */
+ int64_t target_bandwidth;
+
+ /*!
+ * Indicates average complexity of the corpus in single pass vbr based on
+ * LAP. 0 indicates that corpus complexity vbr mode is disabled.
+ */
+ unsigned int vbr_corpus_complexity_lap;
+ /*!
+ * Indicates the maximum allowed bitrate for any intra frame as % of bitrate
+ * target.
+ */
+ unsigned int max_intra_bitrate_pct;
+ /*!
+ * Indicates the maximum allowed bitrate for any inter frame as % of bitrate
+ * target.
+ */
+ unsigned int max_inter_bitrate_pct;
+ /*!
+ * Indicates the percentage of rate boost for golden frame in CBR mode.
+ */
+ unsigned int gf_cbr_boost_pct;
+ /*!
+ * min_cr / 100 indicates the target minimum compression ratio for each
+ * frame.
+ */
+ unsigned int min_cr;
+ /*!
+ * Indicates the frame drop threshold.
+ */
+ int drop_frames_water_mark;
+ /*!
+ * under_shoot_pct indicates the tolerance of the VBR algorithm to
+ * undershoot and is used as a trigger threshold for more aggressive
+ * adaptation of Q. It's value can range from 0-100.
+ */
+ int under_shoot_pct;
+ /*!
+ * over_shoot_pct indicates the tolerance of the VBR algorithm to overshoot
+ * and is used as a trigger threshold for more aggressive adaptation of Q.
+ * It's value can range from 0-1000.
+ */
+ int over_shoot_pct;
+ /*!
+ * Indicates the maximum qindex that can be used by the quantizer i.e. the
+ * worst quality qindex.
+ */
+ int worst_allowed_q;
+ /*!
+ * Indicates the minimum qindex that can be used by the quantizer i.e. the
+ * best quality qindex.
+ */
+ int best_allowed_q;
+ /*!
+ * Indicates the Constant/Constrained Quality level.
+ */
+ int cq_level;
+ /*!
+ * Indicates if the encoding mode is vbr, cbr, constrained quality or
+ * constant quality.
+ */
+ enum aom_rc_mode mode;
+ /*!
+ * Indicates the bias (expressed on a scale of 0 to 100) for determining
+ * target size for the current frame. The value 0 indicates the optimal CBR
+ * mode value should be used, and 100 indicates the optimal VBR mode value
+ * should be used.
+ */
+ int vbrbias;
+ /*!
+ * Indicates the minimum bitrate to be used for a single frame as a percentage
+ * of the target bitrate.
+ */
+ int vbrmin_section;
+ /*!
+ * Indicates the maximum bitrate to be used for a single frame as a percentage
+ * of the target bitrate.
+ */
+ int vbrmax_section;
+} RateControlCfg;
+
+/*!\cond */
+typedef struct {
+ // Indicates the number of frames lag before encoding is started.
+ int lag_in_frames;
+ // Indicates the minimum gf/arf interval to be used.
+ int min_gf_interval;
+ // Indicates the maximum gf/arf interval to be used.
+ int max_gf_interval;
+ // Indicates the minimum height for GF group pyramid structure to be used.
+ int gf_min_pyr_height;
+ // Indicates the maximum height for GF group pyramid structure to be used.
+ int gf_max_pyr_height;
+ // Indicates if automatic set and use of altref frames should be enabled.
+ bool enable_auto_arf;
+ // Indicates if automatic set and use of (b)ackward (r)ef (f)rames should be
+ // enabled.
+ bool enable_auto_brf;
+} GFConfig;
+
+typedef struct {
+ // Indicates the number of tile groups.
+ unsigned int num_tile_groups;
+ // Indicates the MTU size for a tile group. If mtu is non-zero,
+ // num_tile_groups is set to DEFAULT_MAX_NUM_TG.
+ unsigned int mtu;
+ // Indicates the number of tile columns in log2.
+ int tile_columns;
+ // Indicates the number of tile rows in log2.
+ int tile_rows;
+ // Indicates the number of widths in the tile_widths[] array.
+ int tile_width_count;
+ // Indicates the number of heights in the tile_heights[] array.
+ int tile_height_count;
+ // Indicates the tile widths, and may be empty.
+ int tile_widths[MAX_TILE_COLS];
+ // Indicates the tile heights, and may be empty.
+ int tile_heights[MAX_TILE_ROWS];
+ // Indicates if large scale tile coding should be used.
+ bool enable_large_scale_tile;
+ // Indicates if single tile decoding mode should be enabled.
+ bool enable_single_tile_decoding;
+ // Indicates if EXT_TILE_DEBUG should be enabled.
+ bool enable_ext_tile_debug;
+} TileConfig;
+
+typedef struct {
+ // Indicates the width of the input frame.
+ int width;
+ // Indicates the height of the input frame.
+ int height;
+ // If forced_max_frame_width is non-zero then it is used to force the maximum
+ // frame width written in write_sequence_header().
+ int forced_max_frame_width;
+ // If forced_max_frame_width is non-zero then it is used to force the maximum
+ // frame height written in write_sequence_header().
+ int forced_max_frame_height;
+ // Indicates the frame width after applying both super-resolution and resize
+ // to the coded frame.
+ int render_width;
+ // Indicates the frame height after applying both super-resolution and resize
+ // to the coded frame.
+ int render_height;
+} FrameDimensionCfg;
+
+typedef struct {
+ // Indicates if warped motion should be enabled.
+ bool enable_warped_motion;
+ // Indicates if warped motion should be evaluated or not.
+ bool allow_warped_motion;
+ // Indicates if OBMC motion should be enabled.
+ bool enable_obmc;
+} MotionModeCfg;
+
+typedef struct {
+ // Timing info for each frame.
+ aom_timing_info_t timing_info;
+ // Indicates the number of time units of a decoding clock.
+ uint32_t num_units_in_decoding_tick;
+ // Indicates if decoder model information is present in the coded sequence
+ // header.
+ bool decoder_model_info_present_flag;
+ // Indicates if display model information is present in the coded sequence
+ // header.
+ bool display_model_info_present_flag;
+ // Indicates if timing info for each frame is present.
+ bool timing_info_present;
+} DecoderModelCfg;
+
+typedef struct {
+ // Indicates the update frequency for coeff costs.
+ COST_UPDATE_TYPE coeff;
+ // Indicates the update frequency for mode costs.
+ COST_UPDATE_TYPE mode;
+ // Indicates the update frequency for mv costs.
+ COST_UPDATE_TYPE mv;
+ // Indicates the update frequency for dv costs.
+ COST_UPDATE_TYPE dv;
+} CostUpdateFreq;
+
+typedef struct {
+ // Indicates the maximum number of reference frames allowed per frame.
+ unsigned int max_reference_frames;
+ // Indicates if the reduced set of references should be enabled.
+ bool enable_reduced_reference_set;
+ // Indicates if one-sided compound should be enabled.
+ bool enable_onesided_comp;
+} RefFrameCfg;
+
+typedef struct {
+ // Indicates the color space that should be used.
+ aom_color_primaries_t color_primaries;
+ // Indicates the characteristics of transfer function to be used.
+ aom_transfer_characteristics_t transfer_characteristics;
+ // Indicates the matrix coefficients to be used for the transfer function.
+ aom_matrix_coefficients_t matrix_coefficients;
+ // Indicates the chroma 4:2:0 sample position info.
+ aom_chroma_sample_position_t chroma_sample_position;
+ // Indicates if a limited color range or full color range should be used.
+ aom_color_range_t color_range;
+} ColorCfg;
+
+typedef struct {
+ // Indicates if extreme motion vector unit test should be enabled or not.
+ unsigned int motion_vector_unit_test;
+ // Indicates if superblock multipass unit test should be enabled or not.
+ unsigned int sb_multipass_unit_test;
+} UnitTestCfg;
+
+typedef struct {
+ // Indicates the file path to the VMAF model.
+ const char *vmaf_model_path;
+ // Indicates the path to the film grain parameters.
+ const char *film_grain_table_filename;
+ // Indicates the visual tuning metric.
+ aom_tune_metric tuning;
+ // Indicates if the current content is screen or default type.
+ aom_tune_content content;
+ // Indicates the film grain parameters.
+ int film_grain_test_vector;
+ // Indicates the in-block distortion metric to use.
+ aom_dist_metric dist_metric;
+} TuneCfg;
+
+typedef struct {
+ // Indicates the framerate of the input video.
+ double init_framerate;
+ // Indicates the bit-depth of the input video.
+ unsigned int input_bit_depth;
+ // Indicates the maximum number of frames to be encoded.
+ unsigned int limit;
+ // Indicates the chrome subsampling x value.
+ unsigned int chroma_subsampling_x;
+ // Indicates the chrome subsampling y value.
+ unsigned int chroma_subsampling_y;
+} InputCfg;
+
+typedef struct {
+ // If true, encoder will use fixed QP offsets, that are either:
+ // - Given by the user, and stored in 'fixed_qp_offsets' array, OR
+ // - Picked automatically from cq_level.
+ int use_fixed_qp_offsets;
+ // Indicates the minimum flatness of the quantization matrix.
+ int qm_minlevel;
+ // Indicates the maximum flatness of the quantization matrix.
+ int qm_maxlevel;
+ // Indicates if adaptive quantize_b should be enabled.
+ int quant_b_adapt;
+ // Indicates the Adaptive Quantization mode to be used.
+ AQ_MODE aq_mode;
+ // Indicates the delta q mode to be used.
+ DELTAQ_MODE deltaq_mode;
+ // Indicates the delta q mode strength.
+ DELTAQ_MODE deltaq_strength;
+ // Indicates if delta quantization should be enabled in chroma planes.
+ bool enable_chroma_deltaq;
+ // Indicates if delta quantization should be enabled for hdr video
+ bool enable_hdr_deltaq;
+ // Indicates if encoding with quantization matrices should be enabled.
+ bool using_qm;
+} QuantizationCfg;
+
+/*!\endcond */
+/*!
+ * \brief Algorithm configuration parameters.
+ */
+typedef struct {
+ /*!
+ * Controls the level at which rate-distortion optimization of transform
+ * coefficients favours sharpness in the block. Has no impact on RD when set
+ * to zero (default). For values 1-7, eob and skip block optimization are
+ * avoided and rdmult is adjusted in favour of block sharpness.
+ */
+ int sharpness;
+
+ /*!
+ * Indicates the trellis optimization mode of quantized coefficients.
+ * 0: disabled
+ * 1: enabled
+ * 2: enabled for rd search
+ * 3: true for estimate yrd search
+ */
+ int disable_trellis_quant;
+
+ /*!
+ * The maximum number of frames used to create an arf.
+ */
+ int arnr_max_frames;
+
+ /*!
+ * The temporal filter strength for arf used when creating ARFs.
+ */
+ int arnr_strength;
+
+ /*!
+ * Indicates the CDF update mode
+ * 0: no update
+ * 1: update on every frame(default)
+ * 2: selectively update
+ */
+ uint8_t cdf_update_mode;
+
+ /*!
+ * Indicates if RDO based on frame temporal dependency should be enabled.
+ */
+ bool enable_tpl_model;
+
+ /*!
+ * Indicates if coding of overlay frames for filtered ALTREF frames is
+ * enabled.
+ */
+ bool enable_overlay;
+
+ /*!
+ * Controls loop filtering
+ * 0: Loop filter is disabled for all frames
+ * 1: Loop filter is enabled for all frames
+ * 2: Loop filter is disabled for non-reference frames
+ * 3: Loop filter is disables for the frames with low motion
+ */
+ LOOPFILTER_CONTROL loopfilter_control;
+
+ /*!
+ * Indicates if the application of post-processing filters should be skipped
+ * on reconstructed frame.
+ */
+ bool skip_postproc_filtering;
+} AlgoCfg;
+/*!\cond */
+
+typedef struct {
+ // Indicates the codec bit-depth.
+ aom_bit_depth_t bit_depth;
+ // Indicates the superblock size that should be used by the encoder.
+ aom_superblock_size_t superblock_size;
+ // Indicates if loopfilter modulation should be enabled.
+ bool enable_deltalf_mode;
+ // Indicates how CDEF should be applied.
+ CDEF_CONTROL cdef_control;
+ // Indicates if loop restoration filter should be enabled.
+ bool enable_restoration;
+ // When enabled, video mode should be used even for single frame input.
+ bool force_video_mode;
+ // Indicates if the error resiliency features should be enabled.
+ bool error_resilient_mode;
+ // Indicates if frame parallel decoding feature should be enabled.
+ bool frame_parallel_decoding_mode;
+ // Indicates if the input should be encoded as monochrome.
+ bool enable_monochrome;
+ // When enabled, the encoder will use a full header even for still pictures.
+ // When disabled, a reduced header is used for still pictures.
+ bool full_still_picture_hdr;
+ // Indicates if dual interpolation filters should be enabled.
+ bool enable_dual_filter;
+ // Indicates if frame order hint should be enabled or not.
+ bool enable_order_hint;
+ // Indicates if ref_frame_mvs should be enabled at the sequence level.
+ bool ref_frame_mvs_present;
+ // Indicates if ref_frame_mvs should be enabled at the frame level.
+ bool enable_ref_frame_mvs;
+ // Indicates if interintra compound mode is enabled.
+ bool enable_interintra_comp;
+ // Indicates if global motion should be enabled.
+ bool enable_global_motion;
+ // Indicates if palette should be enabled.
+ bool enable_palette;
+} ToolCfg;
+
+/*!\endcond */
+/*!
+ * \brief Main encoder configuration data structure.
+ */
+typedef struct AV1EncoderConfig {
+ /*!\cond */
+ // Configuration related to the input video.
+ InputCfg input_cfg;
+
+ // Configuration related to frame-dimensions.
+ FrameDimensionCfg frm_dim_cfg;
+
+ /*!\endcond */
+ /*!
+ * Encoder algorithm configuration.
+ */
+ AlgoCfg algo_cfg;
+
+ /*!
+ * Configuration related to key-frames.
+ */
+ KeyFrameCfg kf_cfg;
+
+ /*!
+ * Rate control configuration
+ */
+ RateControlCfg rc_cfg;
+ /*!\cond */
+
+ // Configuration related to Quantization.
+ QuantizationCfg q_cfg;
+
+ // Internal frame size scaling.
+ ResizeCfg resize_cfg;
+
+ // Frame Super-Resolution size scaling.
+ SuperResCfg superres_cfg;
+
+ /*!\endcond */
+ /*!
+ * stats_in buffer contains all of the stats packets produced in the first
+ * pass, concatenated.
+ */
+ aom_fixed_buf_t twopass_stats_in;
+ /*!\cond */
+
+ // Configuration related to encoder toolsets.
+ ToolCfg tool_cfg;
+
+ // Configuration related to Group of frames.
+ GFConfig gf_cfg;
+
+ // Tile related configuration parameters.
+ TileConfig tile_cfg;
+
+ // Configuration related to Tune.
+ TuneCfg tune_cfg;
+
+ // Configuration related to color.
+ ColorCfg color_cfg;
+
+ // Configuration related to decoder model.
+ DecoderModelCfg dec_model_cfg;
+
+ // Configuration related to reference frames.
+ RefFrameCfg ref_frm_cfg;
+
+ // Configuration related to unit tests.
+ UnitTestCfg unit_test_cfg;
+
+ // Flags related to motion mode.
+ MotionModeCfg motion_mode_cfg;
+
+ // Flags related to intra mode search.
+ IntraModeCfg intra_mode_cfg;
+
+ // Flags related to transform size/type.
+ TxfmSizeTypeCfg txfm_cfg;
+
+ // Flags related to compound type.
+ CompoundTypeCfg comp_type_cfg;
+
+ // Partition related information.
+ PartitionCfg part_cfg;
+
+ // Configuration related to frequency of cost update.
+ CostUpdateFreq cost_upd_freq;
+
+#if CONFIG_DENOISE
+ // Indicates the noise level.
+ float noise_level;
+ // Indicates the the denoisers block size.
+ int noise_block_size;
+ // Indicates whether to apply denoising to the frame to be encoded
+ int enable_dnl_denoising;
+#endif
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ // Noise sensitivity.
+ int noise_sensitivity;
+#endif
+ // Bit mask to specify which tier each of the 32 possible operating points
+ // conforms to.
+ unsigned int tier_mask;
+
+ // Indicates the number of pixels off the edge of a reference frame we're
+ // allowed to go when forming an inter prediction.
+ int border_in_pixels;
+
+ // Indicates the maximum number of threads that may be used by the encoder.
+ int max_threads;
+
+ // Indicates the speed preset to be used.
+ int speed;
+
+ // Indicates the target sequence level index for each operating point(OP).
+ AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+
+ // Indicates the bitstream profile to be used.
+ BITSTREAM_PROFILE profile;
+
+ /*!\endcond */
+ /*!
+ * Indicates the current encoder pass :
+ * AOM_RC_ONE_PASS = One pass encode,
+ * AOM_RC_FIRST_PASS = First pass of multiple-pass
+ * AOM_RC_SECOND_PASS = Second pass of multiple-pass
+ * AOM_RC_THIRD_PASS = Third pass of multiple-pass
+ */
+ enum aom_enc_pass pass;
+ /*!\cond */
+
+ // Total number of encoding passes.
+ int passes;
+
+ // the name of the second pass output file when passes > 2
+ const char *two_pass_output;
+
+ // the name of the second pass log file when passes > 2
+ const char *second_pass_log;
+
+ // Indicates if the encoding is GOOD or REALTIME.
+ MODE mode;
+
+ // Indicates if row-based multi-threading should be enabled or not.
+ bool row_mt;
+
+ // Indicates if frame parallel multi-threading should be enabled or not.
+ bool fp_mt;
+
+ // Indicates if 16bit frame buffers are to be used i.e., the content is >
+ // 8-bit.
+ bool use_highbitdepth;
+
+ // Indicates the bitstream syntax mode. 0 indicates bitstream is saved as
+ // Section 5 bitstream, while 1 indicates the bitstream is saved in Annex - B
+ // format.
+ bool save_as_annexb;
+
+ // The path for partition stats reading and writing, used in the experiment
+ // CONFIG_PARTITION_SEARCH_ORDER.
+ const char *partition_info_path;
+
+ // The flag that indicates whether we use an external rate distribution to
+ // guide adaptive quantization. It requires --deltaq-mode=3. The rate
+ // distribution map file name is stored in |rate_distribution_info|.
+ unsigned int enable_rate_guide_deltaq;
+
+ // The input file of rate distribution information used in all intra mode
+ // to determine delta quantization.
+ const char *rate_distribution_info;
+
+ // Exit the encoder when it fails to encode to a given level.
+ int strict_level_conformance;
+
+ // Max depth for the GOP after a key frame
+ int kf_max_pyr_height;
+
+ // A flag to control if we enable the superblock qp sweep for a given lambda
+ int sb_qp_sweep;
+ /*!\endcond */
+} AV1EncoderConfig;
+
+/*!\cond */
+static INLINE int is_lossless_requested(const RateControlCfg *const rc_cfg) {
+ return rc_cfg->best_allowed_q == 0 && rc_cfg->worst_allowed_q == 0;
+}
+/*!\endcond */
+
+/*!
+ * \brief Encoder-side probabilities for pruning of various AV1 tools
+ */
+typedef struct {
+ /*!
+ * obmc_probs[i][j] is the probability of OBMC being the best motion mode for
+ * jth block size and ith frame update type, averaged over past frames. If
+ * obmc_probs[i][j] < thresh, then OBMC search is pruned.
+ */
+ int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+
+ /*!
+ * warped_probs[i] is the probability of warped motion being the best motion
+ * mode for ith frame update type, averaged over past frames. If
+ * warped_probs[i] < thresh, then warped motion search is pruned.
+ */
+ int warped_probs[FRAME_UPDATE_TYPES];
+
+ /*!
+ * tx_type_probs[i][j][k] is the probability of kth tx_type being the best
+ * for jth transform size and ith frame update type, averaged over past
+ * frames. If tx_type_probs[i][j][k] < thresh, then transform search for that
+ * type is pruned.
+ */
+ int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES];
+
+ /*!
+ * switchable_interp_probs[i][j][k] is the probability of kth interpolation
+ * filter being the best for jth filter context and ith frame update type,
+ * averaged over past frames. If switchable_interp_probs[i][j][k] < thresh,
+ * then interpolation filter search is pruned for that case.
+ */
+ int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS];
+} FrameProbInfo;
+
+/*!\cond */
+
+typedef struct FRAME_COUNTS {
+// Note: This structure should only contain 'unsigned int' fields, or
+// aggregates built solely from 'unsigned int' fields/elements
+#if CONFIG_ENTROPY_STATS
+ unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES];
+ unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+ unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+ unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+ unsigned int cfl_sign[CFL_JOINT_SIGNS];
+ unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE];
+ unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+ unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2];
+ unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ unsigned int palette_y_color_index[PALETTE_SIZES]
+ [PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ unsigned int palette_uv_color_index[PALETTE_SIZES]
+ [PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+ unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2];
+ unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [EOB_COEF_CONTEXTS][2];
+ unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
+ unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS]
+ [2];
+ unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
+ unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5];
+ unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6];
+ unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7];
+ unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8];
+ unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9];
+ unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10];
+ unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11];
+ unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [LEVEL_CONTEXTS][BR_CDF_SIZE];
+ unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2];
+ unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1];
+ unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+ unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2];
+ unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+ unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+ unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+ unsigned int wedge_idx[BLOCK_SIZES_ALL][16];
+ unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+ unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+ unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
+ unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
+ unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
+ unsigned int obmc[BLOCK_SIZES_ALL][2];
+ unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+ unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+ unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2];
+ unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2];
+ unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
+ unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
+ unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
+ unsigned int intrabc[2];
+
+ unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+ unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
+ unsigned int skip_mode[SKIP_MODE_CONTEXTS][2];
+ unsigned int skip_txfm[SKIP_CONTEXTS][2];
+ unsigned int compound_index[COMP_INDEX_CONTEXTS][2];
+ unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2];
+ unsigned int delta_q[DELTA_Q_PROBS][2];
+ unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2];
+ unsigned int delta_lf[DELTA_LF_PROBS][2];
+
+ unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+ unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+ [TX_TYPES];
+ unsigned int filter_intra_mode[FILTER_INTRA_MODES];
+ unsigned int filter_intra[BLOCK_SIZES_ALL][2];
+ unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES];
+ unsigned int wiener_restore[2];
+ unsigned int sgrproj_restore[2];
+#endif // CONFIG_ENTROPY_STATS
+
+ unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS];
+} FRAME_COUNTS;
+
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+
+typedef struct {
+ int ready;
+ double a;
+ double b;
+ double dist_mean;
+ double ld_mean;
+ double sse_mean;
+ double sse_sse_mean;
+ double sse_ld_mean;
+ int num;
+ double dist_sum;
+ double ld_sum;
+ double sse_sum;
+ double sse_sse_sum;
+ double sse_ld_sum;
+} InterModeRdModel;
+
+typedef struct {
+ int idx;
+ int64_t rd;
+} RdIdxPair;
+// TODO(angiebird): This is an estimated size. We still need to figure what is
+// the maximum number of modes.
+#define MAX_INTER_MODES 1024
+// TODO(any): rename this struct to something else. There is already another
+// struct called inter_mode_info, which makes this terribly confusing.
+/*!\endcond */
+/*!
+ * \brief Struct used to hold inter mode data for fast tx search.
+ *
+ * This struct is used to perform a full transform search only on winning
+ * candidates searched with an estimate for transform coding RD.
+ */
+typedef struct inter_modes_info {
+ /*!
+ * The number of inter modes for which data was stored in each of the
+ * following arrays.
+ */
+ int num;
+ /*!
+ * Mode info struct for each of the candidate modes.
+ */
+ MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+ /*!
+ * The rate for each of the candidate modes.
+ */
+ int mode_rate_arr[MAX_INTER_MODES];
+ /*!
+ * The sse of the predictor for each of the candidate modes.
+ */
+ int64_t sse_arr[MAX_INTER_MODES];
+ /*!
+ * The estimated rd of the predictor for each of the candidate modes.
+ */
+ int64_t est_rd_arr[MAX_INTER_MODES];
+ /*!
+ * The rate and mode index for each of the candidate modes.
+ */
+ RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+ /*!
+ * The full rd stats for each of the candidate modes.
+ */
+ RD_STATS rd_cost_arr[MAX_INTER_MODES];
+ /*!
+ * The full rd stats of luma only for each of the candidate modes.
+ */
+ RD_STATS rd_cost_y_arr[MAX_INTER_MODES];
+ /*!
+ * The full rd stats of chroma only for each of the candidate modes.
+ */
+ RD_STATS rd_cost_uv_arr[MAX_INTER_MODES];
+} InterModesInfo;
+
+/*!\cond */
+typedef struct {
+ // TODO(kyslov): consider changing to 64bit
+
+ // This struct is used for computing variance in choose_partitioning(), where
+ // the max number of samples within a superblock is 32x32 (with 4x4 avg).
+ // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
+ // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
+ uint32_t sum_square_error;
+ int32_t sum_error;
+ int log2_count;
+ int variance;
+} VPartVar;
+
+typedef struct {
+ VPartVar none;
+ VPartVar horz[2];
+ VPartVar vert[2];
+} VPVariance;
+
+typedef struct {
+ VPVariance part_variances;
+ VPartVar split[4];
+} VP4x4;
+
+typedef struct {
+ VPVariance part_variances;
+ VP4x4 split[4];
+} VP8x8;
+
+typedef struct {
+ VPVariance part_variances;
+ VP8x8 split[4];
+} VP16x16;
+
+typedef struct {
+ VPVariance part_variances;
+ VP16x16 split[4];
+} VP32x32;
+
+typedef struct {
+ VPVariance part_variances;
+ VP32x32 split[4];
+} VP64x64;
+
+typedef struct {
+ VPVariance part_variances;
+ VP64x64 *split;
+} VP128x128;
+
+/*!\endcond */
+
+/*!
+ * \brief Thresholds for variance based partitioning.
+ */
+typedef struct {
+ /*!
+ * If block variance > threshold, then that block is forced to split.
+ * thresholds[0] - threshold for 128x128;
+ * thresholds[1] - threshold for 64x64;
+ * thresholds[2] - threshold for 32x32;
+ * thresholds[3] - threshold for 16x16;
+ * thresholds[4] - threshold for 8x8;
+ */
+ int64_t thresholds[5];
+
+ /*!
+ * MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual
+ * minmax > threshold_minmax, the 16x16 is forced to split.
+ */
+ int64_t threshold_minmax;
+} VarBasedPartitionInfo;
+
+/*!
+ * \brief Encoder parameters for synchronization of row based multi-threading
+ */
+typedef struct {
+#if CONFIG_MULTITHREAD
+ /**
+ * \name Synchronization objects for top-right dependency.
+ */
+ /**@{*/
+ pthread_mutex_t *mutex_; /*!< Mutex lock object */
+ pthread_cond_t *cond_; /*!< Condition variable */
+ /**@}*/
+#endif // CONFIG_MULTITHREAD
+ /*!
+ * Buffer to store the superblock whose encoding is complete.
+ * num_finished_cols[i] stores the number of superblocks which finished
+ * encoding in the ith superblock row.
+ */
+ int *num_finished_cols;
+ /*!
+ * Denotes the superblock interval at which conditional signalling should
+ * happen. Also denotes the minimum number of extra superblocks of the top row
+ * to be complete to start encoding the current superblock. A value of 1
+ * indicates top-right dependency.
+ */
+ int sync_range;
+ /*!
+ * Denotes the additional number of superblocks in the previous row to be
+ * complete to start encoding the current superblock when intraBC tool is
+ * enabled. This additional top-right delay is required to satisfy the
+ * hardware constraints for intraBC tool when row multithreading is enabled.
+ */
+ int intrabc_extra_top_right_sb_delay;
+ /*!
+ * Number of superblock rows.
+ */
+ int rows;
+ /*!
+ * The superblock row (in units of MI blocks) to be processed next.
+ */
+ int next_mi_row;
+ /*!
+ * Number of threads processing the current tile.
+ */
+ int num_threads_working;
+} AV1EncRowMultiThreadSync;
+
+/*!\cond */
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+ TileInfo tile_info;
+ DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+ FRAME_CONTEXT *row_ctx;
+ uint64_t abs_sum_level;
+ uint8_t allow_update_cdf;
+ InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+ AV1EncRowMultiThreadSync row_mt_sync;
+ MV firstpass_top_mv;
+} TileDataEnc;
+
+typedef struct RD_COUNTS {
+ int compound_ref_used_flag;
+ int skip_mode_used_flag;
+ int tx_type_used[TX_SIZES_ALL][TX_TYPES];
+ int obmc_used[BLOCK_SIZES_ALL][2];
+ int warped_used[2];
+ int newmv_or_intra_blocks;
+ uint64_t seg_tmp_pred_cost[2];
+} RD_COUNTS;
+
+typedef struct ThreadData {
+ MACROBLOCK mb;
+ MvCosts *mv_costs_alloc;
+ IntraBCMVCosts *dv_costs_alloc;
+ RD_COUNTS rd_counts;
+ FRAME_COUNTS *counts;
+ PC_TREE_SHARED_BUFFERS shared_coeff_buf;
+ SIMPLE_MOTION_DATA_TREE *sms_tree;
+ SIMPLE_MOTION_DATA_TREE *sms_root;
+ uint32_t *hash_value_buffer[2][2];
+ OBMCBuffer obmc_buffer;
+ PALETTE_BUFFER *palette_buffer;
+ CompoundTypeRdBuffers comp_rd_buffer;
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint64_t abs_sum_level;
+ uint8_t *tmp_pred_bufs[2];
+ uint8_t *wiener_tmp_pred_buf;
+ int intrabc_used;
+ int deltaq_used;
+ int coefficient_size;
+ int max_mv_magnitude;
+ int interp_filter_selected[SWITCHABLE];
+ FRAME_CONTEXT *tctx;
+ VP64x64 *vt64x64;
+ int32_t num_64x64_blocks;
+ PICK_MODE_CONTEXT *firstpass_ctx;
+ TemporalFilterData tf_data;
+ TplBuffers tpl_tmp_buffers;
+ TplTxfmStats tpl_txfm_stats;
+ GlobalMotionData gm_data;
+ // Pointer to the array of structures to store gradient information of each
+ // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+ // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+ PixelLevelGradientInfo *pixel_gradient_info;
+ // Pointer to the array of structures to store source variance information of
+ // each 4x4 sub-block in a superblock. Block4x4VarInfo structure is used to
+ // store source variance and log of source variance of each 4x4 sub-block
+ // for subsequent retrieval.
+ Block4x4VarInfo *src_var_info_of_4x4_sub_blocks;
+ // Pointer to pc tree root.
+ PC_TREE *pc_root;
+} ThreadData;
+
+struct EncWorkerData;
+
+/*!\endcond */
+
+/*!
+ * \brief Encoder data related to row-based multi-threading
+ */
+typedef struct {
+ /*!
+ * Number of tile rows for which row synchronization memory is allocated.
+ */
+ int allocated_tile_rows;
+ /*!
+ * Number of tile cols for which row synchronization memory is allocated.
+ */
+ int allocated_tile_cols;
+ /*!
+ * Number of rows for which row synchronization memory is allocated
+ * per tile. During first-pass/look-ahead stage this equals the
+ * maximum number of macroblock rows in a tile. During encode stage,
+ * this equals the maximum number of superblock rows in a tile.
+ */
+ int allocated_rows;
+ /*!
+ * Number of columns for which entropy context memory is allocated
+ * per tile. During encode stage, this equals the maximum number of
+ * superblock columns in a tile minus 1. The entropy context memory
+ * is not allocated during first-pass/look-ahead stage.
+ */
+ int allocated_cols;
+
+ /*!
+ * thread_id_to_tile_id[i] indicates the tile id assigned to the ith thread.
+ */
+ int thread_id_to_tile_id[MAX_NUM_THREADS];
+
+ /*!
+ * num_tile_cols_done[i] indicates the number of tile columns whose encoding
+ * is complete in the ith superblock row.
+ */
+ int *num_tile_cols_done;
+
+ /*!
+ * Number of superblock rows in a frame for which 'num_tile_cols_done' is
+ * allocated.
+ */
+ int allocated_sb_rows;
+
+ /*!
+ * Initialized to false, set to true by the worker thread that encounters an
+ * error in order to abort the processing of other worker threads.
+ */
+ bool row_mt_exit;
+
+ /*!
+ * Initialized to false, set to true during first pass encoding by the worker
+ * thread that encounters an error in order to abort the processing of other
+ * worker threads.
+ */
+ bool firstpass_mt_exit;
+
+ /*!
+ * Initialized to false, set to true in cal_mb_wiener_var_hook() by the worker
+ * thread that encounters an error in order to abort the processing of other
+ * worker threads.
+ */
+ bool mb_wiener_mt_exit;
+
+#if CONFIG_MULTITHREAD
+ /*!
+ * Mutex lock used while dispatching jobs.
+ */
+ pthread_mutex_t *mutex_;
+ /*!
+ * Condition variable used to dispatch loopfilter jobs.
+ */
+ pthread_cond_t *cond_;
+#endif
+
+ /**
+ * \name Row synchronization related function pointers.
+ */
+ /**@{*/
+ /*!
+ * Reader.
+ */
+ void (*sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int);
+ /*!
+ * Writer.
+ */
+ void (*sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int);
+ /**@}*/
+} AV1EncRowMultiThreadInfo;
+
+/*!
+ * \brief Encoder data related to multi-threading for allintra deltaq-mode=3
+ */
+typedef struct {
+#if CONFIG_MULTITHREAD
+ /*!
+ * Mutex lock used while dispatching jobs.
+ */
+ pthread_mutex_t *mutex_;
+ /*!
+ * Condition variable used to dispatch loopfilter jobs.
+ */
+ pthread_cond_t *cond_;
+#endif
+
+ /**
+ * \name Row synchronization related function pointers for all intra mode
+ */
+ /**@{*/
+ /*!
+ * Reader.
+ */
+ void (*intra_sync_read_ptr)(AV1EncRowMultiThreadSync *const, int, int);
+ /*!
+ * Writer.
+ */
+ void (*intra_sync_write_ptr)(AV1EncRowMultiThreadSync *const, int, int, int);
+ /**@}*/
+} AV1EncAllIntraMultiThreadInfo;
+
+/*!
+ * \brief Max number of recodes used to track the frame probabilities.
+ */
+#define NUM_RECODES_PER_FRAME 10
+
+/*!
+ * \brief Max number of frames that can be encoded in a parallel encode set.
+ */
+#define MAX_PARALLEL_FRAMES 4
+
+/*!
+ * \brief Buffers to be backed up during parallel encode set to be restored
+ * later.
+ */
+typedef struct RestoreStateBuffers {
+ /*!
+ * Backup of original CDEF srcbuf.
+ */
+ uint16_t *cdef_srcbuf;
+
+ /*!
+ * Backup of original CDEF colbuf.
+ */
+ uint16_t *cdef_colbuf[MAX_MB_PLANE];
+
+ /*!
+ * Backup of original LR rst_tmpbuf.
+ */
+ int32_t *rst_tmpbuf;
+
+ /*!
+ * Backup of original LR rlbs.
+ */
+ RestorationLineBuffers *rlbs;
+} RestoreStateBuffers;
+
+/*!
+ * \brief Parameters related to restoration types.
+ */
+typedef struct {
+ /*!
+ * Stores the best coefficients for Wiener restoration.
+ */
+ WienerInfo wiener;
+
+ /*!
+ * Stores the best coefficients for Sgrproj restoration.
+ */
+ SgrprojInfo sgrproj;
+
+ /*!
+ * The rtype to use for this unit given a frame rtype as index. Indices:
+ * WIENER, SGRPROJ, SWITCHABLE.
+ */
+ RestorationType best_rtype[RESTORE_TYPES - 1];
+} RestUnitSearchInfo;
+
+/*!
+ * \brief Structure to hold search parameter per restoration unit and
+ * intermediate buffer of Wiener filter used in pick filter stage of Loop
+ * restoration.
+ */
+typedef struct {
+ /*!
+ * Array of pointers to 'RestUnitSearchInfo' which holds data related to
+ * restoration types.
+ */
+ RestUnitSearchInfo *rusi[MAX_MB_PLANE];
+
+ /*!
+ * Buffer used to hold dgd-avg data during SIMD call of Wiener filter.
+ */
+ int16_t *dgd_avg;
+} AV1LrPickStruct;
+
+/*!
+ * \brief Primary Encoder parameters related to multi-threading.
+ */
+typedef struct PrimaryMultiThreadInfo {
+ /*!
+ * Number of workers created for multi-threading.
+ */
+ int num_workers;
+
+ /*!
+ * Number of workers used for different MT modules.
+ */
+ int num_mod_workers[NUM_MT_MODULES];
+
+ /*!
+ * Synchronization object used to launch job in the worker thread.
+ */
+ AVxWorker *workers;
+
+ /*!
+ * Data specific to each worker in encoder multi-threading.
+ * tile_thr_data[i] stores the worker data of the ith thread.
+ */
+ struct EncWorkerData *tile_thr_data;
+
+ /*!
+ * CDEF row multi-threading data.
+ */
+ AV1CdefWorkerData *cdef_worker;
+
+ /*!
+ * Primary(Level 1) Synchronization object used to launch job in the worker
+ * thread.
+ */
+ AVxWorker *p_workers[MAX_PARALLEL_FRAMES];
+
+ /*!
+ * Number of primary workers created for multi-threading.
+ */
+ int p_num_workers;
+
+ /*!
+ * Tracks the number of workers in encode stage multi-threading.
+ */
+ int prev_num_enc_workers;
+} PrimaryMultiThreadInfo;
+
+/*!
+ * \brief Encoder parameters related to multi-threading.
+ */
+typedef struct MultiThreadInfo {
+ /*!
+ * Number of workers created for multi-threading.
+ */
+ int num_workers;
+
+ /*!
+ * Number of workers used for different MT modules.
+ */
+ int num_mod_workers[NUM_MT_MODULES];
+
+ /*!
+ * Synchronization object used to launch job in the worker thread.
+ */
+ AVxWorker *workers;
+
+ /*!
+ * Data specific to each worker in encoder multi-threading.
+ * tile_thr_data[i] stores the worker data of the ith thread.
+ */
+ struct EncWorkerData *tile_thr_data;
+
+ /*!
+ * When set, indicates that row based multi-threading of the encoder is
+ * enabled.
+ */
+ bool row_mt_enabled;
+
+ /*!
+ * When set, indicates that multi-threading for bitstream packing is enabled.
+ */
+ bool pack_bs_mt_enabled;
+
+ /*!
+ * Encoder row multi-threading data.
+ */
+ AV1EncRowMultiThreadInfo enc_row_mt;
+
+ /*!
+ * Encoder multi-threading data for allintra mode in the preprocessing stage
+ * when --deltaq-mode=3.
+ */
+ AV1EncAllIntraMultiThreadInfo intra_mt;
+
+ /*!
+ * Tpl row multi-threading data.
+ */
+ AV1TplRowMultiThreadInfo tpl_row_mt;
+
+ /*!
+ * Loop Filter multi-threading object.
+ */
+ AV1LfSync lf_row_sync;
+
+ /*!
+ * Loop Restoration multi-threading object.
+ */
+ AV1LrSync lr_row_sync;
+
+ /*!
+ * Pack bitstream multi-threading object.
+ */
+ AV1EncPackBSSync pack_bs_sync;
+
+ /*!
+ * Global Motion multi-threading object.
+ */
+ AV1GlobalMotionSync gm_sync;
+
+ /*!
+ * Temporal Filter multi-threading object.
+ */
+ AV1TemporalFilterSync tf_sync;
+
+ /*!
+ * CDEF search multi-threading object.
+ */
+ AV1CdefSync cdef_sync;
+
+ /*!
+ * Pointer to CDEF row multi-threading data for the frame.
+ */
+ AV1CdefWorkerData *cdef_worker;
+
+ /*!
+ * Buffers to be stored/restored before/after parallel encode.
+ */
+ RestoreStateBuffers restore_state_buf;
+
+ /*!
+ * In multi-threaded realtime encoding with row-mt enabled, pipeline
+ * loop-filtering after encoding.
+ */
+ int pipeline_lpf_mt_with_enc;
+} MultiThreadInfo;
+
+/*!\cond */
+
+typedef struct ActiveMap {
+ int enabled;
+ int update;
+ unsigned char *map;
+} ActiveMap;
+
+/*!\endcond */
+
+/*!
+ * \brief Encoder info used for decision on forcing integer motion vectors.
+ */
+typedef struct {
+ /*!
+ * cs_rate_array[i] is the fraction of blocks in a frame which either match
+ * with the collocated block or are smooth, where i is the rate_index.
+ */
+ double cs_rate_array[32];
+ /*!
+ * rate_index is used to index cs_rate_array.
+ */
+ int rate_index;
+ /*!
+ * rate_size is the total number of entries populated in cs_rate_array.
+ */
+ int rate_size;
+} ForceIntegerMVInfo;
+
+/*!\cond */
+
+#if CONFIG_INTERNAL_STATS
+// types of stats
+enum {
+ STAT_Y,
+ STAT_U,
+ STAT_V,
+ STAT_ALL,
+ NUM_STAT_TYPES // This should always be the last member of the enum
+} UENUM1BYTE(StatType);
+
+typedef struct IMAGE_STAT {
+ double stat[NUM_STAT_TYPES];
+ double worst;
+} ImageStat;
+#endif // CONFIG_INTERNAL_STATS
+
+typedef struct {
+ int ref_count;
+ YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+
+/*!\endcond */
+
+/*!
+ * \brief Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level
+ *
+ * This is used for bitstream preparation.
+ */
+typedef struct {
+ /*!
+ * frame_base[mi_row * stride + mi_col] stores the mode information of
+ * block (mi_row,mi_col).
+ */
+ MB_MODE_INFO_EXT_FRAME *frame_base;
+ /*!
+ * Size of frame_base buffer.
+ */
+ int alloc_size;
+ /*!
+ * Stride of frame_base buffer.
+ */
+ int stride;
+} MBMIExtFrameBufferInfo;
+
+/*!\cond */
+
+#if CONFIG_COLLECT_PARTITION_STATS
+typedef struct FramePartitionTimingStats {
+ int partition_decisions[6][EXT_PARTITION_TYPES];
+ int partition_attempts[6][EXT_PARTITION_TYPES];
+ int64_t partition_times[6][EXT_PARTITION_TYPES];
+
+ int partition_redo;
+} FramePartitionTimingStats;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "aom_ports/aom_timer.h"
+// Adjust the following to add new components.
+enum {
+ av1_encode_strategy_time,
+ av1_get_one_pass_rt_params_time,
+ av1_get_second_pass_params_time,
+ denoise_and_encode_time,
+ apply_filtering_time,
+ av1_tpl_setup_stats_time,
+ encode_frame_to_data_rate_time,
+ encode_with_or_without_recode_time,
+ loop_filter_time,
+ cdef_time,
+ loop_restoration_time,
+ av1_pack_bitstream_final_time,
+ av1_encode_frame_time,
+ av1_compute_global_motion_time,
+ av1_setup_motion_field_time,
+ encode_sb_row_time,
+
+ rd_pick_partition_time,
+ rd_use_partition_time,
+ choose_var_based_partitioning_time,
+ av1_prune_partitions_time,
+ none_partition_search_time,
+ split_partition_search_time,
+ rectangular_partition_search_time,
+ ab_partitions_search_time,
+ rd_pick_4partition_time,
+ encode_sb_time,
+
+ rd_pick_sb_modes_time,
+ av1_rd_pick_intra_mode_sb_time,
+ av1_rd_pick_inter_mode_sb_time,
+ set_params_rd_pick_inter_mode_time,
+ skip_inter_mode_time,
+ handle_inter_mode_time,
+ evaluate_motion_mode_for_winner_candidates_time,
+ do_tx_search_time,
+ handle_intra_mode_time,
+ refine_winner_mode_tx_time,
+ av1_search_palette_mode_time,
+ handle_newmv_time,
+ compound_type_rd_time,
+ interpolation_filter_search_time,
+ motion_mode_rd_time,
+
+ nonrd_use_partition_time,
+ pick_sb_modes_nonrd_time,
+ hybrid_intra_mode_search_time,
+ nonrd_pick_inter_mode_sb_time,
+ encode_b_nonrd_time,
+
+ kTimingComponents,
+} UENUM1BYTE(TIMING_COMPONENT);
+
+static INLINE char const *get_component_name(int index) {
+ switch (index) {
+ case av1_encode_strategy_time: return "av1_encode_strategy_time";
+ case av1_get_one_pass_rt_params_time:
+ return "av1_get_one_pass_rt_params_time";
+ case av1_get_second_pass_params_time:
+ return "av1_get_second_pass_params_time";
+ case denoise_and_encode_time: return "denoise_and_encode_time";
+ case apply_filtering_time: return "apply_filtering_time";
+ case av1_tpl_setup_stats_time: return "av1_tpl_setup_stats_time";
+ case encode_frame_to_data_rate_time:
+ return "encode_frame_to_data_rate_time";
+ case encode_with_or_without_recode_time:
+ return "encode_with_or_without_recode_time";
+ case loop_filter_time: return "loop_filter_time";
+ case cdef_time: return "cdef_time";
+ case loop_restoration_time: return "loop_restoration_time";
+ case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time";
+ case av1_encode_frame_time: return "av1_encode_frame_time";
+ case av1_compute_global_motion_time:
+ return "av1_compute_global_motion_time";
+ case av1_setup_motion_field_time: return "av1_setup_motion_field_time";
+ case encode_sb_row_time: return "encode_sb_row_time";
+
+ case rd_pick_partition_time: return "rd_pick_partition_time";
+ case rd_use_partition_time: return "rd_use_partition_time";
+ case choose_var_based_partitioning_time:
+ return "choose_var_based_partitioning_time";
+ case av1_prune_partitions_time: return "av1_prune_partitions_time";
+ case none_partition_search_time: return "none_partition_search_time";
+ case split_partition_search_time: return "split_partition_search_time";
+ case rectangular_partition_search_time:
+ return "rectangular_partition_search_time";
+ case ab_partitions_search_time: return "ab_partitions_search_time";
+ case rd_pick_4partition_time: return "rd_pick_4partition_time";
+ case encode_sb_time: return "encode_sb_time";
+
+ case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+ case av1_rd_pick_intra_mode_sb_time:
+ return "av1_rd_pick_intra_mode_sb_time";
+ case av1_rd_pick_inter_mode_sb_time:
+ return "av1_rd_pick_inter_mode_sb_time";
+ case set_params_rd_pick_inter_mode_time:
+ return "set_params_rd_pick_inter_mode_time";
+ case skip_inter_mode_time: return "skip_inter_mode_time";
+ case handle_inter_mode_time: return "handle_inter_mode_time";
+ case evaluate_motion_mode_for_winner_candidates_time:
+ return "evaluate_motion_mode_for_winner_candidates_time";
+ case do_tx_search_time: return "do_tx_search_time";
+ case handle_intra_mode_time: return "handle_intra_mode_time";
+ case refine_winner_mode_tx_time: return "refine_winner_mode_tx_time";
+ case av1_search_palette_mode_time: return "av1_search_palette_mode_time";
+ case handle_newmv_time: return "handle_newmv_time";
+ case compound_type_rd_time: return "compound_type_rd_time";
+ case interpolation_filter_search_time:
+ return "interpolation_filter_search_time";
+ case motion_mode_rd_time: return "motion_mode_rd_time";
+
+ case nonrd_use_partition_time: return "nonrd_use_partition_time";
+ case pick_sb_modes_nonrd_time: return "pick_sb_modes_nonrd_time";
+ case hybrid_intra_mode_search_time: return "hybrid_intra_mode_search_time";
+ case nonrd_pick_inter_mode_sb_time: return "nonrd_pick_inter_mode_sb_time";
+ case encode_b_nonrd_time: return "encode_b_nonrd_time";
+
+ default: assert(0);
+ }
+ return "error";
+}
+#endif
+
+// The maximum number of internal ARFs except ALTREF_FRAME
+#define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
+
+/*!\endcond */
+
+/*!
+ * \brief Parameters related to global motion search
+ */
+typedef struct {
+ /*!
+ * Flag to indicate if global motion search needs to be rerun.
+ */
+ bool search_done;
+
+ /*!
+ * Array of pointers to the frame buffers holding the reference frames.
+ * ref_buf[i] stores the pointer to the reference frame of the ith
+ * reference frame type.
+ */
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
+
+ /*!
+ * Holds the number of valid reference frames in past and future directions
+ * w.r.t. the current frame. num_ref_frames[i] stores the total number of
+ * valid reference frames in 'i' direction.
+ */
+ int num_ref_frames[MAX_DIRECTIONS];
+
+ /*!
+ * Array of structure which stores the valid reference frames in past and
+ * future directions and their corresponding distance from the source frame.
+ * reference_frames[i][j] holds the jth valid reference frame type in the
+ * direction 'i' and its temporal distance from the source frame .
+ */
+ FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1];
+
+ /**
+ * \name Dimensions for which segment map is allocated.
+ */
+ /**@{*/
+ int segment_map_w; /*!< segment map width */
+ int segment_map_h; /*!< segment map height */
+ /**@}*/
+} GlobalMotionInfo;
+
+/*!
+ * \brief Flags related to interpolation filter search
+ */
+typedef struct {
+ /*!
+ * Stores the default value of skip flag depending on chroma format
+ * Set as 1 for monochrome and 3 for other color formats
+ */
+ int default_interp_skip_flags;
+ /*!
+ * Filter mask to allow certain interp_filter type.
+ */
+ uint16_t interp_filter_search_mask;
+} InterpSearchFlags;
+
+/*!
+ * \brief Parameters for motion vector search process
+ */
+typedef struct {
+ /*!
+ * Largest MV component used in a frame.
+ * The value from the previous frame is used to set the full pixel search
+ * range for the current frame.
+ */
+ int max_mv_magnitude;
+ /*!
+ * Parameter indicating initial search window to be used in full-pixel search.
+ * Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window.
+ */
+ int mv_step_param;
+ /*!
+ * Pointer to sub-pixel search function.
+ * In encoder: av1_find_best_sub_pixel_tree
+ * av1_find_best_sub_pixel_tree_pruned
+ * av1_find_best_sub_pixel_tree_pruned_more
+ * In MV unit test: av1_return_max_sub_pixel_mv
+ * av1_return_min_sub_pixel_mv
+ */
+ fractional_mv_step_fp *find_fractional_mv_step;
+ /*!
+ * Search site configuration for full-pel MV search.
+ * search_site_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple
+ * motion search. search_site_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal
+ * filter search_site_cfg[SS_CFG_FPF]: Used during first pass and lookahead
+ */
+ search_site_config search_site_cfg[SS_CFG_TOTAL][NUM_DISTINCT_SEARCH_METHODS];
+} MotionVectorSearchParams;
+
+/*!
+ * \brief Refresh frame flags for different type of frames.
+ *
+ * If the refresh flag is true for a particular reference frame, after the
+ * current frame is encoded, the reference frame gets refreshed (updated) to
+ * be the current frame. Note: Usually at most one flag will be set to true at
+ * a time. But, for key-frames, all flags are set to true at once.
+ */
+typedef struct {
+ bool golden_frame; /*!< Refresh flag for golden frame */
+ bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */
+ bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */
+} RefreshFrameInfo;
+
+/*!
+ * \brief Desired dimensions for an externally triggered resize.
+ *
+ * When resize is triggered externally, the desired dimensions are stored in
+ * this struct until used in the next frame to be coded. These values are
+ * effective only for one frame and are reset after they are used.
+ */
+typedef struct {
+ int width; /*!< Desired resized width */
+ int height; /*!< Desired resized height */
+} ResizePendingParams;
+
+/*!
+ * \brief Refrence frame distance related variables.
+ */
+typedef struct {
+ /*!
+ * True relative distance of reference frames w.r.t. the current frame.
+ */
+ int ref_relative_dist[INTER_REFS_PER_FRAME];
+ /*!
+ * The nearest reference w.r.t. current frame in the past.
+ */
+ int8_t nearest_past_ref;
+ /*!
+ * The nearest reference w.r.t. current frame in the future.
+ */
+ int8_t nearest_future_ref;
+} RefFrameDistanceInfo;
+
+/*!
+ * \brief Parameters used for winner mode processing.
+ *
+ * This is a basic two pass approach: in the first pass, we reduce the number of
+ * transform searches based on some thresholds during the rdopt process to find
+ * the "winner mode". In the second pass, we perform a more through tx search
+ * on the winner mode.
+ * There are some arrays in the struct, and their indices are used in the
+ * following manner:
+ * Index 0: Default mode evaluation, Winner mode processing is not applicable
+ * (Eg : IntraBc).
+ * Index 1: Mode evaluation.
+ * Index 2: Winner mode evaluation
+ * Index 1 and 2 are only used when the respective speed feature is on.
+ */
+typedef struct {
+ /*!
+ * Threshold to determine if trellis optimization is to be enabled
+ * based on :
+ * 0 : dist threshold
+ * 1 : satd threshold
+ * Corresponds to enable_winner_mode_for_coeff_opt speed feature.
+ */
+ unsigned int coeff_opt_thresholds[MODE_EVAL_TYPES][2];
+
+ /*!
+ * Determines the tx size search method during rdopt.
+ * Corresponds to enable_winner_mode_for_tx_size_srch speed feature.
+ */
+ TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES];
+
+ /*!
+ * Controls how often we should approximate prediction error with tx
+ * coefficients. If it's 0, then never. If 1, then it's during the tx_type
+ * search only. If 2, then always.
+ * Corresponds to tx_domain_dist_level speed feature.
+ */
+ unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES];
+
+ /*!
+ * Threshold to approximate pixel domain distortion with transform domain
+ * distortion. This is only used if use_transform_domain_distortion is on.
+ * Corresponds to enable_winner_mode_for_use_tx_domain_dist speed feature.
+ */
+ unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES];
+
+ /*!
+ * Controls how often we should try to skip the transform process based on
+ * result from dct.
+ * Corresponds to use_skip_flag_prediction speed feature.
+ */
+ unsigned int skip_txfm_level[MODE_EVAL_TYPES];
+
+ /*!
+ * Predict DC only txfm blocks for default, mode and winner mode evaluation.
+ * Index 0: Default mode evaluation, Winner mode processing is not applicable.
+ * Index 1: Mode evaluation, Index 2: Winner mode evaluation
+ */
+ unsigned int predict_dc_level[MODE_EVAL_TYPES];
+} WinnerModeParams;
+
+/*!
+ * \brief Frame refresh flags set by the external interface.
+ *
+ * Flags set by external interface to determine which reference buffers are
+ * refreshed by this frame. When set, the encoder will update the particular
+ * reference frame buffer with the contents of the current frame.
+ */
+typedef struct {
+ bool last_frame; /*!< Refresh flag for last frame */
+ bool golden_frame; /*!< Refresh flag for golden frame */
+ bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */
+ bool alt2_ref_frame; /*!< Refresh flag for alt2-ref frame */
+ bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */
+ /*!
+ * Flag indicating if the update of refresh frame flags is pending.
+ */
+ bool update_pending;
+} ExtRefreshFrameFlagsInfo;
+
+/*!
+ * \brief Flags signalled by the external interface at frame level.
+ */
+typedef struct {
+ /*!
+ * Bit mask to disable certain reference frame types.
+ */
+ int ref_frame_flags;
+
+ /*!
+ * Frame refresh flags set by the external interface.
+ */
+ ExtRefreshFrameFlagsInfo refresh_frame;
+
+ /*!
+ * Flag to enable the update of frame contexts at the end of a frame decode.
+ */
+ bool refresh_frame_context;
+
+ /*!
+ * Flag to indicate that update of refresh_frame_context from external
+ * interface is pending.
+ */
+ bool refresh_frame_context_pending;
+
+ /*!
+ * Flag to enable temporal MV prediction.
+ */
+ bool use_ref_frame_mvs;
+
+ /*!
+ * Indicates whether the current frame is to be coded as error resilient.
+ */
+ bool use_error_resilient;
+
+ /*!
+ * Indicates whether the current frame is to be coded as s-frame.
+ */
+ bool use_s_frame;
+
+ /*!
+ * Indicates whether the current frame's primary_ref_frame is set to
+ * PRIMARY_REF_NONE.
+ */
+ bool use_primary_ref_none;
+} ExternalFlags;
+
+/*!\cond */
+
+typedef struct {
+ // Some misc info
+ int high_prec;
+ int q;
+ int order;
+
+ // MV counters
+ int inter_count;
+ int intra_count;
+ int default_mvs;
+ int mv_joint_count[4];
+ int last_bit_zero;
+ int last_bit_nonzero;
+
+ // Keep track of the rates
+ int total_mv_rate;
+ int hp_total_mv_rate;
+ int lp_total_mv_rate;
+
+ // Texture info
+ int horz_text;
+ int vert_text;
+ int diag_text;
+
+ // Whether the current struct contains valid data
+ int valid;
+} MV_STATS;
+
+typedef struct WeberStats {
+ int64_t mb_wiener_variance;
+ int64_t src_variance;
+ int64_t rec_variance;
+ int16_t src_pix_max;
+ int16_t rec_pix_max;
+ int64_t distortion;
+ int64_t satd;
+ double max_scale;
+} WeberStats;
+
+typedef struct {
+ struct loopfilter lf;
+ CdefInfo cdef_info;
+ YV12_BUFFER_CONFIG copy_buffer;
+ RATE_CONTROL rc;
+ MV_STATS mv_stats;
+} CODING_CONTEXT;
+
+typedef struct {
+ int frame_width;
+ int frame_height;
+ int mi_rows;
+ int mi_cols;
+ int mb_rows;
+ int mb_cols;
+ int num_mbs;
+ aom_bit_depth_t bit_depth;
+ int subsampling_x;
+ int subsampling_y;
+} FRAME_INFO;
+
+/*!
+ * \brief This structure stores different types of frame indices.
+ */
+typedef struct {
+ int show_frame_count;
+} FRAME_INDEX_SET;
+
+/*!\endcond */
+
+/*!
+ * \brief Segmentation related information for the current frame.
+ */
+typedef struct {
+ /*!
+ * 3-bit number containing the segment affiliation for each 4x4 block in the
+ * frame. map[y * stride + x] contains the segment id of the 4x4 block at
+ * (x,y) position.
+ */
+ uint8_t *map;
+ /*!
+ * Flag to indicate if current frame has lossless segments or not.
+ * 1: frame has at least one lossless segment.
+ * 0: frame has no lossless segments.
+ */
+ bool has_lossless_segment;
+} EncSegmentationInfo;
+
+/*!
+ * \brief Frame time stamps.
+ */
+typedef struct {
+ /*!
+ * Start time stamp of the previous frame
+ */
+ int64_t prev_ts_start;
+ /*!
+ * End time stamp of the previous frame
+ */
+ int64_t prev_ts_end;
+ /*!
+ * Start time stamp of the first frame
+ */
+ int64_t first_ts_start;
+} TimeStamps;
+
+/*!
+ * Pointers to the memory allocated for frame level transform coeff related
+ * info.
+ */
+typedef struct {
+ /*!
+ * Pointer to the transformed coefficients buffer.
+ */
+ tran_low_t *tcoeff;
+ /*!
+ * Pointer to the eobs buffer.
+ */
+ uint16_t *eobs;
+ /*!
+ * Pointer to the entropy_ctx buffer.
+ */
+ uint8_t *entropy_ctx;
+} CoeffBufferPool;
+
+#if !CONFIG_REALTIME_ONLY
+/*!\cond */
+// DUCKY_ENCODE_FRAME_MODE is c version of EncodeFrameMode
+enum {
+ DUCKY_ENCODE_FRAME_MODE_NONE, // Let native AV1 determine q index and rdmult
+ DUCKY_ENCODE_FRAME_MODE_QINDEX, // DuckyEncode determines q index and AV1
+ // determines rdmult
+ DUCKY_ENCODE_FRAME_MODE_QINDEX_RDMULT, // DuckyEncode determines q index and
+ // rdmult
+} UENUM1BYTE(DUCKY_ENCODE_FRAME_MODE);
+
+enum {
+ DUCKY_ENCODE_GOP_MODE_NONE, // native AV1 decides GOP
+ DUCKY_ENCODE_GOP_MODE_RCL, // rate control lib decides GOP
+} UENUM1BYTE(DUCKY_ENCODE_GOP_MODE);
+
+typedef struct DuckyEncodeFrameInfo {
+ DUCKY_ENCODE_FRAME_MODE qp_mode;
+ DUCKY_ENCODE_GOP_MODE gop_mode;
+ int q_index;
+ int rdmult;
+ // These two arrays are equivalent to std::vector<SuperblockEncodeParameters>
+ int *superblock_encode_qindex;
+ int *superblock_encode_rdmult;
+ int delta_q_enabled;
+} DuckyEncodeFrameInfo;
+
+typedef struct DuckyEncodeFrameResult {
+ int global_order_idx;
+ int q_index;
+ int rdmult;
+ int rate;
+ int64_t dist;
+ double psnr;
+} DuckyEncodeFrameResult;
+
+typedef struct DuckyEncodeInfo {
+ DuckyEncodeFrameInfo frame_info;
+ DuckyEncodeFrameResult frame_result;
+} DuckyEncodeInfo;
+/*!\endcond */
+#endif
+
+/*!\cond */
+typedef struct RTC_REF {
+ /*!
+ * LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+ * BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ */
+ int reference[INTER_REFS_PER_FRAME];
+ int ref_idx[INTER_REFS_PER_FRAME];
+ int refresh[REF_FRAMES];
+ int set_ref_frame_config;
+ int non_reference_frame;
+ int ref_frame_comp[3];
+ int gld_idx_1layer;
+ /*!
+ * Frame number of the last frame that refreshed the buffer slot.
+ */
+ unsigned int buffer_time_index[REF_FRAMES];
+ /*!
+ * Spatial layer id of the last frame that refreshed the buffer slot.
+ */
+ unsigned char buffer_spatial_layer[REF_FRAMES];
+ /*!
+ * Flag to indicate whether closest reference was the previous frame.
+ */
+ bool reference_was_previous_frame;
+ /*!
+ * Flag to indicate this frame is based on longer term reference only,
+ * for recovery from past loss, and it should be biased for improved coding.
+ */
+ bool bias_recovery_frame;
+} RTC_REF;
+/*!\endcond */
+
+/*!
+ * \brief Structure to hold data corresponding to an encoded frame.
+ */
+typedef struct AV1_COMP_DATA {
+ /*!
+ * Buffer to store packed bitstream data of a frame.
+ */
+ unsigned char *cx_data;
+
+ /*!
+ * Allocated size of the cx_data buffer.
+ */
+ size_t cx_data_sz;
+
+ /*!
+ * Size of data written in the cx_data buffer.
+ */
+ size_t frame_size;
+
+ /*!
+ * Flags for the frame.
+ */
+ unsigned int lib_flags;
+
+ /*!
+ * Time stamp for start of frame.
+ */
+ int64_t ts_frame_start;
+
+ /*!
+ * Time stamp for end of frame.
+ */
+ int64_t ts_frame_end;
+
+ /*!
+ * Flag to indicate flush call.
+ */
+ int flush;
+
+ /*!
+ * Time base for sequence.
+ */
+ const aom_rational64_t *timestamp_ratio;
+
+ /*!
+ * Decide to pop the source for this frame from input buffer queue.
+ */
+ int pop_lookahead;
+
+ /*!
+ * Display order hint of frame whose packed data is in cx_data buffer.
+ */
+ int frame_display_order_hint;
+} AV1_COMP_DATA;
+
+/*!
+ * \brief Top level primary encoder structure
+ */
+typedef struct AV1_PRIMARY {
+ /*!
+ * Array of frame level encoder stage top level structures
+ */
+ struct AV1_COMP *parallel_cpi[MAX_PARALLEL_FRAMES];
+
+ /*!
+ * Array of structures to hold data of frames encoded in a given parallel
+ * encode set.
+ */
+ struct AV1_COMP_DATA parallel_frames_data[MAX_PARALLEL_FRAMES - 1];
+#if CONFIG_FPMT_TEST
+ /*!
+ * Flag which enables/disables simulation path for fpmt unit test.
+ * 0 - FPMT integration
+ * 1 - FPMT simulation
+ */
+ FPMT_TEST_ENC_CFG fpmt_unit_test_cfg;
+
+ /*!
+ * Temporary variable simulating the delayed frame_probability update.
+ */
+ FrameProbInfo temp_frame_probs;
+
+ /*!
+ * Temporary variable holding the updated frame probability across
+ * frames. Copy its value to temp_frame_probs for frame_parallel_level 0
+ * frames or last frame in parallel encode set.
+ */
+ FrameProbInfo temp_frame_probs_simulation;
+
+ /*!
+ * Temporary variable simulating the delayed update of valid global motion
+ * model across frames.
+ */
+ int temp_valid_gm_model_found[FRAME_UPDATE_TYPES];
+#endif // CONFIG_FPMT_TEST
+ /*!
+ * Copy of cm->ref_frame_map maintained to facilitate sequential update of
+ * ref_frame_map by lower layer depth frames encoded ahead of time in a
+ * parallel encode set.
+ */
+ RefCntBuffer *ref_frame_map_copy[REF_FRAMES];
+
+ /*!
+ * Start time stamp of the last encoded show frame
+ */
+ int64_t ts_start_last_show_frame;
+
+ /*!
+ * End time stamp of the last encoded show frame
+ */
+ int64_t ts_end_last_show_frame;
+
+ /*!
+ * Number of frame level contexts(cpis)
+ */
+ int num_fp_contexts;
+
+ /*!
+ * Loopfilter levels of the previous encoded frame.
+ */
+ int filter_level[2];
+
+ /*!
+ * Chrominance component loopfilter level of the previous encoded frame.
+ */
+ int filter_level_u;
+
+ /*!
+ * Chrominance component loopfilter level of the previous encoded frame.
+ */
+ int filter_level_v;
+
+ /*!
+ * Encode stage top level structure
+ * During frame parallel encode, this is the same as parallel_cpi[0]
+ */
+ struct AV1_COMP *cpi;
+
+ /*!
+ * Lookahead processing stage top level structure
+ */
+ struct AV1_COMP *cpi_lap;
+
+ /*!
+ * Look-ahead context.
+ */
+ struct lookahead_ctx *lookahead;
+
+ /*!
+ * Sequence parameters have been transmitted already and locked
+ * or not. Once locked av1_change_config cannot change the seq
+ * parameters.
+ */
+ int seq_params_locked;
+
+ /*!
+ * Pointer to internal utility functions that manipulate aom_codec_* data
+ * structures.
+ */
+ struct aom_codec_pkt_list *output_pkt_list;
+
+ /*!
+ * When set, indicates that internal ARFs are enabled.
+ */
+ int internal_altref_allowed;
+
+ /*!
+ * Tell if OVERLAY frame shows existing alt_ref frame.
+ */
+ int show_existing_alt_ref;
+
+ /*!
+ * Information related to a gf group.
+ */
+ GF_GROUP gf_group;
+
+ /*!
+ * Track prior gf group state.
+ */
+ GF_STATE gf_state;
+
+ /*!
+ * Flag indicating whether look ahead processing (LAP) is enabled.
+ */
+ int lap_enabled;
+
+ /*!
+ * Parameters for AV1 bitstream levels.
+ */
+ AV1LevelParams level_params;
+
+ /*!
+ * Calculates PSNR on each frame when set to 1.
+ */
+ int b_calculate_psnr;
+
+ /*!
+ * Number of frames left to be encoded, is 0 if limit is not set.
+ */
+ int frames_left;
+
+ /*!
+ * Information related to two pass encoding.
+ */
+ TWO_PASS twopass;
+
+ /*!
+ * Rate control related parameters.
+ */
+ PRIMARY_RATE_CONTROL p_rc;
+
+ /*!
+ * Info and resources used by temporal filtering.
+ */
+ TEMPORAL_FILTER_INFO tf_info;
+ /*!
+ * Elements part of the sequence header, that are applicable for all the
+ * frames in the video.
+ */
+ SequenceHeader seq_params;
+
+ /*!
+ * Indicates whether to use SVC.
+ */
+ int use_svc;
+
+ /*!
+ * If true, buffer removal times are present.
+ */
+ bool buffer_removal_time_present;
+
+ /*!
+ * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+ */
+ unsigned int number_temporal_layers;
+
+ /*!
+ * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+ */
+ unsigned int number_spatial_layers;
+
+ /*!
+ * Code and details about current error status.
+ */
+ struct aom_internal_error_info error;
+
+ /*!
+ * Function pointers to variants of sse/sad/variance computation functions.
+ * fn_ptr[i] indicates the list of function pointers corresponding to block
+ * size i.
+ */
+ aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+
+ /*!
+ * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+ * the ith 16 x 16 block in raster scan order.
+ */
+ double *tpl_sb_rdmult_scaling_factors;
+
+ /*!
+ * Parameters related to tpl.
+ */
+ TplParams tpl_data;
+
+ /*!
+ * Motion vector stats of the previous encoded frame.
+ */
+ MV_STATS mv_stats;
+
+#if CONFIG_INTERNAL_STATS
+ /*!\cond */
+ uint64_t total_time_receive_data;
+ uint64_t total_time_compress_data;
+
+ unsigned int total_mode_chosen_counts[MAX_MODES];
+
+ int count[2];
+ uint64_t total_sq_error[2];
+ uint64_t total_samples[2];
+ ImageStat psnr[2];
+
+ double total_blockiness;
+ double worst_blockiness;
+
+ int total_bytes;
+ double summed_quality;
+ double summed_weights;
+ double summed_quality_hbd;
+ double summed_weights_hbd;
+ unsigned int total_recode_hits;
+ double worst_ssim;
+ double worst_ssim_hbd;
+
+ ImageStat fastssim;
+ ImageStat psnrhvs;
+
+ int b_calculate_blockiness;
+ int b_calculate_consistency;
+
+ double total_inconsistency;
+ double worst_consistency;
+ Ssimv *ssim_vars;
+ Metrics metrics;
+ /*!\endcond */
+#endif
+
+#if CONFIG_ENTROPY_STATS
+ /*!
+ * Aggregates frame counts for the sequence.
+ */
+ FRAME_COUNTS aggregate_fc;
+#endif // CONFIG_ENTROPY_STATS
+
+ /*!
+ * For each type of reference frame, this contains the index of a reference
+ * frame buffer for a reference frame of the same type. We use this to
+ * choose our primary reference frame (which is the most recent reference
+ * frame of the same type as the current frame).
+ */
+ int fb_of_context_type[REF_FRAMES];
+
+ /*!
+ * Primary Multi-threading parameters.
+ */
+ PrimaryMultiThreadInfo p_mt_info;
+
+ /*!
+ * Probabilities for pruning of various AV1 tools.
+ */
+ FrameProbInfo frame_probs;
+
+ /*!
+ * Indicates if a valid global motion model has been found in the different
+ * frame update types of a GF group.
+ * valid_gm_model_found[i] indicates if valid global motion model has been
+ * found in the frame update type with enum value equal to i
+ */
+ int valid_gm_model_found[FRAME_UPDATE_TYPES];
+
+ /*!
+ * Struct for the reference structure for RTC.
+ */
+ RTC_REF rtc_ref;
+
+ /*!
+ * Struct for all intra mode row multi threading in the preprocess stage
+ * when --deltaq-mode=3.
+ */
+ AV1EncRowMultiThreadSync intra_row_mt_sync;
+} AV1_PRIMARY;
+
+/*!
+ * \brief Top level encoder structure.
+ */
+typedef struct AV1_COMP {
+ /*!
+ * Pointer to top level primary encoder structure
+ */
+ AV1_PRIMARY *ppi;
+
+ /*!
+ * Quantization and dequantization parameters for internal quantizer setup
+ * in the encoder.
+ */
+ EncQuantDequantParams enc_quant_dequant_params;
+
+ /*!
+ * Structure holding thread specific variables.
+ */
+ ThreadData td;
+
+ /*!
+ * Statistics collected at frame level.
+ */
+ FRAME_COUNTS counts;
+
+ /*!
+ * Holds buffer storing mode information at 4x4/8x8 level.
+ */
+ MBMIExtFrameBufferInfo mbmi_ext_info;
+
+ /*!
+ * Buffer holding the transform block related information.
+ * coeff_buffer_base[i] stores the transform block related information of the
+ * ith superblock in raster scan order.
+ */
+ CB_COEFF_BUFFER *coeff_buffer_base;
+
+ /*!
+ * Structure holding pointers to frame level memory allocated for transform
+ * block related information.
+ */
+ CoeffBufferPool coeff_buffer_pool;
+
+ /*!
+ * Structure holding variables common to encoder and decoder.
+ */
+ AV1_COMMON common;
+
+ /*!
+ * Encoder configuration related parameters.
+ */
+ AV1EncoderConfig oxcf;
+
+ /*!
+ * Stores the trellis optimization type at segment level.
+ * optimize_seg_arr[i] stores the trellis opt type for ith segment.
+ */
+ TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS];
+
+ /*!
+ * Pointer to the frame buffer holding the source frame to be used during the
+ * current stage of encoding. It can be the raw input, temporally filtered
+ * input or scaled input.
+ */
+ YV12_BUFFER_CONFIG *source;
+
+ /*!
+ * Pointer to the frame buffer holding the last raw source frame.
+ * last_source is NULL for the following cases:
+ * 1) First frame
+ * 2) Alt-ref frames
+ * 3) All frames for all-intra frame encoding.
+ */
+ YV12_BUFFER_CONFIG *last_source;
+
+ /*!
+ * Pointer to the frame buffer holding the unscaled source frame.
+ * It can be either the raw input or temporally filtered input.
+ */
+ YV12_BUFFER_CONFIG *unscaled_source;
+
+ /*!
+ * Frame buffer holding the resized source frame (cropping / superres).
+ */
+ YV12_BUFFER_CONFIG scaled_source;
+
+ /*!
+ * Pointer to the frame buffer holding the unscaled last source frame.
+ */
+ YV12_BUFFER_CONFIG *unscaled_last_source;
+
+ /*!
+ * Frame buffer holding the resized last source frame.
+ */
+ YV12_BUFFER_CONFIG scaled_last_source;
+
+ /*!
+ * Pointer to the original source frame. This is used to determine if the
+ * content is screen.
+ */
+ YV12_BUFFER_CONFIG *unfiltered_source;
+
+ /*!
+ * Frame buffer holding the orig source frame for PSNR calculation in rtc tf
+ * case.
+ */
+ YV12_BUFFER_CONFIG orig_source;
+
+ /*!
+ * Skip tpl setup when tpl data from gop length decision can be reused.
+ */
+ int skip_tpl_setup_stats;
+
+ /*!
+ * Scaling factors used in the RD multiplier modulation.
+ * TODO(sdeng): consider merge the following arrays.
+ * tpl_rdmult_scaling_factors is a temporary buffer used to store the
+ * intermediate scaling factors which are used in the calculation of
+ * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the
+ * intermediate scaling factor of the ith 16 x 16 block in raster scan order.
+ */
+ double *tpl_rdmult_scaling_factors;
+
+ /*!
+ * Temporal filter context.
+ */
+ TemporalFilterCtx tf_ctx;
+
+ /*!
+ * Pointer to CDEF search context.
+ */
+ CdefSearchCtx *cdef_search_ctx;
+
+ /*!
+ * Variables related to forcing integer mv decisions for the current frame.
+ */
+ ForceIntegerMVInfo force_intpel_info;
+
+ /*!
+ * Pointer to the buffer holding the scaled reference frames.
+ * scaled_ref_buf[i] holds the scaled reference frame of type i.
+ */
+ RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME];
+
+ /*!
+ * Pointer to the buffer holding the last show frame.
+ */
+ RefCntBuffer *last_show_frame_buf;
+
+ /*!
+ * Refresh frame flags for golden, bwd-ref and alt-ref frames.
+ */
+ RefreshFrameInfo refresh_frame;
+
+ /*!
+ * Flag to reduce the number of reference frame buffers used in rt.
+ */
+ int rt_reduce_num_ref_buffers;
+
+ /*!
+ * Flags signalled by the external interface at frame level.
+ */
+ ExternalFlags ext_flags;
+
+ /*!
+ * Temporary frame buffer used to store the non-loop filtered reconstructed
+ * frame during the search of loop filter level.
+ */
+ YV12_BUFFER_CONFIG last_frame_uf;
+
+ /*!
+ * Temporary frame buffer used to store the loop restored frame during loop
+ * restoration search.
+ */
+ YV12_BUFFER_CONFIG trial_frame_rst;
+
+ /*!
+ * Ambient reconstruction err target for force key frames.
+ */
+ int64_t ambient_err;
+
+ /*!
+ * Parameters related to rate distortion optimization.
+ */
+ RD_OPT rd;
+
+ /*!
+ * Temporary coding context used to save and restore when encoding with and
+ * without super-resolution.
+ */
+ CODING_CONTEXT coding_context;
+
+ /*!
+ * Parameters related to global motion search.
+ */
+ GlobalMotionInfo gm_info;
+
+ /*!
+ * Parameters related to winner mode processing.
+ */
+ WinnerModeParams winner_mode_params;
+
+ /*!
+ * Frame time stamps.
+ */
+ TimeStamps time_stamps;
+
+ /*!
+ * Rate control related parameters.
+ */
+ RATE_CONTROL rc;
+
+ /*!
+ * Frame rate of the video.
+ */
+ double framerate;
+
+ /*!
+ * Bitmask indicating which reference buffers may be referenced by this frame.
+ */
+ int ref_frame_flags;
+
+ /*!
+ * speed is passed as a per-frame parameter into the encoder.
+ */
+ int speed;
+
+ /*!
+ * sf contains fine-grained config set internally based on speed.
+ */
+ SPEED_FEATURES sf;
+
+ /*!
+ * Parameters for motion vector search process.
+ */
+ MotionVectorSearchParams mv_search_params;
+
+ /*!
+ * When set, indicates that all reference frames are forward references,
+ * i.e., all the reference frames are output before the current frame.
+ */
+ int all_one_sided_refs;
+
+ /*!
+ * Segmentation related information for current frame.
+ */
+ EncSegmentationInfo enc_seg;
+
+ /*!
+ * Parameters related to cyclic refresh aq-mode.
+ */
+ CYCLIC_REFRESH *cyclic_refresh;
+ /*!
+ * Parameters related to active map. Active maps indicate
+ * if there is any activity on a 4x4 block basis.
+ */
+ ActiveMap active_map;
+
+ /*!
+ * The frame processing order within a GOP.
+ */
+ unsigned char gf_frame_index;
+
+#if CONFIG_INTERNAL_STATS
+ /*!\cond */
+ uint64_t time_compress_data;
+
+ unsigned int mode_chosen_counts[MAX_MODES];
+ int bytes;
+ unsigned int frame_recode_hits;
+ /*!\endcond */
+#endif
+
+#if CONFIG_SPEED_STATS
+ /*!
+ * For debugging: number of transform searches we have performed.
+ */
+ unsigned int tx_search_count;
+#endif // CONFIG_SPEED_STATS
+
+ /*!
+ * When set, indicates that the frame is droppable, i.e., this frame
+ * does not update any reference buffers.
+ */
+ int droppable;
+
+ /*!
+ * Stores the frame parameters during encoder initialization.
+ */
+ FRAME_INFO frame_info;
+
+ /*!
+ * Stores different types of frame indices.
+ */
+ FRAME_INDEX_SET frame_index_set;
+
+ /*!
+ * Store the cm->width in the last call of alloc_compressor_data(). Help
+ * determine whether compressor data should be reallocated when cm->width
+ * changes.
+ */
+ int data_alloc_width;
+
+ /*!
+ * Store the cm->height in the last call of alloc_compressor_data(). Help
+ * determine whether compressor data should be reallocated when cm->height
+ * changes.
+ */
+ int data_alloc_height;
+
+ /*!
+ * Number of MBs in the full-size frame; to be used to
+ * normalize the firstpass stats. This will differ from the
+ * number of MBs in the current frame when the frame is
+ * scaled.
+ */
+ int initial_mbs;
+
+ /*!
+ * Flag to indicate whether the frame size inforamation has been
+ * setup and propagated to associated allocations.
+ */
+ bool frame_size_related_setup_done;
+
+ /*!
+ * The width of the frame that is lastly encoded.
+ * It is updated in the function "encoder_encode()".
+ */
+ int last_coded_width;
+
+ /*!
+ * The height of the frame that is lastly encoded.
+ * It is updated in the function "encoder_encode()".
+ */
+ int last_coded_height;
+
+ /*!
+ * Resize related parameters.
+ */
+ ResizePendingParams resize_pending_params;
+
+ /*!
+ * Pointer to struct holding adaptive data/contexts/models for the tile during
+ * encoding.
+ */
+ TileDataEnc *tile_data;
+ /*!
+ * Number of tiles for which memory has been allocated for tile_data.
+ */
+ int allocated_tiles;
+
+ /*!
+ * Structure to store the palette token related information.
+ */
+ TokenInfo token_info;
+
+ /*!
+ * VARIANCE_AQ segment map refresh.
+ */
+ int vaq_refresh;
+
+ /*!
+ * Thresholds for variance based partitioning.
+ */
+ VarBasedPartitionInfo vbp_info;
+
+ /*!
+ * Number of recodes in the frame.
+ */
+ int num_frame_recode;
+
+ /*!
+ * Current frame probability of parallel frames, across recodes.
+ */
+ FrameProbInfo frame_new_probs[NUM_RECODES_PER_FRAME];
+
+ /*!
+ * Retain condition for transform type frame_probability calculation
+ */
+ int do_update_frame_probs_txtype[NUM_RECODES_PER_FRAME];
+
+ /*!
+ * Retain condition for obmc frame_probability calculation
+ */
+ int do_update_frame_probs_obmc[NUM_RECODES_PER_FRAME];
+
+ /*!
+ * Retain condition for warped motion frame_probability calculation
+ */
+ int do_update_frame_probs_warp[NUM_RECODES_PER_FRAME];
+
+ /*!
+ * Retain condition for interpolation filter frame_probability calculation
+ */
+ int do_update_frame_probs_interpfilter[NUM_RECODES_PER_FRAME];
+
+#if CONFIG_FPMT_TEST
+ /*!
+ * Temporary variable for simulation.
+ * Previous frame's framerate.
+ */
+ double temp_framerate;
+#endif
+ /*!
+ * Updated framerate for the current parallel frame.
+ * cpi->framerate is updated with new_framerate during
+ * post encode updates for parallel frames.
+ */
+ double new_framerate;
+
+ /*!
+ * Retain condition for fast_extra_bits calculation.
+ */
+ int do_update_vbr_bits_off_target_fast;
+
+ /*!
+ * Multi-threading parameters.
+ */
+ MultiThreadInfo mt_info;
+
+ /*!
+ * Specifies the frame to be output. It is valid only if show_existing_frame
+ * is 1. When show_existing_frame is 0, existing_fb_idx_to_show is set to
+ * INVALID_IDX.
+ */
+ int existing_fb_idx_to_show;
+
+ /*!
+ * A flag to indicate if intrabc is ever used in current frame.
+ */
+ int intrabc_used;
+
+ /*!
+ * Mark which ref frames can be skipped for encoding current frame during RDO.
+ */
+ int prune_ref_frame_mask;
+
+ /*!
+ * Loop Restoration context.
+ */
+ AV1LrStruct lr_ctxt;
+
+ /*!
+ * Loop Restoration context used during pick stage.
+ */
+ AV1LrPickStruct pick_lr_ctxt;
+
+ /*!
+ * Pointer to list of tables with film grain parameters.
+ */
+ aom_film_grain_table_t *film_grain_table;
+
+#if CONFIG_DENOISE
+ /*!
+ * Pointer to structure holding the denoised image buffers and the helper
+ * noise models.
+ */
+ struct aom_denoise_and_model_t *denoise_and_model;
+#endif
+
+ /*!
+ * Flags related to interpolation filter search.
+ */
+ InterpSearchFlags interp_search_flags;
+
+ /*!
+ * Turn on screen content tools flag.
+ * Note that some videos are not screen content videos, but
+ * screen content tools could also improve coding efficiency.
+ * For example, videos with large flat regions, gaming videos that look
+ * like natural videos.
+ */
+ int use_screen_content_tools;
+
+ /*!
+ * A flag to indicate "real" screen content videos.
+ * For example, screen shares, screen editing.
+ * This type is true indicates |use_screen_content_tools| must be true.
+ * In addition, rate control strategy is adjusted when this flag is true.
+ */
+ int is_screen_content_type;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ /*!
+ * Accumulates the partition timing stat over the whole frame.
+ */
+ FramePartitionTimingStats partition_stats;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ /*!
+ * component_time[] are initialized to zero while encoder starts.
+ */
+ uint64_t component_time[kTimingComponents];
+ /*!
+ * Stores timing for individual components between calls of start_timing()
+ * and end_timing().
+ */
+ struct aom_usec_timer component_timer[kTimingComponents];
+ /*!
+ * frame_component_time[] are initialized to zero at beginning of each frame.
+ */
+ uint64_t frame_component_time[kTimingComponents];
+#endif
+
+ /*!
+ * Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
+ */
+ int frame_header_count;
+
+ /*!
+ * Whether any no-zero delta_q was actually used.
+ */
+ int deltaq_used;
+
+ /*!
+ * Refrence frame distance related variables.
+ */
+ RefFrameDistanceInfo ref_frame_dist_info;
+
+ /*!
+ * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+ * the ith 16 x 16 block in raster scan order. This scaling factor is used for
+ * RD multiplier modulation when SSIM tuning is enabled.
+ */
+ double *ssim_rdmult_scaling_factors;
+
+#if CONFIG_TUNE_VMAF
+ /*!
+ * Parameters for VMAF tuning.
+ */
+ TuneVMAFInfo vmaf_info;
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ /*!
+ * Parameters for Butteraugli tuning.
+ */
+ TuneButteraugliInfo butteraugli_info;
+#endif
+
+ /*!
+ * Parameters for scalable video coding.
+ */
+ SVC svc;
+
+ /*!
+ * Indicates whether current processing stage is encode stage or LAP stage.
+ */
+ COMPRESSOR_STAGE compressor_stage;
+
+ /*!
+ * Frame type of the last frame. May be used in some heuristics for speeding
+ * up the encoding.
+ */
+ FRAME_TYPE last_frame_type;
+
+ /*!
+ * Number of tile-groups.
+ */
+ int num_tg;
+
+ /*!
+ * Super-resolution mode currently being used by the encoder.
+ * This may / may not be same as user-supplied mode in oxcf->superres_mode
+ * (when we are recoding to try multiple options for example).
+ */
+ aom_superres_mode superres_mode;
+
+ /*!
+ * First pass related data.
+ */
+ FirstPassData firstpass_data;
+
+ /*!
+ * Temporal Noise Estimate
+ */
+ NOISE_ESTIMATE noise_estimate;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ /*!
+ * Temporal Denoiser
+ */
+ AV1_DENOISER denoiser;
+#endif
+
+ /*!
+ * Count on how many consecutive times a block uses small/zeromv for encoding
+ * in a scale of 8x8 block.
+ */
+ uint8_t *consec_zero_mv;
+
+ /*!
+ * Allocated memory size for |consec_zero_mv|.
+ */
+ int consec_zero_mv_alloc_size;
+
+ /*!
+ * Block size of first pass encoding
+ */
+ BLOCK_SIZE fp_block_size;
+
+ /*!
+ * The counter of encoded super block, used to differentiate block names.
+ * This number starts from 0 and increases whenever a super block is encoded.
+ */
+ int sb_counter;
+
+ /*!
+ * Available bitstream buffer size in bytes
+ */
+ size_t available_bs_size;
+
+ /*!
+ * The controller of the external partition model.
+ * It is used to do partition type selection based on external models.
+ */
+ ExtPartController ext_part_controller;
+
+ /*!
+ * Motion vector stats of the current encoded frame, used to update the
+ * ppi->mv_stats during postencode.
+ */
+ MV_STATS mv_stats;
+ /*!
+ * Stores the reference refresh index for the current frame.
+ */
+ int ref_refresh_index;
+
+ /*!
+ * A flag to indicate if the reference refresh index is available for the
+ * current frame.
+ */
+ bool refresh_idx_available;
+
+ /*!
+ * Reference frame index corresponding to the frame to be excluded from being
+ * used as a reference by frame_parallel_level 2 frame in a parallel
+ * encode set of lower layer frames.
+ */
+ int ref_idx_to_skip;
+#if CONFIG_FPMT_TEST
+ /*!
+ * Stores the wanted frame buffer index for choosing primary ref frame by a
+ * frame_parallel_level 2 frame in a parallel encode set of lower layer
+ * frames.
+ */
+
+ int wanted_fb;
+#endif // CONFIG_FPMT_TEST
+
+ /*!
+ * A flag to indicate frames that will update their data to the primary
+ * context at the end of the encode. It is set for non-parallel frames and the
+ * last frame in encode order in a given parallel encode set.
+ */
+ bool do_frame_data_update;
+
+#if CONFIG_RD_COMMAND
+ /*!
+ * A structure for assigning external q_index / rdmult for experiments
+ */
+ RD_COMMAND rd_command;
+#endif // CONFIG_RD_COMMAND
+
+ /*!
+ * Buffer to store MB variance after Wiener filter.
+ */
+ WeberStats *mb_weber_stats;
+
+ /*!
+ * Buffer to store rate cost estimates for each macro block (8x8) in the
+ * preprocessing stage used in allintra mode.
+ */
+ int *prep_rate_estimates;
+
+ /*!
+ * Buffer to store rate cost estimates for each 16x16 block read
+ * from an external file, used in allintra mode.
+ */
+ double *ext_rate_distribution;
+
+ /*!
+ * The scale that equals sum_rate_uniform_quantizer / sum_ext_rate.
+ */
+ double ext_rate_scale;
+
+ /*!
+ * Buffer to store MB variance after Wiener filter.
+ */
+ BLOCK_SIZE weber_bsize;
+
+ /*!
+ * Frame level Wiener filter normalization.
+ */
+ int64_t norm_wiener_variance;
+
+ /*!
+ * Buffer to store delta-q values for delta-q mode 4.
+ */
+ int *mb_delta_q;
+
+ /*!
+ * Flag to indicate that current frame is dropped.
+ */
+ bool is_dropped_frame;
+
+#if CONFIG_BITRATE_ACCURACY
+ /*!
+ * Structure stores information needed for bitrate accuracy experiment.
+ */
+ VBR_RATECTRL_INFO vbr_rc_info;
+#endif
+
+#if CONFIG_RATECTRL_LOG
+ /*!
+ * Structure stores information of rate control decisions.
+ */
+ RATECTRL_LOG rc_log;
+#endif // CONFIG_RATECTRL_LOG
+
+ /*!
+ * Frame level twopass status and control data
+ */
+ TWO_PASS_FRAME twopass_frame;
+
+ /*!
+ * Context needed for third pass encoding.
+ */
+ THIRD_PASS_DEC_CTX *third_pass_ctx;
+
+ /*!
+ * File pointer to second pass log
+ */
+ FILE *second_pass_log_stream;
+
+ /*!
+ * Buffer to store 64x64 SAD
+ */
+ uint64_t *src_sad_blk_64x64;
+
+ /*!
+ * SSE between the current frame and the reconstructed last frame
+ * It is only used for CBR mode.
+ * It is not used if the reference frame has a different frame size.
+ */
+ uint64_t rec_sse;
+
+ /*!
+ * A flag to indicate whether the encoder is controlled by DuckyEncode or not.
+ * 1:yes 0:no
+ */
+ int use_ducky_encode;
+
+#if !CONFIG_REALTIME_ONLY
+ /*! A structure that facilitates the communication between DuckyEncode and AV1
+ * encoder.
+ */
+ DuckyEncodeInfo ducky_encode_info;
+#endif // CONFIG_REALTIME_ONLY
+ //
+ /*!
+ * Frames since last frame with cdf update.
+ */
+ int frames_since_last_update;
+
+ /*!
+ * Block level thresholds to force zeromv-skip at partition level.
+ */
+ unsigned int zeromv_skip_thresh_exit_part[BLOCK_SIZES_ALL];
+
+ /*!
+ * Number of downsampling pyramid levels to allocate for each frame
+ * This is currently only used for global motion
+ */
+ int image_pyramid_levels;
+
+#if CONFIG_SALIENCY_MAP
+ /*!
+ * Pixel level saliency map for each frame.
+ */
+ uint8_t *saliency_map;
+
+ /*!
+ * Superblock level rdmult scaling factor driven by saliency map.
+ */
+ double *sm_scaling_factor;
+#endif
+
+ /*!
+ * Number of pixels that choose palette mode for luma in the
+ * fast encoding pass in av1_determine_sc_tools_with_encoding().
+ */
+ int palette_pixel_num;
+
+ /*!
+ * Flag to indicate scaled_last_source is available,
+ * so scaling is not needed for last_source.
+ */
+ int scaled_last_source_available;
+} AV1_COMP;
+
+/*!
+ * \brief Input frames and last input frame
+ */
+typedef struct EncodeFrameInput {
+ /*!\cond */
+ YV12_BUFFER_CONFIG *source;
+ YV12_BUFFER_CONFIG *last_source;
+ int64_t ts_duration;
+ /*!\endcond */
+} EncodeFrameInput;
+
+/*!
+ * \brief contains per-frame encoding parameters decided upon by
+ * av1_encode_strategy() and passed down to av1_encode().
+ */
+typedef struct EncodeFrameParams {
+ /*!
+ * Is error resilient mode enabled
+ */
+ int error_resilient_mode;
+ /*!
+ * Frame type (eg KF vs inter frame etc)
+ */
+ FRAME_TYPE frame_type;
+
+ /*!\cond */
+ int primary_ref_frame;
+ int order_offset;
+
+ /*!\endcond */
+ /*!
+ * Should the current frame be displayed after being decoded
+ */
+ int show_frame;
+
+ /*!\cond */
+ int refresh_frame_flags;
+
+ int show_existing_frame;
+ int existing_fb_idx_to_show;
+
+ /*!\endcond */
+ /*!
+ * Bitmask of which reference buffers may be referenced by this frame.
+ */
+ int ref_frame_flags;
+
+ /*!
+ * Reference buffer assignment for this frame.
+ */
+ int remapped_ref_idx[REF_FRAMES];
+
+ /*!
+ * Flags which determine which reference buffers are refreshed by this
+ * frame.
+ */
+ RefreshFrameInfo refresh_frame;
+
+ /*!
+ * Speed level to use for this frame: Bigger number means faster.
+ */
+ int speed;
+} EncodeFrameParams;
+
+/*!\cond */
+
+// EncodeFrameResults contains information about the result of encoding a
+// single frame
+typedef struct {
+ size_t size; // Size of resulting bitstream
+} EncodeFrameResults;
+
+void av1_initialize_enc(unsigned int usage, enum aom_rc_mode end_usage);
+
+struct AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi,
+ const AV1EncoderConfig *oxcf,
+ BufferPool *const pool,
+ COMPRESSOR_STAGE stage,
+ int lap_lag_in_frames);
+
+struct AV1_PRIMARY *av1_create_primary_compressor(
+ struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+ const AV1EncoderConfig *oxcf);
+
+void av1_remove_compressor(AV1_COMP *cpi);
+
+void av1_remove_primary_compressor(AV1_PRIMARY *ppi);
+
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi);
+#endif
+#if CONFIG_INTERNAL_STATS
+void print_internal_stats(AV1_PRIMARY *ppi);
+#endif
+
+void av1_change_config_seq(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
+ bool *sb_size_changed);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+ bool sb_size_changed);
+
+aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+ int subsampling_x, int subsampling_y);
+
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
+ const AV1EncoderConfig *oxcf, int use_svc);
+
+void av1_post_encode_updates(AV1_COMP *const cpi,
+ const AV1_COMP_DATA *const cpi_data);
+
+void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map);
+
+void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool,
+ int ref_buffers_used_map);
+
+void av1_release_scaled_references_fpmt(AV1_COMP *cpi);
+
+void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool,
+ int ref_buffers_used_map);
+
+void av1_init_sc_decisions(AV1_PRIMARY *const ppi);
+
+AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
+ AV1_COMP_DATA *const first_cpi_data);
+
+int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
+ AV1_PRIMARY *const ppi,
+ int *ref_buffers_used_map);
+/*!\endcond */
+
+/*!\brief Obtain the raw frame data
+ *
+ * \ingroup high_level_algo
+ * This function receives the raw frame data from input.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] frame_flags Flags to decide how to encoding the frame
+ * \param[in,out] sd Contain raw frame data
+ * \param[in] time_stamp Time stamp of the frame
+ * \param[in] end_time_stamp End time stamp
+ *
+ * \return Returns a value to indicate if the frame data is received
+ * successfully.
+ * \note The caller can assume that a copy of this frame is made and not just a
+ * copy of the pointer.
+ */
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time_stamp);
+
+/*!\brief Encode a frame
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ * This function encodes the raw frame data, and outputs the frame bit stream
+ * to the designated buffer. The caller should use the output parameters
+ * cpi_data->ts_frame_start and cpi_data->ts_frame_end only when this function
+ * returns AOM_CODEC_OK.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in,out] cpi_data Data corresponding to a frame encode
+ *
+ * \return Returns a value to indicate if the encoding is done successfully.
+ * \retval #AOM_CODEC_OK
+ * \retval -1
+ * No frame encoded; more input is required.
+ * \retval "A nonzero (positive) aom_codec_err_t code"
+ * The encoding failed with the error. Sets the error code and error message
+ * in \c cpi->common.error.
+ */
+int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data);
+
+/*!\brief Run 1-pass/2-pass encoding
+ *
+ * \ingroup high_level_algo
+ * \callgraph
+ * \callergraph
+ */
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+ const EncodeFrameInput *const frame_input,
+ const EncodeFrameParams *const frame_params,
+ EncodeFrameResults *const frame_results);
+
+/*!\cond */
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *sd);
+
+int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags);
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height);
+
+void av1_set_mv_search_params(AV1_COMP *cpi);
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_set_internal_size(AV1EncoderConfig *const oxcf,
+ ResizePendingParams *resize_pending_params,
+ AOM_SCALING_MODE horiz_mode,
+ AOM_SCALING_MODE vert_mode);
+
+int av1_get_quantizer(struct AV1_COMP *cpi);
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
+
+void av1_alloc_mb_wiener_var_pred_buf(AV1_COMMON *cm, ThreadData *td);
+
+void av1_dealloc_mb_wiener_var_pred_buf(ThreadData *td);
+
+// Set screen content options.
+// This function estimates whether to use screen content tools, by counting
+// the portion of blocks that have few luma colors.
+// Modifies:
+// cpi->commom.features.allow_screen_content_tools
+// cpi->common.features.allow_intrabc
+// cpi->use_screen_content_tools
+// cpi->is_screen_content_type
+// However, the estimation is not accurate and may misclassify videos.
+// A slower but more accurate approach that determines whether to use screen
+// content tools is employed later. See av1_determine_sc_tools_with_encoding().
+void av1_set_screen_content_options(struct AV1_COMP *cpi,
+ FeatureFlags *features);
+
+void av1_update_frame_size(AV1_COMP *cpi);
+
+typedef struct {
+ int pyr_level;
+ int disp_order;
+} RefFrameMapPair;
+
+static INLINE void init_ref_map_pair(
+ AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
+ if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) {
+ memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+ return;
+ }
+ memset(ref_frame_map_pairs, 0, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+ for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+ // Get reference frame buffer.
+ const RefCntBuffer *const buf = cpi->common.ref_frame_map[map_idx];
+ if (ref_frame_map_pairs[map_idx].disp_order == -1) continue;
+ if (buf == NULL) {
+ ref_frame_map_pairs[map_idx].disp_order = -1;
+ ref_frame_map_pairs[map_idx].pyr_level = -1;
+ continue;
+ } else if (buf->ref_count > 1) {
+ // Once the keyframe is coded, the slots in ref_frame_map will all
+ // point to the same frame. In that case, all subsequent pointers
+ // matching the current are considered "free" slots. This will find
+ // the next occurrence of the current pointer if ref_count indicates
+ // there are multiple instances of it and mark it as free.
+ for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) {
+ const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2];
+ if (buf2 == buf) {
+ ref_frame_map_pairs[idx2].disp_order = -1;
+ ref_frame_map_pairs[idx2].pyr_level = -1;
+ }
+ }
+ }
+ ref_frame_map_pairs[map_idx].disp_order = (int)buf->display_order_hint;
+ ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level;
+ }
+}
+
+#if CONFIG_FPMT_TEST
+static AOM_INLINE void calc_frame_data_update_flag(
+ GF_GROUP *const gf_group, int gf_frame_index,
+ bool *const do_frame_data_update) {
+ *do_frame_data_update = true;
+ // Set the flag to false for all frames in a given parallel encode set except
+ // the last frame in the set with frame_parallel_level = 2.
+ if (gf_group->frame_parallel_level[gf_frame_index] == 1) {
+ *do_frame_data_update = false;
+ } else if (gf_group->frame_parallel_level[gf_frame_index] == 2) {
+ // Check if this is the last frame in the set with frame_parallel_level = 2.
+ for (int i = gf_frame_index + 1; i < gf_group->size; i++) {
+ if ((gf_group->frame_parallel_level[i] == 0 &&
+ (gf_group->update_type[i] == ARF_UPDATE ||
+ gf_group->update_type[i] == INTNL_ARF_UPDATE)) ||
+ gf_group->frame_parallel_level[i] == 1) {
+ break;
+ } else if (gf_group->frame_parallel_level[i] == 2) {
+ *do_frame_data_update = false;
+ break;
+ }
+ }
+ }
+}
+#endif
+
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+static INLINE int64_t
+timebase_units_to_ticks(const aom_rational64_t *timestamp_ratio, int64_t n) {
+ return n * timestamp_ratio->num / timestamp_ratio->den;
+}
+
+static INLINE int64_t
+ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) {
+ int64_t round = timestamp_ratio->num / 2;
+ if (round > 0) --round;
+ return (n * timestamp_ratio->den + round) / timestamp_ratio->num;
+}
+
+static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const FRAME_UPDATE_TYPE update_type =
+ gf_group->update_type[cpi->gf_frame_index];
+
+ return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE ||
+ update_type == GF_UPDATE;
+}
+
+// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
+static INLINE int av1_use_hash_me(const AV1_COMP *const cpi) {
+ return (cpi->common.features.allow_screen_content_tools &&
+ cpi->common.features.allow_intrabc &&
+ frame_is_intra_only(&cpi->common));
+}
+
+static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf(
+ const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ return buf != NULL ? &buf->buf : NULL;
+}
+
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
+ assert(buf != NULL);
+ ensure_mv_buffer(buf, cm);
+ buf->width = cm->width;
+ buf->height = cm->height;
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE unsigned int allocated_tokens(const TileInfo *tile,
+ int sb_size_log2, int num_planes) {
+ int tile_mb_rows =
+ ROUND_POWER_OF_TWO(tile->mi_row_end - tile->mi_row_start, 2);
+ int tile_mb_cols =
+ ROUND_POWER_OF_TWO(tile->mi_col_end - tile->mi_col_start, 2);
+
+ return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
+ int mi_row, TokenExtra **tok, int sb_size_log2,
+ int num_planes) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+
+ const int tile_mb_cols =
+ (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+ const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
+
+ *tok = cpi->token_info.tile_tok[tile_row][tile_col] +
+ get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
+
+#define ALT_MIN_LAG 3
+static INLINE int is_altref_enabled(int lag_in_frames, bool enable_auto_arf) {
+ return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf;
+}
+
+static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) {
+ return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) &&
+ (gf_cfg->gf_min_pyr_height == 0);
+}
+
+// Helper function to compute number of blocks on either side of the frame.
+static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+ return (frame_length + mb_length - 1) / mb_length;
+}
+
+// Check if statistics generation stage
+static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) {
+ assert(IMPLIES(cpi->compressor_stage == LAP_STAGE,
+ cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->ppi->lap_enabled));
+ return (cpi->oxcf.pass == AOM_RC_FIRST_PASS ||
+ (cpi->compressor_stage == LAP_STAGE));
+}
+// Check if statistics consumption stage
+static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) {
+ return (cpi->oxcf.pass >= AOM_RC_SECOND_PASS);
+}
+
+// Check if statistics consumption stage
+static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) {
+ return (is_stat_consumption_stage_twopass(cpi) ||
+ (cpi->oxcf.pass == AOM_RC_ONE_PASS &&
+ (cpi->compressor_stage == ENCODE_STAGE) && cpi->ppi->lap_enabled));
+}
+
+// Decide whether 'dv_costs' need to be allocated/stored during the encoding.
+static AOM_INLINE bool av1_need_dv_costs(const AV1_COMP *const cpi) {
+ return !cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ av1_allow_intrabc(&cpi->common) && !is_stat_generation_stage(cpi);
+}
+
+/*!\endcond */
+/*!\brief Check if the current stage has statistics
+ *
+ *\ingroup two_pass_algo
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ *
+ * \return 0 if no stats for current stage else 1
+ */
+static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
+ assert(
+ IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
+ return (cpi->oxcf.pass == AOM_RC_ONE_PASS && !cpi->ppi->lap_enabled);
+}
+
+/*!\cond */
+
+static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) {
+ return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+ cpi->oxcf.gf_cfg.lag_in_frames == 0;
+}
+
+// Use default/internal reference structure for single-layer RTC.
+static INLINE int use_rtc_reference_structure_one_layer(const AV1_COMP *cpi) {
+ return is_one_pass_rt_params(cpi) && cpi->ppi->number_spatial_layers == 1 &&
+ cpi->ppi->number_temporal_layers == 1 &&
+ !cpi->ppi->rtc_ref.set_ref_frame_config;
+}
+
+// Function return size of frame stats buffer
+static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
+ /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */
+ return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer);
+}
+
+// TODO(zoeliu): To set up cpi->oxcf.gf_cfg.enable_auto_brf
+
+static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ MV_REFERENCE_FRAME ref0,
+ MV_REFERENCE_FRAME ref1) {
+ xd->block_ref_scale_factors[0] =
+ get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1);
+ xd->block_ref_scale_factors[1] =
+ get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1);
+}
+
+static INLINE int get_chessboard_index(int frame_index) {
+ return frame_index & 0x1;
+}
+
+static INLINE const int *cond_cost_list_const(const struct AV1_COMP *cpi,
+ const int *cost_list) {
+ const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
+ cpi->sf.mv_sf.use_fullpel_costlist;
+ return use_cost_list ? cost_list : NULL;
+}
+
+static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
+ const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
+ cpi->sf.mv_sf.use_fullpel_costlist;
+ return use_cost_list ? cost_list : NULL;
+}
+
+// Compression ratio of current frame.
+double av1_get_compression_ratio(const AV1_COMMON *const cm,
+ size_t encoded_frame_size);
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate);
+
+void av1_setup_frame_size(AV1_COMP *cpi);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+// Returns 1 if a frame is scaled and 0 otherwise.
+static INLINE int av1_resize_scaled(const AV1_COMMON *cm) {
+ return cm->superres_upscaled_width != cm->render_width ||
+ cm->superres_upscaled_height != cm->render_height;
+}
+
+static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
+ return av1_superres_scaled(cm) || av1_resize_scaled(cm);
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient
+// frame. An exception can be made for a forward keyframe since it has no
+// previous dependencies.
+static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
+ return cm->show_existing_frame && (!cm->features.error_resilient_mode ||
+ cm->current_frame.frame_type == KEY_FRAME);
+}
+
+// Get index into the 'cpi->mbmi_ext_info.frame_base' array for the given
+// 'mi_row' and 'mi_col'.
+static INLINE int get_mi_ext_idx(const int mi_row, const int mi_col,
+ const BLOCK_SIZE mi_alloc_bsize,
+ const int mbmi_ext_stride) {
+ const int mi_ext_size_1d = mi_size_wide[mi_alloc_bsize];
+ const int mi_ext_row = mi_row / mi_ext_size_1d;
+ const int mi_ext_col = mi_col / mi_ext_size_1d;
+ return mi_ext_row * mbmi_ext_stride + mi_ext_col;
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_mode_info_offsets(
+ const CommonModeInfoParams *const mi_params,
+ const MBMIExtFrameBufferInfo *const mbmi_ext_info, MACROBLOCK *const x,
+ MACROBLOCKD *const xd, int mi_row, int mi_col) {
+ set_mi_offsets(mi_params, xd, mi_row, mi_col);
+ const int ext_idx = get_mi_ext_idx(mi_row, mi_col, mi_params->mi_alloc_bsize,
+ mbmi_ext_info->stride);
+ x->mbmi_ext_frame = mbmi_ext_info->frame_base + ext_idx;
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+ int cols_left, int *bh, int *bw) {
+ int int_size = (int)bsize;
+ if (rows_left <= 0 || cols_left <= 0) {
+ return AOMMIN(bsize, BLOCK_8X8);
+ } else {
+ for (; int_size > 0; int_size -= 3) {
+ *bh = mi_size_high[int_size];
+ *bw = mi_size_wide[int_size];
+ if ((*bh <= rows_left) && (*bw <= cols_left)) {
+ break;
+ }
+ }
+ }
+ return (BLOCK_SIZE)int_size;
+}
+
+static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0,
+ AOM_LAST_FLAG,
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+ AOM_GOLD_FLAG,
+ AOM_BWD_FLAG,
+ AOM_ALT2_FLAG,
+ AOM_ALT_FLAG };
+
+// When more than 'max_allowed_refs' are available, we reduce the number of
+// reference frames one at a time based on this order.
+static const MV_REFERENCE_FRAME disable_order[] = {
+ LAST3_FRAME,
+ LAST2_FRAME,
+ ALTREF2_FRAME,
+ BWDREF_FRAME,
+};
+
+static const MV_REFERENCE_FRAME
+ ref_frame_priority_order[INTER_REFS_PER_FRAME] = {
+ LAST_FRAME, ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME,
+ ALTREF2_FRAME, LAST2_FRAME, LAST3_FRAME,
+ };
+
+static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
+ const int use_one_pass_rt_params,
+ const YV12_BUFFER_CONFIG **ref_frames,
+ const int ext_ref_frame_flags) {
+ // cpi->ext_flags.ref_frame_flags allows certain reference types to be
+ // disabled by the external interface. These are set by
+ // av1_apply_encoding_flags(). Start with what the external interface allows,
+ // then suppress any reference types which we have found to be duplicates.
+ int flags = ext_ref_frame_flags;
+
+ for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) {
+ const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i];
+ // If this_ref has appeared before, mark the corresponding ref frame as
+ // invalid. For one_pass_rt mode, only disable GOLDEN_FRAME if it's the
+ // same as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd).
+ int index =
+ (use_one_pass_rt_params && ref_frame_priority_order[i] == GOLDEN_FRAME)
+ ? (1 + sf->rt_sf.use_nonrd_altref_frame)
+ : i;
+ for (int j = 0; j < index; ++j) {
+ // If this_ref has appeared before (same as the reference corresponding
+ // to lower index j), remove it as a reference only if that reference
+ // (for index j) is actually used as a reference.
+ if (this_ref == ref_frames[j] &&
+ (flags & (1 << (ref_frame_priority_order[j] - 1)))) {
+ flags &= ~(1 << (ref_frame_priority_order[i] - 1));
+ break;
+ }
+ }
+ }
+ return flags;
+}
+
+// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
+// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
+// function, the memory must be freed by the caller. Both the buf member of the
+// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory
+// returned must be freed via call to free().
+//
+// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
+// the obu_has_size_field bit is set, and the buffer contains the obu_size
+// field.
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi);
+
+#define MAX_GFUBOOST_FACTOR 10.0
+#define MIN_GFUBOOST_FACTOR 4.0
+
+static INLINE int is_frame_tpl_eligible(const GF_GROUP *const gf_group,
+ uint8_t index) {
+ const FRAME_UPDATE_TYPE update_type = gf_group->update_type[index];
+ return update_type == ARF_UPDATE || update_type == GF_UPDATE ||
+ update_type == KF_UPDATE;
+}
+
+static INLINE int is_frame_eligible_for_ref_pruning(const GF_GROUP *gf_group,
+ int selective_ref_frame,
+ int prune_ref_frames,
+ int gf_index) {
+ return (selective_ref_frame > 0) && (prune_ref_frames > 0) &&
+ !is_frame_tpl_eligible(gf_group, gf_index);
+}
+
+// Get update type of the current frame.
+static INLINE FRAME_UPDATE_TYPE get_frame_update_type(const GF_GROUP *gf_group,
+ int gf_frame_index) {
+ return gf_group->update_type[gf_frame_index];
+}
+
+static INLINE int av1_pixels_to_mi(int pixels) {
+ return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2;
+}
+
+static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) &&
+ cm->show_frame;
+}
+
+static INLINE int is_frame_resize_pending(const AV1_COMP *const cpi) {
+ const ResizePendingParams *const resize_pending_params =
+ &cpi->resize_pending_params;
+ return (resize_pending_params->width && resize_pending_params->height &&
+ (cpi->common.width != resize_pending_params->width ||
+ cpi->common.height != resize_pending_params->height));
+}
+
+// Check if loop filter is used.
+static INLINE int is_loopfilter_used(const AV1_COMMON *const cm) {
+ return !cm->features.coded_lossless && !cm->tiles.large_scale;
+}
+
+// Check if CDEF is used.
+static INLINE int is_cdef_used(const AV1_COMMON *const cm) {
+ return cm->seq_params->enable_cdef && !cm->features.coded_lossless &&
+ !cm->tiles.large_scale;
+}
+
+// Check if loop restoration filter is used.
+static INLINE int is_restoration_used(const AV1_COMMON *const cm) {
+ return cm->seq_params->enable_restoration && !cm->features.all_lossless &&
+ !cm->tiles.large_scale;
+}
+
+// Checks if post-processing filters need to be applied.
+// NOTE: This function decides if the application of different post-processing
+// filters on the reconstructed frame can be skipped at the encoder side.
+// However the computation of different filter parameters that are signaled in
+// the bitstream is still required.
+static INLINE unsigned int derive_skip_apply_postproc_filters(
+ const AV1_COMP *cpi, int use_loopfilter, int use_cdef, int use_superres,
+ int use_restoration) {
+ // Though CDEF parameter selection should be dependent on
+ // deblocked/loop-filtered pixels for cdef_pick_method <=
+ // CDEF_FAST_SEARCH_LVL5, CDEF strength values are calculated based on the
+ // pixel values that are not loop-filtered in svc real-time encoding mode.
+ // Hence this case is handled separately using the condition below.
+ if (cpi->ppi->rtc_ref.non_reference_frame)
+ return (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF);
+
+ if (!cpi->oxcf.algo_cfg.skip_postproc_filtering || cpi->ppi->b_calculate_psnr)
+ return 0;
+ assert(cpi->oxcf.mode == ALLINTRA);
+
+ // The post-processing filters are applied one after the other in the
+ // following order: deblocking->cdef->superres->restoration. In case of
+ // ALLINTRA encoding, the reconstructed frame is not used as a reference
+ // frame. Hence, the application of these filters can be skipped when
+ // 1. filter parameters of the subsequent stages are not dependent on the
+ // filtered output of the current stage or
+ // 2. subsequent filtering stages are disabled
+ if (use_restoration) return SKIP_APPLY_RESTORATION;
+ if (use_superres) return SKIP_APPLY_SUPERRES;
+ if (use_cdef) {
+ // CDEF parameter selection is not dependent on the deblocked frame if
+ // cdef_pick_method is CDEF_PICK_FROM_Q. Hence the application of deblocking
+ // filters and cdef filters can be skipped in this case.
+ return (cpi->sf.lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q &&
+ use_loopfilter)
+ ? (SKIP_APPLY_LOOPFILTER | SKIP_APPLY_CDEF)
+ : SKIP_APPLY_CDEF;
+ }
+ if (use_loopfilter) return SKIP_APPLY_LOOPFILTER;
+
+ // If we reach here, all post-processing stages are disabled, so none need to
+ // be skipped.
+ return 0;
+}
+
+static INLINE void set_postproc_filter_default_params(AV1_COMMON *cm) {
+ struct loopfilter *const lf = &cm->lf;
+ CdefInfo *const cdef_info = &cm->cdef_info;
+ RestorationInfo *const rst_info = cm->rst_info;
+
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ cdef_info->cdef_bits = 0;
+ cdef_info->cdef_strengths[0] = 0;
+ cdef_info->nb_cdef_strengths = 1;
+ cdef_info->cdef_uv_strengths[0] = 0;
+ rst_info[0].frame_restoration_type = RESTORE_NONE;
+ rst_info[1].frame_restoration_type = RESTORE_NONE;
+ rst_info[2].frame_restoration_type = RESTORE_NONE;
+}
+
+static INLINE int is_inter_tx_size_search_level_one(
+ const TX_SPEED_FEATURES *tx_sf) {
+ return (tx_sf->inter_tx_size_search_init_depth_rect >= 1 &&
+ tx_sf->inter_tx_size_search_init_depth_sqr >= 1);
+}
+
+static INLINE int get_lpf_opt_level(const SPEED_FEATURES *sf) {
+ int lpf_opt_level = 0;
+ if (is_inter_tx_size_search_level_one(&sf->tx_sf))
+ lpf_opt_level = (sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q) ? 2 : 1;
+ return lpf_opt_level;
+}
+
+// Enable switchable motion mode only if warp and OBMC tools are allowed
+static INLINE bool is_switchable_motion_mode_allowed(bool allow_warped_motion,
+ bool enable_obmc) {
+ return (allow_warped_motion || enable_obmc);
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static INLINE int denoise_svc(const struct AV1_COMP *const cpi) {
+ return (!cpi->ppi->use_svc ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
+}
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+static INLINE void av1_print_fr_partition_timing_stats(
+ const FramePartitionTimingStats *part_stats, const char *filename) {
+ FILE *f = fopen(filename, "w");
+ if (!f) {
+ return;
+ }
+
+ fprintf(f, "bsize,redo,");
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "decision_%d,", part);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "attempt_%d,", part);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "time_%d,", part);
+ }
+ fprintf(f, "\n");
+
+ static const int bsizes[6] = { 128, 64, 32, 16, 8, 4 };
+
+ for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) {
+ fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo);
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]);
+ }
+ fprintf(f, "\n");
+ }
+ fclose(f);
+}
+#endif // CONFIG_COLLECT_PARTITION_STATS == 2
+
+#if CONFIG_COLLECT_PARTITION_STATS
+static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
+ assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
+ bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 ||
+ bsize == BLOCK_4X4);
+ switch (bsize) {
+ case BLOCK_128X128: return 0;
+ case BLOCK_64X64: return 1;
+ case BLOCK_32X32: return 2;
+ case BLOCK_16X16: return 3;
+ case BLOCK_8X8: return 4;
+ case BLOCK_4X4: return 5;
+ default: assert(0 && "Invalid bsize for partition_stats."); return -1;
+ }
+}
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(AV1_COMP *cpi, int component) {
+ aom_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(AV1_COMP *cpi, int component) {
+ aom_usec_timer_mark(&cpi->component_timer[component]);
+ cpi->frame_component_time[component] +=
+ aom_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+ switch (type) {
+ case 0: return "KEY_FRAME";
+ case 1: return "INTER_FRAME";
+ case 2: return "INTRA_ONLY_FRAME";
+ case 3: return "S_FRAME";
+ default: assert(0);
+ }
+ return "error";
+}
+#endif
+
+/*!\endcond */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODER_H_
diff --git a/third_party/aom/av1/encoder/encoder_alloc.h b/third_party/aom/av1/encoder/encoder_alloc.h
new file mode 100644
index 0000000000..ce48496d48
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder_alloc.h
@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_ALLOC_H_
+#define AOM_AV1_ENCODER_ENCODER_ALLOC_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/global_motion_facade.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/pickcdef.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static AOM_INLINE void dealloc_context_buffers_ext(
+ MBMIExtFrameBufferInfo *mbmi_ext_info) {
+ aom_free(mbmi_ext_info->frame_base);
+ mbmi_ext_info->frame_base = NULL;
+ mbmi_ext_info->alloc_size = 0;
+}
+
+static AOM_INLINE void alloc_context_buffers_ext(
+ AV1_COMMON *cm, MBMIExtFrameBufferInfo *mbmi_ext_info) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+ const int mi_alloc_rows =
+ (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+ const int mi_alloc_cols =
+ (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+ const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols;
+
+ if (new_ext_mi_size > mbmi_ext_info->alloc_size) {
+ dealloc_context_buffers_ext(mbmi_ext_info);
+ CHECK_MEM_ERROR(
+ cm, mbmi_ext_info->frame_base,
+ aom_malloc(new_ext_mi_size * sizeof(*mbmi_ext_info->frame_base)));
+ mbmi_ext_info->alloc_size = new_ext_mi_size;
+ }
+ // The stride needs to be updated regardless of whether new allocation
+ // happened or not.
+ mbmi_ext_info->stride = mi_alloc_cols;
+}
+
+static AOM_INLINE void alloc_compressor_data(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ // Setup mi_params
+ mi_params->set_mb_mi(mi_params, cm->width, cm->height,
+ cpi->sf.part_sf.default_min_partition_size);
+
+ if (!is_stat_generation_stage(cpi)) av1_alloc_txb_buf(cpi);
+
+ aom_free(cpi->td.mv_costs_alloc);
+ cpi->td.mv_costs_alloc = NULL;
+ // Avoid the memory allocation of 'mv_costs_alloc' for allintra encoding
+ // mode.
+ if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
+ CHECK_MEM_ERROR(cm, cpi->td.mv_costs_alloc,
+ (MvCosts *)aom_calloc(1, sizeof(*cpi->td.mv_costs_alloc)));
+ cpi->td.mb.mv_costs = cpi->td.mv_costs_alloc;
+ }
+
+ av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf,
+ cm->error);
+ if (av1_setup_sms_tree(cpi, &cpi->td)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate SMS tree");
+ }
+ cpi->td.firstpass_ctx =
+ av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf);
+ if (!cpi->td.firstpass_ctx)
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+}
+
+// Allocate mbmi buffers which are used to store mode information at block
+// level.
+static AOM_INLINE void alloc_mb_mode_info_buffers(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (av1_alloc_context_buffers(cm, cm->width, cm->height,
+ cpi->sf.part_sf.default_min_partition_size)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate context buffers");
+ }
+
+ if (!is_stat_generation_stage(cpi))
+ alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
+}
+
+static AOM_INLINE void realloc_segmentation_maps(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ // Create the encoder segmentation map and set all entries to 0
+ aom_free(cpi->enc_seg.map);
+ CHECK_MEM_ERROR(cm, cpi->enc_seg.map,
+ aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+
+ // Create a map used for cyclic background refresh.
+ if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
+ CHECK_MEM_ERROR(
+ cm, cpi->cyclic_refresh,
+ av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols));
+
+ // Create a map used to mark inactive areas.
+ aom_free(cpi->active_map.map);
+ CHECK_MEM_ERROR(cm, cpi->active_map.map,
+ aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+}
+
+static AOM_INLINE void alloc_obmc_buffers(
+ OBMCBuffer *obmc_buffer, struct aom_internal_error_info *error) {
+ AOM_CHECK_MEM_ERROR(
+ error, obmc_buffer->wsrc,
+ (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->wsrc)));
+ AOM_CHECK_MEM_ERROR(
+ error, obmc_buffer->mask,
+ (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->mask)));
+ AOM_CHECK_MEM_ERROR(
+ error, obmc_buffer->above_pred,
+ (uint8_t *)aom_memalign(
+ 16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->above_pred)));
+ AOM_CHECK_MEM_ERROR(
+ error, obmc_buffer->left_pred,
+ (uint8_t *)aom_memalign(
+ 16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->left_pred)));
+}
+
+static AOM_INLINE void release_obmc_buffers(OBMCBuffer *obmc_buffer) {
+ aom_free(obmc_buffer->mask);
+ aom_free(obmc_buffer->above_pred);
+ aom_free(obmc_buffer->left_pred);
+ aom_free(obmc_buffer->wsrc);
+
+ obmc_buffer->mask = NULL;
+ obmc_buffer->above_pred = NULL;
+ obmc_buffer->left_pred = NULL;
+ obmc_buffer->wsrc = NULL;
+}
+
+static AOM_INLINE void alloc_compound_type_rd_buffers(
+ struct aom_internal_error_info *error, CompoundTypeRdBuffers *const bufs) {
+ AOM_CHECK_MEM_ERROR(
+ error, bufs->pred0,
+ (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+ AOM_CHECK_MEM_ERROR(
+ error, bufs->pred1,
+ (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+ AOM_CHECK_MEM_ERROR(
+ error, bufs->residual1,
+ (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+ AOM_CHECK_MEM_ERROR(
+ error, bufs->diff10,
+ (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+ AOM_CHECK_MEM_ERROR(error, bufs->tmp_best_mask_buf,
+ (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+ sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+static AOM_INLINE void release_compound_type_rd_buffers(
+ CompoundTypeRdBuffers *const bufs) {
+ aom_free(bufs->pred0);
+ aom_free(bufs->pred1);
+ aom_free(bufs->residual1);
+ aom_free(bufs->diff10);
+ aom_free(bufs->tmp_best_mask_buf);
+ av1_zero(*bufs); // Set all pointers to NULL for safety.
+}
+
+static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ TokenInfo *token_info = &cpi->token_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int num_planes = av1_num_planes(cm);
+ dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
+
+ aom_free(cpi->tile_data);
+ cpi->tile_data = NULL;
+ cpi->allocated_tiles = 0;
+ enc_row_mt->allocated_tile_cols = 0;
+ enc_row_mt->allocated_tile_rows = 0;
+
+ // Delete sementation map
+ aom_free(cpi->enc_seg.map);
+ cpi->enc_seg.map = NULL;
+
+ av1_cyclic_refresh_free(cpi->cyclic_refresh);
+ cpi->cyclic_refresh = NULL;
+
+ aom_free(cpi->active_map.map);
+ cpi->active_map.map = NULL;
+
+ aom_free(cpi->ssim_rdmult_scaling_factors);
+ cpi->ssim_rdmult_scaling_factors = NULL;
+
+ aom_free(cpi->tpl_rdmult_scaling_factors);
+ cpi->tpl_rdmult_scaling_factors = NULL;
+
+#if CONFIG_TUNE_VMAF
+ aom_free(cpi->vmaf_info.rdmult_scaling_factors);
+ cpi->vmaf_info.rdmult_scaling_factors = NULL;
+ aom_close_vmaf_model(cpi->vmaf_info.vmaf_model);
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+ aom_free(cpi->butteraugli_info.rdmult_scaling_factors);
+ cpi->butteraugli_info.rdmult_scaling_factors = NULL;
+ aom_free_frame_buffer(&cpi->butteraugli_info.source);
+ aom_free_frame_buffer(&cpi->butteraugli_info.resized_source);
+#endif
+
+#if CONFIG_SALIENCY_MAP
+ aom_free(cpi->saliency_map);
+ aom_free(cpi->sm_scaling_factor);
+#endif
+
+ release_obmc_buffers(&cpi->td.mb.obmc_buffer);
+
+ aom_free(cpi->td.mv_costs_alloc);
+ cpi->td.mv_costs_alloc = NULL;
+ aom_free(cpi->td.dv_costs_alloc);
+ cpi->td.dv_costs_alloc = NULL;
+
+ aom_free(cpi->td.mb.sb_stats_cache);
+ cpi->td.mb.sb_stats_cache = NULL;
+
+ aom_free(cpi->td.mb.sb_fp_stats);
+ cpi->td.mb.sb_fp_stats = NULL;
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+ aom_free(cpi->td.mb.rdcost);
+ cpi->td.mb.rdcost = NULL;
+#endif
+
+ av1_free_pc_tree_recursive(cpi->td.pc_root, num_planes, 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ cpi->td.pc_root = NULL;
+
+ for (int i = 0; i < 2; i++)
+ for (int j = 0; j < 2; j++) {
+ aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]);
+ cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
+ }
+
+ av1_hash_table_destroy(&cpi->td.mb.intrabc_hash_info.intrabc_hash_table);
+
+ aom_free(cm->tpl_mvs);
+ cm->tpl_mvs = NULL;
+
+ aom_free(cpi->td.pixel_gradient_info);
+ cpi->td.pixel_gradient_info = NULL;
+
+ aom_free(cpi->td.src_var_info_of_4x4_sub_blocks);
+ cpi->td.src_var_info_of_4x4_sub_blocks = NULL;
+
+ aom_free(cpi->td.vt64x64);
+ cpi->td.vt64x64 = NULL;
+
+ av1_free_pmc(cpi->td.firstpass_ctx, num_planes);
+ cpi->td.firstpass_ctx = NULL;
+
+ const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
+ // This call ensures that the buffers allocated by tf_alloc_and_reset_data()
+ // in av1_temporal_filter() for single-threaded encode are freed in case an
+ // error is encountered during temporal filtering (due to early termination
+ // tf_dealloc_data() in av1_temporal_filter() would not be invoked).
+ tf_dealloc_data(&cpi->td.tf_data, is_highbitdepth);
+
+ // This call ensures that tpl_tmp_buffers for single-threaded encode are freed
+ // in case of an error during tpl.
+ tpl_dealloc_temp_buffers(&cpi->td.tpl_tmp_buffers);
+
+ // This call ensures that the global motion (gm) data buffers for
+ // single-threaded encode are freed in case of an error during gm.
+ gm_dealloc_data(&cpi->td.gm_data);
+
+ // This call ensures that CDEF search context buffers are deallocated in case
+ // of an error during cdef search.
+ av1_cdef_dealloc_data(cpi->cdef_search_ctx);
+ aom_free(cpi->cdef_search_ctx);
+ cpi->cdef_search_ctx = NULL;
+
+ av1_dealloc_mb_data(&cpi->td.mb, num_planes);
+
+ av1_dealloc_mb_wiener_var_pred_buf(&cpi->td);
+
+ av1_free_txb_buf(cpi);
+ av1_free_context_buffers(cm);
+
+ aom_free_frame_buffer(&cpi->last_frame_uf);
+#if !CONFIG_REALTIME_ONLY
+ av1_free_restoration_buffers(cm);
+ av1_free_firstpass_data(&cpi->firstpass_data);
+#endif
+
+ if (!is_stat_generation_stage(cpi)) {
+ av1_free_cdef_buffers(cm, &cpi->ppi->p_mt_info.cdef_worker,
+ &cpi->mt_info.cdef_sync);
+ }
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ aom_free(cpi->pick_lr_ctxt.rusi[plane]);
+ cpi->pick_lr_ctxt.rusi[plane] = NULL;
+ }
+ aom_free(cpi->pick_lr_ctxt.dgd_avg);
+ cpi->pick_lr_ctxt.dgd_avg = NULL;
+
+ aom_free_frame_buffer(&cpi->trial_frame_rst);
+ aom_free_frame_buffer(&cpi->scaled_source);
+ aom_free_frame_buffer(&cpi->scaled_last_source);
+ aom_free_frame_buffer(&cpi->orig_source);
+ aom_free_frame_buffer(&cpi->svc.source_last_TL0);
+
+ free_token_info(token_info);
+
+ av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
+ av1_free_sms_tree(&cpi->td);
+
+ aom_free(cpi->td.mb.palette_buffer);
+ release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
+ aom_free(cpi->td.mb.tmp_conv_dst);
+ for (int j = 0; j < 2; ++j) {
+ aom_free(cpi->td.mb.tmp_pred_bufs[j]);
+ }
+
+#if CONFIG_DENOISE
+ if (cpi->denoise_and_model) {
+ aom_denoise_and_model_free(cpi->denoise_and_model);
+ cpi->denoise_and_model = NULL;
+ }
+#endif
+ if (cpi->film_grain_table) {
+ aom_film_grain_table_free(cpi->film_grain_table);
+ aom_free(cpi->film_grain_table);
+ cpi->film_grain_table = NULL;
+ }
+
+ if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi);
+ aom_free(cpi->svc.layer_context);
+ cpi->svc.layer_context = NULL;
+
+ aom_free(cpi->consec_zero_mv);
+ cpi->consec_zero_mv = NULL;
+ cpi->consec_zero_mv_alloc_size = 0;
+
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+
+ aom_free(cpi->mb_weber_stats);
+ cpi->mb_weber_stats = NULL;
+
+ if (cpi->oxcf.enable_rate_guide_deltaq) {
+ aom_free(cpi->prep_rate_estimates);
+ cpi->prep_rate_estimates = NULL;
+
+ aom_free(cpi->ext_rate_distribution);
+ cpi->ext_rate_distribution = NULL;
+ }
+
+ aom_free(cpi->mb_delta_q);
+ cpi->mb_delta_q = NULL;
+}
+
+static AOM_INLINE void allocate_gradient_info_for_hog(AV1_COMP *cpi) {
+ if (!is_gradient_caching_for_hog_enabled(cpi)) return;
+
+ PixelLevelGradientInfo *pixel_gradient_info = cpi->td.pixel_gradient_info;
+ if (!pixel_gradient_info) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome;
+ CHECK_MEM_ERROR(
+ cm, pixel_gradient_info,
+ aom_malloc(sizeof(*pixel_gradient_info) * plane_types * MAX_SB_SQUARE));
+ cpi->td.pixel_gradient_info = pixel_gradient_info;
+ }
+
+ cpi->td.mb.pixel_gradient_info = pixel_gradient_info;
+}
+
+static AOM_INLINE void allocate_src_var_of_4x4_sub_block_buf(AV1_COMP *cpi) {
+ if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return;
+
+ Block4x4VarInfo *source_variance_info =
+ cpi->td.src_var_info_of_4x4_sub_blocks;
+ if (!source_variance_info) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size];
+ CHECK_MEM_ERROR(cm, source_variance_info,
+ aom_malloc(sizeof(*source_variance_info) * mi_count_in_sb));
+ cpi->td.src_var_info_of_4x4_sub_blocks = source_variance_info;
+ }
+
+ cpi->td.mb.src_var_info_of_4x4_sub_blocks = source_variance_info;
+}
+
+static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4;
+ if (cpi->td.vt64x64) {
+ if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
+ aom_free(cpi->td.vt64x64);
+ cpi->td.vt64x64 = NULL;
+ }
+ }
+ if (!cpi->td.vt64x64) {
+ CHECK_MEM_ERROR(cm, cpi->td.vt64x64,
+ aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks));
+ cpi->td.num_64x64_blocks = num_64x64_blocks;
+ }
+}
+
+static AOM_INLINE YV12_BUFFER_CONFIG *realloc_and_scale_source(
+ AV1_COMP *cpi, int scaled_width, int scaled_height) {
+ AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ if (scaled_width == cpi->unscaled_source->y_crop_width &&
+ scaled_height == cpi->unscaled_source->y_crop_height) {
+ return cpi->unscaled_source;
+ }
+
+ if (aom_realloc_frame_buffer(
+ &cpi->scaled_source, scaled_width, scaled_height,
+ cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+ cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->features.byte_alignment, NULL, NULL, NULL,
+ cpi->image_pyramid_levels, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to reallocate scaled source buffer");
+ assert(cpi->scaled_source.y_crop_width == scaled_width);
+ assert(cpi->scaled_source.y_crop_height == scaled_height);
+ if (!av1_resize_and_extend_frame_nonnormative(
+ cpi->unscaled_source, &cpi->scaled_source,
+ (int)cm->seq_params->bit_depth, num_planes))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to reallocate buffers during resize");
+ return &cpi->scaled_source;
+}
+
+// Deallocate allocated thread_data.
+static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) {
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+ const int num_tf_workers =
+ AOMMIN(p_mt_info->num_mod_workers[MOD_TF], p_mt_info->num_workers);
+ const int num_tpl_workers =
+ AOMMIN(p_mt_info->num_mod_workers[MOD_TPL], p_mt_info->num_workers);
+ const int is_highbitdepth = ppi->seq_params.use_highbitdepth;
+ const int num_planes = ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+ for (int t = 1; t < p_mt_info->num_workers; ++t) {
+ EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t];
+ thread_data->td = thread_data->original_td;
+ ThreadData *const td = thread_data->td;
+ if (!td) continue;
+ aom_free(td->tctx);
+ aom_free(td->palette_buffer);
+ aom_free(td->tmp_conv_dst);
+ release_compound_type_rd_buffers(&td->comp_rd_buffer);
+ for (int j = 0; j < 2; ++j) {
+ aom_free(td->tmp_pred_bufs[j]);
+ }
+ aom_free(td->pixel_gradient_info);
+ aom_free(td->src_var_info_of_4x4_sub_blocks);
+ release_obmc_buffers(&td->obmc_buffer);
+ aom_free(td->vt64x64);
+
+ for (int x = 0; x < 2; x++) {
+ for (int y = 0; y < 2; y++) {
+ aom_free(td->hash_value_buffer[x][y]);
+ td->hash_value_buffer[x][y] = NULL;
+ }
+ }
+ aom_free(td->mv_costs_alloc);
+ td->mv_costs_alloc = NULL;
+ aom_free(td->dv_costs_alloc);
+ td->dv_costs_alloc = NULL;
+ aom_free(td->counts);
+ av1_free_pmc(td->firstpass_ctx, num_planes);
+ td->firstpass_ctx = NULL;
+ av1_free_shared_coeff_buffer(&td->shared_coeff_buf);
+ av1_free_sms_tree(td);
+ // This call ensures that the buffers allocated by tf_alloc_and_reset_data()
+ // in prepare_tf_workers() for MT encode are freed in case an error is
+ // encountered during temporal filtering (due to early termination
+ // tf_dealloc_thread_data() in av1_tf_do_filtering_mt() would not be
+ // invoked).
+ if (t < num_tf_workers) tf_dealloc_data(&td->tf_data, is_highbitdepth);
+ // This call ensures that tpl_tmp_buffers for MT encode are freed in case of
+ // an error during tpl.
+ if (t < num_tpl_workers) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers);
+ // This call ensures that the buffers in gm_data for MT encode are freed in
+ // case of an error during gm.
+ gm_dealloc_data(&td->gm_data);
+ av1_dealloc_mb_data(&td->mb, num_planes);
+ aom_free(td->mb.sb_stats_cache);
+ td->mb.sb_stats_cache = NULL;
+ aom_free(td->mb.sb_fp_stats);
+ td->mb.sb_fp_stats = NULL;
+#if CONFIG_PARTITION_SEARCH_ORDER
+ aom_free(td->mb.rdcost);
+ td->mb.rdcost = NULL;
+#endif
+ av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, SEARCH_PARTITION);
+ td->pc_root = NULL;
+ av1_dealloc_mb_wiener_var_pred_buf(td);
+ aom_free(td);
+ thread_data->td = NULL;
+ thread_data->original_td = NULL;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODER_ALLOC_H_
diff --git a/third_party/aom/av1/encoder/encoder_utils.c b/third_party/aom/av1/encoder/encoder_utils.c
new file mode 100644
index 0000000000..c35873d207
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder_utils.c
@@ -0,0 +1,1503 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aomcx.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "av1/encoder/mv_prec.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/var_based_part.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define MIN_BOOST_COMBINE_FACTOR 4.0
+#define MAX_BOOST_COMBINE_FACTOR 12.0
+
+const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = {
+ { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 },
+ { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 },
+ { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 },
+ { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 },
+ { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 },
+ { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 },
+ { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 },
+ { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 },
+ { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 },
+ { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 },
+ { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 },
+ { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 },
+ { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 },
+ { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 },
+ { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 },
+ { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 },
+ { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 },
+ { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+ { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 },
+ { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 },
+ { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 },
+ { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 },
+ { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 },
+ { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 },
+ { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 },
+ { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+ { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 },
+ { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 },
+ { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 },
+ { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 },
+ { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 },
+ { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 },
+ { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 },
+ { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 },
+ { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 },
+ { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 },
+ { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+ { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 },
+ { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 },
+ { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 },
+ { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+ { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 },
+ { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 },
+ { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 },
+ { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 },
+ { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 },
+ { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 },
+ { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 },
+ { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 },
+ { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 },
+ { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 },
+ { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 },
+ { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+};
+
+const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 106, 90, 90, 97, 67, 59, 70, 28,
+ 30, 38, 16, 16, 16, 0, 0, 44, 50, 26, 25 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 98, 93, 97, 68, 82, 85, 33, 30,
+ 33, 16, 16, 16, 16, 0, 0, 43, 37, 26, 16 },
+ { 0, 0, 0, 91, 80, 76, 78, 55, 49, 24, 16,
+ 16, 16, 16, 16, 16, 0, 0, 29, 45, 16, 38 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 103, 89, 89, 89, 62, 63, 76, 34,
+ 35, 32, 19, 16, 16, 0, 0, 49, 55, 29, 19 }
+};
+
+const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64,
+ 64, 64, 64 };
+
+// TODO(yunqing): the default probs can be trained later from better
+// performance.
+const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+ [SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS] = {
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } },
+ { { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 },
+ { 512, 512, 512 } }
+ };
+
+static void configure_static_seg_features(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ struct segmentation *const seg = &cm->seg;
+
+ double avg_q;
+#if CONFIG_FPMT_TEST
+ avg_q = ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) &&
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE))
+ ? cpi->ppi->p_rc.temp_avg_q
+ : cpi->ppi->p_rc.avg_q;
+#else
+ avg_q = cpi->ppi->p_rc.avg_q;
+#endif
+
+ int high_q = (int)(avg_q > 48.0);
+ int qi_delta;
+
+ // Disable and clear down for KF
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ // Clear down the global segmentation map
+ memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ seg->update_map = 0;
+ seg->update_data = 0;
+
+ // Disable segmentation
+ av1_disable_segmentation(seg);
+
+ // Clear down the segment features.
+ av1_clearall_segfeatures(seg);
+ } else if (cpi->refresh_frame.alt_ref_frame) {
+ // If this is an alt ref frame
+ // Clear down the global segmentation map
+ memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+ seg->update_map = 0;
+ seg->update_data = 0;
+
+ // Disable segmentation and individual segment features by default
+ av1_disable_segmentation(seg);
+ av1_clearall_segfeatures(seg);
+
+ // If segmentation was enabled set those features needed for the
+ // arf itself.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+
+ qi_delta = av1_compute_qdelta(rc, avg_q, avg_q * 0.875,
+ cm->seq_params->bit_depth);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+ }
+ } else if (seg->enabled) {
+ // All other frames if segmentation has been enabled
+
+ // First normal frame in a valid gf or alt ref group
+ if (rc->frames_since_golden == 0) {
+ // Set up segment features for normal frames in an arf group
+ // Disable segmentation and clear down features if alt ref
+ // is not active for this group
+
+ av1_disable_segmentation(seg);
+
+ memset(cpi->enc_seg.map, 0,
+ cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+
+ seg->update_map = 0;
+ seg->update_data = 0;
+
+ av1_clearall_segfeatures(seg);
+ } else if (rc->is_src_frame_alt_ref) {
+ // Special case where we are coding over the top of a previous
+ // alt ref frame.
+ // Segment coding disabled for compred testing
+
+ // Enable ref frame features for segment 0 as well
+ av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+ av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+ // All mbs should use ALTREF_FRAME
+ av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+ av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+ av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+ // Skip all MBs if high Q (0,0 mv and skip coeffs)
+ if (high_q) {
+ av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+ av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+ }
+ // Enable data update
+ seg->update_data = 1;
+ } else {
+ // All other frames.
+
+ // No updates.. leave things as they are.
+ seg->update_map = 0;
+ seg->update_data = 0;
+ }
+ }
+}
+
+void av1_apply_active_map(AV1_COMP *cpi) {
+ struct segmentation *const seg = &cpi->common.seg;
+ unsigned char *const seg_map = cpi->enc_seg.map;
+ const unsigned char *const active_map = cpi->active_map.map;
+ int i;
+
+ assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+ if (frame_is_intra_only(&cpi->common)) {
+ cpi->active_map.enabled = 0;
+ cpi->active_map.update = 1;
+ }
+
+ if (cpi->active_map.update) {
+ if (cpi->active_map.enabled) {
+ const int num_mis =
+ cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+ for (i = 0; i < num_mis; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+ av1_enable_segmentation(seg);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
+ -MAX_LOOP_FILTER);
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
+ -MAX_LOOP_FILTER);
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
+ -MAX_LOOP_FILTER);
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
+ -MAX_LOOP_FILTER);
+ } else {
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+ if (seg->enabled) {
+ seg->update_data = 1;
+ seg->update_map = 1;
+ }
+ }
+ cpi->active_map.update = 0;
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void process_tpl_stats_frame(AV1_COMP *cpi) {
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ AV1_COMMON *const cm = &cpi->common;
+
+ assert(IMPLIES(gf_group->size > 0, cpi->gf_frame_index < gf_group->size));
+
+ const int tpl_idx = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+ if (tpl_frame->is_valid) {
+ int tpl_stride = tpl_frame->stride;
+ double intra_cost_base = 0;
+ double mc_dep_cost_base = 0;
+ double cbcmp_base = 1;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+ const int row_step = step;
+ const int col_step_sr =
+ coded_to_superres_mi(step, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+ for (int row = 0; row < cm->mi_params.mi_rows; row += row_step) {
+ for (int col = 0; col < mi_cols_sr; col += col_step_sr) {
+ TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+ double cbcmp = (double)(this_stats->srcrf_dist);
+ int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+ intra_cost_base += log(dist_scaled) * cbcmp;
+ mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp;
+ cbcmp_base += cbcmp;
+ }
+ }
+
+ if (mc_dep_cost_base == 0) {
+ tpl_frame->is_valid = 0;
+ } else {
+ cpi->rd.r0 = exp((intra_cost_base - mc_dep_cost_base) / cbcmp_base);
+ if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+ if (cpi->ppi->lap_enabled) {
+ double min_boost_factor = sqrt(cpi->ppi->p_rc.baseline_gf_interval);
+ const int gfu_boost = get_gfu_boost_from_r0_lap(
+ min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0,
+ cpi->ppi->p_rc.num_stats_required_for_gfu_boost);
+ // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
+ // gfu_boost);
+ cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
+ min_boost_factor, MAX_BOOST_COMBINE_FACTOR,
+ cpi->ppi->p_rc.gfu_boost, gfu_boost,
+ cpi->ppi->p_rc.num_stats_used_for_gfu_boost);
+ } else {
+ // TPL may only look at a subset of frame in the gf group when the
+ // speed feature 'reduce_num_frames' is on, which affects the r0
+ // calcuation. Thus, to compensate for TPL not using all frames a
+ // factor to adjust r0 is used.
+ const int gfu_boost =
+ (int)(200.0 * cpi->ppi->tpl_data.r0_adjust_factor / cpi->rd.r0);
+ cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
+ MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+ cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
+ }
+ }
+ }
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+ int *top_index) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ // Setup variables that depend on the dimensions of the frame.
+ av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
+
+#if !CONFIG_REALTIME_ONLY
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (cpi->oxcf.algo_cfg.enable_tpl_model &&
+ av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) {
+ process_tpl_stats_frame(cpi);
+ av1_tpl_rdmult_setup(cpi);
+ }
+#endif
+
+ // Decide q and q bounds.
+ *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index,
+ bottom_index, top_index);
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q &&
+ cpi->ppi->tpl_data.tpl_frame[cpi->gf_frame_index].is_valid &&
+ !is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+ const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+ const int tpl_q = av1_tpl_get_q_index(
+ &cpi->ppi->tpl_data, cpi->gf_frame_index, cpi->rc.active_worst_quality,
+ cm->seq_params->bit_depth);
+ *q = clamp(tpl_q, rc_cfg->best_allowed_q, rc_cfg->worst_allowed_q);
+ *top_index = *bottom_index = *q;
+ if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE)
+ cpi->ppi->p_rc.arf_q = *q;
+ }
+
+ if (cpi->oxcf.q_cfg.use_fixed_qp_offsets && cpi->oxcf.rc_cfg.mode == AOM_Q) {
+ if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+ const double qratio_grad =
+ cpi->ppi->p_rc.baseline_gf_interval > 20 ? 0.2 : 0.3;
+ const double qstep_ratio =
+ 0.2 +
+ (1.0 - (double)cpi->rc.active_worst_quality / MAXQ) * qratio_grad;
+ *q = av1_get_q_index_from_qstep_ratio(
+ cpi->rc.active_worst_quality, qstep_ratio, cm->seq_params->bit_depth);
+ *top_index = *bottom_index = *q;
+ if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == GF_UPDATE)
+ cpi->ppi->p_rc.arf_q = *q;
+ } else if (gf_group->layer_depth[cpi->gf_frame_index] <
+ gf_group->max_layer_depth) {
+ int this_height = gf_group->layer_depth[cpi->gf_frame_index];
+ int arf_q = cpi->ppi->p_rc.arf_q;
+ while (this_height > 1) {
+ arf_q = (arf_q + cpi->oxcf.rc_cfg.cq_level + 1) / 2;
+ --this_height;
+ }
+ *top_index = *bottom_index = *q = arf_q;
+ }
+ }
+#endif
+
+ // Configure experimental use of segmentation for enhanced coding of
+ // static regions if indicated.
+ // Only allowed in the second pass of a two pass encode, as it requires
+ // lagged coding, and if the relevant speed feature flag is set.
+ if (is_stat_consumption_stage_twopass(cpi) &&
+ cpi->sf.hl_sf.static_segmentation)
+ configure_static_seg_features(cpi);
+}
+
+static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
+ pars->num_cr_points = 0;
+ pars->cr_mult = 0;
+ pars->cr_luma_mult = 0;
+ memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr));
+ memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr));
+ pars->num_cb_points = 0;
+ pars->cb_mult = 0;
+ pars->cb_luma_mult = 0;
+ pars->chroma_scaling_from_luma = 0;
+ memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb));
+ memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
+}
+
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+ const AV1EncoderConfig *oxcf) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+ if (tune_cfg->film_grain_test_vector || tune_cfg->film_grain_table_filename ||
+ tune_cfg->content == AOM_CONTENT_FILM) {
+ seq_params->film_grain_params_present = 1;
+ } else {
+#if CONFIG_DENOISE
+ seq_params->film_grain_params_present = (oxcf->noise_level > 0);
+#else
+ seq_params->film_grain_params_present = 0;
+#endif
+ }
+}
+
+void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
+ const AV1EncoderConfig *oxcf) {
+ AV1_COMMON *const cm = &cpi->common;
+ const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+ if (cpi->film_grain_table) {
+ aom_film_grain_table_free(cpi->film_grain_table);
+ aom_free(cpi->film_grain_table);
+ cpi->film_grain_table = NULL;
+ }
+
+ if (tune_cfg->film_grain_test_vector) {
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ memcpy(&cm->film_grain_params,
+ film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1,
+ sizeof(cm->film_grain_params));
+ if (oxcf->tool_cfg.enable_monochrome)
+ reset_film_grain_chroma_params(&cm->film_grain_params);
+ cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+ if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) {
+ cm->film_grain_params.clip_to_restricted_range = 0;
+ }
+ }
+ } else if (tune_cfg->film_grain_table_filename) {
+ CHECK_MEM_ERROR(cm, cpi->film_grain_table,
+ aom_calloc(1, sizeof(*cpi->film_grain_table)));
+
+ aom_film_grain_table_read(cpi->film_grain_table,
+ tune_cfg->film_grain_table_filename, cm->error);
+ } else if (tune_cfg->content == AOM_CONTENT_FILM) {
+ cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+ if (oxcf->tool_cfg.enable_monochrome)
+ reset_film_grain_chroma_params(&cm->film_grain_params);
+ if (cm->seq_params->color_range == AOM_CR_FULL_RANGE)
+ cm->film_grain_params.clip_to_restricted_range = 0;
+ } else {
+ memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+ }
+}
+
+void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
+ const int phase, const int use_optimized_scaler) {
+ AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MV_REFERENCE_FRAME ref_frame;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ BufferPool *const pool = cm->buffer_pool;
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_yv12_buf(cm, ref_frame);
+
+ if (ref == NULL) {
+ cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+ continue;
+ }
+
+ // For RTC-SVC: if force_zero_mode_spatial_ref is enabled, check if the
+ // motion search can be skipped for the references: last, golden, altref.
+ // If so, we can skip scaling that reference.
+ if (cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref &&
+ cpi->ppi->rtc_ref.set_ref_frame_config) {
+ if (ref_frame == LAST_FRAME && cpi->svc.skip_mvsearch_last) continue;
+ if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_mvsearch_gf) continue;
+ if (ref_frame == ALTREF_FRAME && cpi->svc.skip_mvsearch_altref)
+ continue;
+ }
+ // For RTC with superres on: golden reference only needs to be scaled
+ // if it was refreshed in previous frame.
+ if (is_one_pass_rt_params(cpi) &&
+ cpi->oxcf.superres_cfg.enable_superres && ref_frame == GOLDEN_FRAME &&
+ cpi->rc.frame_num_last_gf_refresh <
+ (int)cm->current_frame.frame_number - 1) {
+ continue;
+ }
+
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+ // Replace the reference buffer with a copy having a thicker border,
+ // if the reference buffer is higher resolution than the current
+ // frame, and the border is thin.
+ if ((ref->y_crop_width > cm->width ||
+ ref->y_crop_height > cm->height) &&
+ ref->border < AOM_BORDER_IN_PIXELS) {
+ RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
+ if (aom_yv12_realloc_with_new_border(
+ &ref_fb->buf, AOM_BORDER_IN_PIXELS,
+ cm->features.byte_alignment, cpi->image_pyramid_levels,
+ num_planes) != 0) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+ }
+ int force_scaling = 0;
+ RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1];
+ if (new_fb == NULL) {
+ const int new_fb_idx = get_free_fb(cm);
+ if (new_fb_idx == INVALID_IDX) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Unable to find free frame buffer");
+ }
+ force_scaling = 1;
+ new_fb = &pool->frame_bufs[new_fb_idx];
+ }
+
+ if (force_scaling || new_fb->buf.y_crop_width != cm->width ||
+ new_fb->buf.y_crop_height != cm->height) {
+ if (aom_realloc_frame_buffer(
+ &new_fb->buf, cm->width, cm->height,
+ cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+ cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) {
+ if (force_scaling) {
+ // Release the reference acquired in the get_free_fb() call above.
+ --new_fb->ref_count;
+ }
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+ bool has_optimized_scaler = av1_has_optimized_scaler(
+ ref->y_crop_width, ref->y_crop_height, new_fb->buf.y_crop_width,
+ new_fb->buf.y_crop_height);
+ if (num_planes > 1) {
+ has_optimized_scaler =
+ has_optimized_scaler &&
+ av1_has_optimized_scaler(
+ ref->uv_crop_width, ref->uv_crop_height,
+ new_fb->buf.uv_crop_width, new_fb->buf.uv_crop_height);
+ }
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_optimized_scaler && has_optimized_scaler &&
+ cm->seq_params->bit_depth == AOM_BITS_8) {
+ av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
+ num_planes);
+ } else if (!av1_resize_and_extend_frame_nonnormative(
+ ref, &new_fb->buf, (int)cm->seq_params->bit_depth,
+ num_planes)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffer during resize");
+ }
+#else
+ if (use_optimized_scaler && has_optimized_scaler) {
+ av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
+ num_planes);
+ } else if (!av1_resize_and_extend_frame_nonnormative(
+ ref, &new_fb->buf, (int)cm->seq_params->bit_depth,
+ num_planes)) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate buffer during resize");
+ }
+#endif
+ cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
+ alloc_frame_mvs(cm, new_fb);
+ }
+ } else {
+ RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+ buf->buf.y_crop_width = ref->y_crop_width;
+ buf->buf.y_crop_height = ref->y_crop_height;
+ cpi->scaled_ref_buf[ref_frame - 1] = buf;
+ ++buf->ref_count;
+ }
+ } else {
+ if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+ }
+ }
+}
+
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+ int height, int number_spatial_layers) {
+ if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64) {
+ return BLOCK_64X64;
+ }
+ if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128) {
+ return BLOCK_128X128;
+ }
+#if CONFIG_TFLITE
+ if (oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) return BLOCK_64X64;
+#endif
+ // Force 64x64 superblock size to increase resolution in perceptual
+ // AQ mode.
+ if (oxcf->mode == ALLINTRA &&
+ (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI ||
+ oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED)) {
+ return BLOCK_64X64;
+ }
+ assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+ if (number_spatial_layers > 1 ||
+ oxcf->resize_cfg.resize_mode != RESIZE_NONE) {
+ // Use the configured size (top resolution) for spatial layers or
+ // on resize.
+ return AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) > 720
+ ? BLOCK_128X128
+ : BLOCK_64X64;
+ } else if (oxcf->mode == REALTIME) {
+ if (oxcf->tune_cfg.content == AOM_CONTENT_SCREEN) {
+ const TileConfig *const tile_cfg = &oxcf->tile_cfg;
+ const int num_tiles =
+ (1 << tile_cfg->tile_columns) * (1 << tile_cfg->tile_rows);
+ // For multi-thread encode: if the number of (128x128) superblocks
+ // per tile is low use 64X64 superblock.
+ if (oxcf->row_mt == 1 && oxcf->max_threads >= 4 &&
+ oxcf->max_threads >= num_tiles && AOMMIN(width, height) > 720 &&
+ (width * height) / (128 * 128 * num_tiles) <= 38)
+ return BLOCK_64X64;
+ else
+ return AOMMIN(width, height) >= 720 ? BLOCK_128X128 : BLOCK_64X64;
+ } else {
+ return AOMMIN(width, height) > 720 ? BLOCK_128X128 : BLOCK_64X64;
+ }
+ }
+
+ // TODO(any): Possibly could improve this with a heuristic.
+ // When superres / resize is on, 'cm->width / height' can change between
+ // calls, so we don't apply this heuristic there.
+ // Things break if superblock size changes between the first pass and second
+ // pass encoding, which is why this heuristic is not configured as a
+ // speed-feature.
+ if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE &&
+ oxcf->resize_cfg.resize_mode == RESIZE_NONE) {
+ int is_480p_or_lesser = AOMMIN(width, height) <= 480;
+ if (oxcf->speed >= 1 && is_480p_or_lesser) return BLOCK_64X64;
+
+ // For 1080p and lower resolutions, choose SB size adaptively based on
+ // resolution and speed level for multi-thread encode.
+ int is_1080p_or_lesser = AOMMIN(width, height) <= 1080;
+ if (!is_480p_or_lesser && is_1080p_or_lesser && oxcf->mode == GOOD &&
+ oxcf->row_mt == 1 && oxcf->max_threads > 1 && oxcf->speed >= 5)
+ return BLOCK_64X64;
+
+ // For allintra encode, since the maximum partition size is set to 32X32 for
+ // speed>=6, superblock size is set to 64X64 instead of 128X128. This
+ // improves the multithread performance due to reduction in top right delay
+ // and thread sync wastage. Currently, this setting is selectively enabled
+ // only for speed>=9 and resolutions less than 4k since cost update
+ // frequency is set to INTERNAL_COST_UPD_OFF in these cases.
+ const int is_4k_or_larger = AOMMIN(width, height) >= 2160;
+ if (oxcf->mode == ALLINTRA && oxcf->speed >= 9 && !is_4k_or_larger)
+ return BLOCK_64X64;
+ }
+ return BLOCK_128X128;
+}
+
+void av1_setup_frame(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ // Set up entropy context depending on frame type. The decoder mandates
+ // the use of the default context, index 0, for keyframes and inter
+ // frames where the error_resilient_mode or intra_only flag is set. For
+ // other inter-frames the encoder currently uses only two contexts;
+ // context 1 for ALTREF frames and context 0 for the others.
+
+ if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+ cpi->ext_flags.use_primary_ref_none) {
+ av1_setup_past_independence(cm);
+ }
+
+ if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
+ frame_is_sframe(cm)) {
+ if (!cpi->ppi->seq_params_locked) {
+ set_sb_size(cm->seq_params,
+ av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+ cpi->ppi->number_spatial_layers));
+ }
+ } else {
+ const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
+ if (primary_ref_buf == NULL) {
+ av1_setup_past_independence(cm);
+ cm->seg.update_map = 1;
+ cm->seg.update_data = 1;
+ } else {
+ *cm->fc = primary_ref_buf->frame_context;
+ }
+ }
+
+ av1_zero(cm->cur_frame->interp_filter_selected);
+ cm->prev_frame = get_primary_ref_frame_buf(cm);
+ cpi->vaq_refresh = 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int get_interp_filter_selected(const AV1_COMMON *const cm,
+ MV_REFERENCE_FRAME ref,
+ InterpFilter ifilter) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+ if (buf == NULL) return 0;
+ return buf->interp_filter_selected[ifilter];
+}
+
+uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int ref_total[REF_FRAMES] = { 0 };
+ uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK;
+
+ if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_frame.alt_ref_frame)
+ return mask;
+
+ for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+ for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+ ++ifilter) {
+ ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
+ }
+ }
+ int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
+ ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
+ ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
+
+ for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+ ++ifilter) {
+ int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
+ if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
+ int filter_score =
+ get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
+ get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
+ get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
+ get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
+ get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
+ get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
+ if (filter_score < ref_total_total) {
+ DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter;
+ reset_interp_filter_allowed_mask(&mask, filt_type);
+ }
+ }
+ }
+ return mask;
+}
+
+#define STRICT_PSNR_DIFF_THRESH 0.9
+// Encode key frame with/without screen content tools to determine whether
+// screen content tools should be enabled for this key frame group or not.
+// The first encoding is without screen content tools.
+// The second encoding is with screen content tools.
+// We compare the psnr and frame size to make the decision.
+static void screen_content_tools_determination(
+ AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision,
+ const int allow_intrabc_orig_decision,
+ const int use_screen_content_tools_orig_decision,
+ const int is_screen_content_type_orig_decision, const int pass,
+ int *projected_size_pass, PSNR_STATS *psnr) {
+ AV1_COMMON *const cm = &cpi->common;
+ FeatureFlags *const features = &cm->features;
+
+#if CONFIG_FPMT_TEST
+ projected_size_pass[pass] =
+ ((cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) &&
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE))
+ ? cpi->ppi->p_rc.temp_projected_frame_size
+ : cpi->rc.projected_frame_size;
+#else
+ projected_size_pass[pass] = cpi->rc.projected_frame_size;
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+ aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass],
+ bit_depth, in_bit_depth);
+#else
+ aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]);
+#endif
+ if (pass != 1) return;
+
+ const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0];
+ // Calculate % of palette mode to be chosen in a frame from mode decision.
+ const double palette_ratio =
+ (double)cpi->palette_pixel_num / (double)(cm->height * cm->width);
+ const int psnr_diff_is_large = (psnr_diff > STRICT_PSNR_DIFF_THRESH);
+ const int ratio_is_large =
+ ((palette_ratio >= 0.0001) && ((psnr_diff / palette_ratio) > 4));
+ const int is_sc_encoding_much_better = (psnr_diff_is_large || ratio_is_large);
+ if (is_sc_encoding_much_better) {
+ // Use screen content tools, if we get coding gain.
+ features->allow_screen_content_tools = 1;
+ features->allow_intrabc = cpi->intrabc_used;
+ cpi->use_screen_content_tools = 1;
+ cpi->is_screen_content_type = 1;
+ } else {
+ // Use original screen content decision.
+ features->allow_screen_content_tools =
+ allow_screen_content_tools_orig_decision;
+ features->allow_intrabc = allow_intrabc_orig_decision;
+ cpi->use_screen_content_tools = use_screen_content_tools_orig_decision;
+ cpi->is_screen_content_type = is_screen_content_type_orig_decision;
+ }
+}
+
+// Set some encoding parameters to make the encoding process fast.
+// A fixed block partition size, and a large q is used.
+static void set_encoding_params_for_screen_content(AV1_COMP *cpi,
+ const int pass) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (pass == 0) {
+ // In the first pass, encode without screen content tools.
+ // Use a high q, and a fixed block size for fast encoding.
+ cm->features.allow_screen_content_tools = 0;
+ cm->features.allow_intrabc = 0;
+ cpi->use_screen_content_tools = 0;
+ cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+ cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+ return;
+ }
+ assert(pass == 1);
+ // In the second pass, encode with screen content tools.
+ // Use a high q, and a fixed block size for fast encoding.
+ cm->features.allow_screen_content_tools = 1;
+ // TODO(chengchen): turn intrabc on could lead to data race issue.
+ // cm->allow_intrabc = 1;
+ cpi->use_screen_content_tools = 1;
+ cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+ cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+}
+
+// Determines whether to use screen content tools for the key frame group.
+// This function modifies "cm->features.allow_screen_content_tools",
+// "cm->features.allow_intrabc" and "cpi->use_screen_content_tools".
+void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+ // Variables to help determine if we should allow screen content tools.
+ int projected_size_pass[3] = { 0 };
+ PSNR_STATS psnr[3];
+ const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME;
+ const int allow_screen_content_tools_orig_decision =
+ cm->features.allow_screen_content_tools;
+ const int allow_intrabc_orig_decision = cm->features.allow_intrabc;
+ const int use_screen_content_tools_orig_decision =
+ cpi->use_screen_content_tools;
+ const int is_screen_content_type_orig_decision = cpi->is_screen_content_type;
+ // Turn off the encoding trial for forward key frame and superres.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode || oxcf->kf_cfg.fwd_kf_enabled ||
+ cpi->superres_mode != AOM_SUPERRES_NONE || oxcf->mode == REALTIME ||
+ use_screen_content_tools_orig_decision || !is_key_frame) {
+ return;
+ }
+
+ // TODO(chengchen): multiple encoding for the lossless mode is time consuming.
+ // Find a better way to determine whether screen content tools should be used
+ // for lossless coding.
+ // Use a high q and a fixed partition to do quick encoding.
+ const int q_for_screen_content_quick_run =
+ is_lossless_requested(&oxcf->rc_cfg) ? q_orig : AOMMAX(q_orig, 244);
+ const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type;
+ const BLOCK_SIZE fixed_partition_block_size_orig =
+ cpi->sf.part_sf.fixed_partition_size;
+
+ // Setup necessary params for encoding, including frame source, etc.
+
+ cpi->source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter,
+ 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+ if (cpi->unscaled_last_source != NULL) {
+ cpi->last_source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+ cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels,
+ cpi->image_pyramid_levels);
+ }
+
+ av1_setup_frame(cpi);
+
+ if (cm->seg.enabled) {
+ if (!cm->seg.update_data && cm->prev_frame) {
+ segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ cm->seg.enabled = cm->prev_frame->seg.enabled;
+ } else {
+ av1_calculate_segdata(&cm->seg);
+ }
+ } else {
+ memset(&cm->seg, 0, sizeof(cm->seg));
+ }
+ segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+ cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+ // The two encoding passes aim to help determine whether to use screen
+ // content tools, with a high q and fixed partition.
+ for (int pass = 0; pass < 2; ++pass) {
+ set_encoding_params_for_screen_content(cpi, pass);
+ av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel,
+ q_for_screen_content_quick_run,
+ q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+ av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+
+ av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
+ 0);
+ // transform / motion compensation build reconstruction frame
+ av1_encode_frame(cpi);
+ // Screen content decision
+ screen_content_tools_determination(
+ cpi, allow_screen_content_tools_orig_decision,
+ allow_intrabc_orig_decision, use_screen_content_tools_orig_decision,
+ is_screen_content_type_orig_decision, pass, projected_size_pass, psnr);
+ }
+
+ // Set partition speed feature back.
+ cpi->sf.part_sf.partition_search_type = partition_search_type_orig;
+ cpi->sf.part_sf.fixed_partition_size = fixed_partition_block_size_orig;
+
+ // Free token related info if screen content coding tools are not enabled.
+ if (!cm->features.allow_screen_content_tools)
+ free_token_info(&cpi->token_info);
+}
+#endif // CONFIG_REALTIME_ONLY
+
+static void fix_interp_filter(InterpFilter *const interp_filter,
+ const FRAME_COUNTS *const counts) {
+ if (*interp_filter == SWITCHABLE) {
+ // Check to see if only one of the filters is actually used
+ int count[SWITCHABLE_FILTERS] = { 0 };
+ int num_filters_used = 0;
+ for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+ count[i] += counts->switchable_interp[j][i];
+ num_filters_used += (count[i] > 0);
+ }
+ if (num_filters_used == 1) {
+ // Only one filter is used. So set the filter at frame level
+ for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ if (count[i]) {
+ *interp_filter = i;
+ break;
+ }
+ }
+ }
+ }
+}
+
+void av1_finalize_encoded_frame(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+
+ if (!cm->seq_params->reduced_still_picture_hdr &&
+ encode_show_existing_frame(cm)) {
+ RefCntBuffer *const frame_to_show =
+ cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+ if (frame_to_show == NULL) {
+ aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Buffer does not contain a reconstructed frame");
+ }
+ assert(frame_to_show->ref_count > 0);
+ assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+ }
+
+ if (!encode_show_existing_frame(cm) &&
+ cm->seq_params->film_grain_params_present &&
+ (cm->show_frame || cm->showable_frame)) {
+ // Copy the current frame's film grain params to the its corresponding
+ // RefCntBuffer slot.
+ cm->cur_frame->film_grain_params = cm->film_grain_params;
+
+ // We must update the parameters if this is not an INTER_FRAME
+ if (current_frame->frame_type != INTER_FRAME)
+ cm->cur_frame->film_grain_params.update_parameters = 1;
+
+ // Iterate the random seed for the next frame.
+ cm->film_grain_params.random_seed += 3381;
+ if (cm->film_grain_params.random_seed == 0)
+ cm->film_grain_params.random_seed = 7391;
+ }
+
+ // Initialise all tiles' contexts from the global frame context
+ for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
+ for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
+ const int tile_idx = tile_row * cm->tiles.cols + tile_col;
+ cpi->tile_data[tile_idx].tctx = *cm->fc;
+ }
+ }
+
+ if (!frame_is_intra_only(cm))
+ fix_interp_filter(&cm->features.interp_filter, cpi->td.counts);
+}
+
+int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+ const YV12_BUFFER_CONFIG *last_picture,
+ ForceIntegerMVInfo *const force_intpel_info) {
+ // check use hash ME
+ int k;
+
+ const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE;
+ const double threshold_current = 0.8;
+ const double threshold_average = 0.95;
+ const int max_history_size = 32;
+ int T = 0; // total block
+ int C = 0; // match with collocated block
+ int S = 0; // smooth region but not match with collocated block
+
+ const int pic_width = cur_picture->y_width;
+ const int pic_height = cur_picture->y_height;
+ for (int i = 0; i + block_size <= pic_height; i += block_size) {
+ for (int j = 0; j + block_size <= pic_width; j += block_size) {
+ const int x_pos = j;
+ const int y_pos = i;
+ int match = 1;
+ T++;
+
+ // check whether collocated block match with current
+ uint8_t *p_cur = cur_picture->y_buffer;
+ uint8_t *p_ref = last_picture->y_buffer;
+ int stride_cur = cur_picture->y_stride;
+ int stride_ref = last_picture->y_stride;
+ p_cur += (y_pos * stride_cur + x_pos);
+ p_ref += (y_pos * stride_ref + x_pos);
+
+ if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+ uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+ for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+ for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+ if (p16_cur[tmpX] != p16_ref[tmpX]) {
+ match = 0;
+ }
+ }
+ p16_cur += stride_cur;
+ p16_ref += stride_ref;
+ }
+ } else {
+ for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+ for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+ if (p_cur[tmpX] != p_ref[tmpX]) {
+ match = 0;
+ }
+ }
+ p_cur += stride_cur;
+ p_ref += stride_ref;
+ }
+ }
+
+ if (match) {
+ C++;
+ continue;
+ }
+
+ if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+ y_pos) ||
+ av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+ S++;
+ continue;
+ }
+ }
+ }
+
+ assert(T > 0);
+ double cs_rate = ((double)(C + S)) / ((double)(T));
+
+ force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate;
+
+ force_intpel_info->rate_index =
+ (force_intpel_info->rate_index + 1) % max_history_size;
+ force_intpel_info->rate_size++;
+ force_intpel_info->rate_size =
+ AOMMIN(force_intpel_info->rate_size, max_history_size);
+
+ if (cs_rate < threshold_current) {
+ return 0;
+ }
+
+ if (C == T) {
+ return 1;
+ }
+
+ double cs_average = 0.0;
+
+ for (k = 0; k < force_intpel_info->rate_size; k++) {
+ cs_average += force_intpel_info->cs_rate_array[k];
+ }
+ cs_average /= force_intpel_info->rate_size;
+
+ if (cs_average < threshold_average) {
+ return 0;
+ }
+
+ if ((T - C - S) < 0) {
+ return 1;
+ }
+
+ if (cs_average > 1.01) {
+ return 1;
+ }
+
+ return 0;
+}
+
+void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ uint8_t *y_buffer = cpi->source->y_buffer;
+ const int y_stride = cpi->source->y_stride;
+ const int block_size = BLOCK_16X16;
+
+ const int num_mi_w = mi_size_wide[block_size];
+ const int num_mi_h = mi_size_high[block_size];
+ const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+ double log_sum = 0.0;
+
+ // Loop through each 16x16 block.
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ double var = 0.0, num_of_var = 0.0;
+ const int index = row * num_cols + col;
+
+ // Loop through each 8x8 block.
+ for (int mi_row = row * num_mi_h;
+ mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+ mi_row += 2) {
+ for (int mi_col = col * num_mi_w;
+ mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+ mi_col += 2) {
+ struct buf_2d buf;
+ const int row_offset_y = mi_row << 2;
+ const int col_offset_y = mi_col << 2;
+
+ buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+ buf.stride = y_stride;
+
+ var += av1_get_perpixel_variance_facade(cpi, xd, &buf, BLOCK_8X8,
+ AOM_PLANE_Y);
+ num_of_var += 1.0;
+ }
+ }
+ var = var / num_of_var;
+
+ // Curve fitting with an exponential model on all 16x16 blocks from the
+ // midres dataset.
+ var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+
+ // As per the above computation, var will be in the range of
+ // [17.492222, 84.527656], assuming the data type is of infinite
+ // precision. The following assert conservatively checks if var is in the
+ // range of [17.0, 85.0] to avoid any issues due to the precision of the
+ // relevant data type.
+ assert(var > 17.0 && var < 85.0);
+ cpi->ssim_rdmult_scaling_factors[index] = var;
+ log_sum += log(var);
+ }
+ }
+
+ // As log_sum holds the geometric mean, it will be in the range
+ // [17.492222, 84.527656]. Hence, in the below loop, the value of
+ // cpi->ssim_rdmult_scaling_factors[index] would be in the range
+ // [0.2069, 4.8323].
+ log_sum = exp(log_sum / (double)(num_rows * num_cols));
+
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
+ }
+ }
+}
+
+// Coding context that only needs to be saved when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static void save_extra_coding_context(AV1_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ AV1_COMMON *cm = &cpi->common;
+
+ cc->lf = cm->lf;
+ cc->cdef_info = cm->cdef_info;
+ cc->rc = cpi->rc;
+ cc->mv_stats = cpi->ppi->mv_stats;
+}
+
+void av1_save_all_coding_context(AV1_COMP *cpi) {
+ save_extra_coding_context(cpi);
+ if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+#if DUMP_RECON_FRAMES == 1
+
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+void av1_dump_filtered_recon_frames(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf;
+
+ if (recon_buf == NULL) {
+ printf("Frame %d is not ready.\n", current_frame->frame_number);
+ return;
+ }
+
+ static const int flag_list[REF_FRAMES] = { 0,
+ AOM_LAST_FLAG,
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+ AOM_GOLD_FLAG,
+ AOM_BWD_FLAG,
+ AOM_ALT2_FLAG,
+ AOM_ALT_FLAG };
+ printf(
+ "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
+ "show_existing_frame=%d) "
+ "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
+ current_frame->frame_number, current_frame->order_hint, cm->show_frame,
+ cm->show_existing_frame);
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ const int ref_offset = buf != NULL ? (int)buf->order_hint : -1;
+ printf(" %d(%c)", ref_offset,
+ (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N');
+ }
+ printf(" ]\n");
+
+ if (!cm->show_frame) {
+ printf("Frame %d is a no show frame, so no image dump.\n",
+ current_frame->frame_number);
+ return;
+ }
+
+ int h;
+ char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+ FILE *f_recon = NULL;
+
+ if (current_frame->frame_number == 0) {
+ if ((f_recon = fopen(file_name, "wb")) == NULL) {
+ printf("Unable to open file %s to write.\n", file_name);
+ return;
+ }
+ } else {
+ if ((f_recon = fopen(file_name, "ab")) == NULL) {
+ printf("Unable to open file %s to append.\n", file_name);
+ return;
+ }
+ }
+ printf(
+ "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
+ "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
+ "refresh_alt_ref_frame=%d, "
+ "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
+ current_frame->frame_number, cpi->gf_frame_index,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+ current_frame->order_hint, cm->show_frame, cm->show_existing_frame,
+ cpi->rc.source_alt_ref_active, cpi->refresh_frame.alt_ref_frame,
+ recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
+#if 0
+ int ref_frame;
+ printf("get_ref_frame_map_idx: [");
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+ printf(" %d", get_ref_frame_map_idx(cm, ref_frame));
+ printf(" ]\n");
+#endif // 0
+
+ // --- Y ---
+ for (h = 0; h < cm->height; ++h) {
+ fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+ f_recon);
+ }
+ // --- U ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+ f_recon);
+ }
+ // --- V ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+ f_recon);
+ }
+
+ fclose(f_recon);
+}
+#endif // DUMP_RECON_FRAMES
diff --git a/third_party/aom/av1/encoder/encoder_utils.h b/third_party/aom/av1/encoder/encoder_utils.h
new file mode 100644
index 0000000000..113f62aa59
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder_utils.h
@@ -0,0 +1,1141 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_UTILS_H_
+#define AOM_AV1_ENCODER_ENCODER_UTILS_H_
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+#define DUMP_RECON_FRAMES 0
+
+extern const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL]
+ [TX_TYPES];
+
+extern const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+
+extern const int default_warped_probs[FRAME_UPDATE_TYPES];
+
+extern const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+ [SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS];
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static AOM_INLINE void suppress_active_map(AV1_COMP *cpi) {
+ unsigned char *const seg_map = cpi->enc_seg.map;
+ int i;
+ const int num_mis =
+ cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+ if (cpi->active_map.enabled || cpi->active_map.update)
+ for (i = 0; i < num_mis; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+ seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+// Returns 'size' in the number of Mode Info (MI) units. 'size' is either the
+// width or height.
+static AOM_INLINE int size_in_mi(int size) {
+ // Ensure that the decoded width and height are both multiples of
+ // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+ // subsampling is used).
+ // This simplifies the implementation of various experiments,
+ // eg. cdef, which operates on units of 8x8 luma pixels.
+ const int aligned_size = ALIGN_POWER_OF_TWO(size, 3);
+ return aligned_size >> MI_SIZE_LOG2;
+}
+
+static AOM_INLINE void set_mb_mi(CommonModeInfoParams *mi_params, int width,
+ int height) {
+ mi_params->mi_cols = size_in_mi(width);
+ mi_params->mi_rows = size_in_mi(height);
+ mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
+
+ mi_params->mb_cols = ROUND_POWER_OF_TWO(mi_params->mi_cols, 2);
+ mi_params->mb_rows = ROUND_POWER_OF_TWO(mi_params->mi_rows, 2);
+ mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
+
+ const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+ mi_params->mi_alloc_stride =
+ (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+
+ assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
+ mi_size_high[mi_params->mi_alloc_bsize]);
+}
+
+static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) {
+ aom_free(mi_params->mi_alloc);
+ mi_params->mi_alloc = NULL;
+ mi_params->mi_alloc_size = 0;
+ aom_free(mi_params->mi_grid_base);
+ mi_params->mi_grid_base = NULL;
+ mi_params->mi_grid_size = 0;
+ aom_free(mi_params->tx_type_map);
+ mi_params->tx_type_map = NULL;
+}
+
+static AOM_INLINE void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+ int height,
+ BLOCK_SIZE min_partition_size) {
+ mi_params->mi_alloc_bsize = min_partition_size;
+
+ set_mb_mi(mi_params, width, height);
+}
+
+static AOM_INLINE void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params,
+ int width, int height,
+ BLOCK_SIZE min_partition_size) {
+ (void)min_partition_size;
+ mi_params->mi_alloc_bsize = BLOCK_16X16;
+
+ set_mb_mi(mi_params, width, height);
+}
+
+static AOM_INLINE void enc_setup_mi(CommonModeInfoParams *mi_params) {
+ const int mi_grid_size =
+ mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
+ memset(mi_params->mi_alloc, 0,
+ mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc));
+ memset(mi_params->mi_grid_base, 0,
+ mi_grid_size * sizeof(*mi_params->mi_grid_base));
+ memset(mi_params->tx_type_map, 0,
+ mi_grid_size * sizeof(*mi_params->tx_type_map));
+}
+
+static AOM_INLINE void init_buffer_indices(
+ ForceIntegerMVInfo *const force_intpel_info, int *const remapped_ref_idx) {
+ int fb_idx;
+ for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
+ remapped_ref_idx[fb_idx] = fb_idx;
+ force_intpel_info->rate_index = 0;
+ force_intpel_info->rate_size = 0;
+}
+
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX3DF, JSDAF, JSVAF) \
+ ppi->fn_ptr[BT].sdf = SDF; \
+ ppi->fn_ptr[BT].sdaf = SDAF; \
+ ppi->fn_ptr[BT].vf = VF; \
+ ppi->fn_ptr[BT].svf = SVF; \
+ ppi->fn_ptr[BT].svaf = SVAF; \
+ ppi->fn_ptr[BT].sdx4df = SDX4DF; \
+ ppi->fn_ptr[BT].sdx3df = SDX3DF; \
+ ppi->fn_ptr[BT].jsdaf = JSDAF; \
+ ppi->fn_ptr[BT].jsvaf = JSVAF;
+
+#define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD) \
+ HIGHBD_BFP( \
+ BLOCK_##WIDTH##X##HEIGHT, aom_highbd_sad##WIDTH##x##HEIGHT##_bits##BD, \
+ aom_highbd_sad##WIDTH##x##HEIGHT##_avg_bits##BD, \
+ aom_highbd_##BD##_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_##BD##_sub_pixel_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_##BD##_sub_pixel_avg_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_sad##WIDTH##x##HEIGHT##x4d_bits##BD, \
+ aom_highbd_sad##WIDTH##x##HEIGHT##x3d_bits##BD, \
+ aom_highbd_dist_wtd_sad##WIDTH##x##HEIGHT##_avg_bits##BD, \
+ aom_highbd_##BD##_dist_wtd_sub_pixel_avg_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+ }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+ 4; \
+ }
+
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ } \
+ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 2; \
+ } \
+ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 4; \
+ }
+
+#define MAKE_BFP_JSADAVG_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred, \
+ const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+ jcp_param); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred, \
+ const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+ jcp_param) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred, \
+ const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+ jcp_param) >> \
+ 4; \
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x3d)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x3d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x3d)
+#endif
+
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg)
+#if !CONFIG_REALTIME_ONLY
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
+#endif
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+ ppi->fn_ptr[BT].msdf = MCSDF; \
+ ppi->fn_ptr[BT].msvf = MCSVF;
+
+#define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD) \
+ HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT, \
+ aom_highbd_masked_sad##WIDTH##x##HEIGHT##_bits##BD, \
+ aom_highbd_##BD##_masked_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \
+ int m_stride, int invert_mask) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred_ptr, m, m_stride, invert_mask); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \
+ int m_stride, int invert_mask) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred_ptr, m, m_stride, invert_mask) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \
+ int m_stride, int invert_mask) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred_ptr, m, m_stride, invert_mask) >> \
+ 4; \
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+#if !CONFIG_REALTIME_ONLY
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
+#endif
+#endif
+
+#define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \
+ ppi->fn_ptr[BT].sdsf = SDSF; \
+ ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+#define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD) \
+ HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT, \
+ aom_highbd_sad_skip_##WIDTH##x##HEIGHT##_bits##BD, \
+ aom_highbd_sad_skip_##WIDTH##x##HEIGHT##x4d##_bits##BD)
+
+#define MAKE_SDSF_SKIP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return fnname(src, src_stride, ref, ref_stride); \
+ } \
+ static unsigned int fnname##_bits10(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return fnname(src, src_stride, ref, ref_stride) >> 2; \
+ } \
+ static unsigned int fnname##_bits12(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return fnname(src, src_stride, ref, ref_stride) >> 4; \
+ }
+
+#define MAKE_SDSF_SKIP_SAD_4D_WRAPPER(fnname) \
+ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ } \
+ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 2; \
+ } \
+ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 4; \
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_128x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x128)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x32)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x8)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_64x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_32x8)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_16x64)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_4x16)
+MAKE_SDSF_SKIP_SAD_WRAPPER(aom_highbd_sad_skip_8x32)
+#endif
+
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_128x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x128x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x32x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x8x4d)
+
+#if !CONFIG_REALTIME_ONLY
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_64x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_32x8x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_16x64x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_4x16x4d)
+MAKE_SDSF_SKIP_SAD_4D_WRAPPER(aom_highbd_sad_skip_8x32x4d)
+#endif
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define HIGHBD_OBFP_WRAPPER_8(WIDTH, HEIGHT) \
+ HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \
+ aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits8, \
+ aom_highbd_8_obmc_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_8_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
+ ppi->fn_ptr[BT].osdf = OSDF; \
+ ppi->fn_ptr[BT].ovf = OVF; \
+ ppi->fn_ptr[BT].osvf = OSVF;
+
+#define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD) \
+ HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT, \
+ aom_highbd_obmc_sad##WIDTH##x##HEIGHT##_bits##BD, \
+ aom_highbd_##BD##_obmc_variance##WIDTH##x##HEIGHT, \
+ aom_highbd_##BD##_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk); \
+ } \
+ static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk) >> 2; \
+ } \
+ static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk) >> 4; \
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
+#endif
+
+static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ if (seq_params->use_highbitdepth) {
+ switch (seq_params->bit_depth) {
+ case AOM_BITS_8:
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_BFP_WRAPPER(64, 16, 8)
+ HIGHBD_BFP_WRAPPER(16, 64, 8)
+ HIGHBD_BFP_WRAPPER(32, 8, 8)
+ HIGHBD_BFP_WRAPPER(8, 32, 8)
+ HIGHBD_BFP_WRAPPER(16, 4, 8)
+ HIGHBD_BFP_WRAPPER(4, 16, 8)
+#endif
+ HIGHBD_BFP_WRAPPER(32, 16, 8)
+ HIGHBD_BFP_WRAPPER(16, 32, 8)
+ HIGHBD_BFP_WRAPPER(64, 32, 8)
+ HIGHBD_BFP_WRAPPER(32, 64, 8)
+ HIGHBD_BFP_WRAPPER(32, 32, 8)
+ HIGHBD_BFP_WRAPPER(64, 64, 8)
+ HIGHBD_BFP_WRAPPER(16, 16, 8)
+ HIGHBD_BFP_WRAPPER(16, 8, 8)
+ HIGHBD_BFP_WRAPPER(8, 16, 8)
+ HIGHBD_BFP_WRAPPER(8, 8, 8)
+ HIGHBD_BFP_WRAPPER(8, 4, 8)
+ HIGHBD_BFP_WRAPPER(4, 8, 8)
+ HIGHBD_BFP_WRAPPER(4, 4, 8)
+ HIGHBD_BFP_WRAPPER(128, 128, 8)
+ HIGHBD_BFP_WRAPPER(128, 64, 8)
+ HIGHBD_BFP_WRAPPER(64, 128, 8)
+
+ HIGHBD_MBFP_WRAPPER(128, 128, 8)
+ HIGHBD_MBFP_WRAPPER(128, 64, 8)
+ HIGHBD_MBFP_WRAPPER(64, 128, 8)
+ HIGHBD_MBFP_WRAPPER(64, 64, 8)
+ HIGHBD_MBFP_WRAPPER(64, 32, 8)
+ HIGHBD_MBFP_WRAPPER(32, 64, 8)
+ HIGHBD_MBFP_WRAPPER(32, 32, 8)
+ HIGHBD_MBFP_WRAPPER(32, 16, 8)
+ HIGHBD_MBFP_WRAPPER(16, 32, 8)
+ HIGHBD_MBFP_WRAPPER(16, 16, 8)
+ HIGHBD_MBFP_WRAPPER(8, 16, 8)
+ HIGHBD_MBFP_WRAPPER(16, 8, 8)
+ HIGHBD_MBFP_WRAPPER(8, 8, 8)
+ HIGHBD_MBFP_WRAPPER(4, 8, 8)
+ HIGHBD_MBFP_WRAPPER(8, 4, 8)
+ HIGHBD_MBFP_WRAPPER(4, 4, 8)
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_MBFP_WRAPPER(64, 16, 8)
+ HIGHBD_MBFP_WRAPPER(16, 64, 8)
+ HIGHBD_MBFP_WRAPPER(32, 8, 8)
+ HIGHBD_MBFP_WRAPPER(8, 32, 8)
+ HIGHBD_MBFP_WRAPPER(16, 4, 8)
+ HIGHBD_MBFP_WRAPPER(4, 16, 8)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_OBFP_WRAPPER_8(128, 128)
+ HIGHBD_OBFP_WRAPPER_8(128, 64)
+ HIGHBD_OBFP_WRAPPER_8(64, 128)
+ HIGHBD_OBFP_WRAPPER_8(64, 64)
+ HIGHBD_OBFP_WRAPPER_8(64, 32)
+ HIGHBD_OBFP_WRAPPER_8(32, 64)
+ HIGHBD_OBFP_WRAPPER_8(32, 32)
+ HIGHBD_OBFP_WRAPPER_8(32, 16)
+ HIGHBD_OBFP_WRAPPER_8(16, 32)
+ HIGHBD_OBFP_WRAPPER_8(16, 16)
+ HIGHBD_OBFP_WRAPPER_8(8, 16)
+ HIGHBD_OBFP_WRAPPER_8(16, 8)
+ HIGHBD_OBFP_WRAPPER_8(8, 8)
+ HIGHBD_OBFP_WRAPPER_8(4, 8)
+ HIGHBD_OBFP_WRAPPER_8(8, 4)
+ HIGHBD_OBFP_WRAPPER_8(4, 4)
+ HIGHBD_OBFP_WRAPPER_8(64, 16)
+ HIGHBD_OBFP_WRAPPER_8(16, 64)
+ HIGHBD_OBFP_WRAPPER_8(32, 8)
+ HIGHBD_OBFP_WRAPPER_8(8, 32)
+ HIGHBD_OBFP_WRAPPER_8(16, 4)
+ HIGHBD_OBFP_WRAPPER_8(4, 16)
+#endif
+
+ HIGHBD_SDSFP_WRAPPER(128, 128, 8)
+ HIGHBD_SDSFP_WRAPPER(128, 64, 8)
+ HIGHBD_SDSFP_WRAPPER(64, 128, 8)
+ HIGHBD_SDSFP_WRAPPER(64, 64, 8)
+ HIGHBD_SDSFP_WRAPPER(64, 32, 8)
+ HIGHBD_SDSFP_WRAPPER(32, 64, 8)
+ HIGHBD_SDSFP_WRAPPER(32, 32, 8)
+ HIGHBD_SDSFP_WRAPPER(32, 16, 8)
+ HIGHBD_SDSFP_WRAPPER(16, 32, 8)
+ HIGHBD_SDSFP_WRAPPER(16, 16, 8)
+ HIGHBD_SDSFP_WRAPPER(16, 8, 8)
+ HIGHBD_SDSFP_WRAPPER(8, 16, 8)
+ HIGHBD_SDSFP_WRAPPER(8, 8, 8)
+ HIGHBD_SDSFP_WRAPPER(4, 8, 8)
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_SDSFP_WRAPPER(64, 16, 8)
+ HIGHBD_SDSFP_WRAPPER(32, 8, 8)
+ HIGHBD_SDSFP_WRAPPER(16, 64, 8)
+ HIGHBD_SDSFP_WRAPPER(8, 32, 8)
+ HIGHBD_SDSFP_WRAPPER(4, 16, 8)
+#endif
+ break;
+
+ case AOM_BITS_10:
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_BFP_WRAPPER(64, 16, 10)
+ HIGHBD_BFP_WRAPPER(16, 64, 10)
+ HIGHBD_BFP_WRAPPER(32, 8, 10)
+ HIGHBD_BFP_WRAPPER(8, 32, 10)
+ HIGHBD_BFP_WRAPPER(16, 4, 10)
+ HIGHBD_BFP_WRAPPER(4, 16, 10)
+#endif
+ HIGHBD_BFP_WRAPPER(32, 16, 10)
+ HIGHBD_BFP_WRAPPER(16, 32, 10)
+ HIGHBD_BFP_WRAPPER(64, 32, 10)
+ HIGHBD_BFP_WRAPPER(32, 64, 10)
+ HIGHBD_BFP_WRAPPER(32, 32, 10)
+ HIGHBD_BFP_WRAPPER(64, 64, 10)
+ HIGHBD_BFP_WRAPPER(16, 16, 10)
+ HIGHBD_BFP_WRAPPER(16, 8, 10)
+ HIGHBD_BFP_WRAPPER(8, 16, 10)
+ HIGHBD_BFP_WRAPPER(8, 8, 10)
+ HIGHBD_BFP_WRAPPER(8, 4, 10)
+ HIGHBD_BFP_WRAPPER(4, 8, 10)
+ HIGHBD_BFP_WRAPPER(4, 4, 10)
+ HIGHBD_BFP_WRAPPER(128, 128, 10)
+ HIGHBD_BFP_WRAPPER(128, 64, 10)
+ HIGHBD_BFP_WRAPPER(64, 128, 10)
+
+ HIGHBD_MBFP_WRAPPER(128, 128, 10)
+ HIGHBD_MBFP_WRAPPER(128, 64, 10)
+ HIGHBD_MBFP_WRAPPER(64, 128, 10)
+ HIGHBD_MBFP_WRAPPER(64, 64, 10)
+ HIGHBD_MBFP_WRAPPER(64, 32, 10)
+ HIGHBD_MBFP_WRAPPER(32, 64, 10)
+ HIGHBD_MBFP_WRAPPER(32, 32, 10)
+ HIGHBD_MBFP_WRAPPER(32, 16, 10)
+ HIGHBD_MBFP_WRAPPER(16, 32, 10)
+ HIGHBD_MBFP_WRAPPER(16, 16, 10)
+ HIGHBD_MBFP_WRAPPER(8, 16, 10)
+ HIGHBD_MBFP_WRAPPER(16, 8, 10)
+ HIGHBD_MBFP_WRAPPER(8, 8, 10)
+ HIGHBD_MBFP_WRAPPER(4, 8, 10)
+ HIGHBD_MBFP_WRAPPER(8, 4, 10)
+ HIGHBD_MBFP_WRAPPER(4, 4, 10)
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_MBFP_WRAPPER(64, 16, 10)
+ HIGHBD_MBFP_WRAPPER(16, 64, 10)
+ HIGHBD_MBFP_WRAPPER(32, 8, 10)
+ HIGHBD_MBFP_WRAPPER(8, 32, 10)
+ HIGHBD_MBFP_WRAPPER(16, 4, 10)
+ HIGHBD_MBFP_WRAPPER(4, 16, 10)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_OBFP_WRAPPER(128, 128, 10)
+ HIGHBD_OBFP_WRAPPER(128, 64, 10)
+ HIGHBD_OBFP_WRAPPER(64, 128, 10)
+ HIGHBD_OBFP_WRAPPER(64, 64, 10)
+ HIGHBD_OBFP_WRAPPER(64, 32, 10)
+ HIGHBD_OBFP_WRAPPER(32, 64, 10)
+ HIGHBD_OBFP_WRAPPER(32, 32, 10)
+ HIGHBD_OBFP_WRAPPER(32, 16, 10)
+ HIGHBD_OBFP_WRAPPER(16, 32, 10)
+ HIGHBD_OBFP_WRAPPER(16, 16, 10)
+ HIGHBD_OBFP_WRAPPER(8, 16, 10)
+ HIGHBD_OBFP_WRAPPER(16, 8, 10)
+ HIGHBD_OBFP_WRAPPER(8, 8, 10)
+ HIGHBD_OBFP_WRAPPER(4, 8, 10)
+ HIGHBD_OBFP_WRAPPER(8, 4, 10)
+ HIGHBD_OBFP_WRAPPER(4, 4, 10)
+ HIGHBD_OBFP_WRAPPER(64, 16, 10)
+ HIGHBD_OBFP_WRAPPER(16, 64, 10)
+ HIGHBD_OBFP_WRAPPER(32, 8, 10)
+ HIGHBD_OBFP_WRAPPER(8, 32, 10)
+ HIGHBD_OBFP_WRAPPER(16, 4, 10)
+ HIGHBD_OBFP_WRAPPER(4, 16, 10)
+#endif
+
+ HIGHBD_SDSFP_WRAPPER(128, 128, 10)
+ HIGHBD_SDSFP_WRAPPER(128, 64, 10)
+ HIGHBD_SDSFP_WRAPPER(64, 128, 10)
+ HIGHBD_SDSFP_WRAPPER(64, 64, 10)
+ HIGHBD_SDSFP_WRAPPER(64, 32, 10)
+ HIGHBD_SDSFP_WRAPPER(32, 64, 10)
+ HIGHBD_SDSFP_WRAPPER(32, 32, 10)
+ HIGHBD_SDSFP_WRAPPER(32, 16, 10)
+ HIGHBD_SDSFP_WRAPPER(16, 32, 10)
+ HIGHBD_SDSFP_WRAPPER(16, 16, 10)
+ HIGHBD_SDSFP_WRAPPER(16, 8, 10)
+ HIGHBD_SDSFP_WRAPPER(8, 16, 10)
+ HIGHBD_SDSFP_WRAPPER(8, 8, 10)
+ HIGHBD_SDSFP_WRAPPER(4, 8, 10)
+
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_SDSFP_WRAPPER(64, 16, 10)
+ HIGHBD_SDSFP_WRAPPER(32, 8, 10)
+ HIGHBD_SDSFP_WRAPPER(16, 64, 10)
+ HIGHBD_SDSFP_WRAPPER(8, 32, 10)
+ HIGHBD_SDSFP_WRAPPER(4, 16, 10)
+#endif
+ break;
+
+ case AOM_BITS_12:
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_BFP_WRAPPER(64, 16, 12)
+ HIGHBD_BFP_WRAPPER(16, 64, 12)
+ HIGHBD_BFP_WRAPPER(32, 8, 12)
+ HIGHBD_BFP_WRAPPER(8, 32, 12)
+ HIGHBD_BFP_WRAPPER(16, 4, 12)
+ HIGHBD_BFP_WRAPPER(4, 16, 12)
+#endif
+ HIGHBD_BFP_WRAPPER(32, 16, 12)
+ HIGHBD_BFP_WRAPPER(16, 32, 12)
+ HIGHBD_BFP_WRAPPER(64, 32, 12)
+ HIGHBD_BFP_WRAPPER(32, 64, 12)
+ HIGHBD_BFP_WRAPPER(32, 32, 12)
+ HIGHBD_BFP_WRAPPER(64, 64, 12)
+ HIGHBD_BFP_WRAPPER(16, 16, 12)
+ HIGHBD_BFP_WRAPPER(16, 8, 12)
+ HIGHBD_BFP_WRAPPER(8, 16, 12)
+ HIGHBD_BFP_WRAPPER(8, 8, 12)
+ HIGHBD_BFP_WRAPPER(8, 4, 12)
+ HIGHBD_BFP_WRAPPER(4, 8, 12)
+ HIGHBD_BFP_WRAPPER(4, 4, 12)
+ HIGHBD_BFP_WRAPPER(128, 128, 12)
+ HIGHBD_BFP_WRAPPER(128, 64, 12)
+ HIGHBD_BFP_WRAPPER(64, 128, 12)
+
+ HIGHBD_MBFP_WRAPPER(128, 128, 12)
+ HIGHBD_MBFP_WRAPPER(128, 64, 12)
+ HIGHBD_MBFP_WRAPPER(64, 128, 12)
+ HIGHBD_MBFP_WRAPPER(64, 64, 12)
+ HIGHBD_MBFP_WRAPPER(64, 32, 12)
+ HIGHBD_MBFP_WRAPPER(32, 64, 12)
+ HIGHBD_MBFP_WRAPPER(32, 32, 12)
+ HIGHBD_MBFP_WRAPPER(32, 16, 12)
+ HIGHBD_MBFP_WRAPPER(16, 32, 12)
+ HIGHBD_MBFP_WRAPPER(16, 16, 12)
+ HIGHBD_MBFP_WRAPPER(8, 16, 12)
+ HIGHBD_MBFP_WRAPPER(16, 8, 12)
+ HIGHBD_MBFP_WRAPPER(8, 8, 12)
+ HIGHBD_MBFP_WRAPPER(4, 8, 12)
+ HIGHBD_MBFP_WRAPPER(8, 4, 12)
+ HIGHBD_MBFP_WRAPPER(4, 4, 12)
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_MBFP_WRAPPER(64, 16, 12)
+ HIGHBD_MBFP_WRAPPER(16, 64, 12)
+ HIGHBD_MBFP_WRAPPER(32, 8, 12)
+ HIGHBD_MBFP_WRAPPER(8, 32, 12)
+ HIGHBD_MBFP_WRAPPER(16, 4, 12)
+ HIGHBD_MBFP_WRAPPER(4, 16, 12)
+#endif
+
+// OBMC excluded from realtime only build.
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_OBFP_WRAPPER(128, 128, 12)
+ HIGHBD_OBFP_WRAPPER(128, 64, 12)
+ HIGHBD_OBFP_WRAPPER(64, 128, 12)
+ HIGHBD_OBFP_WRAPPER(64, 64, 12)
+ HIGHBD_OBFP_WRAPPER(64, 32, 12)
+ HIGHBD_OBFP_WRAPPER(32, 64, 12)
+ HIGHBD_OBFP_WRAPPER(32, 32, 12)
+ HIGHBD_OBFP_WRAPPER(32, 16, 12)
+ HIGHBD_OBFP_WRAPPER(16, 32, 12)
+ HIGHBD_OBFP_WRAPPER(16, 16, 12)
+ HIGHBD_OBFP_WRAPPER(8, 16, 12)
+ HIGHBD_OBFP_WRAPPER(16, 8, 12)
+ HIGHBD_OBFP_WRAPPER(8, 8, 12)
+ HIGHBD_OBFP_WRAPPER(4, 8, 12)
+ HIGHBD_OBFP_WRAPPER(8, 4, 12)
+ HIGHBD_OBFP_WRAPPER(4, 4, 12)
+ HIGHBD_OBFP_WRAPPER(64, 16, 12)
+ HIGHBD_OBFP_WRAPPER(16, 64, 12)
+ HIGHBD_OBFP_WRAPPER(32, 8, 12)
+ HIGHBD_OBFP_WRAPPER(8, 32, 12)
+ HIGHBD_OBFP_WRAPPER(16, 4, 12)
+ HIGHBD_OBFP_WRAPPER(4, 16, 12)
+#endif
+
+ HIGHBD_SDSFP_WRAPPER(128, 128, 12)
+ HIGHBD_SDSFP_WRAPPER(128, 64, 12)
+ HIGHBD_SDSFP_WRAPPER(64, 128, 12)
+ HIGHBD_SDSFP_WRAPPER(64, 64, 12)
+ HIGHBD_SDSFP_WRAPPER(64, 32, 12)
+ HIGHBD_SDSFP_WRAPPER(32, 64, 12)
+ HIGHBD_SDSFP_WRAPPER(32, 32, 12)
+ HIGHBD_SDSFP_WRAPPER(32, 16, 12)
+ HIGHBD_SDSFP_WRAPPER(16, 32, 12)
+ HIGHBD_SDSFP_WRAPPER(16, 16, 12)
+ HIGHBD_SDSFP_WRAPPER(16, 8, 12)
+ HIGHBD_SDSFP_WRAPPER(8, 16, 12)
+ HIGHBD_SDSFP_WRAPPER(8, 8, 12)
+ HIGHBD_SDSFP_WRAPPER(4, 8, 12)
+
+#if !CONFIG_REALTIME_ONLY
+ HIGHBD_SDSFP_WRAPPER(64, 16, 12)
+ HIGHBD_SDSFP_WRAPPER(32, 8, 12)
+ HIGHBD_SDSFP_WRAPPER(16, 64, 12)
+ HIGHBD_SDSFP_WRAPPER(8, 32, 12)
+ HIGHBD_SDSFP_WRAPPER(4, 16, 12)
+#endif
+ break;
+
+ default:
+ assert(0 &&
+ "cm->seq_params->bit_depth should be AOM_BITS_8, "
+ "AOM_BITS_10 or AOM_BITS_12");
+ }
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) {
+ FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+ if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+ av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
+ }
+ if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+ av1_copy(frame_probs->obmc_probs, default_obmc_probs);
+ }
+ if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+ av1_copy(frame_probs->warped_probs, default_warped_probs);
+ }
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+ av1_copy(frame_probs->switchable_interp_probs,
+ default_switchable_interp_probs);
+ }
+
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ FrameProbInfo *const temp_frame_probs = &cpi->ppi->temp_frame_probs;
+ if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+ av1_copy(temp_frame_probs->tx_type_probs, default_tx_type_probs);
+ }
+ if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+ av1_copy(temp_frame_probs->obmc_probs, default_obmc_probs);
+ }
+ if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+ av1_copy(temp_frame_probs->warped_probs, default_warped_probs);
+ }
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+ av1_copy(temp_frame_probs->switchable_interp_probs,
+ default_switchable_interp_probs);
+ }
+
+ FrameProbInfo *const temp_frame_probs_simulation =
+ &cpi->ppi->temp_frame_probs_simulation;
+ if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+ av1_copy(temp_frame_probs_simulation->tx_type_probs,
+ default_tx_type_probs);
+ }
+ if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
+ av1_copy(temp_frame_probs_simulation->obmc_probs, default_obmc_probs);
+ }
+ if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+ av1_copy(temp_frame_probs_simulation->warped_probs, default_warped_probs);
+ }
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+ av1_copy(temp_frame_probs_simulation->switchable_interp_probs,
+ default_switchable_interp_probs);
+ }
+ }
+#endif
+}
+
+static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst,
+ const CdefInfo *const src) {
+ dst->cdef_bits = src->cdef_bits;
+ dst->cdef_damping = src->cdef_damping;
+ av1_copy(dst->cdef_strengths, src->cdef_strengths);
+ av1_copy(dst->cdef_uv_strengths, src->cdef_uv_strengths);
+ dst->nb_cdef_strengths = src->nb_cdef_strengths;
+}
+
+// Coding context that only needs to be restored when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static AOM_INLINE void restore_extra_coding_context(AV1_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ AV1_COMMON *cm = &cpi->common;
+ cm->lf = cc->lf;
+ restore_cdef_coding_context(&cm->cdef_info, &cc->cdef_info);
+ cpi->rc = cc->rc;
+ cpi->ppi->mv_stats = cc->mv_stats;
+}
+
+static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ return a->y_height == b->y_height && a->y_width == b->y_width &&
+ a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+ a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+ a->border == b->border &&
+ (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+ (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+static AOM_INLINE int update_entropy(bool *ext_refresh_frame_context,
+ bool *ext_refresh_frame_context_pending,
+ bool update) {
+ *ext_refresh_frame_context = update;
+ *ext_refresh_frame_context_pending = 1;
+ return 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int combine_prior_with_tpl_boost(double min_factor,
+ double max_factor,
+ int prior_boost,
+ int tpl_boost,
+ int frames_to_key) {
+ double factor = sqrt((double)frames_to_key);
+ double range = max_factor - min_factor;
+ factor = AOMMIN(factor, max_factor);
+ factor = AOMMAX(factor, min_factor);
+ factor -= min_factor;
+ int boost =
+ (int)((factor * prior_boost + (range - factor) * tpl_boost) / range);
+ return boost;
+}
+#endif
+
+static AOM_INLINE void set_size_independent_vars(AV1_COMP *cpi) {
+ int i;
+ AV1_COMMON *const cm = &cpi->common;
+ FeatureFlags *const features = &cm->features;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ cm->global_motion[i] = default_warp_params;
+ }
+ cpi->gm_info.search_done = 0;
+
+ av1_set_speed_features_framesize_independent(cpi, cpi->speed);
+ av1_set_rd_speed_thresholds(cpi);
+ features->interp_filter = SWITCHABLE;
+ features->switchable_motion_mode = is_switchable_motion_mode_allowed(
+ features->allow_warped_motion, cpi->oxcf.motion_mode_cfg.enable_obmc);
+}
+
+static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) {
+ // Scaled references should only need to be released under certain conditions:
+ // if the reference will be updated, or if the scaled reference has same
+ // resolution. For now only apply this to Golden for non-svc RTC mode.
+ AV1_COMMON *const cm = &cpi->common;
+ const bool refresh_golden = (cpi->refresh_frame.golden_frame) ? 1 : 0;
+ bool release_golden = true;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+ const int golden_ref = (i == GOLDEN_FRAME - 1);
+ if (golden_ref && is_one_pass_rt_params(cpi) && !cpi->ppi->use_svc &&
+ buf != NULL) {
+ const RefCntBuffer *const ref = get_ref_frame_buf(cm, GOLDEN_FRAME);
+ const bool same_resoln = buf->buf.y_crop_width == ref->buf.y_crop_width &&
+ buf->buf.y_crop_height == ref->buf.y_crop_height;
+ release_golden = refresh_golden || same_resoln;
+ }
+ if (buf != NULL && (!golden_ref || (golden_ref && release_golden))) {
+ --buf->ref_count;
+ cpi->scaled_ref_buf[i] = NULL;
+ }
+ }
+}
+
+static AOM_INLINE void restore_all_coding_context(AV1_COMP *cpi) {
+ restore_extra_coding_context(cpi);
+ if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+static AOM_INLINE int reduce_num_ref_buffers(const AV1_COMP *cpi) {
+ const SequenceHeader *const seq_params = cpi->common.seq_params;
+ return is_one_pass_rt_params(cpi) &&
+ use_rtc_reference_structure_one_layer(cpi) &&
+ (seq_params->order_hint_info.enable_order_hint == 0) &&
+ cpi->rt_reduce_num_ref_buffers;
+}
+
+// Refresh reference frame buffers according to refresh_frame_flags.
+static AOM_INLINE void refresh_reference_frames(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ // All buffers are refreshed for shown keyframes and S-frames.
+ // In case of RT, golden frame refreshes the 6th slot and other reference
+ // frames refresh slots 0 to 5. Slot 7 is not refreshed by any reference
+ // frame. Thus, only 7 buffers are refreshed for keyframes and S-frames
+ // instead of 8.
+ int num_ref_buffers = REF_FRAMES;
+ if (reduce_num_ref_buffers(cpi)) {
+ const int refresh_all_bufs =
+ (cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET ||
+ frame_is_sframe(cm));
+ assert(IMPLIES(((cm->current_frame.refresh_frame_flags >> 7) & 1) == 1,
+ refresh_all_bufs));
+ (void)refresh_all_bufs;
+ num_ref_buffers--;
+ }
+
+ for (int ref_frame = 0; ref_frame < num_ref_buffers; ref_frame++) {
+ if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) {
+ assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame);
+ }
+ }
+}
+
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+ const AV1EncoderConfig *oxcf);
+void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
+ const AV1EncoderConfig *oxcf);
+
+void av1_scale_references(AV1_COMP *cpi, const InterpFilter filter,
+ const int phase, const int use_optimized_scaler);
+
+void av1_setup_frame(AV1_COMP *cpi);
+
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+ int height, int number_spatial_layers);
+
+void av1_apply_active_map(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+uint16_t av1_setup_interp_filter_search_mask(AV1_COMP *cpi);
+
+void av1_determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig);
+#endif
+
+void av1_set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+ int *top_index);
+
+void av1_finalize_encoded_frame(AV1_COMP *const cpi);
+
+int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+ const YV12_BUFFER_CONFIG *last_picture,
+ ForceIntegerMVInfo *const force_intpel_info);
+
+void av1_set_mb_ssim_rdmult_scaling(AV1_COMP *cpi);
+
+void av1_save_all_coding_context(AV1_COMP *cpi);
+
+#if DUMP_RECON_FRAMES == 1
+void av1_dump_filtered_recon_frames(AV1_COMP *cpi);
+#endif
+
+static AOM_INLINE int av1_get_enc_border_size(bool resize, bool all_intra,
+ BLOCK_SIZE sb_size) {
+ // For allintra encoding mode, inter-frame motion search is not applicable and
+ // the intraBC motion vectors are restricted within the tile boundaries. Hence
+ // a smaller frame border size (AOM_ENC_ALLINTRA_BORDER) is used in this case.
+ if (resize) {
+ return AOM_BORDER_IN_PIXELS;
+ }
+ if (all_intra) {
+ return AOM_ENC_ALLINTRA_BORDER;
+ }
+ return block_size_wide[sb_size] + 32;
+}
+
+static AOM_INLINE bool av1_is_resize_needed(const AV1EncoderConfig *oxcf) {
+ const ResizeCfg *resize_cfg = &oxcf->resize_cfg;
+ const SuperResCfg *superres_cfg = &oxcf->superres_cfg;
+ return resize_cfg->resize_mode || superres_cfg->superres_mode;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODER_UTILS_H_
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
new file mode 100644
index 0000000000..5fe2a497c7
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -0,0 +1,886 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encodetxb.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/idct.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+void av1_alloc_txb_buf(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
+ const int num_sb_rows =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+ const int num_sb_cols =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+ const int size = num_sb_rows * num_sb_cols;
+ const int num_planes = av1_num_planes(cm);
+ const int subsampling_x = cm->seq_params->subsampling_x;
+ const int subsampling_y = cm->seq_params->subsampling_y;
+ const int luma_max_sb_square =
+ 1 << num_pels_log2_lookup[cm->seq_params->sb_size];
+ const int chroma_max_sb_square =
+ luma_max_sb_square >> (subsampling_x + subsampling_y);
+ const int num_tcoeffs =
+ size * (luma_max_sb_square + (num_planes - 1) * chroma_max_sb_square);
+ const int txb_unit_size = TX_SIZE_W_MIN * TX_SIZE_H_MIN;
+
+ av1_free_txb_buf(cpi);
+ // TODO(jingning): This should be further reduced.
+ CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
+ aom_malloc(sizeof(*cpi->coeff_buffer_base) * size));
+ CHECK_MEM_ERROR(
+ cm, coeff_buf_pool->tcoeff,
+ aom_memalign(32, sizeof(*coeff_buf_pool->tcoeff) * num_tcoeffs));
+ CHECK_MEM_ERROR(
+ cm, coeff_buf_pool->eobs,
+ aom_malloc(sizeof(*coeff_buf_pool->eobs) * num_tcoeffs / txb_unit_size));
+ CHECK_MEM_ERROR(cm, coeff_buf_pool->entropy_ctx,
+ aom_malloc(sizeof(*coeff_buf_pool->entropy_ctx) *
+ num_tcoeffs / txb_unit_size));
+
+ tran_low_t *tcoeff_ptr = coeff_buf_pool->tcoeff;
+ uint16_t *eob_ptr = coeff_buf_pool->eobs;
+ uint8_t *entropy_ctx_ptr = coeff_buf_pool->entropy_ctx;
+ for (int i = 0; i < size; i++) {
+ for (int plane = 0; plane < num_planes; plane++) {
+ const int max_sb_square =
+ (plane == AOM_PLANE_Y) ? luma_max_sb_square : chroma_max_sb_square;
+ cpi->coeff_buffer_base[i].tcoeff[plane] = tcoeff_ptr;
+ cpi->coeff_buffer_base[i].eobs[plane] = eob_ptr;
+ cpi->coeff_buffer_base[i].entropy_ctx[plane] = entropy_ctx_ptr;
+ tcoeff_ptr += max_sb_square;
+ eob_ptr += max_sb_square / txb_unit_size;
+ entropy_ctx_ptr += max_sb_square / txb_unit_size;
+ }
+ }
+}
+
+void av1_free_txb_buf(AV1_COMP *cpi) {
+ CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
+ aom_free(cpi->coeff_buffer_base);
+ cpi->coeff_buffer_base = NULL;
+ aom_free(coeff_buf_pool->tcoeff);
+ coeff_buf_pool->tcoeff = NULL;
+ aom_free(coeff_buf_pool->eobs);
+ coeff_buf_pool->eobs = NULL;
+ aom_free(coeff_buf_pool->entropy_ctx);
+ coeff_buf_pool->entropy_ctx = NULL;
+}
+
+static void write_golomb(aom_writer *w, int level) {
+ int x = level + 1;
+ int i = x;
+ int length = 0;
+
+ while (i) {
+ i >>= 1;
+ ++length;
+ }
+ assert(length > 0);
+
+ for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0);
+
+ for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
+}
+
+static const int8_t eob_to_pos_small[33] = {
+ 0, 1, 2, // 0-2
+ 3, 3, // 3-4
+ 4, 4, 4, 4, // 5-8
+ 5, 5, 5, 5, 5, 5, 5, 5, // 9-16
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+ 6, // place holder
+ 7, // 33-64
+ 8, 8, // 65-128
+ 9, 9, 9, 9, // 129-256
+ 10, 10, 10, 10, 10, 10, 10, 10, // 257-512
+ 11 // 513-
+};
+
+int av1_get_eob_pos_token(const int eob, int *const extra) {
+ int t;
+
+ if (eob < 33) {
+ t = eob_to_pos_small[eob];
+ } else {
+ const int e = AOMMIN((eob - 1) >> 5, 16);
+ t = eob_to_pos_large[e];
+ }
+
+ *extra = eob - av1_eob_group_start[t];
+
+ return t;
+}
+
+#if CONFIG_ENTROPY_STATS
+void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
+ TX_CLASS tx_class, PLANE_TYPE plane,
+ FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
+ uint8_t allow_update_cdf) {
+#else
+void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+ PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
+ uint8_t allow_update_cdf) {
+#endif
+ int eob_extra;
+ const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
+ TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+
+ switch (eob_multi_size) {
+ case 0:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5);
+ break;
+ case 1:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6);
+ break;
+ case 2:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7);
+ break;
+ case 3:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1,
+ 8);
+ }
+ break;
+ case 4:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1,
+ 9);
+ }
+ break;
+ case 5:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1,
+ 10);
+ }
+ break;
+ case 6:
+ default:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1,
+ 11);
+ }
+ break;
+ }
+
+ if (av1_eob_offset_bits[eob_pt] > 0) {
+ int eob_ctx = eob_pt - 3;
+ int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
+ int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+#if CONFIG_ENTROPY_STATS
+ counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2);
+ }
+}
+
+static INLINE int get_nz_map_ctx(const uint8_t *const levels,
+ const int coeff_idx, const int bhl,
+ const int width, const int scan_idx,
+ const int is_eob, const TX_SIZE tx_size,
+ const TX_CLASS tx_class) {
+ if (is_eob) {
+ if (scan_idx == 0) return 0;
+ if (scan_idx <= (width << bhl) / 8) return 1;
+ if (scan_idx <= (width << bhl) / 4) return 2;
+ return 3;
+ }
+ const int stats =
+ get_nz_mag(levels + get_padded_idx(coeff_idx, bhl), bhl, tx_class);
+ return get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class);
+}
+
+void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = height + TX_PAD_HOR;
+ uint8_t *ls = levels;
+
+ memset(levels + stride * width, 0,
+ sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+ for (int i = 0; i < width; i++) {
+ for (int j = 0; j < height; j++) {
+ *ls++ = (uint8_t)clamp(abs(coeff[i * height + j]), 0, INT8_MAX);
+ }
+ for (int j = 0; j < TX_PAD_HOR; j++) {
+ *ls++ = 0;
+ }
+ }
+}
+
+void av1_get_nz_map_contexts_c(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size, const TX_CLASS tx_class,
+ int8_t *const coeff_contexts) {
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ for (int i = 0; i < eob; ++i) {
+ const int pos = scan[i];
+ coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bhl, width, i,
+ i == eob - 1, tx_size, tx_class);
+ }
+}
+
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
+ aom_writer *w, int blk_row, int blk_col, int plane,
+ int block, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+ (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+ const uint16_t eob = eob_txb[block];
+ const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+ const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2);
+ if (eob == 0) return;
+
+ const TX_TYPE tx_type =
+ av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ // Only y plane's tx_type is transmitted
+ if (plane == 0) {
+ av1_write_tx_type(cm, xd, tx_type, tx_size, w);
+ }
+
+ int eob_extra;
+ const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+ switch (eob_multi_size) {
+ case 0:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5);
+ break;
+ case 1:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6);
+ break;
+ case 2:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7);
+ break;
+ case 3:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8);
+ break;
+ case 4:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9);
+ break;
+ case 5:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10);
+ break;
+ default:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11);
+ break;
+ }
+
+ const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
+ if (eob_offset_bits > 0) {
+ const int eob_ctx = eob_pt - 3;
+ int eob_shift = eob_offset_bits - 1;
+ int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+ aom_write_symbol(w, bit,
+ ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
+ for (int i = 1; i < eob_offset_bits; i++) {
+ eob_shift = eob_offset_bits - 1 - i;
+ bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+ aom_write_bit(w, bit);
+ }
+ }
+
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+ const tran_low_t *tcoeff_txb =
+ cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
+ const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block);
+ av1_txb_init_levels(tcoeff, width, height, levels);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const int16_t *const scan = scan_order->scan;
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+ const int bhl = get_txb_bhl(tx_size);
+ for (int c = eob - 1; c >= 0; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = coeff_contexts[pos];
+ const tran_low_t v = tcoeff[pos];
+ const tran_low_t level = abs(v);
+
+ if (c == eob - 1) {
+ aom_write_symbol(
+ w, AOMMIN(level, 3) - 1,
+ ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3);
+ } else {
+ aom_write_symbol(w, AOMMIN(level, 3),
+ ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx],
+ 4);
+ }
+ if (level > NUM_BASE_LEVELS) {
+ // level is above 1.
+ const int base_range = level - 1 - NUM_BASE_LEVELS;
+ const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ aom_cdf_prob *cdf =
+ ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+ aom_write_symbol(w, k, cdf, BR_CDF_SIZE);
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ }
+
+ // Loop to code all signs in the transform block,
+ // starting with the sign of DC (if applicable)
+ for (int c = 0; c < eob; ++c) {
+ const tran_low_t v = tcoeff[scan[c]];
+ const tran_low_t level = abs(v);
+ const int sign = (v < 0) ? 1 : 0;
+ if (level) {
+ if (c == 0) {
+ const int dc_sign_ctx =
+ (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK;
+ aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+ 2);
+ } else {
+ aom_write_bit(w, sign);
+ }
+ if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+ write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+ }
+ }
+}
+
+void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+ aom_writer *w, BLOCK_SIZE bsize) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int num_planes = av1_num_planes(cm);
+ int block[MAX_MB_PLANE] = { 0 };
+ int row, col;
+ assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+ int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ int mu_blocks_high = mi_size_high[max_unit_bsize];
+ mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+ for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+ for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const int stepr = tx_size_high_unit[tx_size];
+ const int stepc = tx_size_wide_unit[tx_size];
+ const int step = stepr * stepc;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int unit_height = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+ const int unit_width = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+ for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+ blk_row += stepr) {
+ for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+ blk_col += stepc) {
+ av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane,
+ block[plane], tx_size);
+ block[plane] += step;
+ }
+ }
+ }
+ }
+ }
+}
+
+uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+ const SCAN_ORDER *scan_order, int eob) {
+ const int16_t *const scan = scan_order->scan;
+ int cul_level = 0;
+ int c;
+
+ if (eob == 0) return 0;
+ for (c = 0; c < eob; ++c) {
+ cul_level += abs(qcoeff[scan[c]]);
+ if (cul_level > COEFF_CONTEXT_MASK) break;
+ }
+
+ cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+ set_dc_sign(&cul_level, qcoeff[0]);
+
+ return (uint8_t)cul_level;
+}
+
+static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int blk_row, int blk_col,
+ int plane, TX_SIZE tx_size,
+ FRAME_COUNTS *counts,
+ uint8_t allow_update_cdf) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ int is_inter = is_inter_block(mbmi);
+ const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+#if !CONFIG_ENTROPY_STATS
+ (void)counts;
+#endif // !CONFIG_ENTROPY_STATS
+
+ // Only y plane's tx_type is updated
+ if (plane > 0) return;
+ const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col,
+ tx_size, reduced_tx_set_used);
+ if (is_inter) {
+ if (cpi->oxcf.txfm_cfg.use_inter_dct_only) {
+ assert(tx_type == DCT_DCT);
+ }
+ } else {
+ if (cpi->oxcf.txfm_cfg.use_intra_dct_only) {
+ assert(tx_type == DCT_DCT);
+ } else if (cpi->oxcf.txfm_cfg.use_intra_default_tx_only) {
+ const TX_TYPE default_type = get_default_tx_type(
+ PLANE_TYPE_Y, xd, tx_size, cpi->use_screen_content_tools);
+ (void)default_type;
+ // TODO(kyslov): We don't always respect use_intra_default_tx_only flag in
+ // NonRD and REALTIME case. Specifically we ignore it in hybrid inta mode
+ // search, when picking up intra mode in nonRD inter mode search and in RD
+ // REALTIME mode when we limit TX type usage.
+ // We need to fix txfm cfg for these cases. Meanwhile relieving the
+ // assert.
+ assert(tx_type == default_type || cpi->sf.rt_sf.use_nonrd_pick_mode ||
+ cpi->oxcf.mode == REALTIME);
+ }
+ }
+
+ if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+ cm->quant_params.base_qindex > 0 && !mbmi->skip_txfm &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
+ if (eset > 0) {
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used);
+ if (is_inter) {
+ if (allow_update_cdf) {
+ update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
+ av1_ext_tx_ind[tx_set_type][tx_type],
+ av1_num_ext_tx_set[tx_set_type]);
+ }
+#if CONFIG_ENTROPY_STATS
+ ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
+ [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif // CONFIG_ENTROPY_STATS
+ } else {
+ PREDICTION_MODE intra_dir;
+ if (mbmi->filter_intra_mode_info.use_filter_intra)
+ intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+ .filter_intra_mode];
+ else
+ intra_dir = mbmi->mode;
+#if CONFIG_ENTROPY_STATS
+ ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir]
+ [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf) {
+ update_cdf(
+ fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir],
+ av1_ext_tx_ind[tx_set_type][tx_type],
+ av1_num_ext_tx_set[tx_set_type]);
+ }
+ }
+ }
+ }
+}
+
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct tokenize_b_args *const args = arg;
+ const AV1_COMP *cpi = args->cpi;
+ const AV1_COMMON *cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const int eob = p->eobs[block];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *qcoeff = p->qcoeff + block_offset;
+ const PLANE_TYPE plane_type = pd->plane_type;
+ const TX_TYPE tx_type =
+ av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ tran_low_t *tcoeff;
+ assert(args->dry_run != DRY_RUN_COSTCOEFFS);
+ if (args->dry_run == OUTPUT_ENABLED) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane,
+ pd->above_entropy_context + blk_col,
+ pd->left_entropy_context + blk_row, &txb_ctx);
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const uint8_t allow_update_cdf = args->allow_update_cdf;
+ const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#if CONFIG_ENTROPY_STATS
+ int cdf_idx = cm->coef_cdf_category;
+ ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx],
+ eob == 0, 2);
+ }
+
+ CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+ const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+ (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+ uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+ entropy_ctx[block] = txb_ctx.txb_skip_ctx;
+ eob_txb[block] = eob;
+
+ if (eob == 0) {
+ av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
+ blk_row);
+ return;
+ }
+ const int segment_id = mbmi->segment_id;
+ const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+ tran_low_t *tcoeff_txb =
+ cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
+ tcoeff = tcoeff_txb + block_offset;
+ memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+ av1_txb_init_levels(tcoeff, width, height, levels);
+ update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
+ td->counts, allow_update_cdf);
+
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const int16_t *const scan = scan_order->scan;
+
+ // record tx type usage
+ td->rd_counts.tx_type_used[tx_size][tx_type]++;
+
+#if CONFIG_ENTROPY_STATS
+ av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+ td->counts, allow_update_cdf);
+#else
+ av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+ allow_update_cdf);
+#endif
+
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
+ coeff_contexts);
+
+ for (int c = eob - 1; c >= 0; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = coeff_contexts[pos];
+ const tran_low_t v = qcoeff[pos];
+ const tran_low_t level = abs(v);
+ /* abs_sum_level is needed to decide the job scheduling order of
+ * pack bitstream multi-threading. This data is not needed if
+ * multi-threading is disabled. */
+ if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level;
+
+ if (allow_update_cdf) {
+ if (c == eob - 1) {
+ assert(coeff_ctx < 4);
+ update_cdf(
+ ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
+ AOMMIN(level, 3) - 1, 3);
+ } else {
+ update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
+ AOMMIN(level, 3), 4);
+ }
+ }
+ if (c == eob - 1) {
+ assert(coeff_ctx < 4);
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+ [coeff_ctx][AOMMIN(level, 3) - 1];
+ } else {
+ ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+ [coeff_ctx][AOMMIN(level, 3)];
+#endif
+ }
+ if (level > NUM_BASE_LEVELS) {
+ const int base_range = level - 1 - NUM_BASE_LEVELS;
+ const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
+ [plane_type][br_ctx],
+ k, BR_CDF_SIZE);
+ }
+ for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
+ [lps][br_ctx][lps == k];
+#endif // CONFIG_ENTROPY_STATS
+ if (lps == k) break;
+ }
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+ [plane_type][br_ctx][k];
+#endif
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ }
+ // Update the context needed to code the DC sign (if applicable)
+ if (tcoeff[0] != 0) {
+ const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+ const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
+ entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
+ }
+ } else {
+ tcoeff = qcoeff;
+ }
+ const uint8_t cul_level =
+ av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+ av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
+ blk_col, blk_row);
+}
+
+void av1_record_txb_context(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct tokenize_b_args *const args = arg;
+ const AV1_COMP *cpi = args->cpi;
+ const AV1_COMMON *cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const int eob = p->eobs[block];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *qcoeff = p->qcoeff + block_offset;
+ const PLANE_TYPE plane_type = pd->plane_type;
+ const TX_TYPE tx_type =
+ av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ tran_low_t *tcoeff;
+ assert(args->dry_run != DRY_RUN_COSTCOEFFS);
+ if (args->dry_run == OUTPUT_ENABLED) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane,
+ pd->above_entropy_context + blk_col,
+ pd->left_entropy_context + blk_row, &txb_ctx);
+#if CONFIG_ENTROPY_STATS
+ const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ int cdf_idx = cm->coef_cdf_category;
+ ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif // CONFIG_ENTROPY_STATS
+
+ CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+ const int txb_offset = x->mbmi_ext_frame->cb_offset[plane_type] /
+ (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+ uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+ entropy_ctx[block] = txb_ctx.txb_skip_ctx;
+ eob_txb[block] = eob;
+
+ if (eob == 0) {
+ av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
+ blk_row);
+ return;
+ }
+ const int segment_id = mbmi->segment_id;
+ const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+ tran_low_t *tcoeff_txb =
+ cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset[plane_type];
+ tcoeff = tcoeff_txb + block_offset;
+ memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+#if CONFIG_ENTROPY_STATS
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+ av1_txb_init_levels(tcoeff, width, height, levels);
+ update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
+ td->counts, 0 /*allow_update_cdf*/);
+
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const bool do_coeff_scan = true;
+#else
+ const bool do_coeff_scan = cpi->mt_info.pack_bs_mt_enabled;
+#endif
+ const int16_t *const scan = scan_order->scan;
+
+ // record tx type usage
+ td->rd_counts.tx_type_used[tx_size][tx_type]++;
+
+#if CONFIG_ENTROPY_STATS
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+ td->counts, 0 /*allow_update_cdf*/);
+
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
+ coeff_contexts);
+#endif
+
+ for (int c = eob - 1; (c >= 0) && do_coeff_scan; --c) {
+ const int pos = scan[c];
+ const tran_low_t v = qcoeff[pos];
+ const tran_low_t level = abs(v);
+ /* abs_sum_level is needed to decide the job scheduling order of
+ * pack bitstream multi-threading. This data is not needed if
+ * multi-threading is disabled. */
+ if (cpi->mt_info.pack_bs_mt_enabled) td->abs_sum_level += level;
+
+#if CONFIG_ENTROPY_STATS
+ const int coeff_ctx = coeff_contexts[pos];
+ if (c == eob - 1) {
+ assert(coeff_ctx < 4);
+ ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+ [coeff_ctx][AOMMIN(level, 3) - 1];
+ } else {
+ ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+ [coeff_ctx][AOMMIN(level, 3)];
+ }
+ if (level > NUM_BASE_LEVELS) {
+ const int base_range = level - 1 - NUM_BASE_LEVELS;
+ const int br_ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+ for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+ ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
+ [lps][br_ctx][lps == k];
+ if (lps == k) break;
+ }
+ ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+ [plane_type][br_ctx][k];
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+#endif
+ }
+ // Update the context needed to code the DC sign (if applicable)
+ if (tcoeff[0] != 0) {
+ const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+ const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+ ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif // CONFIG_ENTROPY_STATS
+ entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
+ }
+ } else {
+ tcoeff = qcoeff;
+ }
+ const uint8_t cul_level =
+ av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+ av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
+ blk_col, blk_row);
+}
+
+void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ uint8_t allow_update_cdf) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
+ if (mbmi->skip_txfm) {
+ av1_reset_entropy_context(xd, bsize, num_planes);
+ return;
+ }
+ const foreach_transformed_block_visitor visit =
+ allow_update_cdf ? av1_update_and_record_txb_context
+ : av1_record_txb_context;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, visit, &arg);
+ }
+}
+
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+ int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const int stride =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
+ const int offset =
+ (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+ return cpi->coeff_buffer_base + offset;
+}
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
new file mode 100644
index 0000000000..67b94046b4
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODETXB_H_
+#define AOM_AV1_ENCODER_ENCODETXB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/bitwriter.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define TXB_SKIP_CTX_MASK 15
+#define DC_SIGN_CTX_SHIFT 4
+#define DC_SIGN_CTX_MASK 3
+
+int av1_get_eob_pos_token(const int eob, int *const extra);
+
+/*!\endcond */
+/*!\brief Allocate the memory resources for all the macro blocks in the current
+ * coding frame.
+ * \ingroup coefficient_coding
+ *
+ * Each macro block will need a \ref CB_COEFF_BUFFER to store information for
+ * rate-distortion optimization and entropy coding of transform coefficients.
+ *
+ * \param[in] cpi Top-level encoder structure
+ */
+void av1_alloc_txb_buf(AV1_COMP *cpi);
+/*!\brief Free the memory resources for all the macro blocks in the current
+ * coding frame.
+ * \ingroup coefficient_coding
+ *
+ * See \ref av1_alloc_txb_buf and \ref CB_COEFF_BUFFER for more details.
+ *
+ * \param[in] cpi Top-level encoder structure
+ */
+void av1_free_txb_buf(AV1_COMP *cpi);
+
+/*!\brief Write quantized coefficients in a transform block into bitstream using
+ * entropy coding.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function will write the quantized coefficients in a transform block into
+ * the bitstream using entropy coding.
+ *
+ * The coding steps are as follows.
+ *
+ * 1) Code the end of block position "eob", which is the scan index of the
+ * last non-zero coefficient plus one.
+ *
+ * 2) Code the lower magnitude level (<= COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+ * for each coefficient in reversed scan order.
+ *
+ * 3) Code the sign and higher magnitude level
+ * (> COEFF_BASE_RANGE + NUM_BASE_LEVELS) in forward scan order.
+ *
+ * \param[in] cm Top-level structure shared by encoder and
+ * decoder
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] w Entropy coding write pointer
+ * \param[in] blk_row The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane
+ * \param[in] blk_col The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane
+ * \param[in] plane The index of the current plane
+ * \param[in] block The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in] tx_size The given transform size
+ */
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
+ aom_writer *w, int blk_row, int blk_col, int plane,
+ int block, TX_SIZE tx_size);
+
+/*!\brief Write quantized coefficients of all transform blocks in an intra
+ * macroblock into the bitstream using entropy coding.
+ *
+ * \ingroup coefficient_coding
+ *
+ * All transform blocks in the intra macroblock share the same transform size.
+ *
+ * This function use \ref av1_write_coeffs_txb() to code each transform block in
+ * raster order.
+ *
+ * \param[in] cm Top-level structure shared by encoder and
+ * decoder
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] w Entropy coding write pointer
+ * \param[in] bsize Block size of the current macroblock
+ */
+void av1_write_intra_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+ aom_writer *w, BLOCK_SIZE bsize);
+
+/*!\brief Pack the context info of the current transform block into an uint8_t.
+ * \ingroup coefficient_coding
+ *
+ * This context info will be collected and consolidated by its neighbor
+ * transform blocks for coding transform block skip flag (tx_skip) and
+ * the sign of DC coefficient (dc_sign).
+ *
+ * \param[in] qcoeff Buffer of quantized coefficients
+ * \param[in] scan_order Coding order of coefficients in the transform
+ * block
+ * \param[in] eob The scan index of last non-zero coefficient plus
+ * one
+ */
+uint8_t av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+ const SCAN_ORDER *scan_order, int eob);
+
+/*!\brief Update the probability model (cdf) and the entropy context related to
+ * coefficient coding for all transform blocks in the intra macroblock.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function will go through each transform block in the intra macorblock
+ * and call \ref av1_update_and_record_txb_context to update the probability
+ * model and entropy context properly.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] td Top-level multithreading structure
+ * \param[in] dry_run Whether this is a dry run.
+ * \param[in] bsize Block size of the current macroblock
+ * \param[in] allow_update_cdf Allowed to update probability model (cdf) or
+ * not.
+ */
+void av1_update_intra_mb_txb_context(const AV1_COMP *cpi, ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ uint8_t allow_update_cdf);
+
+/*!\brief Update the probability model (cdf) and the entropy context related to
+ * coefficient coding for a transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * There are regular mode and dry run for this funtion.
+ *
+ * Regular mode:
+ *
+ * The probability model (cdf) for each coding symbol in the
+ * transform block will be updated.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * Dry run:
+ *
+ * The probability model update will be skipped.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * \param[in] plane The index of the current plane.
+ * \param[in] block The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in] blk_row The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in] blk_col The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in] plane_bsize Block size for this plane. When the video source
+ * uses chroma subsampling, the block size of UV planes will be smaller than the
+ * block size of Y plane.
+ * \param[in] tx_size The given transform size.
+ * \param[in] arg This parameter will be translated into
+ * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run.
+ */
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg);
+
+/*!\brief Update the entropy context related to coefficient coding for a
+ * transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * There are regular mode and dry run for this function.
+ *
+ * Regular mode:
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * Dry run:
+ *
+ * The probability model update will be skipped.
+ *
+ * The entropy context of this transform block will be updated.
+ *
+ * \param[in] plane The index of the current plane.
+ * \param[in] block The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in] blk_row The row index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in] blk_col The col index of the current transform block
+ * in the macroblock. Each unit has 4 pixels in y plane.
+ * \param[in] plane_bsize Block size for this plane. When the video source
+ * uses chroma subsampling, the block size of UV planes will be smaller than the
+ * block size of Y plane.
+ * \param[in] tx_size The given transform size.
+ * \param[in] arg This parameter will be translated into
+ * tokenize_b_args, in which RUN_TYPE indicates using regular mode or dry run.
+ */
+void av1_record_txb_context(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+/*!\brief Get the corresponding \ref CB_COEFF_BUFFER of the current macro block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * The macroblock's location is described by mi_row and mi_col, row and column
+ * mi indexes in the coding frame.
+ *
+ * Each mi unit is a 4x4 pixel block.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] mi_row Row mi index of the current transform block
+ * in the frame.
+ * \param[in] mi_col Column mi index of the current transform
+ * block in the frame.
+ * \return CB_COEFF_BUFFER* Pointer of \ref CB_COEFF_BUFFER associated
+ * to this macroblock.
+ */
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+ int mi_col);
+
+/*!\brief Returns the entropy cost associated with skipping the current
+ * transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * \param[in] coeff_costs Table of entropy cost for coefficient coding.
+ * \param[in] txb_ctx Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in] plane The index of the current plane
+ * \param[in] tx_size The transform size
+ */
+static INLINE int av1_cost_skip_txb(const CoeffCosts *coeff_costs,
+ const TXB_CTX *const txb_ctx, int plane,
+ TX_SIZE tx_size) {
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const LV_MAP_COEFF_COST *const coeff_costs_ =
+ &coeff_costs->coeff_costs[txs_ctx][plane_type];
+ return coeff_costs_->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+}
+
+/*!\cond */
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+ { 17, 13 },
+ { 16, 10 },
+};
+/*!\endcond */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODETXB_H_
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
new file mode 100644
index 0000000000..d6a806d504
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -0,0 +1,3469 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "av1/common/warped_motion.h"
+#include "av1/common/thread_common.h"
+
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/ethread.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/firstpass.h"
+#endif
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/global_motion_facade.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/rdopt.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/tpl_model.h"
+
+static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+ td->rd_counts.compound_ref_used_flag |=
+ td_t->rd_counts.compound_ref_used_flag;
+ td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
+
+ for (int i = 0; i < TX_SIZES_ALL; i++) {
+ for (int j = 0; j < TX_TYPES; j++)
+ td->rd_counts.tx_type_used[i][j] += td_t->rd_counts.tx_type_used[i][j];
+ }
+
+ for (int i = 0; i < BLOCK_SIZES_ALL; i++) {
+ for (int j = 0; j < 2; j++) {
+ td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j];
+ }
+ }
+
+ for (int i = 0; i < 2; i++) {
+ td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i];
+ }
+
+ td->rd_counts.seg_tmp_pred_cost[0] += td_t->rd_counts.seg_tmp_pred_cost[0];
+ td->rd_counts.seg_tmp_pred_cost[1] += td_t->rd_counts.seg_tmp_pred_cost[1];
+
+ td->rd_counts.newmv_or_intra_blocks += td_t->rd_counts.newmv_or_intra_blocks;
+}
+
+static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+ const int mib_size = cm->seq_params->mib_size;
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int row = 0; row < cm->tiles.rows; row++) {
+ for (int col = 0; col < cm->tiles.cols; col++) {
+ TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col];
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+ mi_row += mib_size) {
+ if (mi_row == tile_info->mi_row_start)
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ for (int mi_col = tile_info->mi_col_start;
+ mi_col < tile_info->mi_col_end; mi_col += mib_size) {
+ const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col;
+ MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str;
+ MB_MODE_INFO *mbmi = mi[0];
+ if (mbmi->skip_txfm == 1 &&
+ (mbmi->bsize == cm->seq_params->sb_size)) {
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+ mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+ mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+ } else {
+ if (cm->delta_q_info.delta_lf_multi) {
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+ xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+ } else {
+ xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+ int c) {
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+}
+
+void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+ int c, int cols) {
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+}
+
+void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+ const int nsync = row_mt_sync->sync_range;
+
+ if (r) {
+ pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > row_mt_sync->num_finished_cols[r - 1] - nsync -
+ row_mt_sync->intrabc_extra_top_right_sb_delay) {
+ pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+#endif // CONFIG_MULTITHREAD
+}
+
+void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
+ int cols) {
+#if CONFIG_MULTITHREAD
+ const int nsync = row_mt_sync->sync_range;
+ int cur;
+ // Only signal when there are enough encoded blocks for next row to run.
+ int sig = 1;
+
+ if (c < cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = cols + nsync + row_mt_sync->intrabc_extra_top_right_sb_delay;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&row_mt_sync->mutex_[r]);
+
+ // When a thread encounters an error, num_finished_cols[r] is set to maximum
+ // column number. In this case, the AOMMAX operation here ensures that
+ // num_finished_cols[r] is not overwritten with a smaller value thus
+ // preventing the infinite waiting of threads in the relevant sync_read()
+ // function.
+ row_mt_sync->num_finished_cols[r] =
+ AOMMAX(row_mt_sync->num_finished_cols[r], cur);
+
+ pthread_cond_signal(&row_mt_sync->cond_[r]);
+ pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
+ }
+#else
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+#endif // CONFIG_MULTITHREAD
+}
+
+// Allocate memory for row synchronization
+static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync,
+ AV1_COMMON *cm, int rows) {
+#if CONFIG_MULTITHREAD
+ int i;
+
+ CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
+ aom_malloc(sizeof(*row_mt_sync->mutex_) * rows));
+ if (row_mt_sync->mutex_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
+ aom_malloc(sizeof(*row_mt_sync->cond_) * rows));
+ if (row_mt_sync->cond_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&row_mt_sync->cond_[i], NULL);
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+
+ CHECK_MEM_ERROR(cm, row_mt_sync->num_finished_cols,
+ aom_malloc(sizeof(*row_mt_sync->num_finished_cols) * rows));
+
+ row_mt_sync->rows = rows;
+ // Set up nsync.
+ row_mt_sync->sync_range = 1;
+}
+
+// Deallocate row based multi-threading synchronization related mutex and data
+void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) {
+ if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+ int i;
+
+ if (row_mt_sync->mutex_ != NULL) {
+ for (i = 0; i < row_mt_sync->rows; ++i) {
+ pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
+ }
+ aom_free(row_mt_sync->mutex_);
+ }
+ if (row_mt_sync->cond_ != NULL) {
+ for (i = 0; i < row_mt_sync->rows; ++i) {
+ pthread_cond_destroy(&row_mt_sync->cond_[i]);
+ }
+ aom_free(row_mt_sync->cond_);
+ }
+#endif // CONFIG_MULTITHREAD
+ aom_free(row_mt_sync->num_finished_cols);
+
+ // clear the structure as the source of this call may be dynamic change
+ // in tiles in which case this call will be followed by an _alloc()
+ // which may fail.
+ av1_zero(*row_mt_sync);
+ }
+}
+
+static AOM_INLINE int get_sb_rows_in_frame(AV1_COMMON *cm) {
+ return CEIL_POWER_OF_TWO(cm->mi_params.mi_rows,
+ cm->seq_params->mib_size_log2);
+}
+
+static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
+ int alloc_row_ctx) {
+ struct AV1Common *cm = &cpi->common;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int tile_col, tile_row;
+
+ av1_row_mt_mem_dealloc(cpi);
+
+ // Allocate memory for row based multi-threading
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+
+ row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows);
+
+ if (alloc_row_ctx) {
+ assert(max_cols > 0);
+ const int num_row_ctx = AOMMAX(1, (max_cols - 1));
+ CHECK_MEM_ERROR(cm, this_tile->row_ctx,
+ (FRAME_CONTEXT *)aom_memalign(
+ 16, num_row_ctx * sizeof(*this_tile->row_ctx)));
+ }
+ }
+ }
+ const int sb_rows = get_sb_rows_in_frame(cm);
+ CHECK_MEM_ERROR(
+ cm, enc_row_mt->num_tile_cols_done,
+ aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows));
+
+ enc_row_mt->allocated_rows = max_rows;
+ enc_row_mt->allocated_cols = max_cols - 1;
+ enc_row_mt->allocated_sb_rows = sb_rows;
+}
+
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int tile_cols = enc_row_mt->allocated_tile_cols;
+ const int tile_rows = enc_row_mt->allocated_tile_rows;
+ int tile_col, tile_row;
+
+ // Free row based multi-threading sync memory
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+
+ av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+
+ if (cpi->oxcf.algo_cfg.cdf_update_mode) {
+ aom_free(this_tile->row_ctx);
+ this_tile->row_ctx = NULL;
+ }
+ }
+ }
+ aom_free(enc_row_mt->num_tile_cols_done);
+ enc_row_mt->num_tile_cols_done = NULL;
+ enc_row_mt->allocated_rows = 0;
+ enc_row_mt->allocated_cols = 0;
+ enc_row_mt->allocated_sb_rows = 0;
+}
+
+static AOM_INLINE void assign_tile_to_thread(int *thread_id_to_tile_id,
+ int num_tiles, int num_workers) {
+ int tile_id = 0;
+ int i;
+
+ for (i = 0; i < num_workers; i++) {
+ thread_id_to_tile_id[i] = tile_id++;
+ if (tile_id == num_tiles) tile_id = 0;
+ }
+}
+
+static AOM_INLINE int get_next_job(TileDataEnc *const tile_data,
+ int *current_mi_row, int mib_size) {
+ AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+ const int mi_row_end = tile_data->tile_info.mi_row_end;
+
+ if (row_mt_sync->next_mi_row < mi_row_end) {
+ *current_mi_row = row_mt_sync->next_mi_row;
+ row_mt_sync->num_threads_working++;
+ row_mt_sync->next_mi_row += mib_size;
+ return 1;
+ }
+ return 0;
+}
+
+static AOM_INLINE void switch_tile_and_get_next_job(
+ AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id,
+ int *current_mi_row, int *end_of_frame, int is_firstpass,
+ const BLOCK_SIZE fp_block_size) {
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+
+ int tile_id = -1; // Stores the tile ID with minimum proc done
+ int max_mis_to_encode = 0;
+ int min_num_threads_working = INT_MAX;
+
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &tile_data[tile_index];
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+#if CONFIG_REALTIME_ONLY
+ int num_b_rows_in_tile =
+ av1_get_sb_rows_in_tile(cm, &this_tile->tile_info);
+ int num_b_cols_in_tile =
+ av1_get_sb_cols_in_tile(cm, &this_tile->tile_info);
+#else
+ int num_b_rows_in_tile =
+ is_firstpass
+ ? av1_get_unit_rows_in_tile(&this_tile->tile_info, fp_block_size)
+ : av1_get_sb_rows_in_tile(cm, &this_tile->tile_info);
+ int num_b_cols_in_tile =
+ is_firstpass
+ ? av1_get_unit_cols_in_tile(&this_tile->tile_info, fp_block_size)
+ : av1_get_sb_cols_in_tile(cm, &this_tile->tile_info);
+#endif
+ int theoretical_limit_on_threads =
+ AOMMIN((num_b_cols_in_tile + 1) >> 1, num_b_rows_in_tile);
+ int num_threads_working = row_mt_sync->num_threads_working;
+
+ if (num_threads_working < theoretical_limit_on_threads) {
+ int num_mis_to_encode =
+ this_tile->tile_info.mi_row_end - row_mt_sync->next_mi_row;
+
+ // Tile to be processed by this thread is selected on the basis of
+ // availability of jobs:
+ // 1) If jobs are available, tile to be processed is chosen on the
+ // basis of minimum number of threads working for that tile. If two or
+ // more tiles have same number of threads working for them, then the
+ // tile with maximum number of jobs available will be chosen.
+ // 2) If no jobs are available, then end_of_frame is reached.
+ if (num_mis_to_encode > 0) {
+ if (num_threads_working < min_num_threads_working) {
+ min_num_threads_working = num_threads_working;
+ max_mis_to_encode = 0;
+ }
+ if (num_threads_working == min_num_threads_working &&
+ num_mis_to_encode > max_mis_to_encode) {
+ tile_id = tile_index;
+ max_mis_to_encode = num_mis_to_encode;
+ }
+ }
+ }
+ }
+ }
+ if (tile_id == -1) {
+ *end_of_frame = 1;
+ } else {
+ // Update the current tile id to the tile id that will be processed next,
+ // which will be the least processed tile.
+ *cur_tile_id = tile_id;
+ const int unit_height = mi_size_high[fp_block_size];
+ get_next_job(&tile_data[tile_id], current_mi_row,
+ is_firstpass ? unit_height : cm->seq_params->mib_size);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void set_firstpass_encode_done(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+ const int unit_height = mi_size_high[fp_block_size];
+
+ // In case of multithreading of firstpass encode, due to top-right
+ // dependency, the worker on a firstpass row waits for the completion of the
+ // firstpass processing of the top and top-right fp_blocks. Hence, in case a
+ // thread (main/worker) encounters an error, update the firstpass processing
+ // of every row in the frame to indicate that it is complete in order to avoid
+ // dependent workers waiting indefinitely.
+ for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ TileInfo *tile = &tile_data->tile_info;
+ AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+ const int unit_cols_in_tile =
+ av1_get_unit_cols_in_tile(tile, fp_block_size);
+ for (int mi_row = tile->mi_row_start, unit_row_in_tile = 0;
+ mi_row < tile->mi_row_end;
+ mi_row += unit_height, unit_row_in_tile++) {
+ enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile,
+ unit_cols_in_tile - 1, unit_cols_in_tile);
+ }
+ }
+ }
+}
+
+static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *const cpi = thread_data->cpi;
+ int thread_id = thread_data->thread_id;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+ (void)unused;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+ enc_row_mt->firstpass_mt_exit = true;
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ set_firstpass_encode_done(cpi);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ AV1_COMMON *const cm = &cpi->common;
+ int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+ assert(cur_tile_id != -1);
+
+ const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+ const int unit_height = mi_size_high[fp_block_size];
+ int end_of_frame = 0;
+ while (1) {
+ int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+ bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
+ if (!firstpass_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id],
+ &current_mi_row, unit_height)) {
+ // No jobs are available for the current tile. Query for the status of
+ // other tiles and get the next job if available
+ switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
+ &current_mi_row, &end_of_frame, 1,
+ fp_block_size);
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ // When firstpass_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (firstpass_mt_exit || end_of_frame) break;
+
+ TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+ ThreadData *td = thread_data->td;
+
+ assert(current_mi_row != -1 &&
+ current_mi_row < this_tile->tile_info.mi_row_end);
+
+ const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+ av1_first_pass_row(cpi, td, this_tile, current_mi_row >> unit_height_log2,
+ fp_block_size);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+ row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+#endif
+
+static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data,
+ AV1EncRowMultiThreadInfo *enc_row_mt,
+ int mib_size_log2) {
+ AV1LfSync *const lf_sync = (AV1LfSync *)thread_data->lf_sync;
+ const int sb_rows = get_sb_rows_in_frame(cm);
+ AV1LfMTInfo *cur_job_info;
+ bool row_mt_exit = false;
+ (void)enc_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+
+ while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
+ LFWorkerData *const lf_data = (LFWorkerData *)thread_data->lf_data;
+ const int lpf_opt_level = cur_job_info->lpf_opt_level;
+ (void)sb_rows;
+#if CONFIG_MULTITHREAD
+ const int cur_sb_row = cur_job_info->mi_row >> mib_size_log2;
+ const int next_sb_row = AOMMIN(sb_rows - 1, cur_sb_row + 1);
+ // Wait for current and next superblock row to finish encoding.
+ pthread_mutex_lock(enc_row_mt_mutex_);
+ while (!enc_row_mt->row_mt_exit &&
+ (enc_row_mt->num_tile_cols_done[cur_sb_row] < cm->tiles.cols ||
+ enc_row_mt->num_tile_cols_done[next_sb_row] < cm->tiles.cols)) {
+ pthread_cond_wait(enc_row_mt->cond_, enc_row_mt_mutex_);
+ }
+ row_mt_exit = enc_row_mt->row_mt_exit;
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ if (row_mt_exit) return;
+
+ av1_thread_loop_filter_rows(
+ lf_data->frame_buffer, lf_data->cm, lf_data->planes, lf_data->xd,
+ cur_job_info->mi_row, cur_job_info->plane, cur_job_info->dir,
+ lpf_opt_level, lf_sync, &thread_data->error_info, lf_data->params_buf,
+ lf_data->tx_buf, mib_size_log2);
+ }
+}
+
+static void set_encoding_done(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ const int mib_size = cm->seq_params->mib_size;
+
+ // In case of row-multithreading, due to top-right dependency, the worker on
+ // an SB row waits for the completion of the encode of the top and top-right
+ // SBs. Hence, in case a thread (main/worker) encounters an error, update that
+ // encoding of every SB row in the frame is complete in order to avoid the
+ // dependent workers of every tile from waiting indefinitely.
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+ const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+ for (int mi_row = tile_info->mi_row_start, sb_row_in_tile = 0;
+ mi_row < tile_info->mi_row_end;
+ mi_row += mib_size, sb_row_in_tile++) {
+ enc_row_mt->sync_write_ptr(row_mt_sync, sb_row_in_tile,
+ sb_cols_in_tile - 1, sb_cols_in_tile);
+ }
+ }
+ }
+}
+
+static bool lpf_mt_with_enc_enabled(int pipeline_lpf_mt_with_enc,
+ const int filter_level[2]) {
+ return pipeline_lpf_mt_with_enc && (filter_level[0] || filter_level[1]);
+}
+
+static int enc_row_mt_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *const cpi = thread_data->cpi;
+ int thread_id = thread_data->thread_id;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
+#endif
+ (void)unused;
+
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ AV1LfSync *const lf_sync = thread_data->lf_sync;
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ xd->error_info = error_info;
+ AV1_COMMON *volatile const cm = &cpi->common;
+ volatile const bool do_pipelined_lpf_mt_with_enc = lpf_mt_with_enc_enabled(
+ cpi->mt_info.pipeline_lpf_mt_with_enc, cm->lf.filter_level);
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+ enc_row_mt->row_mt_exit = true;
+ // Wake up all the workers waiting in launch_loop_filter_rows() to exit in
+ // case of an error.
+ pthread_cond_broadcast(enc_row_mt->cond_);
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ set_encoding_done(cpi);
+
+ if (do_pipelined_lpf_mt_with_enc) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(lf_sync->job_mutex);
+ lf_sync->lf_mt_exit = true;
+ pthread_mutex_unlock(lf_sync->job_mutex);
+#endif
+ av1_set_vert_loop_filter_done(&cpi->common, lf_sync,
+ cpi->common.seq_params->mib_size_log2);
+ }
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
+
+ // Preallocate the pc_tree for realtime coding to reduce the cost of memory
+ // allocation.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+ if (!thread_data->td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ } else {
+ thread_data->td->pc_root = NULL;
+ }
+
+ assert(cur_tile_id != -1);
+
+ const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
+ int end_of_frame = 0;
+ bool row_mt_exit = false;
+
+ // When master thread does not have a valid job to process, xd->tile_ctx
+ // is not set and it contains NULL pointer. This can result in NULL pointer
+ // access violation if accessed beyond the encode stage. Hence, updating
+ // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame
+ // context to avoid NULL pointer access in subsequent stages.
+ thread_data->td->mb.e_mbd.tile_ctx = cm->fc;
+ while (1) {
+ int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+ row_mt_exit = enc_row_mt->row_mt_exit;
+ // row_mt_exit check here can be avoided as it is checked after
+ // sync_read_ptr() in encode_sb_row(). However, checking row_mt_exit here,
+ // tries to return before calling the function get_next_job().
+ if (!row_mt_exit &&
+ !get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
+ cm->seq_params->mib_size)) {
+ // No jobs are available for the current tile. Query for the status of
+ // other tiles and get the next job if available
+ switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
+ &current_mi_row, &end_of_frame, 0,
+ fp_block_size);
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ // When row_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (row_mt_exit) {
+ error_info->setjmp = 0;
+ return 1;
+ }
+
+ if (end_of_frame) break;
+
+ TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ const int tile_row = tile_info->tile_row;
+ const int tile_col = tile_info->tile_col;
+ ThreadData *td = thread_data->td;
+ const int sb_row = current_mi_row >> mib_size_log2;
+
+ assert(current_mi_row != -1 && current_mi_row <= tile_info->mi_row_end);
+
+ td->mb.e_mbd.tile_ctx = td->tctx;
+ td->mb.tile_pb_ctx = &this_tile->tctx;
+ td->abs_sum_level = 0;
+
+ if (this_tile->allow_update_cdf) {
+ td->mb.row_ctx = this_tile->row_ctx;
+ if (current_mi_row == tile_info->mi_row_start)
+ memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+ } else {
+ memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+ }
+
+ av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+ &td->mb.e_mbd);
+
+ cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
+ if (td->mb.txfm_search_info.mb_rd_record != NULL) {
+ av1_crc32c_calculator_init(
+ &td->mb.txfm_search_info.mb_rd_record->crc_calculator);
+ }
+
+ av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex_);
+#endif
+ this_tile->abs_sum_level += td->abs_sum_level;
+ row_mt_sync->num_threads_working--;
+ enc_row_mt->num_tile_cols_done[sb_row]++;
+#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(enc_row_mt->cond_);
+ pthread_mutex_unlock(enc_row_mt_mutex_);
+#endif
+ }
+ if (do_pipelined_lpf_mt_with_enc) {
+ // Loop-filter a superblock row if encoding of the current and next
+ // superblock row is complete.
+ // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving
+ // encoding and loop filter stage.
+ launch_loop_filter_rows(cm, thread_data, enc_row_mt, mib_size_log2);
+ }
+ av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ thread_data->td->pc_root = NULL;
+ error_info->setjmp = 0;
+ return 1;
+}
+
+static int enc_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *const cpi = thread_data->cpi;
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ const AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int t;
+
+ (void)unused;
+
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ // Preallocate the pc_tree for realtime coding to reduce the cost of memory
+ // allocation.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ thread_data->td->pc_root = av1_alloc_pc_tree_node(cm->seq_params->sb_size);
+ if (!thread_data->td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ } else {
+ thread_data->td->pc_root = NULL;
+ }
+
+ for (t = thread_data->start; t < tile_rows * tile_cols;
+ t += cpi->mt_info.num_workers) {
+ int tile_row = t / tile_cols;
+ int tile_col = t % tile_cols;
+
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+ thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+ thread_data->td->mb.tile_pb_ctx = &this_tile->tctx;
+ av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+ }
+
+ av1_free_pc_tree_recursive(thread_data->td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ thread_data->td->pc_root = NULL;
+ error_info->setjmp = 0;
+ return 1;
+}
+
+void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi) {
+ cpi->mt_info.workers = ppi->p_mt_info.workers;
+ cpi->mt_info.num_workers = ppi->p_mt_info.num_workers;
+ cpi->mt_info.tile_thr_data = ppi->p_mt_info.tile_thr_data;
+ int i;
+ for (i = MOD_FP; i < NUM_MT_MODULES; i++) {
+ cpi->mt_info.num_mod_workers[i] =
+ AOMMIN(cpi->mt_info.num_workers, ppi->p_mt_info.num_mod_workers[i]);
+ }
+}
+
+void av1_init_cdef_worker(AV1_COMP *cpi) {
+ // The allocation is done only for level 0 parallel frames. No change
+ // in config is supported in the middle of a parallel encode set, since the
+ // rest of the MT modules also do not support dynamic change of config.
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) return;
+ PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info;
+ int num_cdef_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_CDEF);
+
+ av1_alloc_cdef_buffers(&cpi->common, &p_mt_info->cdef_worker,
+ &cpi->mt_info.cdef_sync, num_cdef_workers, 1);
+ cpi->mt_info.cdef_worker = p_mt_info->cdef_worker;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_init_lr_mt_buffers(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync;
+ if (lr_sync->sync_range) {
+ if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ return;
+ int num_lr_workers =
+ av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
+ assert(num_lr_workers <= lr_sync->num_workers);
+ lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf;
+ lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs;
+ }
+}
+#endif
+
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+ if (setjmp(cm->error->jmp)) {
+ cm->error->setjmp = 0;
+ aom_internal_error_copy(&cpi->ppi->error, cm->error);
+ }
+ cm->error->setjmp = 1;
+ // Initialize enc row MT object.
+ if (is_first_pass || cpi->oxcf.row_mt == 1) {
+ AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
+ if (enc_row_mt->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, enc_row_mt->mutex_,
+ aom_malloc(sizeof(*(enc_row_mt->mutex_))));
+ if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
+ }
+ if (enc_row_mt->cond_ == NULL) {
+ CHECK_MEM_ERROR(cm, enc_row_mt->cond_,
+ aom_malloc(sizeof(*(enc_row_mt->cond_))));
+ if (enc_row_mt->cond_) pthread_cond_init(enc_row_mt->cond_, NULL);
+ }
+ }
+
+ if (!is_first_pass) {
+ // Initialize global motion MT object.
+ AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
+ if (gm_sync->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, gm_sync->mutex_,
+ aom_malloc(sizeof(*(gm_sync->mutex_))));
+ if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
+ }
+#if !CONFIG_REALTIME_ONLY
+ // Initialize temporal filtering MT object.
+ AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
+ if (tf_sync->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, tf_sync->mutex_,
+ aom_malloc(sizeof(*tf_sync->mutex_)));
+ if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
+ }
+#endif // !CONFIG_REALTIME_ONLY
+ // Initialize CDEF MT object.
+ AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+ if (cdef_sync->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+ aom_malloc(sizeof(*(cdef_sync->mutex_))));
+ if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+ }
+
+ // Initialize loop filter MT object.
+ AV1LfSync *lf_sync = &mt_info->lf_row_sync;
+ // Number of superblock rows
+ const int sb_rows =
+ CEIL_POWER_OF_TWO(cm->height >> MI_SIZE_LOG2, MAX_MIB_SIZE_LOG2);
+ PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info;
+ int num_lf_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LPF);
+
+ if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+ num_lf_workers > lf_sync->num_workers) {
+ av1_loop_filter_dealloc(lf_sync);
+ av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers);
+ }
+
+ // Initialize tpl MT object.
+ AV1TplRowMultiThreadInfo *tpl_row_mt = &mt_info->tpl_row_mt;
+ if (tpl_row_mt->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, tpl_row_mt->mutex_,
+ aom_malloc(sizeof(*(tpl_row_mt->mutex_))));
+ if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (is_restoration_used(cm)) {
+ // Initialize loop restoration MT object.
+ AV1LrSync *lr_sync = &mt_info->lr_row_sync;
+ int rst_unit_size = cpi->sf.lpf_sf.min_lr_unit_size;
+ int num_rows_lr = av1_lr_count_units(rst_unit_size, cm->height);
+ int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR);
+ if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
+ num_lr_workers > lr_sync->num_workers ||
+ MAX_MB_PLANE > lr_sync->num_planes) {
+ av1_loop_restoration_dealloc(lr_sync);
+ av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr,
+ MAX_MB_PLANE, cm->width);
+ }
+ }
+#endif
+
+ // Initialization of pack bitstream MT object.
+ AV1EncPackBSSync *pack_bs_sync = &mt_info->pack_bs_sync;
+ if (pack_bs_sync->mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_,
+ aom_malloc(sizeof(*pack_bs_sync->mutex_)));
+ if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL);
+ }
+ }
+ cm->error->setjmp = 0;
+}
+#endif // CONFIG_MULTITHREAD
+
+// Computes the number of workers to be considered while allocating memory for a
+// multi-threaded module under FPMT.
+int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
+ MULTI_THREADED_MODULES mod_name) {
+ int num_mod_workers = p_mt_info->num_mod_workers[mod_name];
+ if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) {
+ // TODO(anyone): Change num_mod_workers to num_mod_workers[MOD_FRAME_ENC].
+ // As frame parallel jobs will only perform multi-threading for the encode
+ // stage, we can limit the allocations according to num_enc_workers per
+ // frame parallel encode(a.k.a num_mod_workers[MOD_FRAME_ENC]).
+ num_mod_workers = p_mt_info->num_workers;
+ }
+ return num_mod_workers;
+}
+
+void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+
+ assert(p_mt_info->workers != NULL);
+ assert(p_mt_info->tile_thr_data != NULL);
+
+ int num_workers = p_mt_info->num_workers;
+ int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC);
+ assert(num_enc_workers <= num_workers);
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i];
+
+ if (i > 0) {
+ // Allocate thread data.
+ ThreadData *td;
+ AOM_CHECK_MEM_ERROR(&ppi->error, td, aom_memalign(32, sizeof(*td)));
+ av1_zero(*td);
+ thread_data->original_td = thread_data->td = td;
+
+ // Set up shared coeff buffers.
+ av1_setup_shared_coeff_buffer(&ppi->seq_params, &td->shared_coeff_buf,
+ &ppi->error);
+ AOM_CHECK_MEM_ERROR(&ppi->error, td->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+ sizeof(*td->tmp_conv_dst)));
+
+ if (i < p_mt_info->num_mod_workers[MOD_FP]) {
+ // Set up firstpass PICK_MODE_CONTEXT.
+ td->firstpass_ctx =
+ av1_alloc_pmc(ppi->cpi, BLOCK_16X16, &td->shared_coeff_buf);
+ if (!td->firstpass_ctx)
+ aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+
+ if (!is_first_pass && i < num_enc_workers) {
+ // Set up sms_tree.
+ if (av1_setup_sms_tree(ppi->cpi, td)) {
+ aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate SMS tree");
+ }
+
+ for (int x = 0; x < 2; x++)
+ for (int y = 0; y < 2; y++)
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, td->hash_value_buffer[x][y],
+ (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*td->hash_value_buffer[0][0])));
+
+ // Allocate frame counters in thread data.
+ AOM_CHECK_MEM_ERROR(&ppi->error, td->counts,
+ aom_calloc(1, sizeof(*td->counts)));
+
+ // Allocate buffers used by palette coding mode.
+ AOM_CHECK_MEM_ERROR(&ppi->error, td->palette_buffer,
+ aom_memalign(16, sizeof(*td->palette_buffer)));
+
+ // The buffers 'tmp_pred_bufs[]', 'comp_rd_buffer' and 'obmc_buffer' are
+ // used in inter frames to store intermediate inter mode prediction
+ // results and are not required for allintra encoding mode. Hence, the
+ // memory allocations for these buffers are avoided for allintra
+ // encoding mode.
+ if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) {
+ alloc_obmc_buffers(&td->obmc_buffer, &ppi->error);
+
+ alloc_compound_type_rd_buffers(&ppi->error, &td->comp_rd_buffer);
+
+ for (int j = 0; j < 2; ++j) {
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, td->tmp_pred_bufs[j],
+ aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*td->tmp_pred_bufs[j])));
+ }
+ }
+
+ if (is_gradient_caching_for_hog_enabled(ppi->cpi)) {
+ const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome;
+ AOM_CHECK_MEM_ERROR(&ppi->error, td->pixel_gradient_info,
+ aom_malloc(sizeof(*td->pixel_gradient_info) *
+ plane_types * MAX_SB_SQUARE));
+ }
+
+ if (is_src_var_for_4x4_sub_blocks_caching_enabled(ppi->cpi)) {
+ const BLOCK_SIZE sb_size = ppi->cpi->common.seq_params->sb_size;
+ const int mi_count_in_sb =
+ mi_size_wide[sb_size] * mi_size_high[sb_size];
+
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, td->src_var_info_of_4x4_sub_blocks,
+ aom_malloc(sizeof(*td->src_var_info_of_4x4_sub_blocks) *
+ mi_count_in_sb));
+ }
+
+ if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+ const int num_64x64_blocks =
+ (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, td->vt64x64,
+ aom_malloc(sizeof(*td->vt64x64) * num_64x64_blocks));
+ }
+ }
+ }
+
+ if (!is_first_pass && ppi->cpi->oxcf.row_mt == 1 && i < num_enc_workers) {
+ if (i == 0) {
+ for (int j = 0; j < ppi->num_fp_contexts; j++) {
+ AOM_CHECK_MEM_ERROR(&ppi->error, ppi->parallel_cpi[j]->td.tctx,
+ (FRAME_CONTEXT *)aom_memalign(
+ 16, sizeof(*ppi->parallel_cpi[j]->td.tctx)));
+ }
+ } else {
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, thread_data->td->tctx,
+ (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
+ }
+ }
+ }
+
+ // Record the number of workers in encode stage multi-threading for which
+ // allocation is done.
+ p_mt_info->prev_num_enc_workers = num_enc_workers;
+}
+
+void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) {
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ assert(p_mt_info->num_workers == 0);
+
+ AOM_CHECK_MEM_ERROR(&ppi->error, p_mt_info->workers,
+ aom_malloc(num_workers * sizeof(*p_mt_info->workers)));
+
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, p_mt_info->tile_thr_data,
+ aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data)));
+
+ for (int i = 0; i < num_workers; ++i) {
+ AVxWorker *const worker = &p_mt_info->workers[i];
+ EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i];
+
+ winterface->init(worker);
+ worker->thread_name = "aom enc worker";
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ if (i > 0) {
+ // Create threads
+ if (!winterface->reset(worker))
+ aom_internal_error(&ppi->error, AOM_CODEC_ERROR,
+ "Tile encoder thread creation failed");
+ }
+ winterface->sync(worker);
+
+ ++p_mt_info->num_workers;
+ }
+}
+
+// This function will change the state and free the mutex of corresponding
+// workers and terminate the object. The object can not be re-used unless a call
+// to reset() is made.
+void av1_terminate_workers(AV1_PRIMARY *ppi) {
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+ for (int t = 0; t < p_mt_info->num_workers; ++t) {
+ AVxWorker *const worker = &p_mt_info->workers[t];
+ aom_get_worker_interface()->end(worker);
+ }
+}
+
+// This function returns 1 if frame parallel encode is supported for
+// the current configuration. Returns 0 otherwise.
+static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+ // FPMT is enabled for AOM_Q and AOM_VBR.
+ // TODO(Tarun): Test and enable resize config.
+ if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) {
+ return 0;
+ }
+ if (ppi->use_svc) {
+ return 0;
+ }
+ if (oxcf->tile_cfg.enable_large_scale_tile) {
+ return 0;
+ }
+ if (oxcf->dec_model_cfg.timing_info_present) {
+ return 0;
+ }
+ if (oxcf->mode != GOOD) {
+ return 0;
+ }
+ if (oxcf->tool_cfg.error_resilient_mode) {
+ return 0;
+ }
+ if (oxcf->resize_cfg.resize_mode) {
+ return 0;
+ }
+ if (oxcf->pass != AOM_RC_SECOND_PASS) {
+ return 0;
+ }
+ if (oxcf->max_threads < 2) {
+ return 0;
+ }
+ if (!oxcf->fp_mt) {
+ return 0;
+ }
+
+ return 1;
+}
+
+int av1_check_fpmt_config(AV1_PRIMARY *const ppi,
+ AV1EncoderConfig *const oxcf) {
+ if (is_fpmt_config(ppi, oxcf)) return 1;
+ // Reset frame parallel configuration for unsupported config
+ if (ppi->num_fp_contexts > 1) {
+ for (int i = 1; i < ppi->num_fp_contexts; i++) {
+ // Release the previously-used frame-buffer
+ if (ppi->parallel_cpi[i]->common.cur_frame != NULL) {
+ --ppi->parallel_cpi[i]->common.cur_frame->ref_count;
+ ppi->parallel_cpi[i]->common.cur_frame = NULL;
+ }
+ }
+
+ int cur_gf_index = ppi->cpi->gf_frame_index;
+ int reset_size = AOMMAX(0, ppi->gf_group.size - cur_gf_index);
+ av1_zero_array(&ppi->gf_group.frame_parallel_level[cur_gf_index],
+ reset_size);
+ av1_zero_array(&ppi->gf_group.is_frame_non_ref[cur_gf_index], reset_size);
+ av1_zero_array(&ppi->gf_group.src_offset[cur_gf_index], reset_size);
+ memset(&ppi->gf_group.skip_frame_refresh[cur_gf_index][0], INVALID_IDX,
+ sizeof(ppi->gf_group.skip_frame_refresh[cur_gf_index][0]) *
+ reset_size * REF_FRAMES);
+ memset(&ppi->gf_group.skip_frame_as_ref[cur_gf_index], INVALID_IDX,
+ sizeof(ppi->gf_group.skip_frame_as_ref[cur_gf_index]) * reset_size);
+ ppi->num_fp_contexts = 1;
+ }
+ return 0;
+}
+
+// A large value for threads used to compute the max num_enc_workers
+// possible for each resolution.
+#define MAX_THREADS 100
+
+// Computes the max number of enc workers possible for each resolution.
+static AOM_INLINE int compute_max_num_enc_workers(
+ CommonModeInfoParams *const mi_params, int mib_size_log2) {
+ int num_sb_rows = CEIL_POWER_OF_TWO(mi_params->mi_rows, mib_size_log2);
+ int num_sb_cols = CEIL_POWER_OF_TWO(mi_params->mi_cols, mib_size_log2);
+
+ return AOMMIN((num_sb_cols + 1) >> 1, num_sb_rows);
+}
+
+// Computes the number of frame parallel(fp) contexts to be created
+// based on the number of max_enc_workers.
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+ ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 0;
+ if (!av1_check_fpmt_config(ppi, oxcf)) {
+ return 1;
+ }
+ int max_num_enc_workers = compute_max_num_enc_workers(
+ &ppi->cpi->common.mi_params, ppi->cpi->common.seq_params->mib_size_log2);
+ // Scaling factors and rounding factors used to tune worker_per_frame
+ // computation.
+ int rounding_factor[2] = { 2, 4 };
+ int scaling_factor[2] = { 4, 8 };
+ int is_480p_or_lesser =
+ AOMMIN(oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height) <= 480;
+ int is_sb_64 = 0;
+ if (ppi->cpi != NULL)
+ is_sb_64 = ppi->cpi->common.seq_params->sb_size == BLOCK_64X64;
+ // A parallel frame encode has at least 1/4th the
+ // theoretical limit of max enc workers in default case. For resolutions
+ // larger than 480p, if SB size is 64x64, optimal performance is obtained with
+ // limit of 1/8.
+ int index = (!is_480p_or_lesser && is_sb_64) ? 1 : 0;
+ int workers_per_frame =
+ AOMMAX(1, (max_num_enc_workers + rounding_factor[index]) /
+ scaling_factor[index]);
+ int max_threads = oxcf->max_threads;
+ int num_fp_contexts = max_threads / workers_per_frame;
+ // Based on empirical results, FPMT gains with multi-tile are significant when
+ // more parallel frames are available. Use FPMT with multi-tile encode only
+ // when sufficient threads are available for parallel encode of
+ // MAX_PARALLEL_FRAMES frames.
+ if (oxcf->tile_cfg.tile_columns > 0 || oxcf->tile_cfg.tile_rows > 0) {
+ if (num_fp_contexts < MAX_PARALLEL_FRAMES) num_fp_contexts = 1;
+ }
+
+ num_fp_contexts = AOMMAX(1, AOMMIN(num_fp_contexts, MAX_PARALLEL_FRAMES));
+ // Limit recalculated num_fp_contexts to ppi->num_fp_contexts.
+ num_fp_contexts = (ppi->num_fp_contexts == 1)
+ ? num_fp_contexts
+ : AOMMIN(num_fp_contexts, ppi->num_fp_contexts);
+ if (num_fp_contexts > 1) {
+ ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] =
+ AOMMIN(max_num_enc_workers * num_fp_contexts, oxcf->max_threads);
+ }
+ return num_fp_contexts;
+}
+
+// Computes the number of workers to process each of the parallel frames.
+static AOM_INLINE int compute_num_workers_per_frame(
+ const int num_workers, const int parallel_frame_count) {
+ // Number of level 2 workers per frame context (floor division).
+ int workers_per_frame = (num_workers / parallel_frame_count);
+ return workers_per_frame;
+}
+
+static AOM_INLINE void restore_workers_after_fpmt(
+ AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared);
+
+// Prepare level 1 workers. This function is only called for
+// parallel_frame_count > 1. This function populates the mt_info structure of
+// frame level contexts appropriately by dividing the total number of available
+// workers amongst the frames as level 2 workers. It also populates the hook and
+// data members of level 1 workers.
+static AOM_INLINE void prepare_fpmt_workers(AV1_PRIMARY *ppi,
+ AV1_COMP_DATA *first_cpi_data,
+ AVxWorkerHook hook,
+ int parallel_frame_count) {
+ assert(parallel_frame_count <= ppi->num_fp_contexts &&
+ parallel_frame_count > 1);
+
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+ int num_workers = p_mt_info->num_workers;
+
+ volatile int frame_idx = 0;
+ volatile int i = 0;
+ while (i < num_workers) {
+ // Assign level 1 worker
+ AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] =
+ &p_mt_info->workers[i];
+ AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
+ MultiThreadInfo *mt_info = &cur_cpi->mt_info;
+ // This 'aom_internal_error_info' pointer is not derived from the local
+ // pointer ('AV1_COMMON *const cm') to silence the compiler warning
+ // "variable 'cm' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered]".
+ struct aom_internal_error_info *const error = cur_cpi->common.error;
+
+ // The jmp_buf is valid only within the scope of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error->jmp)) {
+ error->setjmp = 0;
+ restore_workers_after_fpmt(ppi, parallel_frame_count, i);
+ aom_internal_error_copy(&ppi->error, error);
+ }
+ error->setjmp = 1;
+
+ AV1_COMMON *const cm = &cur_cpi->common;
+ // Assign start of level 2 worker pool
+ mt_info->workers = &p_mt_info->workers[i];
+ mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i];
+ // Assign number of workers for each frame in the parallel encode set.
+ mt_info->num_workers = compute_num_workers_per_frame(
+ num_workers - i, parallel_frame_count - frame_idx);
+ for (int j = MOD_FP; j < NUM_MT_MODULES; j++) {
+ mt_info->num_mod_workers[j] =
+ AOMMIN(mt_info->num_workers, p_mt_info->num_mod_workers[j]);
+ }
+ if (p_mt_info->cdef_worker != NULL) {
+ mt_info->cdef_worker = &p_mt_info->cdef_worker[i];
+
+ // Back up the original cdef_worker pointers.
+ mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf;
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; plane++)
+ mt_info->restore_state_buf.cdef_colbuf[plane] =
+ mt_info->cdef_worker->colbuf[plane];
+ }
+#if !CONFIG_REALTIME_ONLY
+ if (is_restoration_used(cm)) {
+ // Back up the original LR buffers before update.
+ int idx = i + mt_info->num_workers - 1;
+ assert(idx < mt_info->lr_row_sync.num_workers);
+ mt_info->restore_state_buf.rst_tmpbuf =
+ mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf;
+ mt_info->restore_state_buf.rlbs =
+ mt_info->lr_row_sync.lrworkerdata[idx].rlbs;
+
+ // Update LR buffers.
+ mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = cm->rst_tmpbuf;
+ mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cm->rlbs;
+ }
+#endif
+
+ i += mt_info->num_workers;
+
+ // At this stage, the thread specific CDEF buffers for the current frame's
+ // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has
+ // already been allocated across parallel frames.
+ av1_alloc_cdef_buffers(cm, &p_mt_info->cdef_worker, &mt_info->cdef_sync,
+ p_mt_info->num_workers, 0);
+
+ frame_worker->hook = hook;
+ frame_worker->data1 = cur_cpi;
+ frame_worker->data2 = (frame_idx == 0)
+ ? first_cpi_data
+ : &ppi->parallel_frames_data[frame_idx - 1];
+ frame_idx++;
+ error->setjmp = 0;
+ }
+ p_mt_info->p_num_workers = parallel_frame_count;
+}
+
+// Launch level 1 workers to perform frame parallel encode.
+static AOM_INLINE void launch_fpmt_workers(AV1_PRIMARY *ppi) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int num_workers = ppi->p_mt_info.p_num_workers;
+
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+ if (i == 0)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+}
+
+// Restore worker states after parallel encode.
+static AOM_INLINE void restore_workers_after_fpmt(
+ AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared) {
+ assert(parallel_frame_count <= ppi->num_fp_contexts &&
+ parallel_frame_count > 1);
+ (void)parallel_frame_count;
+
+ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+
+ int frame_idx = 0;
+ int i = 0;
+ while (i < num_fpmt_workers_prepared) {
+ AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
+ MultiThreadInfo *mt_info = &cur_cpi->mt_info;
+ const AV1_COMMON *const cm = &cur_cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ // Restore the original cdef_worker pointers.
+ if (p_mt_info->cdef_worker != NULL) {
+ mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf;
+ for (int plane = 0; plane < num_planes; plane++)
+ mt_info->cdef_worker->colbuf[plane] =
+ mt_info->restore_state_buf.cdef_colbuf[plane];
+ }
+#if !CONFIG_REALTIME_ONLY
+ if (is_restoration_used(cm)) {
+ // Restore the original LR buffers.
+ int idx = i + mt_info->num_workers - 1;
+ assert(idx < mt_info->lr_row_sync.num_workers);
+ mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
+ mt_info->restore_state_buf.rst_tmpbuf;
+ mt_info->lr_row_sync.lrworkerdata[idx].rlbs =
+ mt_info->restore_state_buf.rlbs;
+ }
+#endif
+
+ frame_idx++;
+ i += mt_info->num_workers;
+ }
+}
+
+// Synchronize level 1 workers.
+static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi,
+ int frames_in_parallel_set) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int num_workers = ppi->p_mt_info.p_num_workers;
+ int had_error = 0;
+ // Points to error in the earliest display order frame in the parallel set.
+ const struct aom_internal_error_info *error;
+
+ // Encoding ends.
+ for (int i = num_workers - 1; i >= 0; --i) {
+ AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error = ppi->parallel_cpi[i]->common.error;
+ }
+ }
+
+ restore_workers_after_fpmt(ppi, frames_in_parallel_set,
+ ppi->p_mt_info.num_workers);
+
+ if (had_error) aom_internal_error_copy(&ppi->error, error);
+}
+
+static int get_compressed_data_hook(void *arg1, void *arg2) {
+ AV1_COMP *cpi = (AV1_COMP *)arg1;
+ AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2;
+ int status = av1_get_compressed_data(cpi, cpi_data);
+
+ // AOM_CODEC_OK(0) means no error.
+ return !status;
+}
+
+// This function encodes the raw frame data for each frame in parallel encode
+// set, and outputs the frame bit stream to the designated buffers.
+void av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
+ AV1_COMP_DATA *const first_cpi_data) {
+ // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf
+ // corresponding to frames in the current parallel encode set.
+ int ref_buffers_used_map = 0;
+ int frames_in_parallel_set = av1_init_parallel_frame_context(
+ first_cpi_data, ppi, &ref_buffers_used_map);
+ prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook,
+ frames_in_parallel_set);
+ launch_fpmt_workers(ppi);
+ sync_fpmt_workers(ppi, frames_in_parallel_set);
+
+ // Release cpi->scaled_ref_buf corresponding to frames in the current parallel
+ // encode set.
+ for (int i = 0; i < frames_in_parallel_set; ++i) {
+ av1_release_scaled_references_fpmt(ppi->parallel_cpi[i]);
+ }
+ av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool,
+ ref_buffers_used_map);
+}
+
+static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info,
+ int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ worker->had_error = 0;
+ if (i == 0)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+}
+
+static AOM_INLINE void sync_enc_workers(MultiThreadInfo *const mt_info,
+ AV1_COMMON *const cm, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ const AVxWorker *const worker_main = &mt_info->workers[0];
+ int had_error = worker_main->had_error;
+ struct aom_internal_error_info error_info;
+
+ // Read the error_info of main thread.
+ if (had_error) {
+ error_info = ((EncWorkerData *)worker_main->data1)->error_info;
+ }
+
+ // Encoding ends.
+ for (int i = num_workers - 1; i > 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ if (!winterface->sync(worker)) {
+ had_error = 1;
+ error_info = ((EncWorkerData *)worker->data1)->error_info;
+ }
+ }
+
+ if (had_error) aom_internal_error_copy(cm->error, &error_info);
+
+ // Restore xd->error_info of the main thread back to cm->error so that the
+ // multithreaded code, when executed using a single thread, has a valid
+ // xd->error_info.
+ MACROBLOCKD *const xd = &((EncWorkerData *)worker_main->data1)->td->mb.e_mbd;
+ xd->error_info = cm->error;
+}
+
+static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
+ int num_workers) {
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &cpi->mt_info.workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+ cpi->intrabc_used |= thread_data->td->intrabc_used;
+ cpi->deltaq_used |= thread_data->td->deltaq_used;
+ // Accumulate rtc counters.
+ if (!frame_is_intra_only(&cpi->common))
+ av1_accumulate_rtc_counters(cpi, &thread_data->td->mb);
+ cpi->palette_pixel_num += thread_data->td->mb.palette_pixels;
+ if (thread_data->td != &cpi->td) {
+ // Keep these conditional expressions in sync with the corresponding ones
+ // in prepare_enc_workers().
+ if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+ aom_free(thread_data->td->mv_costs_alloc);
+ thread_data->td->mv_costs_alloc = NULL;
+ }
+ if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+ aom_free(thread_data->td->dv_costs_alloc);
+ thread_data->td->dv_costs_alloc = NULL;
+ }
+ }
+ av1_dealloc_mb_data(&thread_data->td->mb, av1_num_planes(&cpi->common));
+
+ // Accumulate counters.
+ if (i > 0) {
+ av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ cpi->td.mb.txfm_search_info.txb_split_count +=
+ thread_data->td->mb.txfm_search_info.txb_split_count;
+#if CONFIG_SPEED_STATS
+ cpi->td.mb.txfm_search_info.tx_search_count +=
+ thread_data->td->mb.txfm_search_info.tx_search_count;
+#endif // CONFIG_SPEED_STATS
+ }
+ }
+}
+
+static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1_COMMON *const cm = &cpi->common;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ thread_data->td->intrabc_used = 0;
+ thread_data->td->deltaq_used = 0;
+ thread_data->td->abs_sum_level = 0;
+ thread_data->td->rd_counts.seg_tmp_pred_cost[0] = 0;
+ thread_data->td->rd_counts.seg_tmp_pred_cost[1] = 0;
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ thread_data->td->rd_counts = cpi->td.rd_counts;
+ thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
+
+ for (int x = 0; x < 2; x++) {
+ for (int y = 0; y < 2; y++) {
+ memcpy(thread_data->td->hash_value_buffer[x][y],
+ cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*thread_data->td->hash_value_buffer[0][0]));
+ thread_data->td->mb.intrabc_hash_info.hash_value_buffer[x][y] =
+ thread_data->td->hash_value_buffer[x][y];
+ }
+ }
+ // Keep these conditional expressions in sync with the corresponding ones
+ // in accumulate_counters_enc_workers().
+ if (cpi->sf.inter_sf.mv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->mv_costs_alloc,
+ (MvCosts *)aom_malloc(sizeof(*thread_data->td->mv_costs_alloc)));
+ thread_data->td->mb.mv_costs = thread_data->td->mv_costs_alloc;
+ memcpy(thread_data->td->mb.mv_costs, cpi->td.mb.mv_costs,
+ sizeof(MvCosts));
+ }
+ if (cpi->sf.intra_sf.dv_cost_upd_level != INTERNAL_COST_UPD_OFF) {
+ // Reset dv_costs to NULL for worker threads when dv cost update is
+ // enabled so that only dv_cost_upd_level needs to be checked before the
+ // aom_free() call for the same.
+ thread_data->td->mb.dv_costs = NULL;
+ if (av1_need_dv_costs(cpi)) {
+ CHECK_MEM_ERROR(cm, thread_data->td->dv_costs_alloc,
+ (IntraBCMVCosts *)aom_malloc(
+ sizeof(*thread_data->td->dv_costs_alloc)));
+ thread_data->td->mb.dv_costs = thread_data->td->dv_costs_alloc;
+ memcpy(thread_data->td->mb.dv_costs, cpi->td.mb.dv_costs,
+ sizeof(IntraBCMVCosts));
+ }
+ }
+ }
+ av1_alloc_mb_data(cpi, &thread_data->td->mb);
+
+ // Reset rtc counters.
+ av1_init_rtc_counters(&thread_data->td->mb);
+
+ thread_data->td->mb.palette_pixels = 0;
+
+ if (thread_data->td->counts != &cpi->counts) {
+ memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
+ }
+
+ if (i > 0) {
+ thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
+ thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
+ thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->mb.tmp_pred_bufs[j] =
+ thread_data->td->tmp_pred_bufs[j];
+ }
+ thread_data->td->mb.pixel_gradient_info =
+ thread_data->td->pixel_gradient_info;
+
+ thread_data->td->mb.src_var_info_of_4x4_sub_blocks =
+ thread_data->td->src_var_info_of_4x4_sub_blocks;
+
+ thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
+ thread_data->td->mb.tmp_pred_bufs[j];
+ }
+ }
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ // Before encoding a frame, copy the thread data from cpi.
+ thread_data->td->mb = cpi->td.mb;
+ }
+ av1_alloc_src_diff_buf(cm, &thread_data->td->mb);
+ }
+}
+#endif
+
+// Computes the number of workers for row multi-threading of encoding stage
+static AOM_INLINE int compute_num_enc_row_mt_workers(const AV1_COMMON *cm,
+ int max_threads) {
+ TileInfo tile_info;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int total_num_threads_row_mt = 0;
+ for (int row = 0; row < tile_rows; row++) {
+ for (int col = 0; col < tile_cols; col++) {
+ av1_tile_init(&tile_info, cm, row, col);
+ const int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, &tile_info);
+ const int num_sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, &tile_info);
+ total_num_threads_row_mt +=
+ AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
+ }
+ }
+ return AOMMIN(max_threads, total_num_threads_row_mt);
+}
+
+// Computes the number of workers for tile multi-threading of encoding stage
+static AOM_INLINE int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm,
+ int max_threads) {
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ return AOMMIN(max_threads, tile_cols * tile_rows);
+}
+
+// Find max worker of all MT stages
+int av1_get_max_num_workers(const AV1_COMP *cpi) {
+ int max_num_workers = 0;
+ for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
+ max_num_workers =
+ AOMMAX(cpi->ppi->p_mt_info.num_mod_workers[i], max_num_workers);
+ assert(max_num_workers >= 1);
+ return AOMMIN(max_num_workers, cpi->oxcf.max_threads);
+}
+
+// Computes the number of workers for encoding stage (row/tile multi-threading)
+int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) {
+ if (max_workers <= 1) return 1;
+ if (cpi->oxcf.row_mt)
+ return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
+ else
+ return compute_num_enc_tile_mt_workers(&cpi->common, max_workers);
+}
+
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int num_workers = mt_info->num_mod_workers[MOD_ENC];
+
+ assert(IMPLIES(cpi->tile_data == NULL,
+ cpi->allocated_tiles < tile_cols * tile_rows));
+ if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
+
+ av1_init_tile_data(cpi);
+ num_workers = AOMMIN(num_workers, mt_info->num_workers);
+
+ prepare_enc_workers(cpi, enc_worker_hook, num_workers);
+ launch_workers(&cpi->mt_info, num_workers);
+ sync_enc_workers(&cpi->mt_info, cm, num_workers);
+ accumulate_counters_enc_workers(cpi, num_workers);
+}
+
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
+ const FRAME_COUNTS *counts) {
+ unsigned int *const acc = (unsigned int *)acc_counts;
+ const unsigned int *const cnt = (const unsigned int *)counts;
+
+ const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
+
+ for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
+}
+
+// Computes the maximum number of sb rows and sb_cols across tiles which are
+// used to allocate memory for multi-threaded encoding with row-mt=1.
+static AOM_INLINE void compute_max_sb_rows_cols(const AV1_COMMON *cm,
+ int *max_sb_rows_in_tile,
+ int *max_sb_cols_in_tile) {
+ const int tile_rows = cm->tiles.rows;
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const int num_mi_rows = cm->mi_params.mi_rows;
+ const int *const row_start_sb = cm->tiles.row_start_sb;
+ for (int row = 0; row < tile_rows; row++) {
+ const int mi_row_start = row_start_sb[row] << mib_size_log2;
+ const int mi_row_end =
+ AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows);
+ const int num_sb_rows_in_tile =
+ CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, mib_size_log2);
+ *max_sb_rows_in_tile = AOMMAX(*max_sb_rows_in_tile, num_sb_rows_in_tile);
+ }
+
+ const int tile_cols = cm->tiles.cols;
+ const int num_mi_cols = cm->mi_params.mi_cols;
+ const int *const col_start_sb = cm->tiles.col_start_sb;
+ for (int col = 0; col < tile_cols; col++) {
+ const int mi_col_start = col_start_sb[col] << mib_size_log2;
+ const int mi_col_end =
+ AOMMIN(col_start_sb[col + 1] << mib_size_log2, num_mi_cols);
+ const int num_sb_cols_in_tile =
+ CEIL_POWER_OF_TWO(mi_col_end - mi_col_start, mib_size_log2);
+ *max_sb_cols_in_tile = AOMMAX(*max_sb_cols_in_tile, num_sb_cols_in_tile);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Computes the number of workers for firstpass stage (row/tile multi-threading)
+int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int total_num_threads_row_mt = 0;
+ TileInfo tile_info;
+
+ if (cpi->oxcf.max_threads <= 1) return 1;
+
+ for (int row = 0; row < tile_rows; row++) {
+ for (int col = 0; col < tile_cols; col++) {
+ av1_tile_init(&tile_info, cm, row, col);
+ const int num_mb_rows_in_tile =
+ av1_get_unit_rows_in_tile(&tile_info, cpi->fp_block_size);
+ const int num_mb_cols_in_tile =
+ av1_get_unit_cols_in_tile(&tile_info, cpi->fp_block_size);
+ total_num_threads_row_mt +=
+ AOMMIN((num_mb_cols_in_tile + 1) >> 1, num_mb_rows_in_tile);
+ }
+ }
+ return AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
+}
+
+// Computes the maximum number of mb_rows for row multi-threading of firstpass
+// stage
+static AOM_INLINE int fp_compute_max_mb_rows(const AV1_COMMON *cm,
+ BLOCK_SIZE fp_block_size) {
+ const int tile_rows = cm->tiles.rows;
+ const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const int num_mi_rows = cm->mi_params.mi_rows;
+ const int *const row_start_sb = cm->tiles.row_start_sb;
+ int max_mb_rows = 0;
+
+ for (int row = 0; row < tile_rows; row++) {
+ const int mi_row_start = row_start_sb[row] << mib_size_log2;
+ const int mi_row_end =
+ AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows);
+ const int num_mb_rows_in_tile =
+ CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, unit_height_log2);
+ max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile);
+ }
+ return max_mb_rows;
+}
+#endif
+
+static void lpf_pipeline_mt_init(AV1_COMP *cpi, int num_workers) {
+ // Pipelining of loop-filtering after encoding is enabled when loop-filter
+ // level is chosen based on quantizer and frame type. It is disabled in case
+ // of 'LOOPFILTER_SELECTIVELY' as the stats collected during encoding stage
+ // decides the filter level. Loop-filtering is disabled in case
+ // of non-reference frames and for frames with intra block copy tool enabled.
+ AV1_COMMON *cm = &cpi->common;
+ const int use_loopfilter = is_loopfilter_used(cm);
+ const int use_superres = av1_superres_scaled(cm);
+ const int use_cdef = is_cdef_used(cm);
+ const int use_restoration = is_restoration_used(cm);
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+ const unsigned int skip_apply_postproc_filters =
+ derive_skip_apply_postproc_filters(cpi, use_loopfilter, use_cdef,
+ use_superres, use_restoration);
+ mt_info->pipeline_lpf_mt_with_enc =
+ (cpi->oxcf.mode == REALTIME) && (cpi->oxcf.speed >= 5) &&
+ (cpi->sf.lpf_sf.lpf_pick == LPF_PICK_FROM_Q) &&
+ (cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY) &&
+ !cpi->ppi->rtc_ref.non_reference_frame && !cm->features.allow_intrabc &&
+ ((skip_apply_postproc_filters & SKIP_APPLY_LOOPFILTER) == 0);
+
+ if (!mt_info->pipeline_lpf_mt_with_enc) return;
+
+ set_postproc_filter_default_params(cm);
+
+ if (!use_loopfilter) return;
+
+ const LPF_PICK_METHOD method = cpi->sf.lpf_sf.lpf_pick;
+ assert(method == LPF_PICK_FROM_Q);
+ assert(cpi->oxcf.algo_cfg.loopfilter_control != LOOPFILTER_SELECTIVELY);
+
+ av1_pick_filter_level(cpi->source, cpi, method);
+
+ struct loopfilter *lf = &cm->lf;
+ const int plane_start = 0;
+ const int plane_end = av1_num_planes(cm);
+ int planes_to_lf[MAX_MB_PLANE];
+ if (lpf_mt_with_enc_enabled(cpi->mt_info.pipeline_lpf_mt_with_enc,
+ lf->filter_level)) {
+ set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end);
+ int lpf_opt_level = get_lpf_opt_level(&cpi->sf);
+ assert(lpf_opt_level == 2);
+
+ const int start_mi_row = 0;
+ const int end_mi_row = start_mi_row + cm->mi_params.mi_rows;
+
+ av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+ assert(mt_info->num_mod_workers[MOD_ENC] ==
+ mt_info->num_mod_workers[MOD_LPF]);
+ loop_filter_frame_mt_init(cm, start_mi_row, end_mi_row, planes_to_lf,
+ mt_info->num_mod_workers[MOD_LPF],
+ &mt_info->lf_row_sync, lpf_opt_level,
+ cm->seq_params->mib_size_log2);
+
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+ // Initialize loopfilter data
+ thread_data->lf_sync = &mt_info->lf_row_sync;
+ thread_data->lf_data = &thread_data->lf_sync->lfdata[i];
+ loop_filter_data_reset(thread_data->lf_data, &cm->cur_frame->buf, cm, xd);
+ }
+ }
+}
+
+void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ const int sb_rows_in_frame = get_sb_rows_in_frame(cm);
+ int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
+ int max_sb_rows_in_tile = 0, max_sb_cols_in_tile = 0;
+ int num_workers = mt_info->num_mod_workers[MOD_ENC];
+
+ compute_max_sb_rows_cols(cm, &max_sb_rows_in_tile, &max_sb_cols_in_tile);
+ const bool alloc_row_mt_mem =
+ (enc_row_mt->allocated_tile_cols != tile_cols ||
+ enc_row_mt->allocated_tile_rows != tile_rows ||
+ enc_row_mt->allocated_rows != max_sb_rows_in_tile ||
+ enc_row_mt->allocated_cols != (max_sb_cols_in_tile - 1) ||
+ enc_row_mt->allocated_sb_rows != sb_rows_in_frame);
+ const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows;
+
+ assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data));
+ if (alloc_tile_data) {
+ av1_alloc_tile_data(cpi);
+ }
+
+ assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem));
+ if (alloc_row_mt_mem) {
+ row_mt_mem_alloc(cpi, max_sb_rows_in_tile, max_sb_cols_in_tile,
+ cpi->oxcf.algo_cfg.cdf_update_mode);
+ }
+
+ num_workers = AOMMIN(num_workers, mt_info->num_workers);
+ lpf_pipeline_mt_init(cpi, num_workers);
+
+ av1_init_tile_data(cpi);
+
+ memset(thread_id_to_tile_id, -1,
+ sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
+ memset(enc_row_mt->num_tile_cols_done, 0,
+ sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows_in_frame);
+ enc_row_mt->row_mt_exit = false;
+
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+ // Initialize num_finished_cols to -1 for all rows.
+ memset(row_mt_sync->num_finished_cols, -1,
+ sizeof(*row_mt_sync->num_finished_cols) * max_sb_rows_in_tile);
+ row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
+ row_mt_sync->num_threads_working = 0;
+ row_mt_sync->intrabc_extra_top_right_sb_delay =
+ av1_get_intrabc_extra_top_right_sb_delay(cm);
+
+ av1_inter_mode_data_init(this_tile);
+ av1_zero_above_context(cm, &cpi->td.mb.e_mbd,
+ this_tile->tile_info.mi_col_start,
+ this_tile->tile_info.mi_col_end, tile_row);
+ }
+ }
+
+ assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
+ num_workers);
+ prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers);
+ launch_workers(&cpi->mt_info, num_workers);
+ sync_enc_workers(&cpi->mt_info, cm, num_workers);
+ if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi);
+ accumulate_counters_enc_workers(cpi, num_workers);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void dealloc_thread_data_src_diff_buf(AV1_COMP *cpi, int num_workers) {
+ for (int i = num_workers - 1; i >= 0; --i) {
+ EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
+ if (thread_data->td != &cpi->td)
+ av1_dealloc_src_diff_buf(&thread_data->td->mb,
+ av1_num_planes(&cpi->common));
+ }
+}
+
+void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
+ int num_workers = 0;
+ int max_mb_rows = 0;
+
+ max_mb_rows = fp_compute_max_mb_rows(cm, cpi->fp_block_size);
+ const bool alloc_row_mt_mem = enc_row_mt->allocated_tile_cols != tile_cols ||
+ enc_row_mt->allocated_tile_rows != tile_rows ||
+ enc_row_mt->allocated_rows != max_mb_rows;
+ const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows;
+
+ assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data));
+ if (alloc_tile_data) {
+ av1_alloc_tile_data(cpi);
+ }
+
+ assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem));
+ if (alloc_row_mt_mem) {
+ row_mt_mem_alloc(cpi, max_mb_rows, -1, 0);
+ }
+
+ av1_init_tile_data(cpi);
+
+ // For pass = 1, compute the no. of workers needed. For single-pass encode
+ // (pass = 0), no. of workers are already computed.
+ if (mt_info->num_mod_workers[MOD_FP] == 0)
+ num_workers = av1_fp_compute_num_enc_workers(cpi);
+ else
+ num_workers = mt_info->num_mod_workers[MOD_FP];
+
+ memset(thread_id_to_tile_id, -1,
+ sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
+ enc_row_mt->firstpass_mt_exit = false;
+
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_index = tile_row * tile_cols + tile_col;
+ TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
+ AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
+
+ // Initialize num_finished_cols to -1 for all rows.
+ memset(row_mt_sync->num_finished_cols, -1,
+ sizeof(*row_mt_sync->num_finished_cols) * max_mb_rows);
+ row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
+ row_mt_sync->num_threads_working = 0;
+
+ // intraBC mode is not evaluated during first-pass encoding. Hence, no
+ // additional top-right delay is required.
+ row_mt_sync->intrabc_extra_top_right_sb_delay = 0;
+ }
+ }
+
+ num_workers = AOMMIN(num_workers, mt_info->num_workers);
+ assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
+ num_workers);
+ fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
+ launch_workers(&cpi->mt_info, num_workers);
+ sync_enc_workers(&cpi->mt_info, cm, num_workers);
+ dealloc_thread_data_src_diff_buf(cpi, num_workers);
+}
+
+void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+ int r, int c) {
+ (void)tpl_mt_sync;
+ (void)r;
+ (void)c;
+}
+
+void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+ int r, int c, int cols) {
+ (void)tpl_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+}
+
+void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
+ int c) {
+#if CONFIG_MULTITHREAD
+ int nsync = tpl_row_mt_sync->sync_range;
+
+ if (r) {
+ pthread_mutex_t *const mutex = &tpl_row_mt_sync->mutex_[r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > tpl_row_mt_sync->num_finished_cols[r - 1] - nsync)
+ pthread_cond_wait(&tpl_row_mt_sync->cond_[r - 1], mutex);
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)tpl_row_mt_sync;
+ (void)r;
+ (void)c;
+#endif // CONFIG_MULTITHREAD
+}
+
+void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
+ int c, int cols) {
+#if CONFIG_MULTITHREAD
+ int nsync = tpl_row_mt_sync->sync_range;
+ int cur;
+ // Only signal when there are enough encoded blocks for next row to run.
+ int sig = 1;
+
+ if (c < cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = cols + nsync;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]);
+
+ // When a thread encounters an error, num_finished_cols[r] is set to maximum
+ // column number. In this case, the AOMMAX operation here ensures that
+ // num_finished_cols[r] is not overwritten with a smaller value thus
+ // preventing the infinite waiting of threads in the relevant sync_read()
+ // function.
+ tpl_row_mt_sync->num_finished_cols[r] =
+ AOMMAX(tpl_row_mt_sync->num_finished_cols[r], cur);
+
+ pthread_cond_signal(&tpl_row_mt_sync->cond_[r]);
+ pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]);
+ }
+#else
+ (void)tpl_row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+#endif // CONFIG_MULTITHREAD
+}
+
+static AOM_INLINE void set_mode_estimation_done(AV1_COMP *cpi) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const BLOCK_SIZE bsize =
+ convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+ const int mi_height = mi_size_high[bsize];
+ AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
+ const int tplb_cols_in_tile =
+ ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+ // In case of tpl row-multithreading, due to top-right dependency, the worker
+ // on an mb_row waits for the completion of the tpl processing of the top and
+ // top-right blocks. Hence, in case a thread (main/worker) encounters an
+ // error, update that the tpl processing of every mb_row in the frame is
+ // complete in order to avoid dependent workers waiting indefinitely.
+ for (int mi_row = 0, tplb_row = 0; mi_row < mi_params->mi_rows;
+ mi_row += mi_height, tplb_row++) {
+ (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+ tplb_cols_in_tile - 1, tplb_cols_in_tile);
+ }
+}
+
+// Each worker calls tpl_worker_hook() and computes the tpl data.
+static int tpl_worker_hook(void *arg1, void *unused) {
+ (void)unused;
+ EncWorkerData *thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *cpi = thread_data->cpi;
+ AV1_COMMON *cm = &cpi->common;
+ MACROBLOCK *x = &thread_data->td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats;
+ TplBuffers *tpl_tmp_buffers = &thread_data->td->tpl_tmp_buffers;
+ CommonModeInfoParams *mi_params = &cm->mi_params;
+ int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
+
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+ AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
+ (void)tpl_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *tpl_error_mutex_ = tpl_row_mt->mutex_;
+#endif
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(tpl_error_mutex_);
+ tpl_row_mt->tpl_mt_exit = true;
+ pthread_mutex_unlock(tpl_error_mutex_);
+#endif
+ set_mode_estimation_done(cpi);
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+ TX_SIZE tx_size = max_txsize_lookup[bsize];
+ int mi_height = mi_size_high[bsize];
+
+ av1_init_tpl_txfm_stats(tpl_txfm_stats);
+
+ for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows;
+ mi_row += num_active_workers * mi_height) {
+ // Motion estimation row boundary
+ av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
+ cpi->oxcf.border_in_pixels);
+ xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+ xd->mb_to_bottom_edge =
+ GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+ av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row,
+ bsize, tx_size);
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Deallocate tpl synchronization related mutex and data.
+void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) {
+ assert(tpl_sync != NULL);
+
+#if CONFIG_MULTITHREAD
+ if (tpl_sync->mutex_ != NULL) {
+ for (int i = 0; i < tpl_sync->rows; ++i)
+ pthread_mutex_destroy(&tpl_sync->mutex_[i]);
+ aom_free(tpl_sync->mutex_);
+ }
+ if (tpl_sync->cond_ != NULL) {
+ for (int i = 0; i < tpl_sync->rows; ++i)
+ pthread_cond_destroy(&tpl_sync->cond_[i]);
+ aom_free(tpl_sync->cond_);
+ }
+#endif // CONFIG_MULTITHREAD
+
+ aom_free(tpl_sync->num_finished_cols);
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*tpl_sync);
+}
+
+// Allocate memory for tpl row synchronization.
+void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm,
+ int mb_rows) {
+ tpl_sync->rows = mb_rows;
+#if CONFIG_MULTITHREAD
+ {
+ CHECK_MEM_ERROR(cm, tpl_sync->mutex_,
+ aom_malloc(sizeof(*tpl_sync->mutex_) * mb_rows));
+ if (tpl_sync->mutex_) {
+ for (int i = 0; i < mb_rows; ++i)
+ pthread_mutex_init(&tpl_sync->mutex_[i], NULL);
+ }
+
+ CHECK_MEM_ERROR(cm, tpl_sync->cond_,
+ aom_malloc(sizeof(*tpl_sync->cond_) * mb_rows));
+ if (tpl_sync->cond_) {
+ for (int i = 0; i < mb_rows; ++i)
+ pthread_cond_init(&tpl_sync->cond_[i], NULL);
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+ CHECK_MEM_ERROR(cm, tpl_sync->num_finished_cols,
+ aom_malloc(sizeof(*tpl_sync->num_finished_cols) * mb_rows));
+
+ // Set up nsync.
+ tpl_sync->sync_range = 1;
+}
+
+// Each worker is prepared by assigning the hook function and individual thread
+// data.
+static AOM_INLINE void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ // OBMC buffers are used only to init MS params and remain unused when
+ // called from tpl, hence set the buffers to defaults.
+ av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+ if (!tpl_alloc_temp_buffers(&thread_data->td->tpl_tmp_buffers,
+ cpi->ppi->tpl_data.tpl_bsize_1d)) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating tpl data");
+ }
+ thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ }
+ }
+}
+
+#if CONFIG_BITRATE_ACCURACY
+// Accumulate transform stats after tpl.
+static void tpl_accumulate_txfm_stats(ThreadData *main_td,
+ const MultiThreadInfo *mt_info,
+ int num_workers) {
+ TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+ ThreadData *td = thread_data->td;
+ if (td != main_td) {
+ const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+ av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats);
+ }
+ }
+}
+#endif // CONFIG_BITRATE_ACCURACY
+
+// Implements multi-threading for tpl.
+void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ CommonModeInfoParams *mi_params = &cm->mi_params;
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ TplParams *tpl_data = &cpi->ppi->tpl_data;
+ AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
+ int mb_rows = mi_params->mb_rows;
+ int num_workers =
+ AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers);
+
+ if (mb_rows != tpl_sync->rows) {
+ av1_tpl_dealloc(tpl_sync);
+ av1_tpl_alloc(tpl_sync, cm, mb_rows);
+ }
+ tpl_sync->num_threads_working = num_workers;
+ mt_info->tpl_row_mt.tpl_mt_exit = false;
+
+ // Initialize cur_mb_col to -1 for all MB rows.
+ memset(tpl_sync->num_finished_cols, -1,
+ sizeof(*tpl_sync->num_finished_cols) * mb_rows);
+
+ prepare_tpl_workers(cpi, tpl_worker_hook, num_workers);
+ launch_workers(&cpi->mt_info, num_workers);
+ sync_enc_workers(&cpi->mt_info, cm, num_workers);
+#if CONFIG_BITRATE_ACCURACY
+ tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers);
+#endif // CONFIG_BITRATE_ACCURACY
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers);
+ }
+}
+
+// Deallocate memory for temporal filter multi-thread synchronization.
+void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) {
+ assert(tf_sync != NULL);
+#if CONFIG_MULTITHREAD
+ if (tf_sync->mutex_ != NULL) {
+ pthread_mutex_destroy(tf_sync->mutex_);
+ aom_free(tf_sync->mutex_);
+ }
+#endif // CONFIG_MULTITHREAD
+ tf_sync->next_tf_row = 0;
+}
+
+// Checks if a job is available. If job is available,
+// populates next_tf_row and returns 1, else returns 0.
+static AOM_INLINE int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync,
+ int *current_mb_row, int mb_rows) {
+ int do_next_row = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_;
+ pthread_mutex_lock(tf_mutex_);
+#endif
+ if (!tf_mt_sync->tf_mt_exit && tf_mt_sync->next_tf_row < mb_rows) {
+ *current_mb_row = tf_mt_sync->next_tf_row;
+ tf_mt_sync->next_tf_row++;
+ do_next_row = 1;
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(tf_mutex_);
+#endif
+ return do_next_row;
+}
+
+// Hook function for each thread in temporal filter multi-threading.
+static int tf_worker_hook(void *arg1, void *unused) {
+ (void)unused;
+ EncWorkerData *thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *cpi = thread_data->cpi;
+ ThreadData *td = thread_data->td;
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync;
+ const struct scale_factors *scale = &cpi->tf_ctx.sf;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *tf_mutex_ = tf_sync->mutex_;
+#endif
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(tf_mutex_);
+ tf_sync->tf_mt_exit = true;
+ pthread_mutex_unlock(tf_mutex_);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ const int num_planes = av1_num_planes(&cpi->common);
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+ MACROBLOCKD *mbd = &td->mb.e_mbd;
+ uint8_t *input_buffer[MAX_MB_PLANE];
+ MB_MODE_INFO **input_mb_mode_info;
+ tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
+ tf_setup_macroblockd(mbd, &td->tf_data, scale);
+
+ int current_mb_row = -1;
+
+ while (tf_get_next_job(tf_sync, &current_mb_row, tf_ctx->mb_rows))
+ av1_tf_do_filtering_row(cpi, td, current_mb_row);
+
+ tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Assigns temporal filter hook function and thread data to each worker.
+static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers, int is_highbitdepth) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ mt_info->tf_sync.next_tf_row = 0;
+ mt_info->tf_sync.tf_mt_exit = false;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ // OBMC buffers are used only to init MS params and remain unused when
+ // called from tf, hence set the buffers to defaults.
+ av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
+ if (!tf_alloc_and_reset_data(&thread_data->td->tf_data,
+ cpi->tf_ctx.num_pels, is_highbitdepth)) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating temporal filter data");
+ }
+ }
+ }
+}
+
+// Deallocate thread specific data for temporal filter.
+static void tf_dealloc_thread_data(AV1_COMP *cpi, int num_workers,
+ int is_highbitdepth) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) tf_dealloc_data(&td->tf_data, is_highbitdepth);
+ }
+}
+
+// Accumulate sse and sum after temporal filtering.
+static void tf_accumulate_frame_diff(AV1_COMP *cpi, int num_workers) {
+ FRAME_DIFF *total_diff = &cpi->td.tf_data.diff;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &cpi->mt_info.workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+ ThreadData *td = thread_data->td;
+ FRAME_DIFF *diff = &td->tf_data.diff;
+ if (td != &cpi->td) {
+ total_diff->sse += diff->sse;
+ total_diff->sum += diff->sum;
+ }
+ }
+}
+
+// Implements multi-threading for temporal filter.
+void av1_tf_do_filtering_mt(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;
+
+ int num_workers =
+ AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers);
+
+ prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth);
+ launch_workers(mt_info, num_workers);
+ sync_enc_workers(mt_info, cm, num_workers);
+ tf_accumulate_frame_diff(cpi, num_workers);
+ tf_dealloc_thread_data(cpi, num_workers, is_highbitdepth);
+}
+
+// Checks if a job is available in the current direction. If a job is available,
+// frame_idx will be populated and returns 1, else returns 0.
+static AOM_INLINE int get_next_gm_job(AV1_COMP *cpi, int *frame_idx,
+ int cur_dir) {
+ GlobalMotionInfo *gm_info = &cpi->gm_info;
+ JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+
+ int total_refs = gm_info->num_ref_frames[cur_dir];
+ int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir];
+
+ if (cur_frame_to_process < total_refs && !job_info->early_exit[cur_dir]) {
+ *frame_idx = gm_info->reference_frames[cur_dir][cur_frame_to_process].frame;
+ job_info->next_frame_to_process[cur_dir] += 1;
+ return 1;
+ }
+ return 0;
+}
+
+// Switches the current direction and calls the function get_next_gm_job() if
+// the speed feature 'prune_ref_frame_for_gm_search' is not set.
+static AOM_INLINE void switch_direction(AV1_COMP *cpi, int *frame_idx,
+ int *cur_dir) {
+ if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return;
+ // Switch the direction and get next job
+ *cur_dir = !(*cur_dir);
+ get_next_gm_job(cpi, frame_idx, *(cur_dir));
+}
+
+// Hook function for each thread in global motion multi-threading.
+static int gm_mt_worker_hook(void *arg1, void *unused) {
+ (void)unused;
+
+ EncWorkerData *thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *cpi = thread_data->cpi;
+ GlobalMotionInfo *gm_info = &cpi->gm_info;
+ AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
+ JobInfo *job_info = &gm_sync->job_info;
+ int thread_id = thread_data->thread_id;
+ GlobalMotionData *gm_thread_data = &thread_data->td->gm_data;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *gm_mt_mutex_ = gm_sync->mutex_;
+#endif
+
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(gm_mt_mutex_);
+ gm_sync->gm_mt_exit = true;
+ pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ int cur_dir = job_info->thread_id_to_dir[thread_id];
+ bool gm_mt_exit = false;
+ while (1) {
+ int ref_buf_idx = -1;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(gm_mt_mutex_);
+#endif
+
+ gm_mt_exit = gm_sync->gm_mt_exit;
+ // Populates ref_buf_idx(the reference frame type) for which global motion
+ // estimation will be done.
+ if (!gm_mt_exit && !get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) {
+ // No jobs are available for the current direction. Switch
+ // to other direction and get the next job, if available.
+ switch_direction(cpi, &ref_buf_idx, &cur_dir);
+ }
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+
+ // When gm_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (gm_mt_exit || ref_buf_idx == -1) break;
+
+ // Compute global motion for the given ref_buf_idx.
+ av1_compute_gm_for_valid_ref_frames(
+ cpi, error_info, gm_info->ref_buf, ref_buf_idx,
+ gm_thread_data->motion_models, gm_thread_data->segment_map,
+ gm_info->segment_map_w, gm_info->segment_map_h);
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(gm_mt_mutex_);
+#endif
+ // If global motion w.r.t. current ref frame is
+ // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+ // the remaining ref frames in that direction.
+ if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+ cpi->common.global_motion[ref_buf_idx].wmtype <= TRANSLATION)
+ job_info->early_exit[cur_dir] = 1;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(gm_mt_mutex_);
+#endif
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Assigns global motion hook function and thread data to each worker.
+static AOM_INLINE void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ mt_info->gm_sync.gm_mt_exit = false;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ if (thread_data->td != &cpi->td)
+ gm_alloc_data(cpi, &thread_data->td->gm_data);
+ }
+}
+
+// Assigns available threads to past/future direction.
+static AOM_INLINE void assign_thread_to_dir(int8_t *thread_id_to_dir,
+ int num_workers) {
+ int8_t frame_dir_idx = 0;
+
+ for (int i = 0; i < num_workers; i++) {
+ thread_id_to_dir[i] = frame_dir_idx++;
+ if (frame_dir_idx == MAX_DIRECTIONS) frame_dir_idx = 0;
+ }
+}
+
+// Computes number of workers for global motion multi-threading.
+static AOM_INLINE int compute_gm_workers(const AV1_COMP *cpi) {
+ int total_refs =
+ cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1];
+ int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
+ ? AOMMIN(MAX_DIRECTIONS, total_refs)
+ : total_refs;
+ num_gm_workers = AOMMIN(num_gm_workers, cpi->mt_info.num_workers);
+ return (num_gm_workers);
+}
+
+// Frees the memory allocated for each worker in global motion multi-threading.
+static AOM_INLINE void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int j = 0; j < num_workers; j++) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) gm_dealloc_data(&td->gm_data);
+ }
+}
+
+// Implements multi-threading for global motion.
+void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
+ JobInfo *job_info = &cpi->mt_info.gm_sync.job_info;
+
+ av1_zero(*job_info);
+
+ int num_workers = compute_gm_workers(cpi);
+
+ assign_thread_to_dir(job_info->thread_id_to_dir, num_workers);
+ prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers);
+ launch_workers(&cpi->mt_info, num_workers);
+ sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers);
+ gm_dealloc_thread_data(cpi, num_workers);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE int get_next_job_allintra(
+ AV1EncRowMultiThreadSync *const row_mt_sync, const int mi_row_end,
+ int *current_mi_row, int mib_size) {
+ if (row_mt_sync->next_mi_row < mi_row_end) {
+ *current_mi_row = row_mt_sync->next_mi_row;
+ row_mt_sync->num_threads_working++;
+ row_mt_sync->next_mi_row += mib_size;
+ return 1;
+ }
+ return 0;
+}
+
+static AOM_INLINE void prepare_wiener_var_workers(AV1_COMP *const cpi,
+ AVxWorkerHook hook,
+ const int num_workers) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *const worker = &mt_info->workers[i];
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ thread_data->thread_id = i;
+ // Set the starting tile for each thread, in this case the preprocessing
+ // stage does not need tiles. So we set it to 0.
+ thread_data->start = 0;
+
+ thread_data->cpi = cpi;
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ av1_alloc_mb_wiener_var_pred_buf(&cpi->common, thread_data->td);
+ }
+ }
+}
+
+static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) {
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+ const BLOCK_SIZE bsize = cpi->weber_bsize;
+ const int mb_step = mi_size_wide[bsize];
+ assert(MB_WIENER_MT_UNIT_SIZE < BLOCK_SIZES_ALL);
+ const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE];
+ const int mt_unit_cols =
+ (mi_params->mi_cols + (mt_unit_step >> 1)) / mt_unit_step;
+ const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt;
+ AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+ &cpi->ppi->intra_row_mt_sync;
+
+ // Update the wiener variance computation of every row in the frame to
+ // indicate that it is complete in order to avoid dependent workers waiting
+ // indefinitely.
+ for (int mi_row = 0, mt_thread_id = 0; mi_row < mi_params->mi_rows;
+ mi_row += mb_step, ++mt_thread_id) {
+ intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id,
+ mt_unit_cols - 1, mt_unit_cols);
+ }
+}
+
+static int cal_mb_wiener_var_hook(void *arg1, void *unused) {
+ (void)unused;
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *const cpi = thread_data->cpi;
+ MACROBLOCK *x = &thread_data->td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BLOCK_SIZE bsize = cpi->weber_bsize;
+ const int mb_step = mi_size_wide[bsize];
+ AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+ &cpi->ppi->intra_row_mt_sync;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
+ (void)enc_row_mt;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *enc_row_mt_mutex = enc_row_mt->mutex_;
+#endif
+
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex);
+ enc_row_mt->mb_wiener_mt_exit = true;
+ pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+ set_mb_wiener_var_calc_done(cpi);
+ return 0;
+ }
+ error_info->setjmp = 1;
+ DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+ double sum_rec_distortion = 0;
+ double sum_est_rate = 0;
+ while (1) {
+ int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex);
+#endif
+ int has_jobs = enc_row_mt->mb_wiener_mt_exit
+ ? 0
+ : get_next_job_allintra(intra_row_mt_sync,
+ cpi->common.mi_params.mi_rows,
+ &current_mi_row, mb_step);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+ if (!has_jobs) break;
+ // TODO(chengchen): properly accumulate the distortion and rate.
+ av1_calc_mb_wiener_var_row(cpi, x, xd, current_mi_row, src_diff, coeff,
+ qcoeff, dqcoeff, &sum_rec_distortion,
+ &sum_est_rate,
+ thread_data->td->wiener_tmp_pred_buf);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(enc_row_mt_mutex);
+#endif
+ intra_row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+static void dealloc_mb_wiener_var_mt_data(AV1_COMP *cpi, int num_workers) {
+ av1_row_mt_sync_mem_dealloc(&cpi->ppi->intra_row_mt_sync);
+
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int j = 0; j < num_workers; ++j) {
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
+ ThreadData *td = thread_data->td;
+ if (td != &cpi->td) av1_dealloc_mb_wiener_var_pred_buf(td);
+ }
+}
+
+// This function is the multi-threading version of computing the wiener
+// variance.
+// Note that the wiener variance is used for allintra mode (1 pass) and its
+// computation is before the frame encoding, so we don't need to consider
+// the number of tiles, instead we allocate all available threads to
+// the computation.
+void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers,
+ double *sum_rec_distortion,
+ double *sum_est_rate) {
+ (void)sum_rec_distortion;
+ (void)sum_est_rate;
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadSync *const intra_row_mt_sync =
+ &cpi->ppi->intra_row_mt_sync;
+
+ // TODO(chengchen): the memory usage could be improved.
+ const int mi_rows = cm->mi_params.mi_rows;
+ row_mt_sync_mem_alloc(intra_row_mt_sync, cm, mi_rows);
+
+ intra_row_mt_sync->intrabc_extra_top_right_sb_delay = 0;
+ intra_row_mt_sync->num_threads_working = num_workers;
+ intra_row_mt_sync->next_mi_row = 0;
+ memset(intra_row_mt_sync->num_finished_cols, -1,
+ sizeof(*intra_row_mt_sync->num_finished_cols) * mi_rows);
+ mt_info->enc_row_mt.mb_wiener_mt_exit = false;
+
+ prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers);
+ launch_workers(mt_info, num_workers);
+ sync_enc_workers(mt_info, cm, num_workers);
+ dealloc_mb_wiener_var_mt_data(cpi, num_workers);
+}
+
+// Compare and order tiles based on absolute sum of tx coeffs.
+static int compare_tile_order(const void *a, const void *b) {
+ const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a;
+ const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b;
+
+ if (tile_a->abs_sum_level > tile_b->abs_sum_level)
+ return -1;
+ else if (tile_a->abs_sum_level == tile_b->abs_sum_level)
+ return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1);
+ else
+ return 1;
+}
+
+// Get next tile index to be processed for pack bitstream
+static AOM_INLINE int get_next_pack_bs_tile_idx(
+ AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) {
+ assert(pack_bs_sync->next_job_idx <= num_tiles);
+ if (pack_bs_sync->next_job_idx == num_tiles) return -1;
+
+ return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++]
+ .tile_idx;
+}
+
+// Calculates bitstream chunk size based on total buffer size and tile or tile
+// group size.
+static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size,
+ const int frame_or_tg_size,
+ size_t *remain_buf_size,
+ size_t max_buf_size,
+ int is_last_chunk) {
+ size_t this_chunk_size;
+ assert(*remain_buf_size > 0);
+ if (is_last_chunk) {
+ this_chunk_size = *remain_buf_size;
+ *remain_buf_size = 0;
+ } else {
+ const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size;
+ this_chunk_size = (size_t)(size_scale / frame_or_tg_size);
+ *remain_buf_size -= this_chunk_size;
+ assert(*remain_buf_size > 0);
+ }
+ assert(this_chunk_size > 0);
+ return this_chunk_size;
+}
+
+// Initializes params required for pack bitstream tile.
+static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb,
+ PackBSParams *const pack_bs_params_arr,
+ uint8_t obu_extn_header) {
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+ const int num_tiles = tiles->cols * tiles->rows;
+ // Fixed size tile groups for the moment
+ const int num_tg_hdrs = cpi->num_tg;
+ // Tile group size in terms of number of tiles.
+ const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs;
+ uint8_t *tile_dst = dst;
+ uint8_t *tile_data_curr = dst;
+ // Max tile group count can not be more than MAX_TILES.
+ int tg_size_mi[MAX_TILES] = { 0 }; // Size of tile group in mi units
+ int tile_idx;
+ int tg_idx = 0;
+ int tile_count_in_tg = 0;
+ int new_tg = 1;
+
+ // Populate pack bitstream params of all tiles.
+ for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+ const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info;
+ PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+ // Calculate tile size in mi units.
+ const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) *
+ (tile_info->mi_row_end - tile_info->mi_row_start);
+ int is_last_tile_in_tg = 0;
+ tile_count_in_tg++;
+ if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1))
+ is_last_tile_in_tg = 1;
+
+ // Populate pack bitstream params of this tile.
+ pack_bs_params->curr_tg_hdr_size = 0;
+ pack_bs_params->obu_extn_header = obu_extn_header;
+ pack_bs_params->saved_wb = saved_wb;
+ pack_bs_params->obu_header_size = 0;
+ pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg;
+ pack_bs_params->new_tg = new_tg;
+ pack_bs_params->tile_col = tile_info->tile_col;
+ pack_bs_params->tile_row = tile_info->tile_row;
+ pack_bs_params->tile_size_mi = tile_size_mi;
+ tg_size_mi[tg_idx] += tile_size_mi;
+
+ if (new_tg) new_tg = 0;
+ if (is_last_tile_in_tg) {
+ tile_count_in_tg = 0;
+ new_tg = 1;
+ tg_idx++;
+ }
+ }
+
+ assert(cpi->available_bs_size > 0);
+ size_t tg_buf_size[MAX_TILES] = { 0 };
+ size_t max_buf_size = cpi->available_bs_size;
+ size_t remain_buf_size = max_buf_size;
+ const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols;
+
+ tile_idx = 0;
+ // Prepare obu, tile group and frame header of each tile group.
+ for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) {
+ PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+ int is_last_tg = tg_idx == cpi->num_tg - 1;
+ // Prorate bitstream buffer size based on tile group size and available
+ // buffer size. This buffer will be used to store headers and tile data.
+ tg_buf_size[tg_idx] =
+ get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size,
+ max_buf_size, is_last_tg);
+
+ pack_bs_params->dst = tile_dst;
+ pack_bs_params->tile_data_curr = tile_dst;
+
+ // Write obu, tile group and frame header at first tile in the tile
+ // group.
+ av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx);
+ tile_dst += tg_buf_size[tg_idx];
+
+ // Exclude headers from tile group buffer size.
+ tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size;
+ tile_idx += tg_size_in_tiles;
+ }
+
+ tg_idx = 0;
+ // Calculate bitstream buffer size of each tile in the tile group.
+ for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+ PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+
+ if (pack_bs_params->new_tg) {
+ max_buf_size = tg_buf_size[tg_idx];
+ remain_buf_size = max_buf_size;
+ }
+
+ // Prorate bitstream buffer size of this tile based on tile size and
+ // available buffer size. For this proration, header size is not accounted.
+ const size_t tile_buf_size = get_bs_chunk_size(
+ pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size,
+ max_buf_size, pack_bs_params->is_last_tile_in_tg);
+ pack_bs_params->tile_buf_size = tile_buf_size;
+
+ // Update base address of bitstream buffer for tile and tile group.
+ if (pack_bs_params->new_tg) {
+ tile_dst = pack_bs_params->dst;
+ tile_data_curr = pack_bs_params->tile_data_curr;
+ // Account header size in first tile of a tile group.
+ pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size;
+ } else {
+ pack_bs_params->dst = tile_dst;
+ pack_bs_params->tile_data_curr = tile_data_curr;
+ }
+
+ if (pack_bs_params->is_last_tile_in_tg) tg_idx++;
+ tile_dst += pack_bs_params->tile_buf_size;
+ }
+}
+
+// Worker hook function of pack bitsteam multithreading.
+static int pack_bs_worker_hook(void *arg1, void *arg2) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ PackBSParams *const pack_bs_params = (PackBSParams *)arg2;
+ AV1_COMP *const cpi = thread_data->cpi;
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync;
+ const CommonTileParams *const tiles = &cm->tiles;
+ const int num_tiles = tiles->cols * tiles->rows;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *const pack_bs_mutex = pack_bs_sync->mutex_;
+#endif
+ MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ xd->error_info = error_info;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pack_bs_mutex);
+ pack_bs_sync->pack_bs_mt_exit = true;
+ pthread_mutex_unlock(pack_bs_mutex);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ while (1) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pack_bs_mutex);
+#endif
+ const int tile_idx =
+ pack_bs_sync->pack_bs_mt_exit
+ ? -1
+ : get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pack_bs_mutex);
+#endif
+ // When pack_bs_mt_exit is set to true, other workers need not pursue any
+ // further jobs.
+ if (tile_idx == -1) break;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+ thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+
+ av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]);
+ }
+
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Prepares thread data and workers of pack bitsteam multithreading.
+static void prepare_pack_bs_workers(AV1_COMP *const cpi,
+ PackBSParams *const pack_bs_params,
+ AVxWorkerHook hook, const int num_workers) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+ if (i == 0) {
+ thread_data->td = &cpi->td;
+ } else {
+ thread_data->td = thread_data->original_td;
+ }
+
+ if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb;
+
+ thread_data->cpi = cpi;
+ thread_data->start = i;
+ thread_data->thread_id = i;
+ av1_reset_pack_bs_thread_data(thread_data->td);
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = pack_bs_params;
+ }
+
+ AV1_COMMON *const cm = &cpi->common;
+ AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync;
+ const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols;
+ pack_bs_sync->next_job_idx = 0;
+ pack_bs_sync->pack_bs_mt_exit = false;
+
+ PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order;
+ // Reset tile order data of pack bitstream
+ av1_zero_array(pack_bs_tile_order, num_tiles);
+
+ // Populate pack bitstream tile order structure
+ for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+ pack_bs_tile_order[tile_idx].abs_sum_level =
+ cpi->tile_data[tile_idx].abs_sum_level;
+ pack_bs_tile_order[tile_idx].tile_idx = tile_idx;
+ }
+
+ // Sort tiles in descending order based on tile area.
+ qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order),
+ compare_tile_order);
+}
+
+// Accumulates data after pack bitsteam processing.
+static void accumulate_pack_bs_data(
+ AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr,
+ uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info,
+ int *const largest_tile_id, unsigned int *max_tile_size,
+ uint32_t *const obu_header_size, uint8_t **tile_data_start,
+ const int num_workers) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonTileParams *const tiles = &cm->tiles;
+ const int tile_count = tiles->cols * tiles->rows;
+ // Fixed size tile groups for the moment
+ size_t curr_tg_data_size = 0;
+ int is_first_tg = 1;
+ uint8_t *curr_tg_start = dst;
+ size_t src_offset = 0;
+ size_t dst_offset = 0;
+
+ for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+ // PackBSParams stores all parameters required to pack tile and header
+ // info.
+ const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+ uint32_t tile_size = 0;
+
+ if (pack_bs_params->new_tg) {
+ curr_tg_start = dst + *total_size;
+ curr_tg_data_size = pack_bs_params->curr_tg_hdr_size;
+ *tile_data_start += pack_bs_params->curr_tg_hdr_size;
+ *obu_header_size = pack_bs_params->obu_header_size;
+ }
+ curr_tg_data_size +=
+ pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4);
+
+ if (pack_bs_params->buf.size > *max_tile_size) {
+ *largest_tile_id = tile_idx;
+ *max_tile_size = (unsigned int)pack_bs_params->buf.size;
+ }
+ tile_size +=
+ (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size;
+
+ // Pack all the chunks of tile bitstreams together
+ if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size);
+
+ if (pack_bs_params->is_last_tile_in_tg)
+ av1_write_last_tile_info(
+ cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size,
+ curr_tg_start, &tile_size, tile_data_start, largest_tile_id,
+ &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header);
+ src_offset += pack_bs_params->tile_buf_size;
+ dst_offset += tile_size;
+ *total_size += tile_size;
+ }
+
+ // Accumulate thread data
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ for (int idx = num_workers - 1; idx >= 0; idx--) {
+ ThreadData const *td = mt_info->tile_thr_data[idx].td;
+ av1_accumulate_pack_bs_thread_data(cpi, td);
+ }
+}
+
+void av1_write_tile_obu_mt(
+ AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+ struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+ const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+ unsigned int *max_tile_size, uint32_t *const obu_header_size,
+ uint8_t **tile_data_start, const int num_workers) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+ PackBSParams pack_bs_params[MAX_TILES];
+ uint32_t tile_size[MAX_TILES] = { 0 };
+
+ for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++)
+ pack_bs_params[tile_idx].total_size = &tile_size[tile_idx];
+
+ init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header);
+ prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook,
+ num_workers);
+ launch_workers(mt_info, num_workers);
+ sync_enc_workers(mt_info, &cpi->common, num_workers);
+ accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info,
+ largest_tile_id, max_tile_size, obu_header_size,
+ tile_data_start, num_workers);
+}
+
+// Deallocate memory for CDEF search multi-thread synchronization.
+void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) {
+ (void)cdef_sync;
+ assert(cdef_sync != NULL);
+#if CONFIG_MULTITHREAD
+ if (cdef_sync->mutex_ != NULL) {
+ pthread_mutex_destroy(cdef_sync->mutex_);
+ aom_free(cdef_sync->mutex_);
+ }
+#endif // CONFIG_MULTITHREAD
+}
+
+// Updates the row and column indices of the next job to be processed.
+// Also updates end_of_frame flag when the processing of all blocks is complete.
+static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) {
+ cdef_sync->fbc++;
+ if (cdef_sync->fbc == nhfb) {
+ cdef_sync->fbr++;
+ if (cdef_sync->fbr == nvfb) {
+ cdef_sync->end_of_frame = 1;
+ } else {
+ cdef_sync->fbc = 0;
+ }
+ }
+}
+
+// Initializes cdef_sync parameters.
+static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
+#if CONFIG_MULTITHREAD
+ if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+#endif // CONFIG_MULTITHREAD
+ cdef_sync->end_of_frame = 0;
+ cdef_sync->fbr = 0;
+ cdef_sync->fbc = 0;
+ cdef_sync->cdef_mt_exit = false;
+}
+
+// Checks if a job is available. If job is available,
+// populates next job information and returns 1, else returns 0.
+static AOM_INLINE int cdef_get_next_job(AV1CdefSync *cdef_sync,
+ CdefSearchCtx *cdef_search_ctx,
+ volatile int *cur_fbr,
+ volatile int *cur_fbc,
+ volatile int *sb_count) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(cdef_sync->mutex_);
+#endif // CONFIG_MULTITHREAD
+ int do_next_block = 0;
+ const int nvfb = cdef_search_ctx->nvfb;
+ const int nhfb = cdef_search_ctx->nhfb;
+
+ // If a block is skip, do not process the block and
+ // check the skip condition for the next block.
+ while (!cdef_sync->cdef_mt_exit && !cdef_sync->end_of_frame &&
+ cdef_sb_skip(cdef_search_ctx->mi_params, cdef_sync->fbr,
+ cdef_sync->fbc)) {
+ update_next_job_info(cdef_sync, nvfb, nhfb);
+ }
+
+ // Populates information needed for current job and update the row,
+ // column indices of the next block to be processed.
+ if (!cdef_sync->cdef_mt_exit && cdef_sync->end_of_frame == 0) {
+ do_next_block = 1;
+ *cur_fbr = cdef_sync->fbr;
+ *cur_fbc = cdef_sync->fbc;
+ *sb_count = cdef_search_ctx->sb_count;
+ cdef_search_ctx->sb_count++;
+ update_next_job_info(cdef_sync, nvfb, nhfb);
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(cdef_sync->mutex_);
+#endif // CONFIG_MULTITHREAD
+ return do_next_block;
+}
+
+// Hook function for each thread in CDEF search multi-threading.
+static int cdef_filter_block_worker_hook(void *arg1, void *arg2) {
+ EncWorkerData *thread_data = (EncWorkerData *)arg1;
+ AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg2;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *cdef_mutex_ = cdef_sync->mutex_;
+#endif
+ struct aom_internal_error_info *const error_info = &thread_data->error_info;
+ CdefSearchCtx *cdef_search_ctx = thread_data->cpi->cdef_search_ctx;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(error_info->jmp)) {
+ error_info->setjmp = 0;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(cdef_mutex_);
+ cdef_sync->cdef_mt_exit = true;
+ pthread_mutex_unlock(cdef_mutex_);
+#endif
+ return 0;
+ }
+ error_info->setjmp = 1;
+
+ volatile int cur_fbr, cur_fbc, sb_count;
+ while (cdef_get_next_job(cdef_sync, cdef_search_ctx, &cur_fbr, &cur_fbc,
+ &sb_count)) {
+ av1_cdef_mse_calc_block(cdef_search_ctx, error_info, cur_fbr, cur_fbc,
+ sb_count);
+ }
+ error_info->setjmp = 0;
+ return 1;
+}
+
+// Assigns CDEF search hook function and thread data to each worker.
+static void prepare_cdef_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ for (int i = num_workers - 1; i >= 0; i--) {
+ AVxWorker *worker = &mt_info->workers[i];
+ EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
+
+ thread_data->cpi = cpi;
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = &mt_info->cdef_sync;
+ }
+}
+
+// Implements multi-threading for CDEF search.
+void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) {
+ MultiThreadInfo *mt_info = &cpi->mt_info;
+ AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+ const int num_workers = mt_info->num_mod_workers[MOD_CDEF_SEARCH];
+
+ cdef_reset_job_info(cdef_sync);
+ prepare_cdef_workers(cpi, cdef_filter_block_worker_hook, num_workers);
+ launch_workers(mt_info, num_workers);
+ sync_enc_workers(mt_info, &cpi->common, num_workers);
+}
+
+// Computes num_workers for temporal filter multi-threading.
+static AOM_INLINE int compute_num_tf_workers(const AV1_COMP *cpi) {
+ // For single-pass encode, using no. of workers as per tf block size was not
+ // found to improve speed. Hence the thread assignment for single-pass encode
+ // is kept based on compute_num_enc_workers().
+ if (cpi->oxcf.pass < AOM_RC_SECOND_PASS)
+ return (av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads));
+
+ if (cpi->oxcf.max_threads <= 1) return 1;
+
+ const int frame_height = cpi->common.height;
+ const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+ const int mb_height = block_size_high[block_size];
+ const int mb_rows = get_num_blocks(frame_height, mb_height);
+ return AOMMIN(cpi->oxcf.max_threads, mb_rows);
+}
+
+// Computes num_workers for tpl multi-threading.
+static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
+ return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop filter multi-threading.
+static AOM_INLINE int compute_num_lf_workers(AV1_COMP *cpi) {
+ return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for cdef multi-threading.
+static AOM_INLINE int compute_num_cdef_workers(AV1_COMP *cpi) {
+ return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for loop-restoration multi-threading.
+static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) {
+ return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for pack bitstream multi-threading.
+static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) {
+ if (cpi->oxcf.max_threads <= 1) return 1;
+ return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for all intra multi-threading.
+static AOM_INLINE int compute_num_ai_workers(AV1_COMP *cpi) {
+ if (cpi->oxcf.max_threads <= 1) return 1;
+ // The multi-threading implementation of deltaq-mode = 3 in allintra
+ // mode is based on row multi threading.
+ if (!cpi->oxcf.row_mt) return 1;
+ cpi->weber_bsize = BLOCK_8X8;
+ const BLOCK_SIZE bsize = cpi->weber_bsize;
+ const int mb_step = mi_size_wide[bsize];
+ const int num_mb_rows = cpi->common.mi_params.mi_rows / mb_step;
+ return AOMMIN(num_mb_rows, cpi->oxcf.max_threads);
+}
+
+static int compute_num_mod_workers(AV1_COMP *cpi,
+ MULTI_THREADED_MODULES mod_name) {
+ int num_mod_workers = 0;
+ switch (mod_name) {
+ case MOD_FP:
+ if (cpi->oxcf.pass >= AOM_RC_SECOND_PASS)
+ num_mod_workers = 0;
+ else
+ num_mod_workers =
+ av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+ break;
+ case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break;
+ case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break;
+ case MOD_GME: num_mod_workers = 1; break;
+ case MOD_ENC:
+ num_mod_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+ break;
+ case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break;
+ case MOD_CDEF_SEARCH:
+ num_mod_workers = compute_num_cdef_workers(cpi);
+ break;
+ case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break;
+ case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break;
+ case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break;
+ case MOD_FRAME_ENC:
+ num_mod_workers = cpi->ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC];
+ break;
+ case MOD_AI:
+ if (cpi->oxcf.pass == AOM_RC_ONE_PASS) {
+ num_mod_workers = compute_num_ai_workers(cpi);
+ } else {
+ num_mod_workers = 0;
+ }
+ break;
+ default: assert(0); break;
+ }
+ return (num_mod_workers);
+}
+// Computes the number of workers for each MT modules in the encoder
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi) {
+ for (int i = MOD_FP; i < NUM_MT_MODULES; i++) {
+ cpi->ppi->p_mt_info.num_mod_workers[i] =
+ compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i);
+ }
+}
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
new file mode 100644
index 0000000000..468e120776
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ETHREAD_H_
+#define AOM_AV1_ENCODER_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+ struct AV1_COMP *cpi;
+ struct ThreadData *td;
+ struct ThreadData *original_td;
+ struct aom_internal_error_info error_info;
+ AV1LfSync *lf_sync;
+ LFWorkerData *lf_data;
+ int start;
+ int thread_id;
+} EncWorkerData;
+
+void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c);
+void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
+ int cols);
+
+void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+ int c);
+void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
+ int c, int cols);
+
+void av1_encode_tiles_mt(struct AV1_COMP *cpi);
+void av1_encode_tiles_row_mt(struct AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi);
+
+int av1_fp_compute_num_enc_workers(AV1_COMP *cpi);
+#endif
+
+void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
+ const struct FRAME_COUNTS *counts);
+
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi);
+
+void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync);
+
+void av1_global_motion_estimation_mt(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+ int r, int c);
+void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
+ int r, int c, int cols);
+
+void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_mt_sync, int r,
+ int c);
+void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_mt_sync, int r,
+ int c, int cols);
+
+void av1_mc_flow_dispenser_mt(AV1_COMP *cpi);
+
+void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync);
+
+#endif // !CONFIG_REALTIME_ONLY
+
+void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers,
+ double *sum_rec_distortion,
+ double *sum_est_rate);
+
+void av1_tf_do_filtering_mt(AV1_COMP *cpi);
+
+void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync);
+
+void av1_compute_num_workers_for_mt(AV1_COMP *cpi);
+
+int av1_get_max_num_workers(const AV1_COMP *cpi);
+
+void av1_create_workers(AV1_PRIMARY *ppi, int num_workers);
+
+void av1_terminate_workers(AV1_PRIMARY *ppi);
+
+void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi);
+
+void av1_init_cdef_worker(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_init_lr_mt_buffers(AV1_COMP *cpi);
+#endif
+
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass);
+#endif // CONFIG_MULTITHREAD
+
+int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
+ MULTI_THREADED_MODULES mod_name);
+
+void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass);
+
+void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi);
+
+void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync);
+
+void av1_write_tile_obu_mt(
+ AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+ struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+ const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+ unsigned int *max_tile_size, uint32_t *const obu_header_size,
+ uint8_t **tile_data_start, const int num_workers);
+
+int av1_compute_num_enc_workers(const AV1_COMP *cpi, int max_workers);
+
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
+
+int av1_check_fpmt_config(AV1_PRIMARY *const ppi, AV1EncoderConfig *const oxcf);
+
+void av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
+ AV1_COMP_DATA *const first_cpi_data);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ETHREAD_H_
diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c
new file mode 100644
index 0000000000..e1b1e69ca7
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/encoder/extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+ uint8_t *dst, int dst_pitch, int w, int h,
+ int extend_top, int extend_left,
+ int extend_bottom, int extend_right,
+ int chroma_step) {
+ int i, linesize;
+ // copy the left and right most columns out
+ const uint8_t *src_ptr1 = src;
+ const uint8_t *src_ptr2 = src + (w - 1) * chroma_step;
+ uint8_t *dst_ptr1 = dst - extend_left;
+ uint8_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ memset(dst_ptr1, src_ptr1[0], extend_left);
+ if (chroma_step == 1) {
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+ } else {
+ for (int j = 0; j < w; j++) {
+ dst_ptr1[extend_left + j] = src_ptr1[chroma_step * j];
+ }
+ }
+ memset(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+ linesize = extend_left + extend_right + w;
+ assert(linesize <= dst_pitch);
+
+ for (i = 0; i < extend_top; i++) {
+ memcpy(dst_ptr1, src_ptr1, linesize);
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ memcpy(dst_ptr2, src_ptr2, linesize);
+ dst_ptr2 += dst_pitch;
+ }
+}
+
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+ uint8_t *dst8, int dst_pitch, int w,
+ int h, int extend_top, int extend_left,
+ int extend_bottom, int extend_right) {
+ int i, linesize;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ // copy the left and right most columns out
+ const uint16_t *src_ptr1 = src;
+ const uint16_t *src_ptr2 = src + w - 1;
+ uint16_t *dst_ptr1 = dst - extend_left;
+ uint16_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+ aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+ linesize = extend_left + extend_right + w;
+ assert(linesize <= dst_pitch);
+
+ for (i = 0; i < extend_top; i++) {
+ memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+ dst_ptr2 += dst_pitch;
+ }
+}
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+ // Extend src frame in buffer
+ const int et_y = dst->border;
+ const int el_y = dst->border;
+ const int er_y =
+ AOMMAX(src->y_width + dst->border, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+ src->y_crop_width;
+ const int eb_y = AOMMAX(src->y_height + dst->border,
+ ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+ src->y_crop_height;
+ const int uv_width_subsampling = src->subsampling_x;
+ const int uv_height_subsampling = src->subsampling_y;
+ const int et_uv = et_y >> uv_height_subsampling;
+ const int el_uv = el_y >> uv_width_subsampling;
+ const int eb_uv = eb_y >> uv_height_subsampling;
+ const int er_uv = er_y >> uv_width_subsampling;
+
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_crop_width,
+ src->y_crop_height, et_y, el_y, eb_y, er_y);
+ if (!src->monochrome) {
+ highbd_copy_and_extend_plane(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+ highbd_copy_and_extend_plane(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+ }
+ return;
+ }
+
+ copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_crop_width, src->y_crop_height,
+ et_y, el_y, eb_y, er_y, 1);
+ if (!src->monochrome) {
+ // detect nv12 format
+ const int chroma_step = src->v_buffer ? 1 : 2;
+ const uint8_t *src_v_buffer =
+ src->v_buffer ? src->v_buffer : src->u_buffer + 1;
+ copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, src->uv_crop_width,
+ src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv,
+ chroma_step);
+ copy_and_extend_plane(src_v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, src->uv_crop_width,
+ src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv,
+ chroma_step);
+ }
+}
diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h
new file mode 100644
index 0000000000..b8cc5b9d28
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTEND_H_
+#define AOM_AV1_ENCODER_EXTEND_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_EXTEND_H_
diff --git a/third_party/aom/av1/encoder/external_partition.c b/third_party/aom/av1/encoder/external_partition.c
new file mode 100644
index 0000000000..79f8b4c8a4
--- /dev/null
+++ b/third_party/aom/av1/encoder/external_partition.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/encoder/external_partition.h"
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+ aom_ext_part_config_t config,
+ ExtPartController *ext_part_controller) {
+ if (ext_part_controller == NULL) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ ext_part_controller->funcs = funcs;
+ ext_part_controller->config = config;
+ const aom_ext_part_status_t status = ext_part_controller->funcs.create_model(
+ ext_part_controller->funcs.priv, &ext_part_controller->config,
+ &ext_part_controller->model);
+ if (status == AOM_EXT_PART_ERROR) {
+ return AOM_CODEC_ERROR;
+ } else if (status == AOM_EXT_PART_TEST) {
+ ext_part_controller->test_mode = 1;
+ ext_part_controller->ready = 0;
+ return AOM_CODEC_OK;
+ }
+ assert(status == AOM_EXT_PART_OK);
+ ext_part_controller->ready = 1;
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller) {
+ if (ext_part_controller == NULL) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ av1_zero(ext_part_controller);
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) {
+ if (ext_part_controller == NULL) {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+ if (ext_part_controller->ready) {
+ const aom_ext_part_status_t status =
+ ext_part_controller->funcs.delete_model(ext_part_controller->model);
+ if (status != AOM_EXT_PART_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ }
+ return av1_ext_part_init(ext_part_controller);
+}
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+ aom_partition_decision_t *decision) {
+ assert(ext_part_controller != NULL);
+ assert(ext_part_controller->ready);
+ assert(decision != NULL);
+ const aom_ext_part_status_t status =
+ ext_part_controller->funcs.get_partition_decision(
+ ext_part_controller->model, decision);
+ if (status != AOM_EXT_PART_OK) return false;
+ return true;
+}
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+ const aom_partition_stats_t *stats) {
+ assert(ext_part_controller != NULL);
+ assert(ext_part_controller->ready);
+ assert(stats != NULL);
+ const aom_ext_part_status_t status =
+ ext_part_controller->funcs.send_partition_stats(
+ ext_part_controller->model, stats);
+ if (status != AOM_EXT_PART_OK) return false;
+ return true;
+}
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+ const aom_partition_features_t *features) {
+ assert(ext_part_controller != NULL);
+ assert(ext_part_controller->ready);
+ assert(features != NULL);
+ const aom_ext_part_status_t status = ext_part_controller->funcs.send_features(
+ ext_part_controller->model, features);
+ if (status != AOM_EXT_PART_OK) return false;
+ return true;
+}
+
+aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
+ const ExtPartController *ext_part_controller) {
+ return ext_part_controller->funcs.decision_mode;
+}
diff --git a/third_party/aom/av1/encoder/external_partition.h b/third_party/aom/av1/encoder/external_partition.h
new file mode 100644
index 0000000000..f74973e9eb
--- /dev/null
+++ b/third_party/aom/av1/encoder/external_partition.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+#define AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+
+#include <stdbool.h>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+
+typedef struct ExtPartController {
+ int ready;
+ int test_mode;
+ aom_ext_part_config_t config;
+ aom_ext_part_model_t model;
+ aom_ext_part_funcs_t funcs;
+} ExtPartController;
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+ aom_ext_part_config_t config,
+ ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller);
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+ aom_partition_decision_t *decision);
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+ const aom_partition_stats_t *stats);
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+ const aom_partition_features_t *features);
+
+aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
+ const ExtPartController *ext_part_controller);
+
+/*!\endcond */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
new file mode 100644
index 0000000000..e20b6c177e
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -0,0 +1,1600 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/variance.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/entropymv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h" // av1_setup_dst_planes()
+#include "av1/common/reconintra.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/dwt.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/reconinter_enc.h"
+
+#define OUTPUT_FPF 0
+
+#define FIRST_PASS_Q 10.0
+#define INTRA_MODE_PENALTY 1024
+#define NEW_MV_MODE_PENALTY 32
+#define DARK_THRESH 64
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+
+#define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1
+
+static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats,
+ struct aom_codec_pkt_list *pktlist) {
+ struct aom_codec_cx_pkt pkt;
+ pkt.kind = AOM_CODEC_STATS_PKT;
+ pkt.data.twopass_stats.buf = stats;
+ pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+ if (pktlist != NULL) aom_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+ {
+ FILE *fpfile;
+ fpfile = fopen("firstpass.stt", "a");
+
+ fprintf(fpfile,
+ "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n",
+ stats->frame, stats->weight, stats->intra_error, stats->coded_error,
+ stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion,
+ stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct,
+ stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
+ stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
+ stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
+ stats->count, stats->duration);
+ fclose(fpfile);
+ }
+#endif
+}
+
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section) {
+ section->frame = 0.0;
+ section->weight = 0.0;
+ section->intra_error = 0.0;
+ section->frame_avg_wavelet_energy = 0.0;
+ section->coded_error = 0.0;
+ section->log_intra_error = 0.0;
+ section->log_coded_error = 0.0;
+ section->sr_coded_error = 0.0;
+ section->pcnt_inter = 0.0;
+ section->pcnt_motion = 0.0;
+ section->pcnt_second_ref = 0.0;
+ section->pcnt_neutral = 0.0;
+ section->intra_skip_pct = 0.0;
+ section->inactive_zone_rows = 0.0;
+ section->inactive_zone_cols = 0.0;
+ section->MVr = 0.0;
+ section->mvr_abs = 0.0;
+ section->MVc = 0.0;
+ section->mvc_abs = 0.0;
+ section->MVrv = 0.0;
+ section->MVcv = 0.0;
+ section->mv_in_out_count = 0.0;
+ section->new_mv_count = 0.0;
+ section->count = 0.0;
+ section->duration = 1.0;
+ section->is_flash = 0;
+ section->noise_var = 0;
+ section->cor_coeff = 1.0;
+}
+
+void av1_accumulate_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame += frame->frame;
+ section->weight += frame->weight;
+ section->intra_error += frame->intra_error;
+ section->log_intra_error += log1p(frame->intra_error);
+ section->log_coded_error += log1p(frame->coded_error);
+ section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy;
+ section->coded_error += frame->coded_error;
+ section->sr_coded_error += frame->sr_coded_error;
+ section->pcnt_inter += frame->pcnt_inter;
+ section->pcnt_motion += frame->pcnt_motion;
+ section->pcnt_second_ref += frame->pcnt_second_ref;
+ section->pcnt_neutral += frame->pcnt_neutral;
+ section->intra_skip_pct += frame->intra_skip_pct;
+ section->inactive_zone_rows += frame->inactive_zone_rows;
+ section->inactive_zone_cols += frame->inactive_zone_cols;
+ section->MVr += frame->MVr;
+ section->mvr_abs += frame->mvr_abs;
+ section->MVc += frame->MVc;
+ section->mvc_abs += frame->mvc_abs;
+ section->MVrv += frame->MVrv;
+ section->MVcv += frame->MVcv;
+ section->mv_in_out_count += frame->mv_in_out_count;
+ section->new_mv_count += frame->new_mv_count;
+ section->count += frame->count;
+ section->duration += frame->duration;
+}
+
+static int get_unit_rows(const BLOCK_SIZE fp_block_size, const int mb_rows) {
+ const int height_mi_log2 = mi_size_high_log2[fp_block_size];
+ const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16];
+ if (height_mi_log2 > mb_height_mi_log2) {
+ return mb_rows >> (height_mi_log2 - mb_height_mi_log2);
+ }
+
+ return mb_rows << (mb_height_mi_log2 - height_mi_log2);
+}
+
+static int get_unit_cols(const BLOCK_SIZE fp_block_size, const int mb_cols) {
+ const int width_mi_log2 = mi_size_wide_log2[fp_block_size];
+ const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16];
+ if (width_mi_log2 > mb_width_mi_log2) {
+ return mb_cols >> (width_mi_log2 - mb_width_mi_log2);
+ }
+
+ return mb_cols << (mb_width_mi_log2 - width_mi_log2);
+}
+
+// TODO(chengchen): can we simplify it even if resize has to be considered?
+static int get_num_mbs(const BLOCK_SIZE fp_block_size,
+ const int num_mbs_16X16) {
+ const int width_mi_log2 = mi_size_wide_log2[fp_block_size];
+ const int height_mi_log2 = mi_size_high_log2[fp_block_size];
+ const int mb_width_mi_log2 = mi_size_wide_log2[BLOCK_16X16];
+ const int mb_height_mi_log2 = mi_size_high_log2[BLOCK_16X16];
+ // TODO(chengchen): Now this function assumes a square block is used.
+ // It does not support rectangular block sizes.
+ assert(width_mi_log2 == height_mi_log2);
+ if (width_mi_log2 > mb_width_mi_log2) {
+ return num_mbs_16X16 >> ((width_mi_log2 - mb_width_mi_log2) +
+ (height_mi_log2 - mb_height_mi_log2));
+ }
+
+ return num_mbs_16X16 << ((mb_width_mi_log2 - width_mi_log2) +
+ (mb_height_mi_log2 - height_mi_log2));
+}
+
+void av1_end_first_pass(AV1_COMP *cpi) {
+ if (cpi->ppi->twopass.stats_buf_ctx->total_stats && !cpi->ppi->lap_enabled)
+ output_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats,
+ cpi->ppi->output_pkt_list);
+}
+
+static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+ switch (bsize) {
+ case BLOCK_8X8: return aom_mse8x8;
+ case BLOCK_16X8: return aom_mse16x8;
+ case BLOCK_8X16: return aom_mse8x16;
+ default: return aom_mse16x16;
+ }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref) {
+ unsigned int sse;
+ const aom_variance_fn_t fn = get_block_variance_fn(bsize);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+ int bd) {
+ switch (bd) {
+ default:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_8_mse8x8;
+ case BLOCK_16X8: return aom_highbd_8_mse16x8;
+ case BLOCK_8X16: return aom_highbd_8_mse8x16;
+ default: return aom_highbd_8_mse16x16;
+ }
+ case 10:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_10_mse8x8;
+ case BLOCK_16X8: return aom_highbd_10_mse16x8;
+ case BLOCK_8X16: return aom_highbd_10_mse8x16;
+ default: return aom_highbd_10_mse16x16;
+ }
+ case 12:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_12_mse8x8;
+ case BLOCK_16X8: return aom_highbd_12_mse16x8;
+ case BLOCK_8X16: return aom_highbd_12_mse8x16;
+ default: return aom_highbd_12_mse16x16;
+ }
+ }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref,
+ int bd) {
+ unsigned int sse;
+ const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(int width, int height) {
+ int sr = 0;
+ const int dim = AOMMIN(width, height);
+
+ while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
+ return sr;
+}
+
+static AOM_INLINE const search_site_config *
+av1_get_first_pass_search_site_config(const AV1_COMP *cpi, MACROBLOCK *x,
+ SEARCH_METHODS search_method) {
+ const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
+
+ // For AVIF applications, even the source frames can have changing resolution,
+ // so we need to manually check for the strides :(
+ // AV1_COMP::mv_search_params.search_site_config is a compressor level cache
+ // that's shared by multiple threads. In most cases where all frames have the
+ // same resolution, the cache contains the search site config that we need.
+ const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+ if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_FPF]->stride) {
+ return mv_search_params->search_site_cfg[SS_CFG_FPF];
+ }
+
+ // If the cache does not contain the correct stride, then we will need to rely
+ // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the
+ // thread level config doesn't match, then we need to update it.
+ search_method = search_method_lookup[search_method];
+ assert(search_method_lookup[search_method] == search_method &&
+ "The search_method_lookup table should be idempotent.");
+ if (ref_stride != x->search_site_cfg_buf[search_method].stride) {
+ av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
+ ref_stride);
+ }
+
+ return x->search_site_cfg_buf;
+}
+
+static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+ const MV *ref_mv,
+ FULLPEL_MV *best_mv,
+ int *best_motion_err) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
+ int tmp_err;
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+ const int sr = get_search_range(cm->width, cm->height);
+ const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr;
+
+ const search_site_config *first_pass_search_sites =
+ av1_get_first_pass_search_site_config(cpi, x, NSTEP);
+ const int fine_search_interval =
+ cpi->is_screen_content_type && cm->features.allow_intrabc;
+ FULLPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
+ start_mv, first_pass_search_sites, NSTEP,
+ fine_search_interval);
+
+ FULLPEL_MV this_best_mv;
+ FULLPEL_MV_STATS best_mv_stats;
+ tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL,
+ &this_best_mv, &best_mv_stats, NULL);
+
+ if (tmp_err < INT_MAX) {
+ aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize];
+ const MSBuffers *ms_buffers = &ms_params.ms_buffers;
+ tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv,
+ &v_fn_ptr, ms_buffers->src, ms_buffers->ref) +
+ new_mv_mode_penalty;
+ }
+
+ if (tmp_err < *best_motion_err) {
+ *best_motion_err = tmp_err;
+ *best_mv = this_best_mv;
+ }
+}
+
+static BLOCK_SIZE get_bsize(const CommonModeInfoParams *const mi_params,
+ const BLOCK_SIZE fp_block_size, const int unit_row,
+ const int unit_col) {
+ const int unit_width = mi_size_wide[fp_block_size];
+ const int unit_height = mi_size_high[fp_block_size];
+ const int is_half_width =
+ unit_width * unit_col + unit_width / 2 >= mi_params->mi_cols;
+ const int is_half_height =
+ unit_height * unit_row + unit_height / 2 >= mi_params->mi_rows;
+ const int max_dimension =
+ AOMMAX(block_size_wide[fp_block_size], block_size_high[fp_block_size]);
+ int square_block_size = 0;
+ // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
+ switch (max_dimension) {
+ case 4: square_block_size = 0; break;
+ case 8: square_block_size = 1; break;
+ case 16: square_block_size = 2; break;
+ case 32: square_block_size = 3; break;
+ case 64: square_block_size = 4; break;
+ case 128: square_block_size = 5; break;
+ default: assert(0 && "First pass block size is not supported!"); break;
+ }
+ if (is_half_width && is_half_height) {
+ return subsize_lookup[PARTITION_SPLIT][square_block_size];
+ } else if (is_half_width) {
+ return subsize_lookup[PARTITION_VERT][square_block_size];
+ } else if (is_half_height) {
+ return subsize_lookup[PARTITION_HORZ][square_block_size];
+ } else {
+ return fp_block_size;
+ }
+}
+
+static int find_fp_qindex(aom_bit_depth_t bit_depth) {
+ return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1);
+}
+
+static double raw_motion_error_stdev(int *raw_motion_err_list,
+ int raw_motion_err_counts) {
+ int64_t sum_raw_err = 0;
+ double raw_err_avg = 0;
+ double raw_err_stdev = 0;
+ if (raw_motion_err_counts == 0) return 0;
+
+ int i;
+ for (i = 0; i < raw_motion_err_counts; i++) {
+ sum_raw_err += raw_motion_err_list[i];
+ }
+ raw_err_avg = (double)sum_raw_err / raw_motion_err_counts;
+ for (i = 0; i < raw_motion_err_counts; i++) {
+ raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) *
+ (raw_motion_err_list[i] - raw_err_avg);
+ }
+ // Calculate the standard deviation for the motion error of all the inter
+ // blocks of the 0,0 motion using the last source
+ // frame as the reference.
+ raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts);
+ return raw_err_stdev;
+}
+
+static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) {
+ return oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL;
+}
+typedef struct intra_pred_block_pass1_args {
+ const SequenceHeader *seq_params;
+ MACROBLOCK *x;
+} intra_pred_block_pass1_args;
+
+static INLINE void copy_rect(uint8_t *dst, int dstride, const uint8_t *src,
+ int sstride, int width, int height, int use_hbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), sstride,
+ CONVERT_TO_SHORTPTR(dst), dstride, width, height);
+ } else {
+ aom_convolve_copy(src, sstride, dst, dstride, width, height);
+ }
+#else
+ (void)use_hbd;
+ aom_convolve_copy(src, sstride, dst, dstride, width, height);
+#endif
+}
+
+static void first_pass_intra_pred_and_calc_diff(int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ (void)block;
+ struct intra_pred_block_pass1_args *const args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+ MACROBLOCK_PLANE *const p = &x->plane[plane];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const SequenceHeader *seq_params = args->seq_params;
+ const int src_stride = p->src.stride;
+ uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
+ pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src,
+ src_stride, dst, dst_stride, blk_col, blk_row, plane);
+
+ av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+}
+
+static void first_pass_predict_intra_block_for_luma_plane(
+ const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int plane = AOM_PLANE_Y;
+ const MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst = pd->dst.buf;
+ const MACROBLOCK_PLANE *const p = &x->plane[plane];
+ const int src_stride = p->src.stride;
+ const uint8_t *src = p->src.buf;
+
+ intra_pred_block_pass1_args args = { seq_params, x };
+ av1_foreach_transformed_block_in_plane(
+ xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args);
+
+ // copy source data to recon buffer, as the recon buffer will be used as a
+ // reference frame subsequently.
+ copy_rect(dst, dst_stride, src, src_stride, block_size_wide[bsize],
+ block_size_high[bsize], seq_params->use_highbitdepth);
+}
+
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+// Computes and returns the intra pred error of a block.
+// intra pred error: sum of squared error of the intra predicted residual.
+// Inputs:
+// cpi: the encoder setting. Only a few params in it will be used.
+// this_frame: the current frame buffer.
+// tile: tile information (not used in first pass, already init to zero)
+// unit_row: row index in the unit of first pass block size.
+// unit_col: column index in the unit of first pass block size.
+// y_offset: the offset of y frame buffer, indicating the starting point of
+// the current block.
+// uv_offset: the offset of u and v frame buffer, indicating the starting
+// point of the current block.
+// fp_block_size: first pass block size.
+// qindex: quantization step size to encode the frame.
+// stats: frame encoding stats.
+// Modifies:
+// stats->intra_skip_count
+// stats->image_data_start_row
+// stats->intra_factor
+// stats->brightness_factor
+// stats->intra_error
+// stats->frame_avg_wavelet_energy
+// Returns:
+// this_intra_error.
+static int firstpass_intra_prediction(
+ AV1_COMP *cpi, ThreadData *td, YV12_BUFFER_CONFIG *const this_frame,
+ const TileInfo *const tile, const int unit_row, const int unit_col,
+ const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size,
+ const int qindex, FRAME_STATS *const stats) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int unit_scale = mi_size_wide[fp_block_size];
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE bsize =
+ get_bsize(mi_params, fp_block_size, unit_row, unit_col);
+
+ set_mi_offsets(mi_params, xd, unit_row * unit_scale, unit_col * unit_scale);
+ xd->plane[0].dst.buf = this_frame->y_buffer + y_offset;
+ if (num_planes > 1) {
+ xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset;
+ xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset;
+ }
+ xd->left_available = (unit_col != 0);
+ xd->mi[0]->bsize = bsize;
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+ set_mi_row_col(xd, tile, unit_row * unit_scale, mi_size_high[bsize],
+ unit_col * unit_scale, mi_size_wide[bsize], mi_params->mi_rows,
+ mi_params->mi_cols);
+ set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
+ xd->mi[0]->segment_id = 0;
+ xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
+ xd->mi[0]->mode = DC_PRED;
+ xd->mi[0]->tx_size = TX_4X4;
+
+ if (cpi->sf.fp_sf.disable_recon)
+ first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize);
+ else
+ av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
+ int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
+ if (seq_params->use_highbitdepth) {
+ switch (seq_params->bit_depth) {
+ case AOM_BITS_8: break;
+ case AOM_BITS_10: this_intra_error >>= 4; break;
+ case AOM_BITS_12: this_intra_error >>= 8; break;
+ default:
+ assert(0 &&
+ "seq_params->bit_depth should be AOM_BITS_8, "
+ "AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+ }
+
+ if (this_intra_error < UL_INTRA_THRESH) {
+ ++stats->intra_skip_count;
+ } else if ((unit_col > 0) && (stats->image_data_start_row == INVALID_ROW)) {
+ stats->image_data_start_row = unit_row;
+ }
+
+ double log_intra = log1p(this_intra_error);
+ if (log_intra < 10.0) {
+ stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+ } else {
+ stats->intra_factor += 1.0;
+ }
+
+ int level_sample;
+ if (seq_params->use_highbitdepth) {
+ level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+ } else {
+ level_sample = x->plane[0].src.buf[0];
+ }
+
+ if (seq_params->use_highbitdepth) {
+ switch (seq_params->bit_depth) {
+ case AOM_BITS_8: break;
+ case AOM_BITS_10: level_sample >>= 2; break;
+ case AOM_BITS_12: level_sample >>= 4; break;
+ default:
+ assert(0 &&
+ "seq_params->bit_depth should be AOM_BITS_8, "
+ "AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+ }
+ if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
+ stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+ } else {
+ stats->brightness_factor += 1.0;
+ }
+
+ // Intrapenalty below deals with situations where the intra and inter
+ // error scores are very low (e.g. a plain black frame).
+ // We do not have special cases in first pass for 0,0 and nearest etc so
+ // all inter modes carry an overhead cost estimate for the mv.
+ // When the error score is very low this causes us to pick all or lots of
+ // INTRA modes and throw lots of key frames.
+ // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+ this_intra_error += INTRA_MODE_PENALTY;
+
+ // Accumulate the intra error.
+ stats->intra_error += (int64_t)this_intra_error;
+
+ // Stats based on wavelet energy is used in the following cases :
+ // 1. ML model which predicts if a flat structure (golden-frame only structure
+ // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in
+ // constant quality mode under certain conditions.
+ // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL.
+ // Thus, wavelet energy calculation is enabled for the above cases.
+ if (calc_wavelet_energy(&cpi->oxcf)) {
+ const int hbd = is_cur_buf_hbd(xd);
+ const int stride = x->plane[0].src.stride;
+ const int num_8x8_rows = block_size_high[fp_block_size] / 8;
+ const int num_8x8_cols = block_size_wide[fp_block_size] / 8;
+ const uint8_t *buf = x->plane[0].src.buf;
+ stats->frame_avg_wavelet_energy += av1_haar_ac_sad_mxn_uint8_input(
+ buf, stride, hbd, num_8x8_rows, num_8x8_cols);
+ } else {
+ stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP;
+ }
+
+ return this_intra_error;
+}
+
+// Returns the sum of square error between source and reference blocks.
+static int get_prediction_error_bitdepth(const int is_high_bitdepth,
+ const int bitdepth,
+ const BLOCK_SIZE block_size,
+ const struct buf_2d *src,
+ const struct buf_2d *ref) {
+ (void)is_high_bitdepth;
+ (void)bitdepth;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_high_bitdepth) {
+ return highbd_get_prediction_error(block_size, src, ref, bitdepth);
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ return get_prediction_error(block_size, src, ref);
+}
+
+// Accumulates motion vector stats.
+// Modifies member variables of "stats".
+static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
+ const int mb_row, const int mb_col,
+ const int mb_rows, const int mb_cols,
+ MV *last_non_zero_mv, FRAME_STATS *stats) {
+ if (is_zero_mv(&best_mv)) return;
+
+ ++stats->mv_count;
+ // Non-zero vector, was it different from the last non zero vector?
+ if (!is_equal_mv(&best_mv, last_non_zero_mv)) ++stats->new_mv_count;
+ *last_non_zero_mv = best_mv;
+
+ // Does the row vector point inwards or outwards?
+ if (mb_row < mb_rows / 2) {
+ if (mv.row > 0) {
+ --stats->sum_in_vectors;
+ } else if (mv.row < 0) {
+ ++stats->sum_in_vectors;
+ }
+ } else if (mb_row > mb_rows / 2) {
+ if (mv.row > 0) {
+ ++stats->sum_in_vectors;
+ } else if (mv.row < 0) {
+ --stats->sum_in_vectors;
+ }
+ }
+
+ // Does the col vector point inwards or outwards?
+ if (mb_col < mb_cols / 2) {
+ if (mv.col > 0) {
+ --stats->sum_in_vectors;
+ } else if (mv.col < 0) {
+ ++stats->sum_in_vectors;
+ }
+ } else if (mb_col > mb_cols / 2) {
+ if (mv.col > 0) {
+ ++stats->sum_in_vectors;
+ } else if (mv.col < 0) {
+ --stats->sum_in_vectors;
+ }
+ }
+}
+
+// Computes and returns the inter prediction error from the last frame.
+// Computes inter prediction errors from the golden and alt ref frams and
+// Updates stats accordingly.
+// Inputs:
+// cpi: the encoder setting. Only a few params in it will be used.
+// last_frame: the frame buffer of the last frame.
+// golden_frame: the frame buffer of the golden frame.
+// unit_row: row index in the unit of first pass block size.
+// unit_col: column index in the unit of first pass block size.
+// recon_yoffset: the y offset of the reconstructed frame buffer,
+// indicating the starting point of the current block.
+// recont_uvoffset: the u/v offset of the reconstructed frame buffer,
+// indicating the starting point of the current block.
+// src_yoffset: the y offset of the source frame buffer.
+// fp_block_size: first pass block size.
+// this_intra_error: the intra prediction error of this block.
+// raw_motion_err_counts: the count of raw motion vectors.
+// raw_motion_err_list: the array that records the raw motion error.
+// ref_mv: the reference used to start the motion search
+// best_mv: the best mv found
+// last_non_zero_mv: the last non zero mv found in this tile row.
+// stats: frame encoding stats.
+// Modifies:
+// raw_motion_err_list
+// best_ref_mv
+// last_mv
+// stats: many member params in it.
+// Returns:
+// this_inter_error
+static int firstpass_inter_prediction(
+ AV1_COMP *cpi, ThreadData *td, const YV12_BUFFER_CONFIG *const last_frame,
+ const YV12_BUFFER_CONFIG *const golden_frame, const int unit_row,
+ const int unit_col, const int recon_yoffset, const int recon_uvoffset,
+ const int src_yoffset, const BLOCK_SIZE fp_block_size,
+ const int this_intra_error, const int raw_motion_err_counts,
+ int *raw_motion_err_list, const MV ref_mv, MV *best_mv,
+ MV *last_non_zero_mv, FRAME_STATS *stats) {
+ int this_inter_error = this_intra_error;
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int is_high_bitdepth = is_cur_buf_hbd(xd);
+ const int bitdepth = xd->bd;
+ const int unit_scale = mi_size_wide[fp_block_size];
+ const BLOCK_SIZE bsize =
+ get_bsize(mi_params, fp_block_size, unit_row, unit_col);
+ const int fp_block_size_height = block_size_wide[fp_block_size];
+ const int unit_width = mi_size_wide[fp_block_size];
+ const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows);
+ const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols);
+ // Assume 0,0 motion with no mv overhead.
+ FULLPEL_MV mv = kZeroFullMv;
+ xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ av1_set_mv_col_limits(mi_params, &x->mv_limits, unit_col * unit_width,
+ fp_block_size_height >> MI_SIZE_LOG2,
+ cpi->oxcf.border_in_pixels);
+
+ int motion_error =
+ get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+ &x->plane[0].src, &xd->plane[0].pre[0]);
+
+ // Compute the motion error of the 0,0 motion using the last source
+ // frame as the reference. Skip the further motion search on
+ // reconstructed frame if this error is small.
+ // TODO(chiyotsai): The unscaled last source might be different dimension
+ // as the current source. See BUG=aomedia:3413
+ struct buf_2d unscaled_last_source_buf_2d;
+ unscaled_last_source_buf_2d.buf =
+ cpi->unscaled_last_source->y_buffer + src_yoffset;
+ unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
+ const int raw_motion_error = get_prediction_error_bitdepth(
+ is_high_bitdepth, bitdepth, bsize, &x->plane[0].src,
+ &unscaled_last_source_buf_2d);
+ raw_motion_err_list[raw_motion_err_counts] = raw_motion_error;
+ const FIRST_PASS_SPEED_FEATURES *const fp_sf = &cpi->sf.fp_sf;
+
+ if (raw_motion_error > fp_sf->skip_motion_search_threshold) {
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search.
+ first_pass_motion_search(cpi, x, &ref_mv, &mv, &motion_error);
+
+ // If the current best reference mv is not centered on 0,0 then do a
+ // 0,0 based search as well.
+ if ((fp_sf->skip_zeromv_motion_search == 0) && !is_zero_mv(&ref_mv)) {
+ FULLPEL_MV tmp_mv = kZeroFullMv;
+ int tmp_err = INT_MAX;
+ first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
+
+ if (tmp_err < motion_error) {
+ motion_error = tmp_err;
+ mv = tmp_mv;
+ }
+ }
+ }
+
+ // Motion search in 2nd reference frame.
+ int gf_motion_error = motion_error;
+ if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+ FULLPEL_MV tmp_mv = kZeroFullMv;
+ // Assume 0,0 motion with no mv overhead.
+ av1_setup_pre_planes(xd, 0, golden_frame, 0, 0, NULL, 1);
+ xd->plane[0].pre[0].buf += recon_yoffset;
+ gf_motion_error =
+ get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+ &x->plane[0].src, &xd->plane[0].pre[0]);
+ first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error);
+ }
+ if (gf_motion_error < motion_error && gf_motion_error < this_intra_error) {
+ ++stats->second_ref_count;
+ }
+ // In accumulating a score for the 2nd reference frame take the
+ // best of the motion predicted score and the intra coded error
+ // (just as will be done for) accumulation of "coded_error" for
+ // the last frame.
+ if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+ stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error);
+ } else {
+ // TODO(chengchen): I believe logically this should also be changed to
+ // stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error).
+ stats->sr_coded_error += motion_error;
+ }
+
+ // Reset to last frame as reference buffer.
+ xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
+ if (av1_num_planes(&cpi->common) > 1) {
+ xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset;
+ xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset;
+ }
+
+ // Start by assuming that intra mode is best.
+ *best_mv = kZeroMv;
+
+ if (motion_error <= this_intra_error) {
+ // Keep a count of cases where the inter and intra were very close
+ // and very low. This helps with scene cut detection for example in
+ // cropped clips with black bars at the sides or top and bottom.
+ if (((this_intra_error - INTRA_MODE_PENALTY) * 9 <= motion_error * 10) &&
+ (this_intra_error < (2 * INTRA_MODE_PENALTY))) {
+ stats->neutral_count += 1.0;
+ // Also track cases where the intra is not much worse than the inter
+ // and use this in limiting the GF/arf group length.
+ } else if ((this_intra_error > NCOUNT_INTRA_THRESH) &&
+ (this_intra_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+ stats->neutral_count +=
+ (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error);
+ }
+
+ *best_mv = get_mv_from_fullmv(&mv);
+ this_inter_error = motion_error;
+ xd->mi[0]->mode = NEWMV;
+ xd->mi[0]->mv[0].as_mv = *best_mv;
+ xd->mi[0]->tx_size = TX_4X4;
+ xd->mi[0]->ref_frame[0] = LAST_FRAME;
+ xd->mi[0]->ref_frame[1] = NONE_FRAME;
+
+ if (fp_sf->disable_recon == 0) {
+ av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale,
+ unit_col * unit_scale, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ av1_encode_sby_pass1(cpi, x, bsize);
+ }
+ stats->sum_mvr += best_mv->row;
+ stats->sum_mvr_abs += abs(best_mv->row);
+ stats->sum_mvc += best_mv->col;
+ stats->sum_mvc_abs += abs(best_mv->col);
+ stats->sum_mvrs += best_mv->row * best_mv->row;
+ stats->sum_mvcs += best_mv->col * best_mv->col;
+ ++stats->inter_count;
+
+ accumulate_mv_stats(*best_mv, mv, unit_row, unit_col, unit_rows, unit_cols,
+ last_non_zero_mv, stats);
+ }
+
+ return this_inter_error;
+}
+
+// Normalize the first pass stats.
+// Error / counters are normalized to each MB.
+// MVs are normalized to the width/height of the frame.
+static void normalize_firstpass_stats(FIRSTPASS_STATS *fps,
+ double num_mbs_16x16, double f_w,
+ double f_h) {
+ fps->coded_error /= num_mbs_16x16;
+ fps->sr_coded_error /= num_mbs_16x16;
+ fps->intra_error /= num_mbs_16x16;
+ fps->frame_avg_wavelet_energy /= num_mbs_16x16;
+ fps->log_coded_error = log1p(fps->coded_error);
+ fps->log_intra_error = log1p(fps->intra_error);
+ fps->MVr /= f_h;
+ fps->mvr_abs /= f_h;
+ fps->MVc /= f_w;
+ fps->mvc_abs /= f_w;
+ fps->MVrv /= (f_h * f_h);
+ fps->MVcv /= (f_w * f_w);
+ fps->new_mv_count /= num_mbs_16x16;
+}
+
+// Updates the first pass stats of this frame.
+// Input:
+// cpi: the encoder setting. Only a few params in it will be used.
+// stats: stats accumulated for this frame.
+// raw_err_stdev: the statndard deviation for the motion error of all the
+// inter blocks of the (0,0) motion using the last source
+// frame as the reference.
+// frame_number: current frame number.
+// ts_duration: Duration of the frame / collection of frames.
+// Updates:
+// twopass->total_stats: the accumulated stats.
+// twopass->stats_buf_ctx->stats_in_end: the pointer to the current stats,
+// update its value and its position
+// in the buffer.
+static void update_firstpass_stats(AV1_COMP *cpi,
+ const FRAME_STATS *const stats,
+ const double raw_err_stdev,
+ const int frame_number,
+ const int64_t ts_duration,
+ const BLOCK_SIZE fp_block_size) {
+ TWO_PASS *twopass = &cpi->ppi->twopass;
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
+ FIRSTPASS_STATS fps;
+ // The minimum error here insures some bit allocation to frames even
+ // in static regions. The allocation per MB declines for larger formats
+ // where the typical "real" energy per MB also falls.
+ // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+ // number of mbs is proportional to the image area.
+ const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : mi_params->MBs;
+ // Number of actual units used in the first pass, it can be other square
+ // block sizes than 16X16.
+ const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16);
+ const double min_err = 200 * sqrt(num_mbs);
+
+ fps.weight = stats->intra_factor * stats->brightness_factor;
+ fps.frame = frame_number;
+ fps.coded_error = (double)(stats->coded_error >> 8) + min_err;
+ fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err;
+ fps.intra_error = (double)(stats->intra_error >> 8) + min_err;
+ fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy;
+ fps.count = 1.0;
+ fps.pcnt_inter = (double)stats->inter_count / num_mbs;
+ fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs;
+ fps.pcnt_neutral = (double)stats->neutral_count / num_mbs;
+ fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs;
+ fps.inactive_zone_rows = (double)stats->image_data_start_row;
+ fps.inactive_zone_cols = 0.0; // Placeholder: not currently supported.
+ fps.raw_error_stdev = raw_err_stdev;
+ fps.is_flash = 0;
+ fps.noise_var = 0.0;
+ fps.cor_coeff = 1.0;
+ fps.log_coded_error = 0.0;
+ fps.log_intra_error = 0.0;
+
+ if (stats->mv_count > 0) {
+ fps.MVr = (double)stats->sum_mvr / stats->mv_count;
+ fps.mvr_abs = (double)stats->sum_mvr_abs / stats->mv_count;
+ fps.MVc = (double)stats->sum_mvc / stats->mv_count;
+ fps.mvc_abs = (double)stats->sum_mvc_abs / stats->mv_count;
+ fps.MVrv = ((double)stats->sum_mvrs -
+ ((double)stats->sum_mvr * stats->sum_mvr / stats->mv_count)) /
+ stats->mv_count;
+ fps.MVcv = ((double)stats->sum_mvcs -
+ ((double)stats->sum_mvc * stats->sum_mvc / stats->mv_count)) /
+ stats->mv_count;
+ fps.mv_in_out_count = (double)stats->sum_in_vectors / (stats->mv_count * 2);
+ fps.new_mv_count = stats->new_mv_count;
+ fps.pcnt_motion = (double)stats->mv_count / num_mbs;
+ } else {
+ fps.MVr = 0.0;
+ fps.mvr_abs = 0.0;
+ fps.MVc = 0.0;
+ fps.mvc_abs = 0.0;
+ fps.MVrv = 0.0;
+ fps.MVcv = 0.0;
+ fps.mv_in_out_count = 0.0;
+ fps.new_mv_count = 0.0;
+ fps.pcnt_motion = 0.0;
+ }
+
+ // TODO(paulwilkins): Handle the case when duration is set to 0, or
+ // something less than the full time between subsequent values of
+ // cpi->source_time_stamp.
+ fps.duration = (double)ts_duration;
+
+ normalize_firstpass_stats(&fps, num_mbs_16X16, cm->width, cm->height);
+
+ // We will store the stats inside the persistent twopass struct (and NOT the
+ // local variable 'fps'), and then cpi->output_pkt_list will point to it.
+ *this_frame_stats = fps;
+ if (!cpi->ppi->lap_enabled) {
+ output_stats(this_frame_stats, cpi->ppi->output_pkt_list);
+ } else {
+ av1_firstpass_info_push(&twopass->firstpass_info, this_frame_stats);
+ }
+ if (cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL) {
+ av1_accumulate_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, &fps);
+ }
+ twopass->stats_buf_ctx->stats_in_end++;
+ // When ducky encode is on, we always use linear buffer for stats_buf_ctx.
+ if (cpi->use_ducky_encode == 0) {
+ // TODO(angiebird): Figure out why first pass uses circular buffer.
+ /* In the case of two pass, first pass uses it as a circular buffer,
+ * when LAP is enabled it is used as a linear buffer*/
+ if ((cpi->oxcf.pass == AOM_RC_FIRST_PASS) &&
+ (twopass->stats_buf_ctx->stats_in_end >=
+ twopass->stats_buf_ctx->stats_in_buf_end)) {
+ twopass->stats_buf_ctx->stats_in_end =
+ twopass->stats_buf_ctx->stats_in_start;
+ }
+ }
+}
+
+static void print_reconstruction_frame(
+ const YV12_BUFFER_CONFIG *const last_frame, int frame_number,
+ int do_print) {
+ if (!do_print) return;
+
+ char filename[512];
+ FILE *recon_file;
+ snprintf(filename, sizeof(filename), "enc%04d.yuv", frame_number);
+
+ if (frame_number == 0) {
+ recon_file = fopen(filename, "wb");
+ } else {
+ recon_file = fopen(filename, "ab");
+ }
+
+ fwrite(last_frame->buffer_alloc, last_frame->frame_size, 1, recon_file);
+ fclose(recon_file);
+}
+
+static FRAME_STATS accumulate_frame_stats(FRAME_STATS *mb_stats, int mb_rows,
+ int mb_cols) {
+ FRAME_STATS stats = { 0 };
+ int i, j;
+
+ stats.image_data_start_row = INVALID_ROW;
+ for (j = 0; j < mb_rows; j++) {
+ for (i = 0; i < mb_cols; i++) {
+ FRAME_STATS mb_stat = mb_stats[j * mb_cols + i];
+ stats.brightness_factor += mb_stat.brightness_factor;
+ stats.coded_error += mb_stat.coded_error;
+ stats.frame_avg_wavelet_energy += mb_stat.frame_avg_wavelet_energy;
+ if (stats.image_data_start_row == INVALID_ROW &&
+ mb_stat.image_data_start_row != INVALID_ROW) {
+ stats.image_data_start_row = mb_stat.image_data_start_row;
+ }
+ stats.inter_count += mb_stat.inter_count;
+ stats.intra_error += mb_stat.intra_error;
+ stats.intra_factor += mb_stat.intra_factor;
+ stats.intra_skip_count += mb_stat.intra_skip_count;
+ stats.mv_count += mb_stat.mv_count;
+ stats.neutral_count += mb_stat.neutral_count;
+ stats.new_mv_count += mb_stat.new_mv_count;
+ stats.second_ref_count += mb_stat.second_ref_count;
+ stats.sr_coded_error += mb_stat.sr_coded_error;
+ stats.sum_in_vectors += mb_stat.sum_in_vectors;
+ stats.sum_mvc += mb_stat.sum_mvc;
+ stats.sum_mvc_abs += mb_stat.sum_mvc_abs;
+ stats.sum_mvcs += mb_stat.sum_mvcs;
+ stats.sum_mvr += mb_stat.sum_mvr;
+ stats.sum_mvr_abs += mb_stat.sum_mvr_abs;
+ stats.sum_mvrs += mb_stat.sum_mvrs;
+ }
+ }
+ return stats;
+}
+
+static void setup_firstpass_data(AV1_COMMON *const cm,
+ FirstPassData *firstpass_data,
+ const int unit_rows, const int unit_cols) {
+ CHECK_MEM_ERROR(cm, firstpass_data->raw_motion_err_list,
+ aom_calloc(unit_rows * unit_cols,
+ sizeof(*firstpass_data->raw_motion_err_list)));
+ CHECK_MEM_ERROR(
+ cm, firstpass_data->mb_stats,
+ aom_calloc(unit_rows * unit_cols, sizeof(*firstpass_data->mb_stats)));
+ for (int j = 0; j < unit_rows; j++) {
+ for (int i = 0; i < unit_cols; i++) {
+ firstpass_data->mb_stats[j * unit_cols + i].image_data_start_row =
+ INVALID_ROW;
+ }
+ }
+}
+
+void av1_free_firstpass_data(FirstPassData *firstpass_data) {
+ aom_free(firstpass_data->raw_motion_err_list);
+ firstpass_data->raw_motion_err_list = NULL;
+ aom_free(firstpass_data->mb_stats);
+ firstpass_data->mb_stats = NULL;
+}
+
+int av1_get_unit_rows_in_tile(const TileInfo *tile,
+ const BLOCK_SIZE fp_block_size) {
+ const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+ const int mi_rows = tile->mi_row_end - tile->mi_row_start;
+ const int unit_rows = CEIL_POWER_OF_TWO(mi_rows, unit_height_log2);
+
+ return unit_rows;
+}
+
+int av1_get_unit_cols_in_tile(const TileInfo *tile,
+ const BLOCK_SIZE fp_block_size) {
+ const int unit_width_log2 = mi_size_wide_log2[fp_block_size];
+ const int mi_cols = tile->mi_col_end - tile->mi_col_start;
+ const int unit_cols = CEIL_POWER_OF_TWO(mi_cols, unit_width_log2);
+
+ return unit_cols;
+}
+
+#define FIRST_PASS_ALT_REF_DISTANCE 16
+static void first_pass_tile(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data,
+ const BLOCK_SIZE fp_block_size) {
+ TileInfo *tile = &tile_data->tile_info;
+ const int unit_height = mi_size_high[fp_block_size];
+ const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+ for (int mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+ mi_row += unit_height) {
+ av1_first_pass_row(cpi, td, tile_data, mi_row >> unit_height_log2,
+ fp_block_size);
+ }
+}
+
+static void first_pass_tiles(AV1_COMP *cpi, const BLOCK_SIZE fp_block_size) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+
+ av1_alloc_src_diff_buf(cm, &cpi->td.mb);
+ for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ first_pass_tile(cpi, &cpi->td, tile_data, fp_block_size);
+ }
+ }
+}
+
+void av1_first_pass_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+ const int unit_row, const BLOCK_SIZE fp_block_size) {
+ MACROBLOCK *const x = &td->mb;
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TileInfo *tile = &tile_data->tile_info;
+ const int qindex = find_fp_qindex(seq_params->bit_depth);
+ const int fp_block_size_width = block_size_high[fp_block_size];
+ const int fp_block_size_height = block_size_wide[fp_block_size];
+ const int unit_width = mi_size_wide[fp_block_size];
+ const int unit_width_log2 = mi_size_wide_log2[fp_block_size];
+ const int unit_height_log2 = mi_size_high_log2[fp_block_size];
+ const int unit_cols = mi_params->mb_cols * 4 / unit_width;
+ int raw_motion_err_counts = 0;
+ int unit_row_in_tile = unit_row - (tile->mi_row_start >> unit_height_log2);
+ int unit_col_start = tile->mi_col_start >> unit_width_log2;
+ int unit_cols_in_tile = av1_get_unit_cols_in_tile(tile, fp_block_size);
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+ AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
+
+ const YV12_BUFFER_CONFIG *last_frame =
+ av1_get_scaled_ref_frame(cpi, LAST_FRAME);
+ if (!last_frame) {
+ last_frame = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ }
+ const YV12_BUFFER_CONFIG *golden_frame =
+ av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+ if (!golden_frame) {
+ golden_frame = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+ }
+ YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
+
+ PICK_MODE_CONTEXT *ctx = td->firstpass_ctx;
+ FRAME_STATS *mb_stats =
+ cpi->firstpass_data.mb_stats + unit_row * unit_cols + unit_col_start;
+ int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list +
+ unit_row * unit_cols + unit_col_start;
+ MV *first_top_mv = &tile_data->firstpass_top_mv;
+
+ for (int i = 0; i < num_planes; ++i) {
+ x->plane[i].coeff = ctx->coeff[i];
+ x->plane[i].qcoeff = ctx->qcoeff[i];
+ x->plane[i].eobs = ctx->eobs[i];
+ x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ x->plane[i].dqcoeff = ctx->dqcoeff[i];
+ }
+
+ const int src_y_stride = cpi->source->y_stride;
+ const int recon_y_stride = this_frame->y_stride;
+ const int recon_uv_stride = this_frame->uv_stride;
+ const int uv_mb_height =
+ fp_block_size_height >> (this_frame->y_height > this_frame->uv_height);
+
+ MV best_ref_mv = kZeroMv;
+ MV last_mv;
+
+ // Reset above block coeffs.
+ xd->up_available = (unit_row_in_tile != 0);
+ int recon_yoffset = (unit_row * recon_y_stride * fp_block_size_height) +
+ (unit_col_start * fp_block_size_width);
+ int src_yoffset = (unit_row * src_y_stride * fp_block_size_height) +
+ (unit_col_start * fp_block_size_width);
+ int recon_uvoffset = (unit_row * recon_uv_stride * uv_mb_height) +
+ (unit_col_start * uv_mb_height);
+
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ av1_set_mv_row_limits(
+ mi_params, &x->mv_limits, (unit_row << unit_height_log2),
+ (fp_block_size_height >> MI_SIZE_LOG2), cpi->oxcf.border_in_pixels);
+
+ av1_setup_src_planes(x, cpi->source, unit_row << unit_height_log2,
+ tile->mi_col_start, num_planes, fp_block_size);
+
+ // Fix - zero the 16x16 block first. This ensures correct this_intra_error for
+ // block sizes smaller than 16x16.
+ av1_zero_array(x->plane[0].src_diff, 256);
+
+ for (int unit_col_in_tile = 0; unit_col_in_tile < unit_cols_in_tile;
+ unit_col_in_tile++) {
+ const int unit_col = unit_col_start + unit_col_in_tile;
+
+ enc_row_mt->sync_read_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile);
+
+#if CONFIG_MULTITHREAD
+ if (cpi->ppi->p_mt_info.num_workers > 1) {
+ pthread_mutex_lock(enc_row_mt->mutex_);
+ bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
+ pthread_mutex_unlock(enc_row_mt->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (firstpass_mt_exit) return;
+ }
+#endif
+
+ if (unit_col_in_tile == 0) {
+ last_mv = *first_top_mv;
+ }
+ int this_intra_error = firstpass_intra_prediction(
+ cpi, td, this_frame, tile, unit_row, unit_col, recon_yoffset,
+ recon_uvoffset, fp_block_size, qindex, mb_stats);
+
+ if (!frame_is_intra_only(cm)) {
+ const int this_inter_error = firstpass_inter_prediction(
+ cpi, td, last_frame, golden_frame, unit_row, unit_col, recon_yoffset,
+ recon_uvoffset, src_yoffset, fp_block_size, this_intra_error,
+ raw_motion_err_counts, raw_motion_err_list, best_ref_mv, &best_ref_mv,
+ &last_mv, mb_stats);
+ if (unit_col_in_tile == 0) {
+ *first_top_mv = last_mv;
+ }
+ mb_stats->coded_error += this_inter_error;
+ ++raw_motion_err_counts;
+ } else {
+ mb_stats->sr_coded_error += this_intra_error;
+ mb_stats->coded_error += this_intra_error;
+ }
+
+ // Adjust to the next column of MBs.
+ x->plane[0].src.buf += fp_block_size_width;
+ if (num_planes > 1) {
+ x->plane[1].src.buf += uv_mb_height;
+ x->plane[2].src.buf += uv_mb_height;
+ }
+
+ recon_yoffset += fp_block_size_width;
+ src_yoffset += fp_block_size_width;
+ recon_uvoffset += uv_mb_height;
+ mb_stats++;
+
+ enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile, unit_col_in_tile,
+ unit_cols_in_tile);
+ }
+}
+
+void av1_noop_first_pass_frame(AV1_COMP *cpi, const int64_t ts_duration) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ int max_mb_rows = mi_params->mb_rows;
+ int max_mb_cols = mi_params->mb_cols;
+ if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) {
+ int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width);
+ max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2);
+ }
+ if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) {
+ int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height);
+ max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2);
+ }
+ const int unit_rows = get_unit_rows(BLOCK_16X16, max_mb_rows);
+ const int unit_cols = get_unit_cols(BLOCK_16X16, max_mb_cols);
+ setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
+ FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
+ FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
+ av1_free_firstpass_data(&cpi->firstpass_data);
+ update_firstpass_stats(cpi, &stats, 1.0, current_frame->frame_number,
+ ts_duration, BLOCK_16X16);
+}
+
+void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int qindex = find_fp_qindex(seq_params->bit_depth);
+ const int ref_frame_flags_backup = cpi->ref_frame_flags;
+ cpi->ref_frame_flags = av1_ref_frame_flag_list[LAST_FRAME] |
+ av1_ref_frame_flag_list[GOLDEN_FRAME];
+
+ // Detect if the key frame is screen content type.
+ if (frame_is_intra_only(cm)) {
+ FeatureFlags *const features = &cm->features;
+ assert(cpi->source != NULL);
+ xd->cur_buf = cpi->source;
+ av1_set_screen_content_options(cpi, features);
+ }
+
+ // Prepare the speed features
+ av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+
+ // Unit size for the first pass encoding.
+ const BLOCK_SIZE fp_block_size =
+ get_fp_block_size(cpi->is_screen_content_type);
+
+ int max_mb_rows = mi_params->mb_rows;
+ int max_mb_cols = mi_params->mb_cols;
+ if (cpi->oxcf.frm_dim_cfg.forced_max_frame_width) {
+ int max_mi_cols = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_width);
+ max_mb_cols = ROUND_POWER_OF_TWO(max_mi_cols, 2);
+ }
+ if (cpi->oxcf.frm_dim_cfg.forced_max_frame_height) {
+ int max_mi_rows = size_in_mi(cpi->oxcf.frm_dim_cfg.forced_max_frame_height);
+ max_mb_rows = ROUND_POWER_OF_TWO(max_mi_rows, 2);
+ }
+
+ // Number of rows in the unit size.
+ // Note max_mb_rows and max_mb_cols are in the unit of 16x16.
+ const int unit_rows = get_unit_rows(fp_block_size, max_mb_rows);
+ const int unit_cols = get_unit_cols(fp_block_size, max_mb_cols);
+
+ // Set fp_block_size, for the convenience of multi-thread usage.
+ cpi->fp_block_size = fp_block_size;
+
+ setup_firstpass_data(cm, &cpi->firstpass_data, unit_rows, unit_cols);
+ int *raw_motion_err_list = cpi->firstpass_data.raw_motion_err_list;
+ FRAME_STATS *mb_stats = cpi->firstpass_data.mb_stats;
+
+ // multi threading info
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
+
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ if (cpi->allocated_tiles < tile_cols * tile_rows) {
+ av1_alloc_tile_data(cpi);
+ }
+
+ av1_init_tile_data(cpi);
+
+ const YV12_BUFFER_CONFIG *last_frame = NULL;
+ const YV12_BUFFER_CONFIG *golden_frame = NULL;
+ if (!frame_is_intra_only(cm)) {
+ av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0);
+ last_frame = av1_is_scaled(get_ref_scale_factors_const(cm, LAST_FRAME))
+ ? av1_get_scaled_ref_frame(cpi, LAST_FRAME)
+ : get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ golden_frame = av1_is_scaled(get_ref_scale_factors_const(cm, GOLDEN_FRAME))
+ ? av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME)
+ : get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+ }
+
+ YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
+ // First pass code requires valid last and new frame buffers.
+ assert(this_frame != NULL);
+ assert(frame_is_intra_only(cm) || (last_frame != NULL));
+
+ av1_setup_frame_size(cpi);
+ av1_set_mv_search_params(cpi);
+
+ set_mi_offsets(mi_params, xd, 0, 0);
+ xd->mi[0]->bsize = fp_block_size;
+
+ // Do not use periodic key frames.
+ cpi->rc.frames_to_key = INT_MAX;
+
+ av1_set_quantizer(
+ cm, cpi->oxcf.q_cfg.qm_minlevel, cpi->oxcf.q_cfg.qm_maxlevel, qindex,
+ cpi->oxcf.q_cfg.enable_chroma_deltaq, cpi->oxcf.q_cfg.enable_hdr_deltaq);
+
+ av1_setup_block_planes(xd, seq_params->subsampling_x,
+ seq_params->subsampling_y, num_planes);
+
+ av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, fp_block_size);
+ av1_setup_dst_planes(xd->plane, seq_params->sb_size, this_frame, 0, 0, 0,
+ num_planes);
+
+ if (!frame_is_intra_only(cm)) {
+ av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes);
+ }
+
+ set_mi_offsets(mi_params, xd, 0, 0);
+
+ // Don't store luma on the fist pass since chroma is not computed
+ xd->cfl.store_y = 0;
+ av1_frame_init_quantizer(cpi);
+
+ av1_default_coef_probs(cm);
+ av1_init_mode_probs(cm->fc);
+ av1_init_mv_probs(cm);
+ av1_initialize_rd_consts(cpi);
+
+ enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy;
+ enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy;
+
+ if (mt_info->num_workers > 1) {
+ enc_row_mt->sync_read_ptr = av1_row_mt_sync_read;
+ enc_row_mt->sync_write_ptr = av1_row_mt_sync_write;
+ av1_fp_encode_tiles_row_mt(cpi);
+ } else {
+ first_pass_tiles(cpi, fp_block_size);
+ }
+
+ FRAME_STATS stats = accumulate_frame_stats(mb_stats, unit_rows, unit_cols);
+ int total_raw_motion_err_count =
+ frame_is_intra_only(cm) ? 0 : unit_rows * unit_cols;
+ const double raw_err_stdev =
+ raw_motion_error_stdev(raw_motion_err_list, total_raw_motion_err_count);
+ av1_free_firstpass_data(&cpi->firstpass_data);
+ av1_dealloc_src_diff_buf(&cpi->td.mb, av1_num_planes(cm));
+
+ // Clamp the image start to rows/2. This number of rows is discarded top
+ // and bottom as dead data so rows / 2 means the frame is blank.
+ if ((stats.image_data_start_row > unit_rows / 2) ||
+ (stats.image_data_start_row == INVALID_ROW)) {
+ stats.image_data_start_row = unit_rows / 2;
+ }
+ // Exclude any image dead zone
+ if (stats.image_data_start_row > 0) {
+ stats.intra_skip_count =
+ AOMMAX(0, stats.intra_skip_count -
+ (stats.image_data_start_row * unit_cols * 2));
+ }
+
+ TWO_PASS *twopass = &cpi->ppi->twopass;
+ const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : mi_params->MBs;
+ // Number of actual units used in the first pass, it can be other square
+ // block sizes than 16X16.
+ const int num_mbs = get_num_mbs(fp_block_size, num_mbs_16X16);
+ stats.intra_factor = stats.intra_factor / (double)num_mbs;
+ stats.brightness_factor = stats.brightness_factor / (double)num_mbs;
+ FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
+ update_firstpass_stats(cpi, &stats, raw_err_stdev,
+ current_frame->frame_number, ts_duration,
+ fp_block_size);
+
+ // Copy the previous Last Frame back into gf buffer if the prediction is good
+ // enough... but also don't allow it to lag too far.
+ if ((twopass->sr_update_lag > 3) ||
+ ((current_frame->frame_number > 0) &&
+ (this_frame_stats->pcnt_inter > 0.20) &&
+ ((this_frame_stats->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame_stats->coded_error)) > 2.0))) {
+ if (golden_frame != NULL) {
+ assign_frame_buffer_p(
+ &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+ cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
+ }
+ twopass->sr_update_lag = 1;
+ } else {
+ ++twopass->sr_update_lag;
+ }
+
+ aom_extend_frame_borders(this_frame, num_planes);
+
+ // The frame we just compressed now becomes the last frame.
+ assign_frame_buffer_p(
+ &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame);
+
+ // Special case for the first frame. Copy into the GF buffer as a second
+ // reference.
+ if (current_frame->frame_number == 0 &&
+ get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) {
+ assign_frame_buffer_p(
+ &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+ cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
+ }
+
+ print_reconstruction_frame(last_frame, current_frame->frame_number,
+ /*do_print=*/0);
+
+ ++current_frame->frame_number;
+ cpi->ref_frame_flags = ref_frame_flags_backup;
+ if (!frame_is_intra_only(cm)) {
+ release_scaled_references(cpi);
+ }
+}
+
+aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info,
+ FIRSTPASS_STATS *ext_stats_buf,
+ int ext_stats_buf_size) {
+ assert(IMPLIES(ext_stats_buf == NULL, ext_stats_buf_size == 0));
+ if (ext_stats_buf == NULL) {
+ firstpass_info->stats_buf = firstpass_info->static_stats_buf;
+ firstpass_info->stats_buf_size =
+ sizeof(firstpass_info->static_stats_buf) /
+ sizeof(firstpass_info->static_stats_buf[0]);
+ firstpass_info->start_index = 0;
+ firstpass_info->cur_index = 0;
+ firstpass_info->stats_count = 0;
+ firstpass_info->future_stats_count = 0;
+ firstpass_info->past_stats_count = 0;
+ av1_zero(firstpass_info->total_stats);
+ if (ext_stats_buf_size == 0) {
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ firstpass_info->stats_buf = ext_stats_buf;
+ firstpass_info->stats_buf_size = ext_stats_buf_size;
+ firstpass_info->start_index = 0;
+ firstpass_info->cur_index = 0;
+ firstpass_info->stats_count = firstpass_info->stats_buf_size;
+ firstpass_info->future_stats_count = firstpass_info->stats_count;
+ firstpass_info->past_stats_count = 0;
+ av1_zero(firstpass_info->total_stats);
+ for (int i = 0; i < firstpass_info->stats_count; ++i) {
+ av1_accumulate_stats(&firstpass_info->total_stats,
+ &firstpass_info->stats_buf[i]);
+ }
+ }
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_firstpass_info_move_cur_index(
+ FIRSTPASS_INFO *firstpass_info) {
+ assert(firstpass_info->future_stats_count +
+ firstpass_info->past_stats_count ==
+ firstpass_info->stats_count);
+ if (firstpass_info->future_stats_count > 1) {
+ firstpass_info->cur_index =
+ (firstpass_info->cur_index + 1) % firstpass_info->stats_buf_size;
+ --firstpass_info->future_stats_count;
+ ++firstpass_info->past_stats_count;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+}
+
+aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info) {
+ if (firstpass_info->stats_count > 0 && firstpass_info->past_stats_count > 0) {
+ const int next_start =
+ (firstpass_info->start_index + 1) % firstpass_info->stats_buf_size;
+ firstpass_info->start_index = next_start;
+ --firstpass_info->stats_count;
+ --firstpass_info->past_stats_count;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+}
+
+aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop(
+ FIRSTPASS_INFO *firstpass_info) {
+ aom_codec_err_t ret = av1_firstpass_info_move_cur_index(firstpass_info);
+ if (ret != AOM_CODEC_OK) return ret;
+ ret = av1_firstpass_info_pop(firstpass_info);
+ return ret;
+}
+
+aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info,
+ const FIRSTPASS_STATS *input_stats) {
+ if (firstpass_info->stats_count < firstpass_info->stats_buf_size) {
+ const int next_index =
+ (firstpass_info->start_index + firstpass_info->stats_count) %
+ firstpass_info->stats_buf_size;
+ firstpass_info->stats_buf[next_index] = *input_stats;
+ ++firstpass_info->stats_count;
+ ++firstpass_info->future_stats_count;
+ av1_accumulate_stats(&firstpass_info->total_stats, input_stats);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+}
+
+const FIRSTPASS_STATS *av1_firstpass_info_peek(
+ const FIRSTPASS_INFO *firstpass_info, int offset_from_cur) {
+ if (offset_from_cur >= -firstpass_info->past_stats_count &&
+ offset_from_cur < firstpass_info->future_stats_count) {
+ const int index = (firstpass_info->cur_index + offset_from_cur) %
+ firstpass_info->stats_buf_size;
+ return &firstpass_info->stats_buf[index];
+ } else {
+ return NULL;
+ }
+}
+
+int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
+ int offset_from_cur) {
+ if (offset_from_cur < firstpass_info->future_stats_count) {
+ return firstpass_info->future_stats_count - offset_from_cur;
+ }
+ return 0;
+}
+
+int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
+ int offset_from_cur) {
+ if (offset_from_cur >= -firstpass_info->past_stats_count) {
+ return offset_from_cur + firstpass_info->past_stats_count;
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
new file mode 100644
index 0000000000..d01363a80e
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_
+#define AOM_AV1_ENCODER_FIRSTPASS_H_
+
+#include <stdbool.h>
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
+
+#define MIN_ZERO_MOTION 0.95
+#define MAX_SR_CODED_ERROR 40
+#define MAX_RAW_ERR_VAR 2000
+#define MIN_MV_IN_OUT 0.4
+
+#define VLOW_MOTION_THRESHOLD 950
+struct ThreadData;
+
+/*!
+ * \brief The stucture of acummulated frame stats in the first pass.
+ *
+ * Errors (coded_error, intra_error, etc.) and counters (new_mv_count) are
+ * normalized to each MB. MV related stats (MVc, MVr, etc.) are normalized to
+ * the frame width and height. See function normalize_firstpass_stats.
+ */
+typedef struct FIRSTPASS_STATS {
+ /*!
+ * Frame number in display order, if stats are for a single frame.
+ * No real meaning for a collection of frames.
+ */
+ double frame;
+ /*!
+ * Weight assigned to this frame (or total weight for the collection of
+ * frames) currently based on intra factor and brightness factor. This is used
+ * to distribute bits betweeen easier and harder frames.
+ */
+ double weight;
+ /*!
+ * Intra prediction error.
+ */
+ double intra_error;
+ /*!
+ * Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+ */
+ double frame_avg_wavelet_energy;
+ /*!
+ * Best of intra pred error and inter pred error using last frame as ref.
+ */
+ double coded_error;
+ /*!
+ * Best of intra pred error and inter pred error using golden frame as ref.
+ */
+ double sr_coded_error;
+ /*!
+ * Percentage of blocks with inter pred error < intra pred error.
+ */
+ double pcnt_inter;
+ /*!
+ * Percentage of blocks using (inter prediction and) non-zero motion vectors.
+ */
+ double pcnt_motion;
+ /*!
+ * Percentage of blocks where golden frame was better than last or intra:
+ * inter pred error using golden frame < inter pred error using last frame and
+ * inter pred error using golden frame < intra pred error
+ */
+ double pcnt_second_ref;
+ /*!
+ * Percentage of blocks where intra and inter prediction errors were very
+ * close. Note that this is a 'weighted count', that is, the so blocks may be
+ * weighted by how close the two errors were.
+ */
+ double pcnt_neutral;
+ /*!
+ * Percentage of blocks that have almost no intra error residual
+ * (i.e. are in effect completely flat and untextured in the intra
+ * domain). In natural videos this is uncommon, but it is much more
+ * common in animations, graphics and screen content, so may be used
+ * as a signal to detect these types of content.
+ */
+ double intra_skip_pct;
+ /*!
+ * Image mask rows top and bottom.
+ */
+ double inactive_zone_rows;
+ /*!
+ * Image mask columns at left and right edges.
+ */
+ double inactive_zone_cols;
+ /*!
+ * Average of row motion vectors.
+ */
+ double MVr;
+ /*!
+ * Mean of absolute value of row motion vectors.
+ */
+ double mvr_abs;
+ /*!
+ * Mean of column motion vectors.
+ */
+ double MVc;
+ /*!
+ * Mean of absolute value of column motion vectors.
+ */
+ double mvc_abs;
+ /*!
+ * Variance of row motion vectors.
+ */
+ double MVrv;
+ /*!
+ * Variance of column motion vectors.
+ */
+ double MVcv;
+ /*!
+ * Value in range [-1,1] indicating fraction of row and column motion vectors
+ * that point inwards (negative MV value) or outwards (positive MV value).
+ * For example, value of 1 indicates, all row/column MVs are inwards.
+ */
+ double mv_in_out_count;
+ /*!
+ * Count of unique non-zero motion vectors.
+ */
+ double new_mv_count;
+ /*!
+ * Duration of the frame / collection of frames.
+ */
+ double duration;
+ /*!
+ * 1.0 if stats are for a single frame, OR
+ * Number of frames in this collection for which the stats are accumulated.
+ */
+ double count;
+ /*!
+ * standard deviation for (0, 0) motion prediction error
+ */
+ double raw_error_stdev;
+ /*!
+ * Whether the frame contains a flash
+ */
+ int64_t is_flash;
+ /*!
+ * Estimated noise variance
+ */
+ double noise_var;
+ /*!
+ * Correlation coefficient with the previous frame
+ */
+ double cor_coeff;
+ /*!
+ * log of intra_error
+ */
+ double log_intra_error;
+ /*!
+ * log of coded_error
+ */
+ double log_coded_error;
+} FIRSTPASS_STATS;
+
+// We want to keep one past stats for key frame detection
+// in test_candidate_kf()
+#define FIRSTPASS_INFO_STATS_PAST_MIN 1
+
+// The size of static buffer used in FIRSTPASS_INFO.
+#define FIRSTPASS_INFO_STATIC_BUF_SIZE \
+ (MAX_LAP_BUFFERS + FIRSTPASS_INFO_STATS_PAST_MIN)
+
+/*!
+ * \brief Data structure used for managing first pass stats
+ */
+typedef struct {
+ /*!
+ * A static buffer that will be used when no ext_stats_buf is assigned. The
+ * ext_stats_buf is assigned through av1_firstpass_info_init() when the user
+ * already has a pre-existing firstpass stats that is stored in an external
+ * buffer. The ext_stats_buf is usually used in two pass mode. When using one
+ * pass mode, we generate "firstpass" stats and encode the video in the same
+ * pass. In this scenario, the stats will be pushed and popped from
+ * static_stats_buf.
+ */
+ FIRSTPASS_STATS static_stats_buf[FIRSTPASS_INFO_STATIC_BUF_SIZE];
+ /*!
+ * A pointer to first pass stats.
+ * Note that this buffer will be used as ring buffer.
+ */
+ FIRSTPASS_STATS *stats_buf;
+ /*!
+ * size of stats_buf
+ */
+ int stats_buf_size;
+ /*!
+ * start index of the available frame stats
+ * Note that start_index doesn't always point to
+ * current frame's stats because we need to
+ * keep past stats as well. To access current
+ * frame's stats, please use cur_index.
+ */
+ int start_index;
+
+ /*!
+ * count available stats stored in stats_buf
+ * the following condition should stay true
+ * stats_count = future_stats_count + past_stats_count
+ */
+ int stats_count;
+
+ /*!
+ * index of the current frame's stats
+ */
+ int cur_index;
+
+ /*!
+ * count available future stats including current stats
+ */
+ int future_stats_count;
+
+ /*!
+ * count available past stats EXCLUDING current stats
+ */
+ int past_stats_count;
+
+ /*!
+ * Accumulation of the stats being pushed into firstpass_info
+ */
+ FIRSTPASS_STATS total_stats;
+} FIRSTPASS_INFO;
+
+/*!\brief Init firstpass_info
+ *
+ * If using ext_stats_buf, the buffer needs to stay available during encoding
+ * process.
+ *
+ * \ingroup rate_control
+ * \param[out] firstpass_info struct of firstpass_info.
+ * \param[in] ext_stats_buf external stats buffer. Pass in NULL if
+ * choose to use internal static_stats_buf.
+ * \param[in] ext_stats_buf_size external stats buffer size. Pass in 0 if
+ * choose to use internal static_stats_buf. \return status
+ */
+aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info,
+ FIRSTPASS_STATS *ext_stats_buf,
+ int ext_stats_buf_size);
+
+/*!\brief Move cur_index by 1
+ *
+ * \ingroup rate_control
+ * \param[out] firstpass_info struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_move_cur_index(
+ FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Pop a stats from firstpass_info
+ *
+ * \ingroup rate_control
+ * \param[out] firstpass_info struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Move cur_index by 1 and pop a stats from firstpass_info
+ *
+ * \ingroup rate_control
+ * \param[out] firstpass_info struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop(
+ FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Push a stats into firstpass_info
+ *
+ * Note that the input stats will be copied into firstpass_info.
+ * \ingroup rate_control
+ * \param[out] firstpass_info struct of firstpass_info.
+ * \param[in] input_stats input stats
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info,
+ const FIRSTPASS_STATS *input_stats);
+
+/*!\brief Peek at a stats from firstpass_info
+ *
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in] firstpass_info struct of firstpass_info.
+ * \param[in] offset_from_cur index offset from cur_index.
+ * \return pointer to the stats. The pointer will be NULL if
+ * stats_index_offset is invalid.
+ */
+const FIRSTPASS_STATS *av1_firstpass_info_peek(
+ const FIRSTPASS_INFO *firstpass_info, int offset_from_cur);
+
+/*!\brief Count the future stats from the target in firstpass_info
+ * Note that the target stats will be counted as well.
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in] firstpass_info struct of firstpass_info.
+ * \param[in] offset_from_cur target stats's inffset
+ * from cur_index.
+ * \return Number of stats in the future after the target stats
+ * including itself.
+ */
+int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
+ int offset_from_cur);
+
+/*!\brief Count the past stats before the target in firstpass_info
+ * Note that the target stats will NOT be counted.
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in] firstpass_info struct of firstpass_info.
+ * \param[in] offset_from_cur target stats's index offset
+ * from cur_index.
+ * \return Number of stats in the past before the target stats
+ * excluding itself.
+ */
+int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
+ int offset_from_cur);
+
+/*!\cond */
+#define FC_ANIMATION_THRESH 0.15
+enum {
+ FC_NORMAL = 0,
+ FC_GRAPHICS_ANIMATION = 1,
+ FRAME_CONTENT_TYPES = 2
+} UENUM1BYTE(FRAME_CONTENT_TYPE);
+/*!\endcond */
+
+/*!
+ * \brief Data related to the current GF/ARF group and the
+ * individual frames within the group
+ */
+typedef struct GF_GROUP {
+ /*!\cond */
+ // Frame update type, e.g. ARF/GF/LF/Overlay
+ FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH];
+ unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+ // The number of frames displayed so far within the GOP at a given coding
+ // frame.
+ unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH];
+ int layer_depth[MAX_STATIC_GF_GROUP_LENGTH];
+ int arf_boost[MAX_STATIC_GF_GROUP_LENGTH];
+ int max_layer_depth;
+ int max_layer_depth_allowed;
+ // This is currently only populated for AOM_Q mode
+ int q_val[MAX_STATIC_GF_GROUP_LENGTH];
+ int rdmult_val[MAX_STATIC_GF_GROUP_LENGTH];
+ int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH];
+ // The frame coding type - inter/intra frame
+ FRAME_TYPE frame_type[MAX_STATIC_GF_GROUP_LENGTH];
+ // The reference frame buffer control - update or reset
+ REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH];
+ int arf_index; // the index in the gf group of ARF, if no arf, then -1
+ int size; // The total length of a GOP
+
+ // The offset into lookahead_ctx for choosing
+ // source of frame parallel encodes.
+ int src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+ // Stores the display order hint of each frame in the current GF_GROUP.
+ int display_idx[MAX_STATIC_GF_GROUP_LENGTH];
+
+ // The reference frame list maps the reference frame indexes to its
+ // buffer index in the decoded buffer. A value of -1 means the
+ // corresponding reference frame index doesn't point towards any
+ // previously decoded frame.
+ int8_t ref_frame_list[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+ // Update frame index
+ int update_ref_idx[MAX_STATIC_GF_GROUP_LENGTH];
+ // The map_idx of primary reference
+ int primary_ref_idx[MAX_STATIC_GF_GROUP_LENGTH];
+
+ // Indicates the level of parallelism in frame parallel encodes.
+ // 0 : frame is independently encoded (not part of parallel encodes).
+ // 1 : frame is the first in encode order in a given parallel encode set.
+ // 2 : frame occurs later in encode order in a given parallel encode set.
+ int frame_parallel_level[MAX_STATIC_GF_GROUP_LENGTH];
+ // Indicates whether a frame should act as non-reference frame.
+ bool is_frame_non_ref[MAX_STATIC_GF_GROUP_LENGTH];
+ // Indicates whether a frame is dropped.
+ bool is_frame_dropped[MAX_STATIC_GF_GROUP_LENGTH];
+
+ // Stores the display order hint of the frames not to be
+ // refreshed by the current frame.
+ int skip_frame_refresh[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+ // Stores the display order hint of the frame to be excluded during reference
+ // assignment.
+ int skip_frame_as_ref[MAX_STATIC_GF_GROUP_LENGTH];
+ /*!\endcond */
+} GF_GROUP;
+/*!\cond */
+
+typedef struct {
+ // Track if the last frame in a GOP has higher quality.
+ int arf_gf_boost_lst;
+} GF_STATE;
+
+typedef struct {
+ FIRSTPASS_STATS *stats_in_start;
+ FIRSTPASS_STATS *stats_in_end;
+ FIRSTPASS_STATS *stats_in_buf_end;
+ FIRSTPASS_STATS *total_stats;
+ FIRSTPASS_STATS *total_left_stats;
+} STATS_BUFFER_CTX;
+
+/*!\endcond */
+
+/*!
+ * \brief Two pass status and control data.
+ */
+typedef struct {
+ /*!\cond */
+ unsigned int section_intra_rating;
+ // Circular queue of first pass stats stored for most recent frames.
+ // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored
+ // here.
+ FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1];
+ int frame_stats_next_idx; // Index to next unused element in frame_stats_arr.
+ STATS_BUFFER_CTX *stats_buf_ctx;
+ FIRSTPASS_INFO firstpass_info; // This is the first pass data structure
+ // intended to replace stats_in
+ int first_pass_done;
+ int64_t bits_left;
+ double modified_error_min;
+ double modified_error_max;
+ double modified_error_left;
+
+ // Projected total bits available for a key frame group of frames
+ int64_t kf_group_bits;
+
+ // Error score of frames still to be coded in kf group
+ double kf_group_error_left;
+
+ // Over time correction for bits per macro block estimation
+ double bpm_factor;
+
+ // Record of target and actual bits spent in current ARF group
+ int rolling_arf_group_target_bits;
+ int rolling_arf_group_actual_bits;
+
+ int sr_update_lag;
+
+ int kf_zeromotion_pct;
+ int last_kfgroup_zeromotion_pct;
+ int extend_minq;
+ int extend_maxq;
+ /*!\endcond */
+} TWO_PASS;
+
+/*!
+ * \brief Frame level Two pass status and control data.
+ */
+typedef struct {
+ /*!\cond */
+ const FIRSTPASS_STATS *stats_in;
+ // Pointer to the stats of the current frame.
+ const FIRSTPASS_STATS *this_frame;
+ double mb_av_energy;
+ // An indication of the content type of the current frame
+ FRAME_CONTENT_TYPE fr_content_type;
+ double frame_avg_haar_energy;
+ /*!\endcond */
+} TWO_PASS_FRAME;
+
+/*!\cond */
+
+// This structure contains several key parameters to be accumulated for this
+// frame.
+typedef struct {
+ // Intra prediction error.
+ int64_t intra_error;
+ // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+ int64_t frame_avg_wavelet_energy;
+ // Best of intra pred error and inter pred error using last frame as ref.
+ int64_t coded_error;
+ // Best of intra pred error and inter pred error using golden frame as ref.
+ int64_t sr_coded_error;
+ // Count of motion vector.
+ int mv_count;
+ // Count of blocks that pick inter prediction (inter pred error is smaller
+ // than intra pred error).
+ int inter_count;
+ // Count of blocks that pick second ref (golden frame).
+ int second_ref_count;
+ // Count of blocks where the inter and intra are very close and very low.
+ double neutral_count;
+ // Count of blocks where intra error is very small.
+ int intra_skip_count;
+ // Start row.
+ int image_data_start_row;
+ // Count of unique non-zero motion vectors.
+ int new_mv_count;
+ // Sum of inward motion vectors.
+ int sum_in_vectors;
+ // Sum of motion vector row.
+ int sum_mvr;
+ // Sum of motion vector column.
+ int sum_mvc;
+ // Sum of absolute value of motion vector row.
+ int sum_mvr_abs;
+ // Sum of absolute value of motion vector column.
+ int sum_mvc_abs;
+ // Sum of the square of motion vector row.
+ int64_t sum_mvrs;
+ // Sum of the square of motion vector column.
+ int64_t sum_mvcs;
+ // A factor calculated using intra pred error.
+ double intra_factor;
+ // A factor that measures brightness.
+ double brightness_factor;
+} FRAME_STATS;
+
+// This structure contains first pass data.
+typedef struct {
+ // Buffer holding frame stats for all MACROBLOCKs.
+ // mb_stats[i] stores the FRAME_STATS of the ith
+ // MB in raster scan order.
+ FRAME_STATS *mb_stats;
+ // Buffer to store the prediction error of the (0,0) motion
+ // vector using the last source frame as the reference.
+ // raw_motion_err_list[i] stores the raw_motion_err of
+ // the ith MB in raster scan order.
+ int *raw_motion_err_list;
+} FirstPassData;
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+struct AV1EncoderConfig;
+struct TileDataEnc;
+
+static INLINE int is_fp_wavelet_energy_invalid(
+ const FIRSTPASS_STATS *fp_stats) {
+ assert(fp_stats != NULL);
+ return (fp_stats->frame_avg_wavelet_energy < 0);
+}
+
+static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) {
+ return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16);
+}
+
+int av1_get_unit_rows_in_tile(const TileInfo *tile,
+ const BLOCK_SIZE fp_block_size);
+int av1_get_unit_cols_in_tile(const TileInfo *tile,
+ const BLOCK_SIZE fp_block_size);
+
+void av1_first_pass_row(struct AV1_COMP *cpi, struct ThreadData *td,
+ struct TileDataEnc *tile_data, const int mb_row,
+ const BLOCK_SIZE fp_block_size);
+void av1_end_first_pass(struct AV1_COMP *cpi);
+
+void av1_free_firstpass_data(FirstPassData *firstpass_data);
+
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section);
+void av1_accumulate_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame);
+/*!\endcond */
+
+/*!\brief AV1 first pass encoding.
+ *
+ * \ingroup rate_control
+ * This function is the first encoding pass for the two pass encoding mode.
+ * It encodes the whole video and collect essential information.
+ * Two pass encoding is an encoding mode in the reference software (libaom)
+ * of AV1 for high performance encoding. The first pass is a fast encoding
+ * process to collect essential information to help the second pass make
+ * encoding decisions and improve coding quality. The collected stats is used
+ * in rate control, for example, to determine frame cut, the position of
+ * alternative reference frame (ARF), etc.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] ts_duration Duration of the frame / collection of frames
+ *
+ * \remark Nothing is returned. Instead, the "TWO_PASS" structure inside "cpi"
+ * is modified to store information computed in this function.
+ */
+void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
+
+void av1_noop_first_pass_frame(struct AV1_COMP *cpi, const int64_t ts_duration);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_FIRSTPASS_H_
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
new file mode 100644
index 0000000000..73910de121
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/encoder/global_motion.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/segmentation.h"
+
+#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
+
+// Border over which to compute the global motion
+#define ERRORADV_BORDER 0
+
+/* clang-format off */
+// Error metric used for global motion evaluation.
+// For 8-bit input, the pixel error used to index this table will always
+// be between -255 and +255. But for 10- and 12-bit input, we use interpolation
+// which means that we need to support indices of -256 and +256 as well.
+// Therefore, the table is offset so that logical index 0 corresponds to
+// error_measure_lut[256].
+const int error_measure_lut[513] = {
+ // pow 0.7
+ 16384, 16384, 16339, 16294, 16249, 16204, 16158, 16113,
+ 16068, 16022, 15977, 15932, 15886, 15840, 15795, 15749,
+ 15703, 15657, 15612, 15566, 15520, 15474, 15427, 15381,
+ 15335, 15289, 15242, 15196, 15149, 15103, 15056, 15010,
+ 14963, 14916, 14869, 14822, 14775, 14728, 14681, 14634,
+ 14587, 14539, 14492, 14445, 14397, 14350, 14302, 14254,
+ 14206, 14159, 14111, 14063, 14015, 13967, 13918, 13870,
+ 13822, 13773, 13725, 13676, 13628, 13579, 13530, 13481,
+ 13432, 13383, 13334, 13285, 13236, 13187, 13137, 13088,
+ 13038, 12988, 12939, 12889, 12839, 12789, 12739, 12689,
+ 12639, 12588, 12538, 12487, 12437, 12386, 12335, 12285,
+ 12234, 12183, 12132, 12080, 12029, 11978, 11926, 11875,
+ 11823, 11771, 11719, 11667, 11615, 11563, 11511, 11458,
+ 11406, 11353, 11301, 11248, 11195, 11142, 11089, 11036,
+ 10982, 10929, 10875, 10822, 10768, 10714, 10660, 10606,
+ 10552, 10497, 10443, 10388, 10333, 10279, 10224, 10168,
+ 10113, 10058, 10002, 9947, 9891, 9835, 9779, 9723,
+ 9666, 9610, 9553, 9497, 9440, 9383, 9326, 9268,
+ 9211, 9153, 9095, 9037, 8979, 8921, 8862, 8804,
+ 8745, 8686, 8627, 8568, 8508, 8449, 8389, 8329,
+ 8269, 8208, 8148, 8087, 8026, 7965, 7903, 7842,
+ 7780, 7718, 7656, 7593, 7531, 7468, 7405, 7341,
+ 7278, 7214, 7150, 7086, 7021, 6956, 6891, 6826,
+ 6760, 6695, 6628, 6562, 6495, 6428, 6361, 6293,
+ 6225, 6157, 6089, 6020, 5950, 5881, 5811, 5741,
+ 5670, 5599, 5527, 5456, 5383, 5311, 5237, 5164,
+ 5090, 5015, 4941, 4865, 4789, 4713, 4636, 4558,
+ 4480, 4401, 4322, 4242, 4162, 4080, 3998, 3916,
+ 3832, 3748, 3663, 3577, 3490, 3402, 3314, 3224,
+ 3133, 3041, 2948, 2854, 2758, 2661, 2562, 2461,
+ 2359, 2255, 2148, 2040, 1929, 1815, 1698, 1577,
+ 1452, 1323, 1187, 1045, 894, 731, 550, 339,
+ 0, 339, 550, 731, 894, 1045, 1187, 1323,
+ 1452, 1577, 1698, 1815, 1929, 2040, 2148, 2255,
+ 2359, 2461, 2562, 2661, 2758, 2854, 2948, 3041,
+ 3133, 3224, 3314, 3402, 3490, 3577, 3663, 3748,
+ 3832, 3916, 3998, 4080, 4162, 4242, 4322, 4401,
+ 4480, 4558, 4636, 4713, 4789, 4865, 4941, 5015,
+ 5090, 5164, 5237, 5311, 5383, 5456, 5527, 5599,
+ 5670, 5741, 5811, 5881, 5950, 6020, 6089, 6157,
+ 6225, 6293, 6361, 6428, 6495, 6562, 6628, 6695,
+ 6760, 6826, 6891, 6956, 7021, 7086, 7150, 7214,
+ 7278, 7341, 7405, 7468, 7531, 7593, 7656, 7718,
+ 7780, 7842, 7903, 7965, 8026, 8087, 8148, 8208,
+ 8269, 8329, 8389, 8449, 8508, 8568, 8627, 8686,
+ 8745, 8804, 8862, 8921, 8979, 9037, 9095, 9153,
+ 9211, 9268, 9326, 9383, 9440, 9497, 9553, 9610,
+ 9666, 9723, 9779, 9835, 9891, 9947, 10002, 10058,
+ 10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497,
+ 10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929,
+ 10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353,
+ 11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771,
+ 11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183,
+ 12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588,
+ 12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988,
+ 13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383,
+ 13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773,
+ 13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159,
+ 14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539,
+ 14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916,
+ 14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289,
+ 15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657,
+ 15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022,
+ 16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384,
+ 16384,
+};
+/* clang-format on */
+
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) {
+ return best_erroradvantage < erroradv_tr &&
+ best_erroradvantage * params_cost < erroradv_prod_tr;
+}
+
+static void convert_to_params(const double *params, int32_t *model) {
+ int i;
+ model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+ model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+ model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) *
+ GM_TRANS_DECODE_FACTOR;
+ model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) *
+ GM_TRANS_DECODE_FACTOR;
+
+ for (i = 2; i < 6; ++i) {
+ const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0);
+ model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5);
+ model[i] =
+ (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX);
+ model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR;
+ }
+}
+
+void av1_convert_model_to_params(const double *params,
+ WarpedMotionParams *model) {
+ convert_to_params(params, model->wmmat);
+ model->wmtype = get_wmtype(model);
+ model->invalid = 0;
+}
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int32_t add_param_offset(int param_index, int32_t param_value,
+ int32_t offset) {
+ const int scale_vals[2] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF };
+ const int clamp_vals[2] = { GM_TRANS_MAX, GM_ALPHA_MAX };
+ // type of param: 0 - translation, 1 - affine
+ const int param_type = (param_index < 2 ? 0 : 1);
+ const int is_one_centered = (param_index == 2 || param_index == 5);
+
+ // Make parameter zero-centered and offset the shift that was done to make
+ // it compatible with the warped model
+ param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+ scale_vals[param_type];
+ // Add desired offset to the rescaled/zero-centered parameter
+ param_value += offset;
+ // Clamp the parameter so it does not overflow the number of bits allotted
+ // to it in the bitstream
+ param_value = (int32_t)clamp(param_value, -clamp_vals[param_type],
+ clamp_vals[param_type]);
+ // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+ // with the warped motion library
+ param_value *= (1 << scale_vals[param_type]);
+
+ // Undo the zero-centering step if necessary
+ return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
+static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
+ switch (wmtype) {
+ case IDENTITY:
+ wm->wmmat[0] = 0;
+ wm->wmmat[1] = 0;
+ AOM_FALLTHROUGH_INTENDED;
+ case TRANSLATION:
+ wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS;
+ wm->wmmat[3] = 0;
+ AOM_FALLTHROUGH_INTENDED;
+ case ROTZOOM:
+ wm->wmmat[4] = -wm->wmmat[3];
+ wm->wmmat[5] = wm->wmmat[2];
+ AOM_FALLTHROUGH_INTENDED;
+ case AFFINE: break;
+ default: assert(0);
+ }
+ wm->wmtype = wmtype;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int generic_sad_highbd(const uint16_t *const ref, int ref_stride,
+ const uint16_t *const dst, int dst_stride,
+ int p_width, int p_height) {
+ // This function should only be called for patches smaller than
+ // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+ // small enough that we don't need a 64-bit accumulator
+ assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+ int sad = 0;
+ for (int i = 0; i < p_height; ++i) {
+ for (int j = 0; j < p_width; ++j) {
+ sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
+ }
+ }
+ return sad;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_segmented_frame_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t highbd_segmented_frame_error(
+ const uint16_t *const ref, int ref_stride, const uint16_t *const dst,
+ int dst_stride, int p_width, int p_height, int bd, uint8_t *segment_map,
+ int segment_map_stride) {
+ (void)bd;
+ int patch_w, patch_h;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ int64_t sum_error = 0;
+ for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+ // avoid computing error into the frame padding
+ patch_w = AOMMIN(error_bsize_w, p_width - j);
+ patch_h = AOMMIN(error_bsize_h, p_height - i);
+
+ if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+ sum_error += aom_highbd_sad32x32(
+ CONVERT_TO_BYTEPTR(ref + j + i * ref_stride), ref_stride,
+ CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+ } else {
+ sum_error += generic_sad_highbd(ref + j + i * ref_stride, ref_stride,
+ dst + j + i * dst_stride, dst_stride,
+ patch_w, patch_h);
+ }
+ }
+ }
+ return sum_error;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in highbd_warp_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t highbd_warp_error(WarpedMotionParams *wm,
+ const uint16_t *const ref, int ref_width,
+ int ref_height, int ref_stride,
+ const uint16_t *const dst, int dst_stride,
+ int p_col, int p_row, int p_width,
+ int p_height, int subsampling_x,
+ int subsampling_y, int bd, int64_t best_error,
+ uint8_t *segment_map, int segment_map_stride) {
+ int64_t gm_sumerr = 0;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ DECLARE_ALIGNED(32, uint16_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
+
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
+ conv_params.use_dist_wtd_comp_avg = 0;
+ for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+ // avoid warping extra 8x8 blocks in the padded region of the frame
+ // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+ const int warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j);
+ const int warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i);
+ highbd_warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i,
+ warp_w, warp_h, WARP_ERROR_BLOCK, subsampling_x,
+ subsampling_y, bd, &conv_params);
+
+ if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+ gm_sumerr += aom_highbd_sad32x32(
+ CONVERT_TO_BYTEPTR(tmp), WARP_ERROR_BLOCK,
+ CONVERT_TO_BYTEPTR(dst + j + i * dst_stride), dst_stride);
+ } else {
+ gm_sumerr +=
+ generic_sad_highbd(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+ dst_stride, warp_w, warp_h);
+ }
+
+ if (gm_sumerr > best_error) return INT64_MAX;
+ }
+ }
+ return gm_sumerr;
+}
+#endif
+
+static INLINE int generic_sad(const uint8_t *const ref, int ref_stride,
+ const uint8_t *const dst, int dst_stride,
+ int p_width, int p_height) {
+ // This function should only be called for patches smaller than
+ // WARP_ERROR_BLOCK x WARP_ERROR_BLOCK. This keeps the number of pixels
+ // small enough that we don't need a 64-bit accumulator
+ assert(p_width <= WARP_ERROR_BLOCK && p_height <= WARP_ERROR_BLOCK);
+
+ int sad = 0;
+ for (int i = 0; i < p_height; ++i) {
+ for (int j = 0; j < p_width; ++j) {
+ sad += abs(dst[j + i * dst_stride] - ref[j + i * ref_stride]);
+ }
+ }
+ return sad;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in segmented_warp_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t segmented_frame_error(const uint8_t *const ref, int ref_stride,
+ const uint8_t *const dst, int dst_stride,
+ int p_width, int p_height,
+ uint8_t *segment_map,
+ int segment_map_stride) {
+ int patch_w, patch_h;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ int64_t sum_error = 0;
+ for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+ // avoid computing error into the frame padding
+ patch_w = AOMMIN(error_bsize_w, p_width - j);
+ patch_h = AOMMIN(error_bsize_h, p_height - i);
+
+ if (patch_w == WARP_ERROR_BLOCK && patch_h == WARP_ERROR_BLOCK) {
+ sum_error += aom_sad32x32(ref + j + i * ref_stride, ref_stride,
+ dst + j + i * dst_stride, dst_stride);
+ } else {
+ sum_error +=
+ generic_sad(ref + j + i * ref_stride, ref_stride,
+ dst + j + i * dst_stride, dst_stride, patch_w, patch_h);
+ }
+ }
+ }
+ return sum_error;
+}
+
+#if WARP_ERROR_BLOCK != 32
+#error "Need to change SAD call size in warp_error"
+#endif // WARP_ERROR_BLOCK != 32
+static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
+ int ref_width, int ref_height, int ref_stride,
+ const uint8_t *const dst, int dst_stride, int p_col,
+ int p_row, int p_width, int p_height,
+ int subsampling_x, int subsampling_y,
+ int64_t best_error, uint8_t *segment_map,
+ int segment_map_stride) {
+ int64_t gm_sumerr = 0;
+ int warp_w, warp_h;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ DECLARE_ALIGNED(16, uint8_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
+ ConvolveParams conv_params = get_conv_params(0, 0, 8);
+ conv_params.use_dist_wtd_comp_avg = 0;
+
+ for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+ int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+ // Only compute the error if this block contains inliers from the motion
+ // model
+ if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+ // avoid warping extra 8x8 blocks in the padded region of the frame
+ // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+ warp_w = AOMMIN(error_bsize_w, p_col + ref_width - j);
+ warp_h = AOMMIN(error_bsize_h, p_row + ref_height - i);
+ warp_plane(wm, ref, ref_width, ref_height, ref_stride, tmp, j, i, warp_w,
+ warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y,
+ &conv_params);
+
+ if (warp_w == WARP_ERROR_BLOCK && warp_h == WARP_ERROR_BLOCK) {
+ gm_sumerr += aom_sad32x32(tmp, WARP_ERROR_BLOCK,
+ dst + j + i * dst_stride, dst_stride);
+ } else {
+ gm_sumerr +=
+ generic_sad(tmp, WARP_ERROR_BLOCK, dst + j + i * dst_stride,
+ dst_stride, warp_w, warp_h);
+ }
+
+ if (gm_sumerr > best_error) return INT64_MAX;
+ }
+ }
+ return gm_sumerr;
+}
+
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+ int ref_stride, uint8_t *dst, int dst_stride,
+ int p_width, int p_height,
+ uint8_t *segment_map,
+ int segment_map_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ return highbd_segmented_frame_error(
+ CONVERT_TO_SHORTPTR(ref), ref_stride, CONVERT_TO_SHORTPTR(dst),
+ dst_stride, p_width, p_height, bd, segment_map, segment_map_stride);
+ }
+#endif
+ (void)use_hbd;
+ (void)bd;
+ return segmented_frame_error(ref, ref_stride, dst, dst_stride, p_width,
+ p_height, segment_map, segment_map_stride);
+}
+
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+ const uint8_t *ref, int ref_width, int ref_height,
+ int ref_stride, uint8_t *dst, int dst_stride, int p_col,
+ int p_row, int p_width, int p_height, int subsampling_x,
+ int subsampling_y, int64_t best_error,
+ uint8_t *segment_map, int segment_map_stride) {
+ if (!av1_get_shear_params(wm)) return INT64_MAX;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd)
+ return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), ref_width,
+ ref_height, ref_stride, CONVERT_TO_SHORTPTR(dst),
+ dst_stride, p_col, p_row, p_width, p_height,
+ subsampling_x, subsampling_y, bd, best_error,
+ segment_map, segment_map_stride);
+#endif
+ (void)use_hbd;
+ (void)bd;
+ return warp_error(wm, ref, ref_width, ref_height, ref_stride, dst, dst_stride,
+ p_col, p_row, p_width, p_height, subsampling_x,
+ subsampling_y, best_error, segment_map, segment_map_stride);
+}
+
+int64_t av1_refine_integerized_param(
+ WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
+ uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
+ int d_width, int d_height, int d_stride, int n_refinements,
+ int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride) {
+ static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+ const int border = ERRORADV_BORDER;
+ int i = 0, p;
+ int n_params = max_trans_model_params[wmtype];
+ int32_t *param_mat = wm->wmmat;
+ int64_t step_error, best_error;
+ int32_t step;
+ int32_t *param;
+ int32_t curr_param;
+ int32_t best_param;
+
+ force_wmtype(wm, wmtype);
+ wm->wmtype = get_wmtype(wm);
+
+ if (n_refinements == 0) {
+ // Compute the maximum error value that will be accepted, so that
+ // av1_warp_error can terminate early if it proves the model will not
+ // be accepted.
+ int64_t selection_threshold = (int64_t)lrint(ref_frame_error * erroradv_tr);
+ return av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border,
+ 0, 0, selection_threshold, segment_map,
+ segment_map_stride);
+ }
+
+ // When refining, use a slightly higher threshold for the initial error
+ // calculation - see comment above erroradv_early_tr for why.
+ int64_t selection_threshold =
+ (int64_t)lrint(ref_frame_error * erroradv_early_tr);
+ best_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border, border,
+ d_width - 2 * border, d_height - 2 * border, 0, 0,
+ selection_threshold, segment_map, segment_map_stride);
+
+ if (best_error > selection_threshold) {
+ return INT64_MAX;
+ }
+
+ step = 1 << (n_refinements - 1);
+ for (i = 0; i < n_refinements; i++, step >>= 1) {
+ for (p = 0; p < n_params; ++p) {
+ int step_dir = 0;
+ param = param_mat + p;
+ curr_param = *param;
+ best_param = curr_param;
+ // look to the left
+ // Note: We have to use force_wmtype() to keep the proper symmetry for
+ // ROTZOOM type models
+ *param = add_param_offset(p, curr_param, -step);
+ force_wmtype(wm, wmtype);
+ step_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border, 0,
+ 0, best_error, segment_map, segment_map_stride);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ step_dir = -1;
+ }
+
+ // look to the right
+ *param = add_param_offset(p, curr_param, step);
+ force_wmtype(wm, wmtype);
+ step_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border, 0,
+ 0, best_error, segment_map, segment_map_stride);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ step_dir = 1;
+ }
+
+ // look to the direction chosen above repeatedly until error increases
+ // for the biggest step size
+ while (step_dir) {
+ *param = add_param_offset(p, best_param, step * step_dir);
+ force_wmtype(wm, wmtype);
+ step_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, d_stride, border,
+ border, d_width - 2 * border, d_height - 2 * border,
+ 0, 0, best_error, segment_map, segment_map_stride);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ } else {
+ step_dir = 0;
+ }
+ }
+
+ // Restore best parameter value so far
+ *param = best_param;
+ force_wmtype(wm, wmtype);
+ }
+ }
+
+ wm->wmtype = get_wmtype(wm);
+ return best_error;
+}
+
+#define FEAT_COUNT_TR 3
+#define SEG_COUNT_TR 48
+void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
+ int height, int *inliers,
+ int num_inliers) {
+ int seg_count = 0;
+ memset(segment_map, 0, sizeof(*segment_map) * width * height);
+
+ for (int i = 0; i < num_inliers; i++) {
+ int x = inliers[i * 2];
+ int y = inliers[i * 2 + 1];
+ int seg_x = x >> WARP_ERROR_BLOCK_LOG;
+ int seg_y = y >> WARP_ERROR_BLOCK_LOG;
+ segment_map[seg_y * width + seg_x] += 1;
+ }
+
+ for (int i = 0; i < height; i++) {
+ for (int j = 0; j < width; j++) {
+ uint8_t feat_count = segment_map[i * width + j];
+ segment_map[i * width + j] = (feat_count >= FEAT_COUNT_TR);
+ seg_count += (segment_map[i * width + j]);
+ }
+ }
+
+ // If this motion does not make up a large enough portion of the frame,
+ // use the unsegmented version of the error metric
+ if (seg_count < SEG_COUNT_TR)
+ memset(segment_map, 1, width * height * sizeof(*segment_map));
+}
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
new file mode 100644
index 0000000000..8c9c60f0f5
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RANSAC_NUM_MOTIONS 1
+#define GM_MAX_REFINEMENT_STEPS 5
+#define MAX_DIRECTIONS 2
+
+// The structure holds a valid reference frame type and its temporal distance
+// from the source frame.
+typedef struct {
+ int distance;
+ MV_REFERENCE_FRAME frame;
+} FrameDistPair;
+
+typedef struct {
+ // Array of structure which holds the global motion parameters for a given
+ // motion model. motion_models[i] holds the parameters for a given motion
+ // model for the ith ransac motion.
+ MotionModel motion_models[RANSAC_NUM_MOTIONS];
+
+ // Pointer to hold inliers from motion model.
+ uint8_t *segment_map;
+} GlobalMotionData;
+
+typedef struct {
+ // Holds the mapping of each thread to past/future direction.
+ // thread_id_to_dir[i] indicates the direction id (past - 0/future - 1)
+ // assigned to the ith thread.
+ int8_t thread_id_to_dir[MAX_NUM_THREADS];
+
+ // A flag which holds the early exit status based on the speed feature
+ // 'prune_ref_frame_for_gm_search'. early_exit[i] will be set if the speed
+ // feature based early exit happens in the direction 'i'.
+ int8_t early_exit[MAX_DIRECTIONS];
+
+ // Counter for the next reference frame to be processed.
+ // next_frame_to_process[i] will hold the count of next reference frame to be
+ // processed in the direction 'i'.
+ int8_t next_frame_to_process[MAX_DIRECTIONS];
+} JobInfo;
+
+typedef struct {
+ // Data related to assigning jobs for global motion multi-threading.
+ JobInfo job_info;
+
+#if CONFIG_MULTITHREAD
+ // Mutex lock used while dispatching jobs.
+ pthread_mutex_t *mutex_;
+#endif
+
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool gm_mt_exit;
+} AV1GlobalMotionSync;
+
+void av1_convert_model_to_params(const double *params,
+ WarpedMotionParams *model);
+
+// Criteria for accepting a global motion model
+static const double erroradv_tr = 0.65;
+static const double erroradv_prod_tr = 20000;
+
+// Early exit threshold for global motion refinement
+// This is set slightly higher than erroradv_tr, as a compromise between
+// two factors:
+//
+// 1) By rejecting un-promising models early, we can reduce the encode time
+// spent trying to refine them
+//
+// 2) When we refine a model, its error may decrease to below the acceptance
+// threshold even if the model is initially above the threshold
+static const double erroradv_early_tr = 0.70;
+
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost);
+
+void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
+ int height, int *inliers,
+ int num_inliers);
+
+extern const int error_measure_lut[513];
+
+static INLINE int error_measure(int err) {
+ return error_measure_lut[256 + err];
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_error_measure(int err, int bd) {
+ const int b = bd - 8;
+ const int bmask = (1 << b) - 1;
+ const int v = (1 << b);
+
+ // Split error into two parts and do an interpolated table lookup
+ // To compute the table index and interpolation value, we want to calculate
+ // the quotient and remainder of err / 2^b. But it is very important that
+ // the division must round down, and the remainder must be positive,
+ // ie. in the range [0, 2^b).
+ //
+ // In C, the >> and & operators do what we want, but the / and % operators
+ // give the wrong results for negative inputs. So we must use >> and & here.
+ //
+ // For example, if bd == 10 and err == -5, compare the results:
+ // (-5) >> 2 = -2, (-5) & 3 = 3
+ // vs. (-5) / 4 = -1, (-5) % 4 = -1
+ const int e1 = err >> b;
+ const int e2 = err & bmask;
+ return error_measure_lut[256 + e1] * (v - e2) +
+ error_measure_lut[257 + e1] * e2;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+ int ref_stride, uint8_t *dst, int dst_stride,
+ int p_width, int p_height,
+ uint8_t *segment_map, int segment_map_stride);
+
+// Returns the error between the result of applying motion 'wm' to the frame
+// described by 'ref' and the frame described by 'dst'.
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+ const uint8_t *ref, int ref_width, int ref_height,
+ int ref_stride, uint8_t *dst, int dst_stride, int p_col,
+ int p_row, int p_width, int p_height, int subsampling_x,
+ int subsampling_y, int64_t best_error,
+ uint8_t *segment_map, int segment_map_stride);
+
+// Returns the av1_warp_error between "dst" and the result of applying the
+// motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
+// modified in place.
+int64_t av1_refine_integerized_param(
+ WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
+ uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
+ int d_width, int d_height, int d_stride, int n_refinements,
+ int64_t ref_frame_error, uint8_t *segment_map, int segment_map_stride);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/third_party/aom/av1/encoder/global_motion_facade.c b/third_party/aom/av1/encoder/global_motion_facade.c
new file mode 100644
index 0000000000..02a4e70ed3
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion_facade.c
@@ -0,0 +1,450 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/binary_codes_writer.h"
+
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_dsp/pyramid.h"
+#include "av1/common/warped_motion.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/global_motion_facade.h"
+
+// Range of model types to search
+#define FIRST_GLOBAL_TRANS_TYPE ROTZOOM
+#define LAST_GLOBAL_TRANS_TYPE ROTZOOM
+
+// Computes the cost for the warp parameters.
+static int gm_get_params_cost(const WarpedMotionParams *gm,
+ const WarpedMotionParams *ref_gm, int allow_hp) {
+ int params_cost = 0;
+ int trans_bits, trans_prec_diff;
+ switch (gm->wmtype) {
+ case AFFINE:
+ case ROTZOOM:
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
+ (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+ (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+ if (gm->wmtype >= AFFINE) {
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+ (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ }
+ AOM_FALLTHROUGH_INTENDED;
+ case TRANSLATION:
+ trans_bits = (gm->wmtype == TRANSLATION)
+ ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+ : GM_ABS_TRANS_BITS;
+ trans_prec_diff = (gm->wmtype == TRANSLATION)
+ ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+ : GM_TRANS_PREC_DIFF;
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[0] >> trans_prec_diff),
+ (gm->wmmat[0] >> trans_prec_diff));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[1] >> trans_prec_diff),
+ (gm->wmmat[1] >> trans_prec_diff));
+ AOM_FALLTHROUGH_INTENDED;
+ case IDENTITY: break;
+ default: assert(0);
+ }
+ return (params_cost << AV1_PROB_COST_SHIFT);
+}
+
+// For the given reference frame, computes the global motion parameters for
+// different motion models and finds the best.
+static AOM_INLINE void compute_global_motion_for_ref_frame(
+ AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+ MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
+ const int segment_map_h, const WarpedMotionParams *ref_params) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ int src_width = cpi->source->y_crop_width;
+ int src_height = cpi->source->y_crop_height;
+ int src_stride = cpi->source->y_stride;
+ assert(ref_buf[frame] != NULL);
+ int bit_depth = cpi->common.seq_params->bit_depth;
+ GlobalMotionMethod global_motion_method = default_global_motion_method;
+ int num_refinements = cpi->sf.gm_sf.num_refinement_steps;
+ bool mem_alloc_failed = false;
+
+ // Select the best model based on fractional error reduction.
+ // By initializing this to erroradv_tr, the same logic which is used to
+ // select the best model will automatically filter out any model which
+ // doesn't meet the required quality threshold
+ double best_erroradv = erroradv_tr;
+ for (TransformationType model = FIRST_GLOBAL_TRANS_TYPE;
+ model <= LAST_GLOBAL_TRANS_TYPE; ++model) {
+ if (!aom_compute_global_motion(
+ model, cpi->source, ref_buf[frame], bit_depth, global_motion_method,
+ motion_models, RANSAC_NUM_MOTIONS, &mem_alloc_failed)) {
+ if (mem_alloc_failed) {
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate global motion buffers");
+ }
+ continue;
+ }
+
+ for (int i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+ if (motion_models[i].num_inliers == 0) continue;
+
+ WarpedMotionParams tmp_wm_params;
+ av1_convert_model_to_params(motion_models[i].params, &tmp_wm_params);
+
+ // Skip models that we won't use (IDENTITY or TRANSLATION)
+ //
+ // For IDENTITY type models, we don't need to evaluate anything because
+ // all the following logic is effectively comparing the estimated model
+ // to an identity model.
+ //
+ // For TRANSLATION type global motion models, gm_get_motion_vector() gives
+ // the wrong motion vector (see comments in that function for details).
+ // As translation-type models do not give much gain, we can avoid this bug
+ // by never choosing a TRANSLATION type model
+ if (tmp_wm_params.wmtype <= TRANSLATION) continue;
+
+ av1_compute_feature_segmentation_map(
+ segment_map, segment_map_w, segment_map_h, motion_models[i].inliers,
+ motion_models[i].num_inliers);
+
+ int64_t ref_frame_error = av1_segmented_frame_error(
+ is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
+ ref_buf[frame]->y_stride, cpi->source->y_buffer, src_stride,
+ src_width, src_height, segment_map, segment_map_w);
+
+ if (ref_frame_error == 0) continue;
+
+ const int64_t warp_error = av1_refine_integerized_param(
+ &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
+ ref_buf[frame]->y_buffer, ref_buf[frame]->y_crop_width,
+ ref_buf[frame]->y_crop_height, ref_buf[frame]->y_stride,
+ cpi->source->y_buffer, src_width, src_height, src_stride,
+ num_refinements, ref_frame_error, segment_map, segment_map_w);
+
+ // av1_refine_integerized_param() can return a simpler model type than
+ // its input, so re-check model type here
+ if (tmp_wm_params.wmtype <= TRANSLATION) continue;
+
+ double erroradvantage = (double)warp_error / ref_frame_error;
+
+ if (erroradvantage < best_erroradv) {
+ best_erroradv = erroradvantage;
+ // Save the wm_params modified by
+ // av1_refine_integerized_param() rather than motion index to
+ // avoid rerunning refine() below.
+ memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+ sizeof(WarpedMotionParams));
+ }
+ }
+ }
+
+ if (!av1_get_shear_params(&cm->global_motion[frame]))
+ cm->global_motion[frame] = default_warp_params;
+
+#if 0
+ // We never choose translational models, so this code is disabled
+ if (cm->global_motion[frame].wmtype == TRANSLATION) {
+ cm->global_motion[frame].wmmat[0] =
+ convert_to_trans_prec(cm->features.allow_high_precision_mv,
+ cm->global_motion[frame].wmmat[0]) *
+ GM_TRANS_ONLY_DECODE_FACTOR;
+ cm->global_motion[frame].wmmat[1] =
+ convert_to_trans_prec(cm->features.allow_high_precision_mv,
+ cm->global_motion[frame].wmmat[1]) *
+ GM_TRANS_ONLY_DECODE_FACTOR;
+ }
+#endif
+
+ if (cm->global_motion[frame].wmtype == IDENTITY) return;
+
+ // If the best error advantage found doesn't meet the threshold for
+ // this motion type, revert to IDENTITY.
+ if (!av1_is_enough_erroradvantage(
+ best_erroradv,
+ gm_get_params_cost(&cm->global_motion[frame], ref_params,
+ cm->features.allow_high_precision_mv))) {
+ cm->global_motion[frame] = default_warp_params;
+ }
+}
+
+// Computes global motion for the given reference frame.
+void av1_compute_gm_for_valid_ref_frames(
+ AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+ MotionModel *motion_models, uint8_t *segment_map, int segment_map_w,
+ int segment_map_h) {
+ AV1_COMMON *const cm = &cpi->common;
+ const WarpedMotionParams *ref_params =
+ cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+ : &default_warp_params;
+
+ compute_global_motion_for_ref_frame(cpi, error_info, ref_buf, frame,
+ motion_models, segment_map, segment_map_w,
+ segment_map_h, ref_params);
+}
+
+// Loops over valid reference frames and computes global motion estimation.
+static AOM_INLINE void compute_global_motion_for_references(
+ AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+ FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames,
+ MotionModel *motion_models, uint8_t *segment_map, const int segment_map_w,
+ const int segment_map_h) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct aom_internal_error_info *const error_info =
+ cpi->td.mb.e_mbd.error_info;
+ // Compute global motion w.r.t. reference frames starting from the nearest ref
+ // frame in a given direction.
+ for (int frame = 0; frame < num_ref_frames; frame++) {
+ int ref_frame = reference_frame[frame].frame;
+ av1_compute_gm_for_valid_ref_frames(cpi, error_info, ref_buf, ref_frame,
+ motion_models, segment_map,
+ segment_map_w, segment_map_h);
+ // If global motion w.r.t. current ref frame is
+ // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+ // the remaining ref frames in that direction.
+ if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+ cm->global_motion[ref_frame].wmtype <= TRANSLATION)
+ break;
+ }
+}
+
+// Compares the distance in 'a' and 'b'. Returns 1 if the frame corresponding to
+// 'a' is farther, -1 if the frame corresponding to 'b' is farther, 0 otherwise.
+static int compare_distance(const void *a, const void *b) {
+ const int diff =
+ ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance;
+ if (diff > 0)
+ return 1;
+ else if (diff < 0)
+ return -1;
+ return 0;
+}
+
+static int disable_gm_search_based_on_stats(const AV1_COMP *const cpi) {
+ int is_gm_present = 1;
+
+ // Check number of GM models only in GF groups with ARF frames. GM param
+ // estimation is always done in the case of GF groups with no ARF frames (flat
+ // gops)
+ if (cpi->ppi->gf_group.arf_index > -1) {
+ // valid_gm_model_found is initialized to INT32_MAX in the beginning of
+ // every GF group.
+ // Therefore, GM param estimation is always done for all frames until
+ // at least 1 frame each of ARF_UPDATE, INTNL_ARF_UPDATE and LF_UPDATE are
+ // encoded in a GF group For subsequent frames, GM param estimation is
+ // disabled, if no valid models have been found in all the three update
+ // types.
+ is_gm_present = (cpi->ppi->valid_gm_model_found[ARF_UPDATE] != 0) ||
+ (cpi->ppi->valid_gm_model_found[INTNL_ARF_UPDATE] != 0) ||
+ (cpi->ppi->valid_gm_model_found[LF_UPDATE] != 0);
+ }
+ return !is_gm_present;
+}
+
+// Prunes reference frames for global motion estimation based on the speed
+// feature 'gm_search_type'.
+static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) {
+ (void)frame;
+ switch (sf->gm_sf.gm_search_type) {
+ case GM_FULL_SEARCH: return 1;
+ case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
+ return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+ case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2:
+ return !(frame == LAST2_FRAME || frame == LAST3_FRAME ||
+ (frame == ALTREF2_FRAME));
+ case GM_SEARCH_CLOSEST_REFS_ONLY: return 1;
+ case GM_DISABLE_SEARCH: return 0;
+ default: assert(0);
+ }
+ return 1;
+}
+
+// Populates valid reference frames in past/future directions in
+// 'reference_frames' and their count in 'num_ref_frames'.
+static AOM_INLINE void update_valid_ref_frames_for_gm(
+ AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+ FrameDistPair reference_frames[MAX_DIRECTIONS][REF_FRAMES - 1],
+ int *num_ref_frames) {
+ AV1_COMMON *const cm = &cpi->common;
+ int *num_past_ref_frames = &num_ref_frames[0];
+ int *num_future_ref_frames = &num_ref_frames[1];
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
+ gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, cpi->gf_frame_index);
+ int cur_frame_gm_disabled = 0;
+ int pyr_lvl = cm->cur_frame->pyramid_level;
+
+ if (cpi->sf.gm_sf.disable_gm_search_based_on_stats) {
+ cur_frame_gm_disabled = disable_gm_search_based_on_stats(cpi);
+ }
+
+ for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
+ const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME };
+ RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
+ const int ref_disabled =
+ !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
+ ref_buf[frame] = NULL;
+ cm->global_motion[frame] = default_warp_params;
+ // Skip global motion estimation for invalid ref frames
+ if (buf == NULL ||
+ (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) {
+ continue;
+ } else {
+ ref_buf[frame] = &buf->buf;
+ }
+
+ int prune_ref_frames =
+ ref_pruning_enabled &&
+ prune_ref_by_selective_ref_frame(cpi, NULL, ref_frame,
+ cm->cur_frame->ref_display_order_hint);
+ int ref_pyr_lvl = buf->pyramid_level;
+
+ if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+ ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
+ do_gm_search_logic(&cpi->sf, frame) && !prune_ref_frames &&
+ ref_pyr_lvl <= pyr_lvl && !cur_frame_gm_disabled) {
+ assert(ref_buf[frame] != NULL);
+ const int relative_frame_dist = av1_encoder_get_relative_dist(
+ buf->display_order_hint, cm->cur_frame->display_order_hint);
+ // Populate past and future ref frames.
+ // reference_frames[0][] indicates past direction and
+ // reference_frames[1][] indicates future direction.
+ if (relative_frame_dist == 0) {
+ // Skip global motion estimation for frames at the same nominal instant.
+ // This will generally be either a "real" frame coded against a
+ // temporal filtered version, or a higher spatial layer coded against
+ // a lower spatial layer. In either case, the optimal motion model will
+ // be IDENTITY, so we don't need to search explicitly.
+ } else if (relative_frame_dist < 0) {
+ reference_frames[0][*num_past_ref_frames].distance =
+ abs(relative_frame_dist);
+ reference_frames[0][*num_past_ref_frames].frame = frame;
+ (*num_past_ref_frames)++;
+ } else {
+ reference_frames[1][*num_future_ref_frames].distance =
+ abs(relative_frame_dist);
+ reference_frames[1][*num_future_ref_frames].frame = frame;
+ (*num_future_ref_frames)++;
+ }
+ }
+ }
+}
+
+// Initializes parameters used for computing global motion.
+static AOM_INLINE void setup_global_motion_info_params(AV1_COMP *cpi) {
+ GlobalMotionInfo *const gm_info = &cpi->gm_info;
+ YV12_BUFFER_CONFIG *source = cpi->source;
+
+ gm_info->segment_map_w =
+ (source->y_crop_width + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG;
+ gm_info->segment_map_h =
+ (source->y_crop_height + WARP_ERROR_BLOCK - 1) >> WARP_ERROR_BLOCK_LOG;
+
+ memset(gm_info->reference_frames, -1,
+ sizeof(gm_info->reference_frames[0][0]) * MAX_DIRECTIONS *
+ (REF_FRAMES - 1));
+ av1_zero(gm_info->num_ref_frames);
+
+ // Populate ref_buf for valid ref frames in global motion
+ update_valid_ref_frames_for_gm(cpi, gm_info->ref_buf,
+ gm_info->reference_frames,
+ gm_info->num_ref_frames);
+
+ // Sort the past and future ref frames in the ascending order of their
+ // distance from the current frame. reference_frames[0] => past direction
+ // and reference_frames[1] => future direction.
+ qsort(gm_info->reference_frames[0], gm_info->num_ref_frames[0],
+ sizeof(gm_info->reference_frames[0][0]), compare_distance);
+ qsort(gm_info->reference_frames[1], gm_info->num_ref_frames[1],
+ sizeof(gm_info->reference_frames[1][0]), compare_distance);
+
+ if (cpi->sf.gm_sf.gm_search_type == GM_SEARCH_CLOSEST_REFS_ONLY) {
+ // Filter down to the nearest two ref frames.
+ // Prefer one past and one future ref over two past refs, even if
+ // the second past ref is closer
+ if (gm_info->num_ref_frames[1] > 0) {
+ gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 1);
+ gm_info->num_ref_frames[1] = AOMMIN(gm_info->num_ref_frames[1], 1);
+ } else {
+ gm_info->num_ref_frames[0] = AOMMIN(gm_info->num_ref_frames[0], 2);
+ }
+ }
+}
+
+// Computes global motion w.r.t. valid reference frames.
+static AOM_INLINE void global_motion_estimation(AV1_COMP *cpi) {
+ GlobalMotionInfo *const gm_info = &cpi->gm_info;
+ GlobalMotionData *gm_data = &cpi->td.gm_data;
+
+ // Compute global motion w.r.t. past reference frames and future reference
+ // frames
+ for (int dir = 0; dir < MAX_DIRECTIONS; dir++) {
+ if (gm_info->num_ref_frames[dir] > 0)
+ compute_global_motion_for_references(
+ cpi, gm_info->ref_buf, gm_info->reference_frames[dir],
+ gm_info->num_ref_frames[dir], gm_data->motion_models,
+ gm_data->segment_map, gm_info->segment_map_w, gm_info->segment_map_h);
+ }
+}
+
+// Global motion estimation for the current frame is computed.This computation
+// happens once per frame and the winner motion model parameters are stored in
+// cm->cur_frame->global_motion.
+void av1_compute_global_motion_facade(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ GlobalMotionInfo *const gm_info = &cpi->gm_info;
+
+ if (cpi->oxcf.tool_cfg.enable_global_motion) {
+ if (cpi->gf_frame_index == 0) {
+ for (int i = 0; i < FRAME_UPDATE_TYPES; i++) {
+ cpi->ppi->valid_gm_model_found[i] = INT32_MAX;
+#if CONFIG_FPMT_TEST
+ if (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE)
+ cpi->ppi->temp_valid_gm_model_found[i] = INT32_MAX;
+#endif
+ }
+ }
+ }
+
+ if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
+ cpi->oxcf.tool_cfg.enable_global_motion && !gm_info->search_done &&
+ cpi->sf.gm_sf.gm_search_type != GM_DISABLE_SEARCH) {
+ setup_global_motion_info_params(cpi);
+ // Terminate early if the total number of reference frames is zero.
+ if (cpi->gm_info.num_ref_frames[0] || cpi->gm_info.num_ref_frames[1]) {
+ gm_alloc_data(cpi, &cpi->td.gm_data);
+ if (cpi->mt_info.num_workers > 1)
+ av1_global_motion_estimation_mt(cpi);
+ else
+ global_motion_estimation(cpi);
+ gm_dealloc_data(&cpi->td.gm_data);
+ gm_info->search_done = 1;
+ }
+ }
+ memcpy(cm->cur_frame->global_motion, cm->global_motion,
+ sizeof(cm->cur_frame->global_motion));
+}
diff --git a/third_party/aom/av1/encoder/global_motion_facade.h b/third_party/aom/av1/encoder/global_motion_facade.h
new file mode 100644
index 0000000000..f13989aa25
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion_facade.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+// Allocates memory for members of GlobalMotionData.
+static AOM_INLINE void gm_alloc_data(AV1_COMP *cpi, GlobalMotionData *gm_data) {
+ AV1_COMMON *cm = &cpi->common;
+ GlobalMotionInfo *gm_info = &cpi->gm_info;
+
+ CHECK_MEM_ERROR(cm, gm_data->segment_map,
+ aom_malloc(sizeof(*gm_data->segment_map) *
+ gm_info->segment_map_w * gm_info->segment_map_h));
+
+ av1_zero_array(gm_data->motion_models, RANSAC_NUM_MOTIONS);
+ for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+ CHECK_MEM_ERROR(cm, gm_data->motion_models[m].inliers,
+ aom_malloc(sizeof(*gm_data->motion_models[m].inliers) * 2 *
+ MAX_CORNERS));
+ }
+}
+
+// Deallocates the memory allocated for members of GlobalMotionData.
+static AOM_INLINE void gm_dealloc_data(GlobalMotionData *gm_data) {
+ aom_free(gm_data->segment_map);
+ gm_data->segment_map = NULL;
+ for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+ aom_free(gm_data->motion_models[m].inliers);
+ gm_data->motion_models[m].inliers = NULL;
+ }
+}
+
+void av1_compute_gm_for_valid_ref_frames(
+ AV1_COMP *cpi, struct aom_internal_error_info *error_info,
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+ MotionModel *motion_models, uint8_t *segment_map, int segment_map_w,
+ int segment_map_h);
+void av1_compute_global_motion_facade(struct AV1_COMP *cpi);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_FACADE_H_
diff --git a/third_party/aom/av1/encoder/gop_structure.c b/third_party/aom/av1/encoder/gop_structure.c
new file mode 100644
index 0000000000..5078098450
--- /dev/null
+++ b/third_party/aom/av1/encoder/gop_structure.c
@@ -0,0 +1,867 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "av1/common/blockd.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "av1/common/av1_common_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+
+// This function sets gf_group->frame_parallel_level for LF_UPDATE frames based
+// on the value of parallel_frame_count.
+static void set_frame_parallel_level(int *frame_parallel_level,
+ int *parallel_frame_count,
+ int max_parallel_frames) {
+ assert(*parallel_frame_count > 0);
+ // parallel_frame_count > 1 indicates subsequent frame(s) in the current
+ // parallel encode set.
+ *frame_parallel_level = 1 + (*parallel_frame_count > 1);
+ // Update the count of no. of parallel frames.
+ (*parallel_frame_count)++;
+ if (*parallel_frame_count > max_parallel_frames) *parallel_frame_count = 1;
+}
+
+// This function sets gf_group->src_offset based on frame_parallel_level.
+// Outputs are gf_group->src_offset and first_frame_index
+static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index,
+ int cur_frame_idx, int frame_ind) {
+ if (gf_group->frame_parallel_level[frame_ind] > 0) {
+ if (gf_group->frame_parallel_level[frame_ind] == 1) {
+ *first_frame_index = cur_frame_idx;
+ }
+
+ // Obtain the offset of the frame at frame_ind in the lookahead queue by
+ // subtracting the display order hints of the current frame from the display
+ // order hint of the first frame in parallel encoding set (at
+ // first_frame_index).
+ gf_group->src_offset[frame_ind] =
+ (cur_frame_idx + gf_group->arf_src_offset[frame_ind]) -
+ *first_frame_index;
+ }
+}
+
+// Sets the GF_GROUP params for LF_UPDATE frames.
+static AOM_INLINE void set_params_for_leaf_frames(
+ const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+ const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+ GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+ int *parallel_frame_count, int max_parallel_frames,
+ int do_frame_parallel_encode, int *first_frame_index, int *cur_disp_index,
+ int layer_depth, int start, int end) {
+ gf_group->update_type[*frame_ind] = LF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth);
+ gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+ gf_group->arf_boost[*frame_ind] =
+ av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+ end - start, 0, NULL, NULL, 0);
+ ++(*cur_disp_index);
+
+ // Set the level of parallelism for the LF_UPDATE frame.
+ if (do_frame_parallel_encode) {
+ set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+ parallel_frame_count, max_parallel_frames);
+ // Set LF_UPDATE frames as non-reference frames.
+ gf_group->is_frame_non_ref[*frame_ind] = true;
+ }
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+
+ ++(*frame_ind);
+ ++(*cur_frame_idx);
+}
+
+// Sets the GF_GROUP params for INTNL_OVERLAY_UPDATE frames.
+static AOM_INLINE void set_params_for_intnl_overlay_frames(
+ GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+ int *first_frame_index, int *cur_disp_index, int layer_depth) {
+ gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->layer_depth[*frame_ind] = layer_depth;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+ gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+ ++(*cur_disp_index);
+
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+ ++(*frame_ind);
+ ++(*cur_frame_idx);
+}
+
+// Sets the GF_GROUP params for INTNL_ARF_UPDATE frames.
+static AOM_INLINE void set_params_for_internal_arfs(
+ const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+ const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+ GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+ int *parallel_frame_count, int max_parallel_frames,
+ int do_frame_parallel_encode, int *first_frame_index, int depth_thr,
+ int *cur_disp_idx, int layer_depth, int arf_src_offset, int offset,
+ int f_frames, int b_frames) {
+ gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = arf_src_offset;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->layer_depth[*frame_ind] = layer_depth;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+ gf_group->display_idx[*frame_ind] =
+ (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+ gf_group->arf_boost[*frame_ind] =
+ av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, offset,
+ f_frames, b_frames, NULL, NULL, 0);
+
+ if (do_frame_parallel_encode) {
+ if (depth_thr != INT_MAX) {
+ assert(depth_thr == 3 || depth_thr == 4);
+ assert(IMPLIES(depth_thr == 3, layer_depth == 4));
+ assert(IMPLIES(depth_thr == 4, layer_depth == 5));
+ // Set frame_parallel_level of the first frame in the given layer to 1.
+ if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) {
+ gf_group->frame_parallel_level[*frame_ind] = 1;
+ } else {
+ // Set frame_parallel_level of the consecutive frame in the same given
+ // layer to 2.
+ assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1);
+ gf_group->frame_parallel_level[*frame_ind] = 2;
+ // Store the display order hints of the past 2 INTNL_ARF_UPDATE
+ // frames which would not have been displayed at the time of the encode
+ // of current frame.
+ gf_group->skip_frame_refresh[*frame_ind][0] =
+ gf_group->display_idx[(*frame_ind) - 1];
+ gf_group->skip_frame_refresh[*frame_ind][1] =
+ gf_group->display_idx[(*frame_ind) - 2];
+ // Set the display_idx of frame_parallel_level 1 frame in
+ // gf_group->skip_frame_as_ref.
+ gf_group->skip_frame_as_ref[*frame_ind] =
+ gf_group->display_idx[(*frame_ind) - 1];
+ }
+ }
+ // If max_parallel_frames is not exceeded and if the frame will not be
+ // temporally filtered, encode the next internal ARF frame in parallel.
+ if (*parallel_frame_count > 1 &&
+ *parallel_frame_count <= max_parallel_frames) {
+ if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR)
+ gf_group->frame_parallel_level[*frame_ind] = 2;
+ *parallel_frame_count = 1;
+ }
+ }
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+ ++(*frame_ind);
+}
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params_for_fp(
+ const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+ GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+ RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+ int *cur_frame_idx, int *frame_ind, int *parallel_frame_count,
+ int max_parallel_frames, int do_frame_parallel_encode,
+ int *first_frame_index, int depth_thr, int *cur_disp_idx, int layer_depth) {
+ const int num_frames_to_process = end - start;
+
+ // Either we are at the last level of the pyramid, or we don't have enough
+ // frames between 'l' and 'r' to create one more level.
+ if (layer_depth > gf_group->max_layer_depth_allowed ||
+ num_frames_to_process < 3) {
+ // Leaf nodes.
+ while (start < end) {
+ set_params_for_leaf_frames(twopass, twopass_frame, p_rc, frame_info,
+ gf_group, cur_frame_idx, frame_ind,
+ parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index,
+ cur_disp_idx, layer_depth, start, end);
+ ++start;
+ }
+ } else {
+ const int m = (start + end - 1) / 2;
+
+ // Internal ARF.
+ int arf_src_offset = m - start;
+ set_params_for_internal_arfs(
+ twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+ frame_ind, parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index, INT_MAX, cur_disp_idx,
+ layer_depth, arf_src_offset, m, end - m, m - start);
+
+ // If encode reordering is enabled, configure the multi-layers accordingly
+ // and return. For e.g., the encode order for gf-interval 16 after
+ // reordering would be 0-> 16-> 8-> 4-> 2-> 6-> 1-> 3-> 5-> 7-> 12-> 10->
+ // 14-> 9-> 11-> 13-> 15.
+ if (layer_depth >= depth_thr) {
+ int m1 = (m + start - 1) / 2;
+ int m2 = (m + 1 + end) / 2;
+ int arf_src_offsets[2] = { m1 - start, m2 - start };
+ // Parameters to compute arf_boost.
+ int offset[2] = { m1, m2 };
+ int f_frames[2] = { m - m1, end - m2 };
+ int b_frames[2] = { m1 - start, m2 - (m + 1) };
+
+ // Set GF_GROUP params for INTNL_ARF_UPDATE frames which are reordered.
+ for (int i = 0; i < 2; i++) {
+ set_params_for_internal_arfs(
+ twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+ frame_ind, parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index, depth_thr,
+ cur_disp_idx, layer_depth + 1, arf_src_offsets[i], offset[i],
+ f_frames[i], b_frames[i]);
+ }
+
+ // Initialize the start and end indices to configure LF_UPDATE frames.
+ int start_idx[4] = { start, m1 + 1, m + 1, end - 1 };
+ int end_idx[4] = { m1, m, m2, end };
+ int layer_depth_for_intnl_overlay[4] = { layer_depth + 1, layer_depth,
+ layer_depth + 1, INVALID_IDX };
+
+ // Set GF_GROUP params for the rest of LF_UPDATE and INTNL_OVERLAY_UPDATE
+ // frames after reordering.
+ for (int i = 0; i < 4; i++) {
+ set_multi_layer_params_for_fp(
+ twopass, twopass_frame, gf_group, p_rc, rc, frame_info,
+ start_idx[i], end_idx[i], cur_frame_idx, frame_ind,
+ parallel_frame_count, max_parallel_frames, do_frame_parallel_encode,
+ first_frame_index, depth_thr, cur_disp_idx, layer_depth + 2);
+ if (layer_depth_for_intnl_overlay[i] != INVALID_IDX)
+ set_params_for_intnl_overlay_frames(
+ gf_group, cur_frame_idx, frame_ind, first_frame_index,
+ cur_disp_idx, layer_depth_for_intnl_overlay[i]);
+ }
+ return;
+ }
+
+ // Frames displayed before this internal ARF.
+ set_multi_layer_params_for_fp(
+ twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m,
+ cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+ layer_depth + 1);
+
+ // Overlay for internal ARF.
+ set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
+ first_frame_index, cur_disp_idx,
+ layer_depth);
+
+ // Frames displayed after this internal ARF.
+ set_multi_layer_params_for_fp(
+ twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end,
+ cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+ layer_depth + 1);
+ }
+}
+
+// Structure for bookkeeping start, end and display indices to configure
+// INTNL_ARF_UPDATE frames.
+typedef struct {
+ int start;
+ int end;
+ int display_index;
+} FRAME_REORDER_INFO;
+
+// Updates the stats required to configure the GF_GROUP.
+static AOM_INLINE void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats,
+ int arf_frame_index,
+ int display_idx, int start,
+ int end) {
+ arf_frame_stats[arf_frame_index].start = start;
+ arf_frame_stats[arf_frame_index].end = end;
+ arf_frame_stats[arf_frame_index].display_index = display_idx;
+}
+
+// Sets GF_GROUP params for INTNL_ARF_UPDATE frames. Also populates
+// doh_gf_index_map and arf_frame_stats.
+static AOM_INLINE void set_params_for_internal_arfs_in_gf14(
+ GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+ int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
+ int *count_arf_frames, int *doh_gf_index_map, int start, int end,
+ int layer_depth, int layer_with_parallel_encodes) {
+ int index = (start + end - 1) / 2;
+ gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = index - 1;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->layer_depth[*frame_ind] = layer_depth;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+ gf_group->display_idx[*frame_ind] =
+ (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+
+ // Update the display index of the current frame with its gf index.
+ doh_gf_index_map[index] = *frame_ind;
+ if (layer_with_parallel_encodes) {
+ assert(layer_depth == 4);
+ // Set frame_parallel_level of the first frame in the given layer depth
+ // to 1.
+ if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) {
+ gf_group->frame_parallel_level[*frame_ind] = 1;
+ } else {
+ // Set frame_parallel_level of the consecutive frame in the same given
+ // layer depth to 2.
+ assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1);
+ gf_group->frame_parallel_level[*frame_ind] = 2;
+ // Set the display_idx of frame_parallel_level 1 frame in
+ // gf_group->skip_frame_as_ref.
+ gf_group->skip_frame_as_ref[*frame_ind] =
+ gf_group->display_idx[(*frame_ind) - 1];
+ }
+ }
+ ++(*frame_ind);
+
+ // Update arf_frame_stats.
+ fill_arf_frame_stats(arf_frame_stats, *count_arf_frames, index, start, end);
+ ++(*count_arf_frames);
+}
+
+// Sets GF_GROUP params for all INTNL_ARF_UPDATE frames in the given layer
+// dpeth.
+static AOM_INLINE void set_params_for_cur_layer_frames(
+ GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+ int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
+ int *count_arf_frames, int *doh_gf_index_map, int num_dir, int node_start,
+ int node_end, int layer_depth) {
+ assert(num_dir < 3);
+ int start, end;
+ // Iterate through the nodes in the previous layer depth.
+ for (int i = node_start; i < node_end; i++) {
+ // For each node, check if a frame can be coded as INTNL_ARF_UPDATE frame on
+ // either direction.
+ for (int dir = 0; dir < num_dir; dir++) {
+ // Checks for a frame to the left of current node.
+ if (dir == 0) {
+ start = arf_frame_stats[i].start;
+ end = arf_frame_stats[i].display_index;
+ } else {
+ // Checks for a frame to the right of current node.
+ start = arf_frame_stats[i].display_index + 1;
+ end = arf_frame_stats[i].end;
+ }
+ const int num_frames_to_process = end - start;
+ // Checks if a frame can be coded as INTNL_ARF_UPDATE frame. If
+ // num_frames_to_process is less than 3, then there are not enough frames
+ // between 'start' and 'end' to create another level.
+ if (num_frames_to_process >= 3) {
+ // Flag to indicate the lower layer depths for which parallel encoding
+ // is enabled. Currently enabled for layer 4 frames.
+ int layer_with_parallel_encodes = layer_depth == 4;
+ set_params_for_internal_arfs_in_gf14(
+ gf_group, arf_frame_stats, cur_frame_idx, cur_disp_idx, frame_ind,
+ count_arf_frames, doh_gf_index_map, start, end, layer_depth,
+ layer_with_parallel_encodes);
+ }
+ }
+ }
+}
+
+// Configures multi-layers of the GF_GROUP when consecutive encode of frames in
+// the same layer depth is enbaled.
+static AOM_INLINE void set_multi_layer_params_for_gf14(
+ const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+ const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+ GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+ int *cur_frame_idx, int *frame_ind, int *count_arf_frames,
+ int *doh_gf_index_map, int *parallel_frame_count, int *first_frame_index,
+ int *cur_disp_index, int gf_interval, int layer_depth,
+ int max_parallel_frames) {
+ assert(layer_depth == 2);
+ assert(gf_group->max_layer_depth_allowed >= 4);
+ int layer, node_start, node_end = 0;
+ // Maximum layer depth excluding LF_UPDATE frames is 4 since applicable only
+ // for gf-interval 14.
+ const int max_layer_depth = 4;
+ // Iterate through each layer depth starting from 2 till 'max_layer_depth'.
+ for (layer = layer_depth; layer <= max_layer_depth; layer++) {
+ // 'node_start' and 'node_end' indicate the number of nodes from the
+ // previous layer depth to be considered. It also corresponds to the indices
+ // of arf_frame_stats.
+ node_start = node_end;
+ node_end = (*count_arf_frames);
+ // 'num_dir' indicates the number of directions to traverse w.r.t. a given
+ // node in order to choose an INTNL_ARF_UPDATE frame. Layer depth 2 would
+ // have only one frame and hence needs to traverse only in the left
+ // direction w.r.t the node in the previous layer.
+ int num_dir = layer == 2 ? 1 : 2;
+ set_params_for_cur_layer_frames(gf_group, arf_frame_stats, cur_frame_idx,
+ cur_disp_index, frame_ind, count_arf_frames,
+ doh_gf_index_map, num_dir, node_start,
+ node_end, layer);
+ }
+
+ for (int i = 1; i < gf_interval; i++) {
+ // Since doh_gf_index_map is already populated for all INTNL_ARF_UPDATE
+ // frames in the GF_GROUP, any frame with INVALID_IDX would correspond to an
+ // LF_UPDATE frame.
+ if (doh_gf_index_map[i] == INVALID_IDX) {
+ // LF_UPDATE frames.
+ // TODO(Remya): Correct start and end parameters passed to
+ // set_params_for_leaf_frames() once encode reordering for gf-interval 14
+ // is enbaled for parallel encode of lower layer frames.
+ set_params_for_leaf_frames(
+ twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+ frame_ind, parallel_frame_count, max_parallel_frames, 1,
+ first_frame_index, cur_disp_index, layer, 0, 0);
+ } else {
+ // In order to obtain the layer depths of INTNL_OVERLAY_UPDATE frames, get
+ // the gf index of corresponding INTNL_ARF_UPDATE frames.
+ int intnl_arf_index = doh_gf_index_map[i];
+ int ld = gf_group->layer_depth[intnl_arf_index];
+ set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
+ first_frame_index, cur_disp_index,
+ ld);
+ }
+ }
+}
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params(
+ const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+ GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+ RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+ int *cur_frame_idx, int *frame_ind, int *parallel_frame_count,
+ int max_parallel_frames, int do_frame_parallel_encode,
+ int *first_frame_index, int *cur_disp_idx, int layer_depth) {
+ const int num_frames_to_process = end - start;
+
+ // Either we are at the last level of the pyramid, or we don't have enough
+ // frames between 'l' and 'r' to create one more level.
+ if (layer_depth > gf_group->max_layer_depth_allowed ||
+ num_frames_to_process < 3) {
+ // Leaf nodes.
+ while (start < end) {
+ gf_group->update_type[*frame_ind] = LF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->display_idx[*frame_ind] = *cur_disp_idx;
+ gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
+ gf_group->arf_boost[*frame_ind] =
+ av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+ end - start, 0, NULL, NULL, 0);
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+ gf_group->max_layer_depth =
+ AOMMAX(gf_group->max_layer_depth, layer_depth);
+ // Set the level of parallelism for the LF_UPDATE frame.
+ if (do_frame_parallel_encode) {
+ set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+ parallel_frame_count, max_parallel_frames);
+ // Set LF_UPDATE frames as non-reference frames.
+ gf_group->is_frame_non_ref[*frame_ind] = true;
+ }
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+ ++(*frame_ind);
+ ++(*cur_frame_idx);
+ ++(*cur_disp_idx);
+ ++start;
+ }
+ } else {
+ const int m = (start + end - 1) / 2;
+
+ // Internal ARF.
+ gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = m - start;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->display_idx[*frame_ind] =
+ *cur_disp_idx + gf_group->arf_src_offset[*frame_ind];
+ gf_group->layer_depth[*frame_ind] = layer_depth;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+
+ if (do_frame_parallel_encode) {
+ // If max_parallel_frames is not exceeded and if the frame will not be
+ // temporally filtered, encode the next internal ARF frame in parallel.
+ if (*parallel_frame_count > 1 &&
+ *parallel_frame_count <= max_parallel_frames) {
+ if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR)
+ gf_group->frame_parallel_level[*frame_ind] = 2;
+ *parallel_frame_count = 1;
+ }
+ }
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+
+ // Get the boost factor for intermediate ARF frames.
+ gf_group->arf_boost[*frame_ind] =
+ av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, m, end - m,
+ m - start, NULL, NULL, 0);
+ ++(*frame_ind);
+
+ // Frames displayed before this internal ARF.
+ set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc,
+ frame_info, start, m, cur_frame_idx, frame_ind,
+ parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index,
+ cur_disp_idx, layer_depth + 1);
+
+ // Overlay for internal ARF.
+ gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+ gf_group->display_idx[*frame_ind] = *cur_disp_idx;
+ gf_group->arf_boost[*frame_ind] = 0;
+ gf_group->layer_depth[*frame_ind] = layer_depth;
+ gf_group->frame_type[*frame_ind] = INTER_FRAME;
+ gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+
+ set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+ ++(*frame_ind);
+ ++(*cur_frame_idx);
+ ++(*cur_disp_idx);
+
+ // Frames displayed after this internal ARF.
+ set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc,
+ frame_info, m + 1, end, cur_frame_idx, frame_ind,
+ parallel_frame_count, max_parallel_frames,
+ do_frame_parallel_encode, first_frame_index,
+ cur_disp_idx, layer_depth + 1);
+ }
+}
+
+static int construct_multi_layer_gf_structure(
+ AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group,
+ RATE_CONTROL *rc, FRAME_INFO *const frame_info, int baseline_gf_interval,
+ FRAME_UPDATE_TYPE first_frame_update_type) {
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ // TODO(angiebird): Why do we need "-1" here?
+ const int gf_interval = baseline_gf_interval - 1;
+ int frame_index = 0;
+ int cur_frame_index = 0;
+
+ // Set the display order hint for the first frame in the GF_GROUP.
+ int cur_disp_index = (first_frame_update_type == KF_UPDATE)
+ ? 0
+ : cpi->common.current_frame.frame_number;
+
+ // Initialize gf_group->frame_parallel_level, gf_group->is_frame_non_ref,
+ // gf_group->src_offset and gf_group->is_frame_dropped with 0.
+ memset(gf_group->frame_parallel_level, 0,
+ sizeof(gf_group->frame_parallel_level));
+ memset(gf_group->is_frame_non_ref, 0, sizeof(gf_group->is_frame_non_ref));
+ memset(gf_group->src_offset, 0, sizeof(gf_group->src_offset));
+ memset(gf_group->is_frame_dropped, 0, sizeof(gf_group->is_frame_dropped));
+ // Initialize gf_group->skip_frame_refresh and gf_group->skip_frame_as_ref
+ // with INVALID_IDX.
+ memset(gf_group->skip_frame_refresh, INVALID_IDX,
+ sizeof(gf_group->skip_frame_refresh));
+ memset(gf_group->skip_frame_as_ref, INVALID_IDX,
+ sizeof(gf_group->skip_frame_as_ref));
+
+ int kf_decomp = cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1;
+ // This is a patch that fixes https://crbug.com/aomedia/3163
+ // enable_keyframe_filtering > 1 will introduce an extra overlay frame at
+ // key frame location. However when
+ // baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH, we can't
+ // afford to have an extra overlay frame. Otherwise, the gf_group->size will
+ // become MAX_STATIC_GF_GROUP_LENGTH + 1, which causes memory error.
+ // A cheap solution is to turn of kf_decomp here.
+ // TODO(angiebird): Find a systematic way to solve this issue.
+ if (baseline_gf_interval == MAX_STATIC_GF_GROUP_LENGTH) {
+ kf_decomp = 0;
+ }
+ if (first_frame_update_type == KF_UPDATE) {
+ gf_group->update_type[frame_index] = kf_decomp ? ARF_UPDATE : KF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = 0;
+ gf_group->frame_type[frame_index] = KEY_FRAME;
+ gf_group->refbuf_state[frame_index] = REFBUF_RESET;
+ gf_group->max_layer_depth = 0;
+ gf_group->display_idx[frame_index] = cur_disp_index;
+ if (!kf_decomp) cur_disp_index++;
+ ++frame_index;
+
+ if (kf_decomp) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = 0;
+ gf_group->frame_type[frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = 0;
+ gf_group->display_idx[frame_index] = cur_disp_index;
+ cur_disp_index++;
+ ++frame_index;
+ }
+ cur_frame_index++;
+ }
+
+ if (first_frame_update_type == GF_UPDATE) {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = 0;
+ gf_group->frame_type[frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = 0;
+ gf_group->display_idx[frame_index] = cur_disp_index;
+ cur_disp_index++;
+ ++frame_index;
+ ++cur_frame_index;
+ }
+
+ // ALTREF.
+ const int use_altref = gf_group->max_layer_depth_allowed > 0;
+ int is_fwd_kf = rc->frames_to_fwd_kf == gf_interval;
+
+ if (use_altref) {
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = 1;
+ gf_group->arf_boost[frame_index] = cpi->ppi->p_rc.gfu_boost;
+ gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME;
+ gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = 1;
+ gf_group->arf_index = frame_index;
+ gf_group->display_idx[frame_index] =
+ cur_disp_index + gf_group->arf_src_offset[frame_index];
+ ++frame_index;
+ } else {
+ gf_group->arf_index = -1;
+ }
+
+ // Flag to indicate if multi-layer configuration is complete.
+ int is_multi_layer_configured = 0;
+
+ // Running count of no. of frames that is part of a given parallel
+ // encode set in a gf_group. Value of 1 indicates no parallel encode.
+ int parallel_frame_count = 1;
+ // Enable parallel encode of frames if gf_group has a multi-layer pyramid
+ // structure with minimum 4 layers.
+ int do_frame_parallel_encode = (cpi->ppi->num_fp_contexts > 1 && use_altref &&
+ gf_group->max_layer_depth_allowed >= 4);
+
+ int first_frame_index = cur_frame_index;
+ if (do_frame_parallel_encode) {
+ // construct_multi_layer_gf_structure() takes the input parameter
+ // 'gf_interval' as p_rc->baseline_gf_interval - 1 . Below code computes the
+ // actual GF_GROUP length by compensating for this offset.
+ int actual_gf_length = ((first_frame_update_type == KF_UPDATE) ||
+ (first_frame_update_type == GF_UPDATE))
+ ? gf_interval
+ : gf_interval + 1;
+
+ // In order to facilitate parallel encoding of frames in lower layer depths,
+ // encode reordering is done. Currently encode reordering is enabled only
+ // for gf-intervals 16 and 32. NOTE: Since the buffer holding the
+ // reference frames is of size 8 (ref_frame_map[REF_FRAMES]), there is a
+ // limitation on the number of hidden frames possible at any given point and
+ // hence the reordering is enabled only for gf-intervals 16 and 32.
+ // Disabling encode reordering for gf-interval 14 since some cross-frame
+ // dependencies related to temporal filtering for FPMT is currently not
+ // handled.
+ int disable_gf14_reorder = 1;
+ if (actual_gf_length == 14 && !disable_gf14_reorder) {
+ // This array holds the gf index of INTNL_ARF_UPDATE frames in the slot
+ // corresponding to their display order hint. This is used while
+ // configuring the LF_UPDATE frames and INTNL_OVERLAY_UPDATE frames.
+ int doh_gf_index_map[FIXED_GF_INTERVAL];
+ // Initialize doh_gf_index_map with INVALID_IDX.
+ memset(&doh_gf_index_map[0], INVALID_IDX,
+ (sizeof(doh_gf_index_map[0]) * FIXED_GF_INTERVAL));
+
+ FRAME_REORDER_INFO arf_frame_stats[REF_FRAMES - 1];
+ // Store the stats corresponding to layer 1 frame.
+ fill_arf_frame_stats(arf_frame_stats, 0, actual_gf_length, 1,
+ actual_gf_length);
+ int count_arf_frames = 1;
+
+ // Sets multi-layer params for gf-interval 14 to consecutively encode
+ // frames in the same layer depth, i.e., encode order would be 0-> 14->
+ // 7-> 3-> 10-> 5-> 12-> 1-> 2-> 4-> 6-> 8-> 9-> 11-> 13.
+ // TODO(Remya): Set GF_GROUP param 'arf_boost' for all frames.
+ set_multi_layer_params_for_gf14(
+ twopass, &cpi->twopass_frame, p_rc, frame_info, gf_group,
+ arf_frame_stats, &cur_frame_index, &frame_index, &count_arf_frames,
+ doh_gf_index_map, &parallel_frame_count, &first_frame_index,
+ &cur_disp_index, actual_gf_length, use_altref + 1,
+ cpi->ppi->num_fp_contexts);
+
+ // Set gf_group->skip_frame_refresh.
+ for (int i = 0; i < actual_gf_length; i++) {
+ int count = 0;
+ if (gf_group->update_type[i] == INTNL_ARF_UPDATE) {
+ for (int j = 0; j < i; j++) {
+ // Store the display order hint of the frames which would not
+ // have been displayed at the encode call of frame 'i'.
+ if ((gf_group->display_idx[j] < gf_group->display_idx[i]) &&
+ gf_group->update_type[j] == INTNL_ARF_UPDATE) {
+ gf_group->skip_frame_refresh[i][count++] =
+ gf_group->display_idx[j];
+ }
+ }
+ }
+ }
+ } else {
+ // Set layer depth threshold for reordering as per the gf length.
+ int depth_thr = (actual_gf_length == 16) ? 3
+ : (actual_gf_length == 32) ? 4
+ : INT_MAX;
+
+ set_multi_layer_params_for_fp(
+ twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info,
+ cur_frame_index, gf_interval, &cur_frame_index, &frame_index,
+ &parallel_frame_count, cpi->ppi->num_fp_contexts,
+ do_frame_parallel_encode, &first_frame_index, depth_thr,
+ &cur_disp_index, use_altref + 1);
+ }
+ is_multi_layer_configured = 1;
+ }
+
+ // Rest of the frames.
+ if (!is_multi_layer_configured)
+ set_multi_layer_params(twopass, &cpi->twopass_frame, gf_group, p_rc, rc,
+ frame_info, cur_frame_index, gf_interval,
+ &cur_frame_index, &frame_index,
+ &parallel_frame_count, cpi->ppi->num_fp_contexts,
+ do_frame_parallel_encode, &first_frame_index,
+ &cur_disp_index, use_altref + 1);
+
+ if (use_altref) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
+ gf_group->arf_boost[frame_index] = NORMAL_BOOST;
+ gf_group->frame_type[frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[frame_index] =
+ is_fwd_kf ? REFBUF_RESET : REFBUF_UPDATE;
+ gf_group->display_idx[frame_index] = cur_disp_index;
+ ++frame_index;
+ } else {
+ for (; cur_frame_index <= gf_interval; ++cur_frame_index) {
+ gf_group->update_type[frame_index] = LF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+ gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
+ gf_group->arf_boost[frame_index] = NORMAL_BOOST;
+ gf_group->frame_type[frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+ set_src_offset(gf_group, &first_frame_index, cur_frame_index,
+ frame_index);
+ gf_group->display_idx[frame_index] = cur_disp_index;
+ cur_disp_index++;
+ ++frame_index;
+ }
+ }
+ if (do_frame_parallel_encode) {
+ // Iterate through the gf_group and reset frame_parallel_level to 0 in case
+ // a frame is marked as frame_parallel_level 1 with no subsequent
+ // frame_parallel_level 2 frame(s).
+ int level1_frame_idx = INT_MAX;
+ int level2_frame_count = 0;
+ for (int frame_idx = 0; frame_idx < frame_index; frame_idx++) {
+ if (gf_group->frame_parallel_level[frame_idx] == 1) {
+ // Set frame_parallel_level to 0 if only one frame is present in a
+ // parallel encode set.
+ if (level1_frame_idx != INT_MAX && !level2_frame_count)
+ gf_group->frame_parallel_level[level1_frame_idx] = 0;
+ // Book-keep frame_idx of frame_parallel_level 1 frame and reset the
+ // count of frame_parallel_level 2 frames in the corresponding parallel
+ // encode set.
+ level1_frame_idx = frame_idx;
+ level2_frame_count = 0;
+ }
+ if (gf_group->frame_parallel_level[frame_idx] == 2) level2_frame_count++;
+ }
+ // If frame_parallel_level is set to 1 for the last LF_UPDATE
+ // frame in the gf_group, reset it to zero since there are no subsequent
+ // frames in the gf_group.
+ if (gf_group->frame_parallel_level[frame_index - 2] == 1) {
+ assert(gf_group->update_type[frame_index - 2] == LF_UPDATE);
+ gf_group->frame_parallel_level[frame_index - 2] = 0;
+ }
+ }
+
+ for (int gf_idx = frame_index; gf_idx < MAX_STATIC_GF_GROUP_LENGTH;
+ ++gf_idx) {
+ gf_group->update_type[gf_idx] = LF_UPDATE;
+ gf_group->arf_src_offset[gf_idx] = 0;
+ gf_group->cur_frame_idx[gf_idx] = gf_idx;
+ gf_group->layer_depth[gf_idx] = MAX_ARF_LAYERS;
+ gf_group->arf_boost[gf_idx] = NORMAL_BOOST;
+ gf_group->frame_type[gf_idx] = INTER_FRAME;
+ gf_group->refbuf_state[gf_idx] = REFBUF_UPDATE;
+ gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+ }
+
+ return frame_index;
+}
+
+static void set_ld_layer_depth(GF_GROUP *gf_group, int gop_length) {
+ int log_gop_length = 0;
+ while ((1 << log_gop_length) < gop_length) {
+ ++log_gop_length;
+ }
+
+ for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) {
+ int count = 0;
+ // Find the trailing zeros
+ for (; count < MAX_ARF_LAYERS; ++count) {
+ if ((gf_index >> count) & 0x01) break;
+ }
+ gf_group->layer_depth[gf_index] = AOMMAX(log_gop_length - count, 0);
+ }
+ gf_group->max_layer_depth = AOMMIN(log_gop_length, MAX_ARF_LAYERS);
+}
+
+void av1_gop_setup_structure(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FRAME_INFO *const frame_info = &cpi->frame_info;
+ const int key_frame = rc->frames_since_key == 0;
+ FRAME_UPDATE_TYPE first_frame_update_type = ARF_UPDATE;
+
+ if (key_frame) {
+ first_frame_update_type = KF_UPDATE;
+ if (cpi->oxcf.kf_max_pyr_height != -1) {
+ gf_group->max_layer_depth_allowed = AOMMIN(
+ cpi->oxcf.kf_max_pyr_height, gf_group->max_layer_depth_allowed);
+ }
+ } else if (!cpi->ppi->gf_state.arf_gf_boost_lst) {
+ first_frame_update_type = GF_UPDATE;
+ }
+
+ gf_group->size = construct_multi_layer_gf_structure(
+ cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval,
+ first_frame_update_type);
+
+ if (gf_group->max_layer_depth_allowed == 0)
+ set_ld_layer_depth(gf_group, p_rc->baseline_gf_interval);
+}
+
+int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group,
+ int gf_frame_index) {
+ return gf_group->frame_type[gf_frame_index] == KEY_FRAME &&
+ gf_group->refbuf_state[gf_frame_index] == REFBUF_UPDATE;
+}
+
+int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index) {
+ const int arf_src_offset = gf_group->arf_src_offset[gf_frame_index];
+ // TODO(angiebird): when gf_group->size == 32, it's possble to
+ // have "two" second arf. Check if this is acceptable.
+ if (gf_group->update_type[gf_frame_index] == INTNL_ARF_UPDATE &&
+ arf_src_offset >= TF_LOOKAHEAD_IDX_THR) {
+ return 1;
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/gop_structure.h b/third_party/aom/av1/encoder/gop_structure.h
new file mode 100644
index 0000000000..ff22f54136
--- /dev/null
+++ b/third_party/aom/av1/encoder/gop_structure.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+#define AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+#define MIN_ARF_GF_BOOST 240
+#define NORMAL_BOOST 100
+
+/*!\endcond */
+
+/*!\brief Set up the Group-Of-Pictures structure for this GF_GROUP.
+ *
+ *\ingroup rate_control
+ *
+ * This function defines the Group-Of-Pictures structure for this GF_GROUP.
+ * This involves deciding where to place the various FRAME_UPDATE_TYPEs in
+ * the group. It does this primarily by updateing entries in
+ * cpi->twopass.gf_group.update_type[].
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ *
+ * \remark No return value but this function updates group data structures.
+ */
+void av1_gop_setup_structure(struct AV1_COMP *cpi);
+
+/*!\brief Distributes bits to frames in a group
+ *
+ *\ingroup rate_control
+ *
+ * This function decides on the allocation of bits between the different
+ * frames and types of frame in a GF/ARF group.
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] rc Rate control data
+ * \param[in] gf_group GF/ARF group data structure
+ * \param[in] is_key_frame Indicates if the first frame in the group is
+ * also a key frame.
+ * \param[in] use_arf Are ARF frames enabled or is this a GF only
+ * uni-directional group.
+ * \param[in] gf_group_bits Bits available to be allocated.
+ *
+ * \remark No return but updates the rate control and group data structures
+ * to reflect the allocation of bits.
+ */
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+ GF_GROUP *gf_group, int is_key_frame, int use_arf,
+ int64_t gf_group_bits);
+
+/*!\brief Check whether a frame in the GOP is a forward key frame
+ *
+ *\ingroup rate_control
+ *
+ * \param[in] gf_group GF/ARF group data structure
+ * \param[in] gf_frame_index GOP index
+ *
+ * \return Return 1 if it is a forward key frame, otherwise return 0
+ */
+int av1_gop_check_forward_keyframe(const GF_GROUP *gf_group,
+ int gf_frame_index);
+
+/*!\brief Check whether a frame in the GOP is the second arf
+ *
+ *\ingroup rate_control
+ *
+ * \param[in] gf_group GF/ARF group data structure
+ * \param[in] gf_frame_index GOP index
+ *
+ * \return Return 1 if it is the second arf
+ */
+int av1_gop_is_second_arf(const GF_GROUP *gf_group, int gf_frame_index);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_GOP_STRUCTURE_H_
diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h
new file mode 100644
index 0000000000..945dc37331
--- /dev/null
+++ b/third_party/aom/av1/encoder/grain_test_vectors.h
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+
+/* Test vectors for emulation of different film grain types.
+ * Note that bit depth would be derived from the bitstream and
+ * not signaled in film grain metadata. The parameters are valid
+ * for any bit depth.
+ */
+static aom_film_grain_t film_grain_test_vectors[16] = {
+ /* Test 1 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 16, 0 },
+ { 25, 136 },
+ { 33, 144 },
+ { 41, 160 },
+ { 48, 168 },
+ { 56, 136 },
+ { 67, 128 },
+ { 82, 144 },
+ { 97, 152 },
+ { 113, 144 },
+ { 128, 176 },
+ { 143, 168 },
+ { 158, 176 },
+ { 178, 184 } },
+ 14 /* num_points_y */,
+ { { 16, 0 },
+ { 20, 64 },
+ { 28, 88 },
+ { 60, 104 },
+ { 90, 136 },
+ { 105, 160 },
+ { 134, 168 },
+ { 168, 208 } },
+ 8 /* num_cb_points */,
+ { { 16, 0 },
+ { 28, 96 },
+ { 56, 80 },
+ { 66, 96 },
+ { 80, 104 },
+ { 108, 96 },
+ { 122, 112 },
+ { 137, 112 },
+ { 169, 176 } },
+ 9 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+ { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+ { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+ 8 /* ar_coeff_shift */,
+ 247 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 18 /* cb_offset */,
+ 229 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 54 /* cr_offset */,
+ 0 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /* chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 2 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 96 }, { 255, 96 } },
+ 2 /* num_points_y */,
+ { { 0, 64 }, { 255, 64 } },
+ 2 /* num_cb_points */,
+ { { 0, 64 }, { 255, 64 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 3 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 192 }, { 255, 192 } },
+ 2 /* num_points_y */,
+ { { 0, 128 }, { 255, 128 } },
+ 2 /* num_cb_points */,
+ { { 0, 128 }, { 255, 128 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19,
+ -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0,
+ },
+ {
+ 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19,
+ -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 1 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 4 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 16, 0 },
+ { 24, 137 },
+ { 53, 146 },
+ { 63, 155 },
+ { 78, 155 },
+ { 107, 150 },
+ { 122, 147 },
+ { 136, 147 },
+ { 166, 153 },
+ },
+ 9 /* num_points_y */,
+ {
+ { 16, 0 },
+ { 20, 72 },
+ { 27, 82 },
+ { 33, 91 },
+ { 69, 121 },
+ { 95, 143 },
+ { 108, 154 },
+ { 134, 169 },
+ { 147, 177 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 16, 0 },
+ { 24, 95 },
+ { 54, 93 },
+ { 65, 94 },
+ { 79, 98 },
+ { 109, 107 },
+ { 124, 119 },
+ { 139, 136 },
+ { 169, 170 },
+ },
+ 9 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42,
+ 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113,
+ },
+ {
+ -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5,
+ -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0,
+ },
+ {
+ 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2,
+ -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0,
+ },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 5 */
+ {
+ 1 /* apply_grain */,
+ 0 /* update_parameters */,
+ { { 0, 64 }, { 255, 64 } },
+ 2 /* num_points_y */,
+ {
+ { 0, 96 },
+ { 32, 90 },
+ { 64, 83 },
+ { 96, 76 },
+ { 128, 68 },
+ { 159, 59 },
+ { 191, 48 },
+ { 223, 34 },
+ { 255, 0 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 0, 0 },
+ { 32, 34 },
+ { 64, 48 },
+ { 96, 59 },
+ { 128, 68 },
+ { 159, 76 },
+ { 191, 83 },
+ { 223, 90 },
+ { 255, 96 },
+ },
+ 9 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ -2, 2, -5, 7, -6, 4, -2, -1, 1, -2, 0, -2, 2,
+ -3, -5, 13, -13, 6, -14, 8, -1, 18, -36, 58, 0,
+ },
+ {
+ -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2,
+ 0, 1, 0, -7, 50, -8, -2, 2, 2, 2, -4, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 1063 /* random_seed */
+ },
+ /* Test 6 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 96 },
+ { 20, 92 },
+ { 39, 88 },
+ { 59, 84 },
+ { 78, 80 },
+ { 98, 75 },
+ { 118, 70 },
+ { 137, 65 },
+ { 157, 60 },
+ { 177, 53 },
+ { 196, 46 },
+ { 216, 38 },
+ { 235, 27 },
+ { 255, 0 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 } },
+ 0 /* num_cb_points */,
+ { { 0, 0 } },
+ 0 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 2754 /* random_seed */
+ },
+ /* Test 7 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 0 },
+ { 20, 27 },
+ { 39, 38 },
+ { 59, 46 },
+ { 78, 53 },
+ { 98, 60 },
+ { 118, 65 },
+ { 137, 70 },
+ { 157, 75 },
+ { 177, 80 },
+ { 196, 84 },
+ { 216, 88 },
+ { 235, 92 },
+ { 255, 96 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 }, { 255, 0 } },
+ 2 /* num_cb_points */,
+ { { 0, 0 }, { 255, 0 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 8 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 96 }, { 255, 96 } },
+ 2 /* num_points_y */,
+ { { 0, 62 }, { 255, 62 } },
+ 2 /* num_cb_points */,
+ { { 0, 62 }, { 255, 62 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6,
+ -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69,
+ },
+ {
+ 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8,
+ -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 9 */
+ {
+ 1 /* apply_grain */,
+ 0 /* update_parameters */,
+ { { 0, 48 }, { 255, 48 } },
+ 2 /* num_points_y */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cb_points */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 10 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 48 }, { 255, 48 } },
+ 2 /* num_points_y */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cb_points */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+ { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 11 */
+ {
+ 1 /* apply_grain */,
+ 0 /* update_parameters */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_points_y */,
+ {
+ { 0, 48 },
+ { 32, 45 },
+ { 64, 42 },
+ { 96, 38 },
+ { 128, 34 },
+ { 159, 29 },
+ { 191, 24 },
+ { 223, 17 },
+ { 255, 0 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 0, 0 },
+ { 32, 17 },
+ { 64, 24 },
+ { 96, 29 },
+ { 128, 34 },
+ { 159, 38 },
+ { 191, 42 },
+ { 223, 45 },
+ { 255, 48 },
+ },
+ 9 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42,
+ 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113,
+ },
+ {
+ -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5,
+ -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0,
+ },
+ {
+ 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2,
+ -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0,
+ },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 1357 /* random_seed */
+ },
+ /* Test 12 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 16, 0 },
+ { 24, 49 },
+ { 39, 69 },
+ { 46, 84 },
+ { 53, 91 },
+ { 63, 100 },
+ { 78, 114 },
+ { 92, 134 },
+ { 164, 139 },
+ },
+ 9 /* num_points_y */,
+ {
+ { 16, 0 },
+ { 20, 31 },
+ { 26, 42 },
+ { 33, 54 },
+ { 40, 65 },
+ { 47, 72 },
+ { 56, 85 },
+ { 84, 123 },
+ { 152, 157 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 16, 0 },
+ { 25, 14 },
+ { 39, 33 },
+ { 47, 40 },
+ { 54, 47 },
+ { 64, 62 },
+ { 79, 76 },
+ { 94, 83 },
+ { 167, 101 },
+ },
+ 9 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+ { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+ { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 0 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 13 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 48 },
+ { 20, 46 },
+ { 39, 44 },
+ { 59, 42 },
+ { 78, 40 },
+ { 98, 38 },
+ { 118, 35 },
+ { 137, 33 },
+ { 157, 30 },
+ { 177, 27 },
+ { 196, 23 },
+ { 216, 19 },
+ { 235, 13 },
+ { 255, 0 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cb_points */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 14 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 0 },
+ { 20, 13 },
+ { 39, 19 },
+ { 59, 23 },
+ { 78, 27 },
+ { 98, 30 },
+ { 118, 33 },
+ { 137, 35 },
+ { 157, 38 },
+ { 177, 40 },
+ { 196, 42 },
+ { 216, 44 },
+ { 235, 46 },
+ { 255, 48 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cb_points */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 15 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 96 }, { 255, 96 } },
+ 1 /* num_points_y */,
+ { { 0, 96 }, { 255, 96 } },
+ 0 /* num_cb_points */,
+ { { 0, 96 }, { 255, 96 } },
+ 0 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 },
+ { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 },
+ { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 1 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 16 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 16, 0 },
+ { 58, 126 },
+ { 87, 120 },
+ { 97, 122 },
+ { 112, 125 },
+ { 126, 131 },
+ { 141, 139 },
+ { 199, 153 },
+ },
+ 8 /* num_points_y */,
+ {
+ { 16, 0 },
+ { 59, 68 },
+ { 66, 76 },
+ { 73, 82 },
+ { 79, 85 },
+ { 86, 86 },
+ { 151, 95 },
+ { 192, 101 },
+ },
+ 8 /* num_cb_points */,
+ {
+ { 16, 0 },
+ { 59, 64 },
+ { 89, 80 },
+ { 99, 86 },
+ { 114, 90 },
+ { 129, 93 },
+ { 144, 97 },
+ { 203, 85 },
+ },
+ 8 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6,
+ -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69,
+ },
+ {
+ 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8,
+ -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 2 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+};
+#endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c
new file mode 100644
index 0000000000..8037b59bef
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/hash.h"
+#include "config/av1_rtcd.h"
+
+static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
+ uint8_t *pData, uint32_t dataLength) {
+ for (uint32_t i = 0; i < dataLength; i++) {
+ const uint8_t index = (uint8_t)(
+ (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
+ pData[i]);
+ p_crc_calculator->remainder <<= 8;
+ p_crc_calculator->remainder ^= p_crc_calculator->table[index];
+ }
+}
+
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+ p_crc_calculator->remainder = 0;
+}
+
+static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) {
+ return p_crc_calculator->remainder & p_crc_calculator->final_result_mask;
+}
+
+static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) {
+ const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1);
+ const uint32_t byte_high_bit = 1 << (8 - 1);
+
+ for (uint32_t value = 0; value < 256; value++) {
+ uint32_t remainder = 0;
+ for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) {
+ if (value & mask) {
+ remainder ^= high_bit;
+ }
+
+ if (remainder & high_bit) {
+ remainder <<= 1;
+ remainder ^= p_crc_calculator->trunc_poly;
+ } else {
+ remainder <<= 1;
+ }
+ }
+ p_crc_calculator->table[value] = remainder;
+ }
+}
+
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+ uint32_t truncPoly) {
+ p_crc_calculator->remainder = 0;
+ p_crc_calculator->bits = bits;
+ p_crc_calculator->trunc_poly = truncPoly;
+ p_crc_calculator->final_result_mask = (1 << bits) - 1;
+ crc_calculator_init_table(p_crc_calculator);
+}
+
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+ int length) {
+ crc_calculator_reset(p_crc_calculator);
+ crc_calculator_process_data(p_crc_calculator, p, length);
+ return crc_calculator_get_crc(p_crc_calculator);
+}
+
+/* CRC-32C (iSCSI) polynomial in reversed bit order. */
+#define POLY 0x82f63b78
+
+/* Construct table for software CRC-32C calculation. */
+void av1_crc32c_calculator_init(CRC32C *p_crc32c) {
+ uint32_t crc;
+
+ for (int n = 0; n < 256; n++) {
+ crc = n;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ p_crc32c->table[0][n] = crc;
+ }
+ for (int n = 0; n < 256; n++) {
+ crc = p_crc32c->table[0][n];
+ for (int k = 1; k < 8; k++) {
+ crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8);
+ p_crc32c->table[k][n] = crc;
+ }
+ }
+}
+
+/* Table-driven software version as a fall-back. This is about 15 times slower
+ than using the hardware instructions. This assumes little-endian integers,
+ as is the case on Intel processors that the assembler code here is for. */
+uint32_t av1_get_crc32c_value_c(void *c, uint8_t *buf, size_t len) {
+ const uint8_t *next = (const uint8_t *)(buf);
+ uint64_t crc;
+ CRC32C *p = (CRC32C *)c;
+ crc = 0 ^ 0xffffffff;
+ while (len && ((uintptr_t)next & 7) != 0) {
+ crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ len--;
+ }
+ while (len >= 8) {
+ crc ^= *(uint64_t *)next;
+ crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^
+ p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^
+ p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^
+ p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56];
+ next += 8;
+ len -= 8;
+ }
+ while (len) {
+ crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ len--;
+ }
+ return (uint32_t)crc ^ 0xffffffff;
+}
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
new file mode 100644
index 0000000000..d8e8cc3a0b
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_H_
+#define AOM_AV1_ENCODER_HASH_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _crc_calculator {
+ uint32_t remainder;
+ uint32_t trunc_poly;
+ uint32_t bits;
+ uint32_t table[256];
+ uint32_t final_result_mask;
+} CRC_CALCULATOR;
+
+// Initialize the crc calculator. It must be executed at least once before
+// calling av1_get_crc_value().
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+ uint32_t truncPoly);
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+ int length);
+
+// CRC32C: POLY = 0x82f63b78;
+typedef struct _CRC32C {
+ /* Table for a quadword-at-a-time software crc. */
+ uint32_t table[8][256];
+} CRC32C;
+
+// init table for software version crc32c
+void av1_crc32c_calculator_init(CRC32C *p_crc32c);
+
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_HASH_H_
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
new file mode 100644
index 0000000000..8b04e22d6c
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/hash_motion.h"
+
+#define kSrcBits 16
+#define kBlockSizeBits 3
+#define kMaxAddr (1 << (kSrcBits + kBlockSizeBits))
+
+// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
+// If yes, fix this function
+static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src,
+ int stride,
+ uint8_t *p_pixels_in1D) {
+ const uint8_t *p_pel = y_src;
+ int index = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ p_pixels_in1D[index++] = p_pel[j];
+ }
+ p_pel += stride;
+ }
+}
+
+static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src,
+ int stride,
+ uint16_t *p_pixels_in1D) {
+ const uint16_t *p_pel = y_src;
+ int index = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ p_pixels_in1D[index++] = p_pel[j];
+ }
+ p_pel += stride;
+ }
+}
+
+static int is_block_2x2_row_same_value(const uint8_t *p) {
+ if (p[0] != p[1] || p[2] != p[3]) {
+ return 0;
+ }
+ return 1;
+}
+
+static int is_block16_2x2_row_same_value(const uint16_t *p) {
+ if (p[0] != p[1] || p[2] != p[3]) {
+ return 0;
+ }
+ return 1;
+}
+
+static int is_block_2x2_col_same_value(const uint8_t *p) {
+ if ((p[0] != p[2]) || (p[1] != p[3])) {
+ return 0;
+ }
+ return 1;
+}
+
+static int is_block16_2x2_col_same_value(const uint16_t *p) {
+ if ((p[0] != p[2]) || (p[1] != p[3])) {
+ return 0;
+ }
+ return 1;
+}
+
+// the hash value (hash_value1 consists two parts, the first 3 bits relate to
+// the block size and the remaining 16 bits are the crc values. This fuction
+// is used to get the first 3 bits.
+static int hash_block_size_to_index(int block_size) {
+ switch (block_size) {
+ case 4: return 0;
+ case 8: return 1;
+ case 16: return 2;
+ case 32: return 3;
+ case 64: return 4;
+ case 128: return 5;
+ default: return -1;
+ }
+}
+
+void av1_hash_table_init(IntraBCHashInfo *intrabc_hash_info) {
+ if (!intrabc_hash_info->g_crc_initialized) {
+ av1_crc_calculator_init(&intrabc_hash_info->crc_calculator1, 24, 0x5D6DCB);
+ av1_crc_calculator_init(&intrabc_hash_info->crc_calculator2, 24, 0x864CFB);
+ intrabc_hash_info->g_crc_initialized = 1;
+ }
+ intrabc_hash_info->intrabc_hash_table.p_lookup_table = NULL;
+}
+
+void av1_hash_table_clear_all(hash_table *p_hash_table) {
+ if (p_hash_table->p_lookup_table == NULL) {
+ return;
+ }
+ for (int i = 0; i < kMaxAddr; i++) {
+ if (p_hash_table->p_lookup_table[i] != NULL) {
+ aom_vector_destroy(p_hash_table->p_lookup_table[i]);
+ aom_free(p_hash_table->p_lookup_table[i]);
+ p_hash_table->p_lookup_table[i] = NULL;
+ }
+ }
+}
+
+void av1_hash_table_destroy(hash_table *p_hash_table) {
+ av1_hash_table_clear_all(p_hash_table);
+ aom_free(p_hash_table->p_lookup_table);
+ p_hash_table->p_lookup_table = NULL;
+}
+
+bool av1_hash_table_create(hash_table *p_hash_table) {
+ if (p_hash_table->p_lookup_table != NULL) {
+ av1_hash_table_clear_all(p_hash_table);
+ return true;
+ }
+ p_hash_table->p_lookup_table =
+ (Vector **)aom_calloc(kMaxAddr, sizeof(p_hash_table->p_lookup_table[0]));
+ if (!p_hash_table->p_lookup_table) return false;
+ return true;
+}
+
+static bool hash_table_add_to_table(hash_table *p_hash_table,
+ uint32_t hash_value,
+ block_hash *curr_block_hash) {
+ if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+ p_hash_table->p_lookup_table[hash_value] =
+ aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0]));
+ if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+ return false;
+ }
+ if (aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+ sizeof(curr_block_hash[0])) == VECTOR_ERROR)
+ return false;
+ if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+ curr_block_hash) == VECTOR_ERROR)
+ return false;
+ } else {
+ if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+ curr_block_hash) == VECTOR_ERROR)
+ return false;
+ }
+ return true;
+}
+
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+ uint32_t hash_value) {
+ if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+ return 0;
+ } else {
+ return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size);
+ }
+}
+
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+ uint32_t hash_value) {
+ assert(av1_hash_table_count(p_hash_table, hash_value) > 0);
+ return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]);
+}
+
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+ uint32_t hash_value2) {
+ if (p_hash_table->p_lookup_table[hash_value1] == NULL) {
+ return 0;
+ }
+ Iterator iterator =
+ aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]);
+ Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]);
+ for (; !aom_iterator_equals(&iterator, &last);
+ aom_iterator_increment(&iterator)) {
+ if ((*(block_hash *)aom_iterator_get(&iterator)).hash_value2 ==
+ hash_value2) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intrabc_hash_info,
+ const YV12_BUFFER_CONFIG *picture,
+ uint32_t *pic_block_hash[2],
+ int8_t *pic_block_same_info[3]) {
+ const int width = 2;
+ const int height = 2;
+ const int x_end = picture->y_crop_width - width + 1;
+ const int y_end = picture->y_crop_height - height + 1;
+ CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+ CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+
+ const int length = width * 2;
+ if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t p[4];
+ int pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ get_pixels_in_1D_short_array_by_block_2x2(
+ CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride +
+ x_pos,
+ picture->y_stride, p);
+ pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
+ pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
+
+ pic_block_hash[0][pos] =
+ av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0]));
+ pic_block_hash[1][pos] =
+ av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0]));
+ pos++;
+ }
+ pos += width - 1;
+ }
+ } else {
+ uint8_t p[4];
+ int pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ get_pixels_in_1D_char_array_by_block_2x2(
+ picture->y_buffer + y_pos * picture->y_stride + x_pos,
+ picture->y_stride, p);
+ pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
+ pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
+
+ pic_block_hash[0][pos] =
+ av1_get_crc_value(calc_1, p, length * sizeof(p[0]));
+ pic_block_hash[1][pos] =
+ av1_get_crc_value(calc_2, p, length * sizeof(p[0]));
+ pos++;
+ }
+ pos += width - 1;
+ }
+ }
+}
+
+void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+ const YV12_BUFFER_CONFIG *picture,
+ int block_size,
+ uint32_t *src_pic_block_hash[2],
+ uint32_t *dst_pic_block_hash[2],
+ int8_t *src_pic_block_same_info[3],
+ int8_t *dst_pic_block_same_info[3]) {
+ CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+ CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+
+ const int pic_width = picture->y_crop_width;
+ const int x_end = picture->y_crop_width - block_size + 1;
+ const int y_end = picture->y_crop_height - block_size + 1;
+
+ const int src_size = block_size >> 1;
+ const int quad_size = block_size >> 2;
+
+ uint32_t p[4];
+ const int length = sizeof(p);
+
+ int pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ p[0] = src_pic_block_hash[0][pos];
+ p[1] = src_pic_block_hash[0][pos + src_size];
+ p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
+ p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
+ dst_pic_block_hash[0][pos] =
+ av1_get_crc_value(calc_1, (uint8_t *)p, length);
+
+ p[0] = src_pic_block_hash[1][pos];
+ p[1] = src_pic_block_hash[1][pos + src_size];
+ p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
+ p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
+ dst_pic_block_hash[1][pos] =
+ av1_get_crc_value(calc_2, (uint8_t *)p, length);
+
+ dst_pic_block_same_info[0][pos] =
+ src_pic_block_same_info[0][pos] &&
+ src_pic_block_same_info[0][pos + quad_size] &&
+ src_pic_block_same_info[0][pos + src_size] &&
+ src_pic_block_same_info[0][pos + src_size * pic_width] &&
+ src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] &&
+ src_pic_block_same_info[0][pos + src_size * pic_width + src_size];
+
+ dst_pic_block_same_info[1][pos] =
+ src_pic_block_same_info[1][pos] &&
+ src_pic_block_same_info[1][pos + src_size] &&
+ src_pic_block_same_info[1][pos + quad_size * pic_width] &&
+ src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] &&
+ src_pic_block_same_info[1][pos + src_size * pic_width] &&
+ src_pic_block_same_info[1][pos + src_size * pic_width + src_size];
+ pos++;
+ }
+ pos += block_size - 1;
+ }
+
+ if (block_size >= 4) {
+ const int size_minus_1 = block_size - 1;
+ pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ dst_pic_block_same_info[2][pos] =
+ (!dst_pic_block_same_info[0][pos] &&
+ !dst_pic_block_same_info[1][pos]) ||
+ (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0));
+ pos++;
+ }
+ pos += block_size - 1;
+ }
+ }
+}
+
+bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+ uint32_t *pic_hash[2],
+ int8_t *pic_is_same,
+ int pic_width, int pic_height,
+ int block_size) {
+ const int x_end = pic_width - block_size + 1;
+ const int y_end = pic_height - block_size + 1;
+
+ const int8_t *src_is_added = pic_is_same;
+ const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] };
+
+ int add_value = hash_block_size_to_index(block_size);
+ assert(add_value >= 0);
+ add_value <<= kSrcBits;
+ const int crc_mask = (1 << kSrcBits) - 1;
+
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ const int pos = y_pos * pic_width + x_pos;
+ // valid data
+ if (src_is_added[pos]) {
+ block_hash curr_block_hash;
+ curr_block_hash.x = x_pos;
+ curr_block_hash.y = y_pos;
+
+ const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value;
+ curr_block_hash.hash_value2 = src_hash[1][pos];
+
+ if (!hash_table_add_to_table(p_hash_table, hash_value1,
+ &curr_block_hash)) {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start) {
+ const int stride = picture->y_stride;
+ const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+ if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p16[j] != p16[0]) {
+ return 0;
+ }
+ }
+ p16 += stride;
+ }
+ } else {
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p[j] != p[0]) {
+ return 0;
+ }
+ }
+ p += stride;
+ }
+ }
+
+ return 1;
+}
+
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start) {
+ const int stride = picture->y_stride;
+ const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+ if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p16[j * stride + i] != p16[i]) {
+ return 0;
+ }
+ }
+ }
+ } else {
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p[j * stride + i] != p[i]) {
+ return 0;
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+ const uint8_t *y_src, int stride, int block_size,
+ uint32_t *hash_value1, uint32_t *hash_value2,
+ int use_highbitdepth) {
+ int add_value = hash_block_size_to_index(block_size);
+ assert(add_value >= 0);
+ add_value <<= kSrcBits;
+ const int crc_mask = (1 << kSrcBits) - 1;
+
+ CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+ CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+ uint32_t **buf_1 = intrabc_hash_info->hash_value_buffer[0];
+ uint32_t **buf_2 = intrabc_hash_info->hash_value_buffer[1];
+
+ // 2x2 subblock hash values in current CU
+ int sub_block_in_width = (block_size >> 1);
+ if (use_highbitdepth) {
+ uint16_t pixel_to_hash[4];
+ uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src);
+ for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+ for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+ int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+ get_pixels_in_1D_short_array_by_block_2x2(
+ y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
+ assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash,
+ sizeof(pixel_to_hash));
+ buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash,
+ sizeof(pixel_to_hash));
+ }
+ }
+ } else {
+ uint8_t pixel_to_hash[4];
+ for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+ for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+ int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+ get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
+ stride, pixel_to_hash);
+ assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ buf_1[0][pos] =
+ av1_get_crc_value(calc_1, pixel_to_hash, sizeof(pixel_to_hash));
+ buf_2[0][pos] =
+ av1_get_crc_value(calc_2, pixel_to_hash, sizeof(pixel_to_hash));
+ }
+ }
+ }
+
+ int src_sub_block_in_width = sub_block_in_width;
+ sub_block_in_width >>= 1;
+
+ int src_idx = 1;
+ int dst_idx = 0;
+
+ // 4x4 subblock hash values to current block hash values
+ uint32_t to_hash[4];
+ for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) {
+ src_idx = 1 - src_idx;
+ dst_idx = 1 - dst_idx;
+
+ int dst_pos = 0;
+ for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) {
+ for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) {
+ int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1);
+
+ assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ assert(srcPos + src_sub_block_in_width + 1 <
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ to_hash[0] = buf_1[src_idx][srcPos];
+ to_hash[1] = buf_1[src_idx][srcPos + 1];
+ to_hash[2] = buf_1[src_idx][srcPos + src_sub_block_in_width];
+ to_hash[3] = buf_1[src_idx][srcPos + src_sub_block_in_width + 1];
+
+ buf_1[dst_idx][dst_pos] =
+ av1_get_crc_value(calc_1, (uint8_t *)to_hash, sizeof(to_hash));
+
+ to_hash[0] = buf_2[src_idx][srcPos];
+ to_hash[1] = buf_2[src_idx][srcPos + 1];
+ to_hash[2] = buf_2[src_idx][srcPos + src_sub_block_in_width];
+ to_hash[3] = buf_2[src_idx][srcPos + src_sub_block_in_width + 1];
+ buf_2[dst_idx][dst_pos] =
+ av1_get_crc_value(calc_2, (uint8_t *)to_hash, sizeof(to_hash));
+ dst_pos++;
+ }
+ }
+
+ src_sub_block_in_width = sub_block_in_width;
+ sub_block_in_width >>= 1;
+ }
+
+ *hash_value1 = (buf_1[dst_idx][0] & crc_mask) + add_value;
+ *hash_value2 = buf_2[dst_idx][0];
+}
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
new file mode 100644
index 0000000000..8974ba27cb
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_
+#define AOM_AV1_ENCODER_HASH_MOTION_H_
+
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/hash.h"
+#include "third_party/vector/vector.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Block size used for force_integer_mv decisions
+#define FORCE_INT_MV_DECISION_BLOCK_SIZE 8
+
+// store a block's hash info.
+// x and y are the position from the top left of the picture
+// hash_value2 is used to store the second hash value
+typedef struct _block_hash {
+ int16_t x;
+ int16_t y;
+ uint32_t hash_value2;
+} block_hash;
+
+typedef struct _hash_table {
+ Vector **p_lookup_table;
+} hash_table;
+
+struct intrabc_hash_info;
+
+typedef struct intrabc_hash_info {
+ // buffer for hash value calculation of a block
+ // used only in av1_get_block_hash_value()
+ // [first hash/second hash]
+ // [two buffers used ping-pong]
+ uint32_t *hash_value_buffer[2][2];
+ hash_table intrabc_hash_table;
+
+ CRC_CALCULATOR crc_calculator1;
+ CRC_CALCULATOR crc_calculator2;
+ int g_crc_initialized;
+} IntraBCHashInfo;
+
+void av1_hash_table_init(IntraBCHashInfo *intra_bc_hash_info);
+void av1_hash_table_clear_all(hash_table *p_hash_table);
+void av1_hash_table_destroy(hash_table *p_hash_table);
+bool av1_hash_table_create(hash_table *p_hash_table);
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+ uint32_t hash_value);
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+ uint32_t hash_value);
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+ uint32_t hash_value2);
+void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intra_bc_hash_info,
+ const YV12_BUFFER_CONFIG *picture,
+ uint32_t *pic_block_hash[2],
+ int8_t *pic_block_same_info[3]);
+void av1_generate_block_hash_value(IntraBCHashInfo *intra_bc_hash_info,
+ const YV12_BUFFER_CONFIG *picture,
+ int block_size,
+ uint32_t *src_pic_block_hash[2],
+ uint32_t *dst_pic_block_hash[2],
+ int8_t *src_pic_block_same_info[3],
+ int8_t *dst_pic_block_same_info[3]);
+bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+ uint32_t *pic_hash[2],
+ int8_t *pic_is_same,
+ int pic_width, int pic_height,
+ int block_size);
+
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all rows
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start);
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all columns
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start);
+
+void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+ const uint8_t *y_src, int stride, int block_size,
+ uint32_t *hash_value1, uint32_t *hash_value2,
+ int use_highbitdepth);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_HASH_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
new file mode 100644
index 0000000000..a108e8148c
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/blockd.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+ pixel.
+ Shared for both high and low bit depth.
+ */
+void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ int i;
+ tran_high_t a1, b1, c1, d1, e1;
+ const int16_t *ip_pass0 = input;
+ const tran_low_t *ip = NULL;
+ tran_low_t *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip_pass0[0 * stride];
+ b1 = ip_pass0[1 * stride];
+ c1 = ip_pass0[2 * stride];
+ d1 = ip_pass0[3 * stride];
+
+ a1 += b1;
+ d1 = d1 - c1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= c1;
+ d1 += b1;
+ op[0] = (tran_low_t)a1;
+ op[1] = (tran_low_t)c1;
+ op[2] = (tran_low_t)d1;
+ op[3] = (tran_low_t)b1;
+
+ ip_pass0++;
+ op += 4;
+ }
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ b1 = ip[4 * 1];
+ c1 = ip[4 * 2];
+ d1 = ip[4 * 3];
+
+ a1 += b1;
+ d1 -= c1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= c1;
+ d1 += b1;
+ op[4 * 0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+ op[4 * 1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+ op[4 * 2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+ op[4 * 3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+ ip++;
+ op++;
+ }
+}
+
+static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ if (txfm_param->lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_fwht4x4(src_diff, coeff, diff_stride);
+ return;
+ }
+ av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+#endif
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ bd);
+}
+
+static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ bd);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+#endif
+
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+ TxfmParam *txfm_param) {
+ if (txfm_param->bd == 8)
+ av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+ else
+ av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_64X64:
+ highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_32X64:
+ highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_64X32:
+ highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+
+ case TX_32X32:
+ highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X16:
+ highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X8:
+ highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_4X8:
+ highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X4:
+ highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X16:
+ highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X8:
+ highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X32:
+ highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_32X16:
+ highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_4X4:
+ highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
+ break;
+#if !CONFIG_REALTIME_ONLY
+ case TX_4X16:
+ highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X4:
+ highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X32:
+ highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_32X8:
+ highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X64:
+ highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_64X16:
+ highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+#endif
+ default: assert(0); break;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+ ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ switch (tx_size) {
+ // As the output transform co-efficients of 4x4 Hadamard transform can be
+ // represented using 15 bits (for 12-bit clip) use lowbd variant of
+ // hadamard_4x4.
+ case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+ case TX_8X8: aom_highbd_hadamard_8x8(src_diff, src_stride, coeff); break;
+ case TX_16X16:
+ aom_highbd_hadamard_16x16(src_diff, src_stride, coeff);
+ break;
+ case TX_32X32:
+ aom_highbd_hadamard_32x32(src_diff, src_stride, coeff);
+ break;
+ default: assert(0);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void wht_fwd_txfm(TX_SIZE tx_size, const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ switch (tx_size) {
+ case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+ case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
+ case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
+ case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
+ default: assert(0);
+ }
+}
+
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+ const int16_t *src_diff, int src_stride,
+ tran_low_t *coeff) {
+ if (use_hadamard) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (bd_info.use_highbitdepth_buf) {
+ highbd_wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+ } else {
+ wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+ }
+#else
+ wht_fwd_txfm(tx_size, src_diff, src_stride, coeff);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ } else {
+ TxfmParam txfm_param;
+ txfm_param.tx_type = DCT_DCT;
+ txfm_param.tx_size = tx_size;
+ txfm_param.lossless = 0;
+ txfm_param.bd = bd_info.bit_depth;
+ txfm_param.is_hbd = bd_info.use_highbitdepth_buf;
+ txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+ av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
new file mode 100644
index 0000000000..30f8a2258b
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+ TxfmParam *txfm_param);
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param);
+
+/*!\brief Apply Hadamard or DCT transform
+ *
+ * \callergraph
+ * DCT and Hadamard transforms are commonly used for quick RD score estimation.
+ * The coeff buffer's size should be equal to the number of pixels
+ * corresponding to tx_size.
+ */
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+ const int16_t *src_diff, int src_stride, tran_low_t *coeff);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/third_party/aom/av1/encoder/interp_search.c b/third_party/aom/av1/encoder/interp_search.c
new file mode 100644
index 0000000000..27235303c0
--- /dev/null
+++ b/third_party/aom/av1/encoder/interp_search.c
@@ -0,0 +1,801 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/reconinter_enc.h"
+
+// return mv_diff
+static INLINE int is_interp_filter_good_match(
+ const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi,
+ int skip_level) {
+ const int is_comp = has_second_ref(mi);
+ int i;
+
+ for (i = 0; i < 1 + is_comp; ++i) {
+ if (st->ref_frames[i] != mi->ref_frame[i]) return INT_MAX;
+ }
+
+ if (skip_level == 1 && is_comp) {
+ if (st->comp_type != mi->interinter_comp.type) return INT_MAX;
+ if (st->compound_idx != mi->compound_idx) return INT_MAX;
+ }
+
+ int mv_diff = 0;
+ for (i = 0; i < 1 + is_comp; ++i) {
+ mv_diff += abs(st->mv[i].as_mv.row - mi->mv[i].as_mv.row) +
+ abs(st->mv[i].as_mv.col - mi->mv[i].as_mv.col);
+ }
+ return mv_diff;
+}
+
+static INLINE int save_interp_filter_search_stat(
+ MB_MODE_INFO *const mbmi, int64_t rd, unsigned int pred_sse,
+ INTERPOLATION_FILTER_STATS *interp_filter_stats,
+ int interp_filter_stats_idx) {
+ if (interp_filter_stats_idx < MAX_INTERP_FILTER_STATS) {
+ INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
+ { mbmi->mv[0], mbmi->mv[1] },
+ { mbmi->ref_frame[0],
+ mbmi->ref_frame[1] },
+ mbmi->interinter_comp.type,
+ mbmi->compound_idx,
+ rd,
+ pred_sse };
+ interp_filter_stats[interp_filter_stats_idx] = stat;
+ interp_filter_stats_idx++;
+ }
+ return interp_filter_stats_idx;
+}
+
+static INLINE int find_interp_filter_in_stats(
+ MB_MODE_INFO *const mbmi, INTERPOLATION_FILTER_STATS *interp_filter_stats,
+ int interp_filter_stats_idx, int skip_level) {
+ // [skip_levels][single or comp]
+ const int thr[2][2] = { { 0, 0 }, { 3, 7 } };
+ const int is_comp = has_second_ref(mbmi);
+
+ // Find good enough match.
+ // TODO(yunqing): Separate single-ref mode and comp mode stats for fast
+ // search.
+ int best = INT_MAX;
+ int match = -1;
+ for (int j = 0; j < interp_filter_stats_idx; ++j) {
+ const INTERPOLATION_FILTER_STATS *st = &interp_filter_stats[j];
+ const int mv_diff = is_interp_filter_good_match(st, mbmi, skip_level);
+ // Exact match is found.
+ if (mv_diff == 0) {
+ match = j;
+ break;
+ } else if (mv_diff < best && mv_diff <= thr[skip_level - 1][is_comp]) {
+ best = mv_diff;
+ match = j;
+ }
+ }
+
+ if (match != -1) {
+ mbmi->interp_filters = interp_filter_stats[match].filters;
+ return match;
+ }
+ return -1; // no match result found
+}
+
+int av1_find_interp_filter_match(
+ MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
+ const InterpFilter assign_filter, const int need_search,
+ INTERPOLATION_FILTER_STATS *interp_filter_stats,
+ int interp_filter_stats_idx) {
+ int match_found_idx = -1;
+ if (cpi->sf.interp_sf.use_interp_filter && need_search)
+ match_found_idx = find_interp_filter_in_stats(
+ mbmi, interp_filter_stats, interp_filter_stats_idx,
+ cpi->sf.interp_sf.use_interp_filter);
+
+ if (!need_search || match_found_idx == -1)
+ set_default_interp_filters(mbmi, assign_filter);
+ return match_found_idx;
+}
+
+static INLINE int get_switchable_rate(MACROBLOCK *const x,
+ const int_interpfilters filters,
+ const int ctx[2], int dual_filter) {
+ const InterpFilter filter0 = filters.as_filters.y_filter;
+ int inter_filter_cost =
+ x->mode_costs.switchable_interp_costs[ctx[0]][filter0];
+ if (dual_filter) {
+ const InterpFilter filter1 = filters.as_filters.x_filter;
+ inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx[1]][filter1];
+ }
+ return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+}
+
+// Build inter predictor and calculate model rd
+// for a given plane.
+static INLINE void interp_model_rd_eval(
+ MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int plane_from, int plane_to,
+ RD_STATS *rd_stats, int is_skip_build_pred) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ RD_STATS tmp_rd_stats;
+ av1_init_rd_stats(&tmp_rd_stats);
+
+ // Skip inter predictor if the predictor is already available.
+ if (!is_skip_build_pred) {
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ plane_from, plane_to);
+ }
+
+ model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model
+ ? MODELRD_LEGACY
+ : MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate,
+ &tmp_rd_stats.dist, &tmp_rd_stats.skip_txfm, &tmp_rd_stats.sse, NULL,
+ NULL, NULL);
+
+ av1_merge_rd_stats(rd_stats, &tmp_rd_stats);
+}
+
+// calculate the rdcost of given interpolation_filter
+static INLINE int64_t interpolation_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t *const rd,
+ RD_STATS *rd_stats_luma, RD_STATS *rd_stats, int *const switchable_rate,
+ const BUFFER_SET *dst_bufs[2], int filter_idx, const int switchable_ctx[2],
+ const int skip_pred) {
+ const AV1_COMMON *cm = &cpi->common;
+ const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ RD_STATS this_rd_stats_luma, this_rd_stats;
+
+ // Initialize rd_stats structures to default values.
+ av1_init_rd_stats(&this_rd_stats_luma);
+ this_rd_stats = *rd_stats_luma;
+ const int_interpfilters last_best = mbmi->interp_filters;
+ mbmi->interp_filters = filter_sets[filter_idx];
+ const int tmp_rs =
+ get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
+ cm->seq_params->enable_dual_filter);
+
+ int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0);
+ if (min_rd > *rd) {
+ mbmi->interp_filters = last_best;
+ return 0;
+ }
+
+ (void)tile_data;
+
+ assert(skip_pred != 2);
+ assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0));
+ assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0));
+ assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0));
+ assert((rd_stats_luma->skip_txfm == 0) || (rd_stats_luma->skip_txfm == 1));
+ assert((rd_stats->skip_txfm == 0) || (rd_stats->skip_txfm == 1));
+ assert((skip_pred >= 0) &&
+ (skip_pred <= interp_search_flags->default_interp_skip_flags));
+
+ // When skip_txfm pred is equal to default_interp_skip_flags,
+ // skip both luma and chroma MC.
+ // For mono-chrome images:
+ // num_planes = 1 and cpi->default_interp_skip_flags = 1,
+ // skip_pred = 1: skip both luma and chroma
+ // skip_pred = 0: Evaluate luma and as num_planes=1,
+ // skip chroma evaluation
+ int tmp_skip_pred =
+ (skip_pred == interp_search_flags->default_interp_skip_flags)
+ ? INTERP_SKIP_LUMA_SKIP_CHROMA
+ : skip_pred;
+
+ switch (tmp_skip_pred) {
+ case INTERP_EVAL_LUMA_EVAL_CHROMA:
+ // skip_pred = 0: Evaluate both luma and chroma.
+ // Luma MC
+ interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y,
+ &this_rd_stats_luma, 0);
+ this_rd_stats = this_rd_stats_luma;
+#if CONFIG_COLLECT_RD_STATS == 3
+ RD_STATS rd_stats_y;
+ av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+ INT64_MAX);
+ PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 3
+ AOM_FALLTHROUGH_INTENDED;
+ case INTERP_SKIP_LUMA_EVAL_CHROMA:
+ // skip_pred = 1: skip luma evaluation (retain previous best luma stats)
+ // and do chroma evaluation.
+ for (int plane = 1; plane < num_planes; ++plane) {
+ int64_t tmp_rd =
+ RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist);
+ if (tmp_rd >= *rd) {
+ mbmi->interp_filters = last_best;
+ return 0;
+ }
+ interp_model_rd_eval(x, cpi, bsize, orig_dst, plane, plane,
+ &this_rd_stats, 0);
+ }
+ break;
+ case INTERP_SKIP_LUMA_SKIP_CHROMA:
+ // both luma and chroma evaluation is skipped
+ this_rd_stats = *rd_stats;
+ break;
+ case INTERP_EVAL_INVALID:
+ default: assert(0); return 0;
+ }
+ int64_t tmp_rd =
+ RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist);
+
+ if (tmp_rd < *rd) {
+ *rd = tmp_rd;
+ *switchable_rate = tmp_rs;
+ if (skip_pred != interp_search_flags->default_interp_skip_flags) {
+ if (skip_pred == INTERP_EVAL_LUMA_EVAL_CHROMA) {
+ // Overwrite the data as current filter is the best one
+ *rd_stats_luma = this_rd_stats_luma;
+ *rd_stats = this_rd_stats;
+ // As luma MC data is computed, no need to recompute after the search
+ x->recalc_luma_mc_data = 0;
+ } else if (skip_pred == INTERP_SKIP_LUMA_EVAL_CHROMA) {
+ // As luma MC data is not computed, update of luma data can be skipped
+ *rd_stats = this_rd_stats;
+ // As luma MC data is not recomputed and current filter is the best,
+ // indicate the possibility of recomputing MC data
+ // If current buffer contains valid MC data, toggle to indicate that
+ // luma MC data needs to be recomputed
+ x->recalc_luma_mc_data ^= 1;
+ }
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ }
+ return 1;
+ }
+ mbmi->interp_filters = last_best;
+ return 0;
+}
+
+static INLINE INTERP_PRED_TYPE is_pred_filter_search_allowed(
+ const AV1_COMP *const cpi, MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int_interpfilters *af, int_interpfilters *lf) {
+ const AV1_COMMON *cm = &cpi->common;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int bsl = mi_size_wide_log2[bsize];
+ int is_horiz_eq = 0, is_vert_eq = 0;
+
+ if (above_mbmi && is_inter_block(above_mbmi))
+ *af = above_mbmi->interp_filters;
+
+ if (left_mbmi && is_inter_block(left_mbmi)) *lf = left_mbmi->interp_filters;
+
+ if (af->as_filters.x_filter != INTERP_INVALID)
+ is_horiz_eq = af->as_filters.x_filter == lf->as_filters.x_filter;
+ if (af->as_filters.y_filter != INTERP_INVALID)
+ is_vert_eq = af->as_filters.y_filter == lf->as_filters.y_filter;
+
+ INTERP_PRED_TYPE pred_filter_type = (is_vert_eq << 1) + is_horiz_eq;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int pred_filter_enable =
+ cpi->sf.interp_sf.cb_pred_filter_search
+ ? (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_frame.frame_number)) &
+ 0x1
+ : 0;
+ pred_filter_enable &= is_horiz_eq || is_vert_eq;
+ // pred_filter_search = 0: pred_filter is disabled
+ // pred_filter_search = 1: pred_filter is enabled and only horz pred matching
+ // pred_filter_search = 2: pred_filter is enabled and only vert pred matching
+ // pred_filter_search = 3: pred_filter is enabled and
+ // both vert, horz pred matching
+ return pred_filter_enable * pred_filter_type;
+}
+
+static DUAL_FILTER_TYPE find_best_interp_rd_facade(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats, int *const switchable_rate,
+ const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+ const int skip_pred, uint16_t allow_interp_mask, int is_w4_or_h4) {
+ int tmp_skip_pred = skip_pred;
+ DUAL_FILTER_TYPE best_filt_type = REG_REG;
+
+ // If no filter are set to be evaluated, return from function
+ if (allow_interp_mask == 0x0) return best_filt_type;
+ // For block width or height is 4, skip the pred evaluation of SHARP_SHARP
+ tmp_skip_pred = is_w4_or_h4
+ ? cpi->interp_search_flags.default_interp_skip_flags
+ : skip_pred;
+
+ // Loop over the all filter types and evaluate for only allowed filter types
+ for (int filt_type = SHARP_SHARP; filt_type >= REG_REG; --filt_type) {
+ const int is_filter_allowed =
+ get_interp_filter_allowed_mask(allow_interp_mask, filt_type);
+ if (is_filter_allowed)
+ if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate,
+ dst_bufs, filt_type, switchable_ctx,
+ tmp_skip_pred))
+ best_filt_type = filt_type;
+ tmp_skip_pred = skip_pred;
+ }
+ return best_filt_type;
+}
+
+static INLINE void pred_dual_interp_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats, int *const switchable_rate,
+ const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+ const int skip_pred, INTERP_PRED_TYPE pred_filt_type, int_interpfilters *af,
+ int_interpfilters *lf) {
+ (void)lf;
+ assert(pred_filt_type > INTERP_HORZ_NEQ_VERT_NEQ);
+ assert(pred_filt_type < INTERP_PRED_TYPE_ALL);
+ uint16_t allowed_interp_mask = 0;
+
+ if (pred_filt_type == INTERP_HORZ_EQ_VERT_NEQ) {
+ // pred_filter_search = 1: Only horizontal filter is matching
+ allowed_interp_mask =
+ av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.x_filter];
+ } else if (pred_filt_type == INTERP_HORZ_NEQ_VERT_EQ) {
+ // pred_filter_search = 2: Only vertical filter is matching
+ allowed_interp_mask =
+ av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.y_filter];
+ } else {
+ // pred_filter_search = 3: Both horizontal and vertical filter are matching
+ int filt_type =
+ af->as_filters.x_filter + af->as_filters.y_filter * SWITCHABLE_FILTERS;
+ set_interp_filter_allowed_mask(&allowed_interp_mask, filt_type);
+ }
+ // REG_REG is already been evaluated in the beginning
+ reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG);
+ find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y,
+ rd_stats, switchable_rate, dst_bufs,
+ switchable_ctx, skip_pred, allowed_interp_mask, 0);
+}
+// Evaluate dual filter type
+// a) Using above, left block interp filter
+// b) Find the best horizontal filter and
+// then evaluate corresponding vertical filters.
+static INLINE void fast_dual_interp_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats, int *const switchable_rate,
+ const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+ const int skip_hor, const int skip_ver) {
+ const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ;
+ int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID);
+ int_interpfilters lf = af;
+
+ if (!have_newmv_in_inter_mode(mbmi->mode)) {
+ pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf);
+ }
+
+ if (pred_filter_type) {
+ pred_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+ switchable_ctx, (skip_hor & skip_ver),
+ pred_filter_type, &af, &lf);
+ } else {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ int best_dual_mode = 0;
+ int skip_pred =
+ bw <= 4 ? interp_search_flags->default_interp_skip_flags : skip_hor;
+ // TODO(any): Make use of find_best_interp_rd_facade()
+ // if speed impact is negligible
+ for (int i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+ if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate,
+ dst_bufs, i, switchable_ctx, skip_pred)) {
+ best_dual_mode = i;
+ }
+ skip_pred = skip_hor;
+ }
+ // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+ skip_pred =
+ bh <= 4 ? interp_search_flags->default_interp_skip_flags : skip_ver;
+ for (int i = (best_dual_mode + (SWITCHABLE_FILTERS * 2));
+ i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
+ interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+ i, switchable_ctx, skip_pred);
+ skip_pred = skip_ver;
+ }
+ }
+}
+
+// Find the best interp filter if dual_interp_filter = 0
+static INLINE void find_best_non_dual_interp_filter(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats, int *const switchable_rate,
+ const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+ const int skip_ver, const int skip_hor) {
+ const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+ int8_t i;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ uint16_t interp_filter_search_mask =
+ interp_search_flags->interp_filter_search_mask;
+
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0);
+ const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1);
+ int use_actual_frame_probs = 1;
+ const int *switchable_interp_p0;
+ const int *switchable_interp_p1;
+#if CONFIG_FPMT_TEST
+ use_actual_frame_probs =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+ if (!use_actual_frame_probs) {
+ switchable_interp_p0 = (int *)cpi->ppi->temp_frame_probs
+ .switchable_interp_probs[update_type][ctx0];
+ switchable_interp_p1 = (int *)cpi->ppi->temp_frame_probs
+ .switchable_interp_probs[update_type][ctx1];
+ }
+#endif
+ if (use_actual_frame_probs) {
+ switchable_interp_p0 =
+ cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx0];
+ switchable_interp_p1 =
+ cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx1];
+ }
+ static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 };
+ const int thresh = thr[update_type];
+ for (i = 0; i < SWITCHABLE_FILTERS; i++) {
+ // For non-dual case, the 2 dir's prob should be identical.
+ assert(switchable_interp_p0[i] == switchable_interp_p1[i]);
+ if (switchable_interp_p0[i] < thresh &&
+ switchable_interp_p1[i] < thresh) {
+ DUAL_FILTER_TYPE filt_type = i + SWITCHABLE_FILTERS * i;
+ reset_interp_filter_allowed_mask(&interp_filter_search_mask, filt_type);
+ }
+ }
+ }
+
+ // Regular filter evaluation should have been done and hence the same should
+ // be the winner
+ assert(x->e_mbd.mi[0]->interp_filters.as_int == filter_sets[0].as_int);
+ if ((skip_hor & skip_ver) != interp_search_flags->default_interp_skip_flags) {
+ INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ;
+ int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID);
+ int_interpfilters lf = af;
+
+ pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf);
+ if (pred_filter_type) {
+ assert(af.as_filters.x_filter != INTERP_INVALID);
+ int filter_idx = SWITCHABLE * af.as_filters.x_filter;
+ // This assert tells that (filter_x == filter_y) for non-dual filter case
+ assert(filter_sets[filter_idx].as_filters.x_filter ==
+ filter_sets[filter_idx].as_filters.y_filter);
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search &&
+ !(get_interp_filter_allowed_mask(interp_filter_search_mask,
+ filter_idx))) {
+ return;
+ }
+ if (filter_idx) {
+ interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+ filter_idx, switchable_ctx,
+ (skip_hor & skip_ver));
+ }
+ return;
+ }
+ }
+ // Reuse regular filter's modeled rd data for sharp filter for following
+ // cases
+ // 1) When bsize is 4x4
+ // 2) When block width is 4 (i.e. 4x8/4x16 blocks) and MV in vertical
+ // direction is full-pel
+ // 3) When block height is 4 (i.e. 8x4/16x4 blocks) and MV in horizontal
+ // direction is full-pel
+ // TODO(any): Optimize cases 2 and 3 further if luma MV in relavant direction
+ // alone is full-pel
+
+ if ((bsize == BLOCK_4X4) ||
+ (block_size_wide[bsize] == 4 &&
+ skip_ver == interp_search_flags->default_interp_skip_flags) ||
+ (block_size_high[bsize] == 4 &&
+ skip_hor == interp_search_flags->default_interp_skip_flags)) {
+ int skip_pred = skip_hor & skip_ver;
+ uint16_t allowed_interp_mask = 0;
+
+ // REG_REG filter type is evaluated beforehand, hence skip it
+ set_interp_filter_allowed_mask(&allowed_interp_mask, SHARP_SHARP);
+ set_interp_filter_allowed_mask(&allowed_interp_mask, SMOOTH_SMOOTH);
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search)
+ allowed_interp_mask &= interp_filter_search_mask;
+
+ find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+ switchable_ctx, skip_pred, allowed_interp_mask,
+ 1);
+ } else {
+ int skip_pred = (skip_hor & skip_ver);
+ for (i = (SWITCHABLE_FILTERS + 1); i < DUAL_FILTER_SET_SIZE;
+ i += (SWITCHABLE_FILTERS + 1)) {
+ // This assert tells that (filter_x == filter_y) for non-dual filter case
+ assert(filter_sets[i].as_filters.x_filter ==
+ filter_sets[i].as_filters.y_filter);
+ if (cpi->sf.interp_sf.adaptive_interp_filter_search &&
+ !(get_interp_filter_allowed_mask(interp_filter_search_mask, i))) {
+ continue;
+ }
+ interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+ i, switchable_ctx, skip_pred);
+ // In first iteration, smooth filter is evaluated. If smooth filter
+ // (which is less sharper) is the winner among regular and smooth filters,
+ // sharp filter evaluation is skipped
+ // TODO(any): Refine this gating based on modelled rd only (i.e., by not
+ // accounting switchable filter rate)
+ if (cpi->sf.interp_sf.skip_sharp_interp_filter_search &&
+ skip_pred != interp_search_flags->default_interp_skip_flags) {
+ if (mbmi->interp_filters.as_int == filter_sets[SMOOTH_SMOOTH].as_int)
+ break;
+ }
+ }
+ }
+}
+
+static INLINE void calc_interp_skip_pred_flag(MACROBLOCK *const x,
+ const AV1_COMP *const cpi,
+ int *skip_hor, int *skip_ver) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int num_planes = av1_num_planes(cm);
+ const int is_compound = has_second_ref(mbmi);
+ assert(is_intrabc_block(mbmi) == 0);
+ for (int ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, mbmi->ref_frame[ref]);
+ // TODO(any): Refine skip flag calculation considering scaling
+ if (av1_is_scaled(sf)) {
+ *skip_hor = 0;
+ *skip_ver = 0;
+ break;
+ }
+ const MV mv = mbmi->mv[ref].as_mv;
+ int skip_hor_plane = 0;
+ int skip_ver_plane = 0;
+ for (int plane_idx = 0; plane_idx < AOMMAX(1, (num_planes - 1));
+ ++plane_idx) {
+ struct macroblockd_plane *const pd = &xd->plane[plane_idx];
+ const int bw = pd->width;
+ const int bh = pd->height;
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(
+ xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+ const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ skip_hor_plane |= ((sub_x == 0) << plane_idx);
+ skip_ver_plane |= ((sub_y == 0) << plane_idx);
+ }
+ *skip_hor &= skip_hor_plane;
+ *skip_ver &= skip_ver_plane;
+ // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
+ assert(*skip_hor != 2);
+ assert(*skip_ver != 2);
+ }
+ // When compond prediction type is compound segment wedge, luma MC and chroma
+ // MC need to go hand in hand as mask generated during luma MC is reuired for
+ // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
+ // vertical filter decision may be incorrect as temporary MC evaluation
+ // overwrites the mask. Make skip_ver as 0 for this case so that mask is
+ // populated during luma MC
+ if (is_compound && mbmi->compound_idx == 1 &&
+ mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
+ assert(mbmi->comp_group_idx == 1);
+ if (*skip_hor == 0 && *skip_ver == 1) *skip_ver = 0;
+ }
+}
+
+/*!\brief AV1 interpolation filter search
+ *
+ * \ingroup inter_mode_search
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during
+ * encoding.
+ * \param[in] x Pointer to struc holding all the data for
+ * the current macroblock.
+ * \param[in] bsize Current block size.
+ * \param[in] tmp_dst A temporary prediction buffer to hold a
+ * computed prediction.
+ * \param[in,out] orig_dst A prediction buffer to hold a computed
+ * prediction. This will eventually hold the
+ * final prediction, and the tmp_dst info will
+ * be copied here.
+ * \param[in,out] rd The RD cost associated with the selected
+ * interpolation filter parameters.
+ * \param[in,out] switchable_rate The rate associated with using a SWITCHABLE
+ * filter mode.
+ * \param[in,out] skip_build_pred Indicates whether or not to build the inter
+ * predictor. If this is 0, the inter predictor
+ * has already been built and thus we can avoid
+ * repeating computation.
+ * \param[in] args HandleInterModeArgs struct holding
+ * miscellaneous arguments for inter mode
+ * search. See the documentation for this
+ * struct for a description of each member.
+ * \param[in] ref_best_rd Best RD found so far for this block.
+ * It is used for early termination of this
+ * search if the RD exceeds this value.
+ *
+ * \return Returns INT64_MAX if the filter parameters are invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * parameter search is a success.
+ */
+int64_t av1_interpolation_filter_search(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+ int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
+ HandleInterModeArgs *args, int64_t ref_best_rd) {
+ const AV1_COMMON *cm = &cpi->common;
+ const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int need_search = av1_is_interp_needed(xd);
+ const int ref_frame = xd->mi[0]->ref_frame[0];
+ RD_STATS rd_stats_luma, rd_stats;
+
+ // Initialization of rd_stats structures with default values
+ av1_init_rd_stats(&rd_stats_luma);
+ av1_init_rd_stats(&rd_stats);
+
+ int match_found_idx = -1;
+ const InterpFilter assign_filter = cm->features.interp_filter;
+
+ match_found_idx = av1_find_interp_filter_match(
+ mbmi, cpi, assign_filter, need_search, args->interp_filter_stats,
+ args->interp_filter_stats_idx);
+
+ if (match_found_idx != -1) {
+ *rd = args->interp_filter_stats[match_found_idx].rd;
+ x->pred_sse[ref_frame] =
+ args->interp_filter_stats[match_found_idx].pred_sse;
+ *skip_build_pred = 0;
+ return 0;
+ }
+
+ int switchable_ctx[2];
+ switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
+ switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
+ *switchable_rate =
+ get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
+ cm->seq_params->enable_dual_filter);
+
+ // Do MC evaluation for default filter_type.
+ // Luma MC
+ interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y,
+ &rd_stats_luma, *skip_build_pred);
+
+#if CONFIG_COLLECT_RD_STATS == 3
+ RD_STATS rd_stats_y;
+ av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+ PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 3
+ // Chroma MC
+ if (num_planes > 1) {
+ interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_U, AOM_PLANE_V,
+ &rd_stats, *skip_build_pred);
+ }
+ *skip_build_pred = 1;
+
+ av1_merge_rd_stats(&rd_stats, &rd_stats_luma);
+
+ assert(rd_stats.rate >= 0);
+
+ *rd = RDCOST(x->rdmult, *switchable_rate + rd_stats.rate, rd_stats.dist);
+ x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4);
+
+ if (assign_filter != SWITCHABLE || match_found_idx != -1) {
+ return 0;
+ }
+ if (!need_search) {
+ int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ assert(mbmi->interp_filters.as_int == filters.as_int);
+ (void)filters;
+ return 0;
+ }
+ if (args->modelled_rd != NULL) {
+ if (has_second_ref(mbmi)) {
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+ MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+ const int mode0 = compound_ref0_mode(mbmi->mode);
+ const int mode1 = compound_ref1_mode(mbmi->mode);
+ const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+ args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+ if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
+ return INT64_MAX;
+ }
+ }
+ }
+
+ x->recalc_luma_mc_data = 0;
+ // skip_flag=xx (in binary form)
+ // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
+ // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip
+ // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only"
+ // Skip_flag=2 is not a valid case
+ // skip_flag=3 corresponds to "Skip both luma and chroma MC"
+ int skip_hor = interp_search_flags->default_interp_skip_flags;
+ int skip_ver = interp_search_flags->default_interp_skip_flags;
+ calc_interp_skip_pred_flag(x, cpi, &skip_hor, &skip_ver);
+
+ // do interp_filter search
+ restore_dst_buf(xd, *tmp_dst, num_planes);
+ const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
+ // Evaluate dual interp filters
+ if (cm->seq_params->enable_dual_filter) {
+ if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) {
+ fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+ &rd_stats_luma, &rd_stats, switchable_rate,
+ dst_bufs, switchable_ctx, skip_hor, skip_ver);
+ } else {
+ // Use full interpolation filter search
+ uint16_t allowed_interp_mask = ALLOW_ALL_INTERP_FILT_MASK;
+ // REG_REG filter type is evaluated beforehand, so loop is repeated over
+ // REG_SMOOTH to SHARP_SHARP for full interpolation filter search
+ reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG);
+ find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd,
+ &rd_stats_luma, &rd_stats, switchable_rate,
+ dst_bufs, switchable_ctx,
+ (skip_hor & skip_ver), allowed_interp_mask, 0);
+ }
+ } else {
+ // Evaluate non-dual interp filters
+ find_best_non_dual_interp_filter(
+ x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats,
+ switchable_rate, dst_bufs, switchable_ctx, skip_ver, skip_hor);
+ }
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ // Recompute final MC data if required
+ if (x->recalc_luma_mc_data == 1) {
+ // Recomputing final luma MC data is required only if the same was skipped
+ // in either of the directions Condition below is necessary, but not
+ // sufficient
+ assert((skip_hor == 1) || (skip_ver == 1));
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ }
+ x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4);
+
+ // save search results
+ if (cpi->sf.interp_sf.use_interp_filter) {
+ assert(match_found_idx == -1);
+ args->interp_filter_stats_idx = save_interp_filter_search_stat(
+ mbmi, *rd, x->pred_sse[ref_frame], args->interp_filter_stats,
+ args->interp_filter_stats_idx);
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/interp_search.h b/third_party/aom/av1/encoder/interp_search.h
new file mode 100644
index 0000000000..9815e0bcfb
--- /dev/null
+++ b/third_party/aom/av1/encoder/interp_search.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
+#define AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define MAX_INTERP_FILTER_STATS 128
+#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
+
+typedef struct {
+ int_interpfilters filters;
+ int_mv mv[2];
+ int8_t ref_frames[2];
+ COMPOUND_TYPE comp_type;
+ int compound_idx;
+ int64_t rd;
+ unsigned int pred_sse;
+} INTERPOLATION_FILTER_STATS;
+/*!\endcond */
+
+/*!\brief Miscellaneous arguments for inter mode search.
+ */
+typedef struct HandleInterModeArgs {
+ /*!
+ * Buffer for the above predictor in OBMC
+ */
+ uint8_t *above_pred_buf[MAX_MB_PLANE];
+ /*!
+ * Stride for the above predictor in OBMC
+ */
+ int above_pred_stride[MAX_MB_PLANE];
+ /*!
+ * Buffer for the left predictor in OBMC
+ */
+ uint8_t *left_pred_buf[MAX_MB_PLANE];
+ /*!
+ * Stride for the left predictor in OBMC
+ */
+ int left_pred_stride[MAX_MB_PLANE];
+ /*!
+ * Pointer to the first member in a 2D array which holds
+ * single reference mode motion vectors to be used as a starting
+ * point in the mv search for compound modes. Each array is length REF_FRAMES,
+ * meaning there is a slot for a single reference motion vector for
+ * each possible reference frame. The 2D array consists of N of these arrays,
+ * where N is the length of the reference mv stack computed for the single
+ * reference case for that particular reference frame.
+ */
+ int_mv (*single_newmv)[REF_FRAMES];
+ /*!
+ * Pointer to the first array of a 2D array with the same setup as
+ * single_newmv array above. This is a 2D array to hold the rate
+ * corresponding to each of the single reference mode motion vectors
+ * held in single_newmv.
+ */
+ int (*single_newmv_rate)[REF_FRAMES];
+ /*!
+ * Pointer to the first array of a 2D array with the same setup as
+ * single_newmv array above. This is a 2D array to hold a 0 or 1
+ * validity value corresponding to each of the single reference mode motion
+ * vectors held in single_newmv.
+ */
+ int (*single_newmv_valid)[REF_FRAMES];
+ /*!
+ * Pointer to the first array in a 3D array of predicted rate-distortion.
+ * The dimensions of this structure are:
+ * (number of possible inter modes) X
+ * (number of reference MVs) X
+ * (number of reference frames).
+ */
+ int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+ /*!
+ * Holds an estimated entropy cost for picking the current reference frame.
+ * This is used to compute an rd estimate.
+ */
+ int ref_frame_cost;
+ /*!
+ * Holds an estimated entropy cost for picking single or compound
+ * reference. This is used to compute an rd estimate.
+ */
+ int single_comp_cost;
+ /*!
+ * Pointer to the first element in a 3D array holding rd's of
+ * SIMPLE_TRANSLATION used to prune out the motion mode search in single ref
+ * modes used to determine compound ref modes. The full structure is:
+ * (number of inter modes) X (length of refmv list) X (number of ref frames)
+ */
+ int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+ /*!
+ * An integer value 0 or 1 which indicates whether or not to skip the motion
+ * mode search and default to SIMPLE_TRANSLATION as a speed feature.
+ */
+ int skip_motion_mode;
+ /*!
+ * Initialized to false. If true, skips interpolation filter search and uses
+ * the default EIGHTTAP_REGULAR.
+ */
+ bool skip_ifs;
+ /*!
+ * A pointer to the first element in an array of INTERINTRA_MODE types. This
+ * contains the best inter_intra mode for each reference frame.
+ */
+ INTERINTRA_MODE *inter_intra_mode;
+ /*!
+ * Array of saved interpolation filter stats collected to avoid repeating
+ * an interpolation filter search when the mv and ref_frame are the same
+ * as a previous search.
+ */
+ INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS];
+
+ /*!
+ * Stack to store full pixel search start mv of NEWMV mode.
+ */
+ FULLPEL_MV start_mv_stack[(MAX_REF_MV_SEARCH - 1) * 2];
+
+ /*!
+ * Stack to store ref_mv_idx of NEWMV mode.
+ */
+ uint8_t ref_mv_idx_stack[(MAX_REF_MV_SEARCH - 1) * 2];
+
+ /*!
+ * Count of mvs in start mv stack.
+ */
+ int start_mv_cnt;
+
+ /*!
+ * Index of the last set of saved stats in the interp_filter_stats array.
+ */
+ int interp_filter_stats_idx;
+ /*!
+ * Estimated wedge index.
+ */
+ int wedge_index;
+ /*!
+ * Estimated wedge sign.
+ */
+ int wedge_sign;
+ /*!
+ * Estimated diff wtd index.
+ */
+ int diffwtd_index;
+ /*!
+ * Estimated cmp mode.
+ */
+ int cmp_mode[MODE_CTX_REF_FRAMES];
+ /*!
+ * The best sse during single new_mv search. Note that the sse here comes from
+ * single_motion_search, and not from interpolation_filter_search. This has
+ * two implications:
+ * 1. The mv used to calculate the sse here does not have to be the best sse
+ * found in handle_inter_mode.
+ * 2. Even if the mvs agree, the sse here can differ from the sse in \ref
+ * MACROBLOCK::pred_sse due to different interpolation filter used.
+ */
+ unsigned int best_single_sse_in_refs[REF_FRAMES];
+ /*!
+ * Holds the sse of best mode so far in the mode evaluation process. This is
+ * used in intermediate termination of NEWMV mode evaluation.
+ */
+ unsigned int best_pred_sse;
+} HandleInterModeArgs;
+
+/*!\cond */
+static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = {
+ { 0x00000000 }, { 0x00010000 }, { 0x00020000 }, // y = 0
+ { 0x00000001 }, { 0x00010001 }, { 0x00020001 }, // y = 1
+ { 0x00000002 }, { 0x00010002 }, { 0x00020002 }, // y = 2
+};
+
+int av1_find_interp_filter_match(
+ MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
+ const InterpFilter assign_filter, const int need_search,
+ INTERPOLATION_FILTER_STATS *interp_filter_stats,
+ int interp_filter_stats_idx);
+
+int64_t av1_interpolation_filter_search(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+ int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
+ HandleInterModeArgs *args, int64_t ref_best_rd);
+
+/*!\endcond */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/intra_mode_search.c b/third_party/aom/av1/encoder/intra_mode_search.c
new file mode 100644
index 0000000000..99b0af2f8e
--- /dev/null
+++ b/third_party/aom/av1/encoder/intra_mode_search.c
@@ -0,0 +1,1739 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/tx_search.h"
+
+// Even though there are 7 delta angles, this macro is set to 9 to facilitate
+// the rd threshold check to prune -3 and 3 delta angles.
+#define SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY (2 * MAX_ANGLE_DELTA + 3)
+
+// The order for evaluating delta angles while processing the luma directional
+// intra modes. Currently, this order of evaluation is applicable only when
+// speed feature prune_luma_odd_delta_angles_in_intra is enabled. In this case,
+// even angles are evaluated first in order to facilitate the pruning of odd
+// delta angles based on the rd costs of the neighboring delta angles.
+static const int8_t luma_delta_angles_order[2 * MAX_ANGLE_DELTA] = {
+ -2, 2, -3, -1, 1, 3,
+};
+
+/*!\cond */
+static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
+ DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, PAETH_PRED,
+ SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED, D157_PRED,
+ D67_PRED, D113_PRED, D45_PRED,
+};
+
+static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
+ UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, UV_V_PRED,
+ UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+ UV_D135_PRED, UV_D203_PRED, UV_D157_PRED, UV_D67_PRED,
+ UV_D113_PRED, UV_D45_PRED,
+};
+
+// The bitmask corresponds to the filter intra modes as defined in enums.h
+// FILTER_INTRA_MODE enumeration type. Setting a bit to 0 in the mask means to
+// disable the evaluation of corresponding filter intra mode. The table
+// av1_derived_filter_intra_mode_used_flag is used when speed feature
+// prune_filter_intra_level is 1. The evaluated filter intra modes are union
+// of the following:
+// 1) FILTER_DC_PRED
+// 2) mode that corresponds to best mode so far of DC_PRED, V_PRED, H_PRED,
+// D157_PRED and PAETH_PRED. (Eg: FILTER_V_PRED if best mode so far is V_PRED).
+static const uint8_t av1_derived_filter_intra_mode_used_flag[INTRA_MODES] = {
+ 0x01, // DC_PRED: 0000 0001
+ 0x03, // V_PRED: 0000 0011
+ 0x05, // H_PRED: 0000 0101
+ 0x01, // D45_PRED: 0000 0001
+ 0x01, // D135_PRED: 0000 0001
+ 0x01, // D113_PRED: 0000 0001
+ 0x09, // D157_PRED: 0000 1001
+ 0x01, // D203_PRED: 0000 0001
+ 0x01, // D67_PRED: 0000 0001
+ 0x01, // SMOOTH_PRED: 0000 0001
+ 0x01, // SMOOTH_V_PRED: 0000 0001
+ 0x01, // SMOOTH_H_PRED: 0000 0001
+ 0x11 // PAETH_PRED: 0001 0001
+};
+
+// The bitmask corresponds to the chroma intra modes as defined in enums.h
+// UV_PREDICTION_MODE enumeration type. Setting a bit to 0 in the mask means to
+// disable the evaluation of corresponding chroma intra mode. The table
+// av1_derived_chroma_intra_mode_used_flag is used when speed feature
+// prune_chroma_modes_using_luma_winner is enabled. The evaluated chroma
+// intra modes are union of the following:
+// 1) UV_DC_PRED
+// 2) UV_SMOOTH_PRED
+// 3) UV_CFL_PRED
+// 4) mode that corresponds to luma intra mode winner (Eg : UV_V_PRED if luma
+// intra mode winner is V_PRED).
+static const uint16_t av1_derived_chroma_intra_mode_used_flag[INTRA_MODES] = {
+ 0x2201, // DC_PRED: 0010 0010 0000 0001
+ 0x2203, // V_PRED: 0010 0010 0000 0011
+ 0x2205, // H_PRED: 0010 0010 0000 0101
+ 0x2209, // D45_PRED: 0010 0010 0000 1001
+ 0x2211, // D135_PRED: 0010 0010 0001 0001
+ 0x2221, // D113_PRED: 0010 0010 0010 0001
+ 0x2241, // D157_PRED: 0010 0010 0100 0001
+ 0x2281, // D203_PRED: 0010 0010 1000 0001
+ 0x2301, // D67_PRED: 0010 0011 0000 0001
+ 0x2201, // SMOOTH_PRED: 0010 0010 0000 0001
+ 0x2601, // SMOOTH_V_PRED: 0010 0110 0000 0001
+ 0x2a01, // SMOOTH_H_PRED: 0010 1010 0000 0001
+ 0x3201 // PAETH_PRED: 0011 0010 0000 0001
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, all_zeros[MAX_SB_SIZE]) = { 0 };
+DECLARE_ALIGNED(16, static const uint16_t,
+ highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf,
+ const int stride, const int is_hbd) {
+ unsigned int sse;
+
+ if (is_hbd)
+ return vf(buf, stride, CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse);
+ else
+ return vf(buf, stride, all_zeros, 0, &sse);
+}
+
+// Computes average of log(1 + variance) across 4x4 sub-blocks for source and
+// reconstructed blocks.
+static void compute_avg_log_variance(const AV1_COMP *const cpi, MACROBLOCK *x,
+ const BLOCK_SIZE bs,
+ double *avg_log_src_variance,
+ double *avg_log_recon_variance) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+ const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1);
+ const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1);
+ const int right_overflow =
+ (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+ const int bottom_overflow =
+ (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+ const int bw = (MI_SIZE * mi_size_wide[bs] - right_overflow);
+ const int bh = (MI_SIZE * mi_size_high[bs] - bottom_overflow);
+ const int is_hbd = is_cur_buf_hbd(xd);
+
+ for (int i = 0; i < bh; i += MI_SIZE) {
+ const int r = mi_row_in_sb + (i >> MI_SIZE_LOG2);
+ for (int j = 0; j < bw; j += MI_SIZE) {
+ const int c = mi_col_in_sb + (j >> MI_SIZE_LOG2);
+ const int mi_offset = r * mi_size_wide[sb_size] + c;
+ Block4x4VarInfo *block_4x4_var_info =
+ &x->src_var_info_of_4x4_sub_blocks[mi_offset];
+ int src_var = block_4x4_var_info->var;
+ double log_src_var = block_4x4_var_info->log_var;
+ // Compute average of log(1 + variance) for the source block from 4x4
+ // sub-block variance values. Calculate and store 4x4 sub-block variance
+ // and log(1 + variance), if the values present in
+ // src_var_of_4x4_sub_blocks are invalid. Reuse the same if it is readily
+ // available with valid values.
+ if (src_var < 0) {
+ src_var = av1_calc_normalized_variance(
+ cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride, is_hbd);
+ block_4x4_var_info->var = src_var;
+ log_src_var = log1p(src_var / 16.0);
+ block_4x4_var_info->log_var = log_src_var;
+ } else {
+ // When source variance is already calculated and available for
+ // retrieval, check if log(1 + variance) is also available. If it is
+ // available, then retrieve from buffer. Else, calculate the same and
+ // store to the buffer.
+ if (log_src_var < 0) {
+ log_src_var = log1p(src_var / 16.0);
+ block_4x4_var_info->log_var = log_src_var;
+ }
+ }
+ *avg_log_src_variance += log_src_var;
+
+ const int recon_var = av1_calc_normalized_variance(
+ cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+ xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j,
+ xd->plane[0].dst.stride, is_hbd);
+ *avg_log_recon_variance += log1p(recon_var / 16.0);
+ }
+ }
+
+ const int blocks = (bw * bh) / 16;
+ *avg_log_src_variance /= (double)blocks;
+ *avg_log_recon_variance /= (double)blocks;
+}
+
+// Returns a factor to be applied to the RD value based on how well the
+// reconstructed block variance matches the source variance.
+static double intra_rd_variance_factor(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs) {
+ double threshold = INTRA_RD_VAR_THRESH(cpi->oxcf.speed);
+ // For non-positive threshold values, the comparison of source and
+ // reconstructed variances with threshold evaluates to false
+ // (src_var < threshold/rec_var < threshold) as these metrics are greater than
+ // than 0. Hence further calculations are skipped.
+ if (threshold <= 0) return 1.0;
+
+ double variance_rd_factor = 1.0;
+ double avg_log_src_variance = 0.0;
+ double avg_log_recon_variance = 0.0;
+ double var_diff = 0.0;
+
+ compute_avg_log_variance(cpi, x, bs, &avg_log_src_variance,
+ &avg_log_recon_variance);
+
+ // Dont allow 0 to prevent / 0 below.
+ avg_log_src_variance += 0.000001;
+ avg_log_recon_variance += 0.000001;
+
+ if (avg_log_src_variance >= avg_log_recon_variance) {
+ var_diff = (avg_log_src_variance - avg_log_recon_variance);
+ if ((var_diff > 0.5) && (avg_log_recon_variance < threshold)) {
+ variance_rd_factor = 1.0 + ((var_diff * 2) / avg_log_src_variance);
+ }
+ } else {
+ var_diff = (avg_log_recon_variance - avg_log_src_variance);
+ if ((var_diff > 0.5) && (avg_log_src_variance < threshold)) {
+ variance_rd_factor = 1.0 + (var_diff / (2 * avg_log_src_variance));
+ }
+ }
+
+ // Limit adjustment;
+ variance_rd_factor = AOMMIN(3.0, variance_rd_factor);
+
+ return variance_rd_factor;
+}
+/*!\endcond */
+
+/*!\brief Search for the best filter_intra mode when coding intra frame.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function loops through all filter_intra modes to find the best one.
+ *
+ * \return Returns 1 if a new filter_intra mode is selected; 0 otherwise.
+ */
+static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, uint8_t *skippable,
+ BLOCK_SIZE bsize, int mode_cost,
+ PREDICTION_MODE best_mode_so_far,
+ int64_t *best_rd, int64_t *best_model_rd,
+ PICK_MODE_CONTEXT *ctx) {
+ // Skip the evaluation of filter intra modes.
+ if (cpi->sf.intra_sf.prune_filter_intra_level == 2) return 0;
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ int filter_intra_selected_flag = 0;
+ FILTER_INTRA_MODE mode;
+ TX_SIZE best_tx_size = TX_8X8;
+ FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ av1_zero(filter_intra_mode_info);
+ mbmi->filter_intra_mode_info.use_filter_intra = 1;
+ mbmi->mode = DC_PRED;
+ mbmi->palette_mode_info.palette_size[0] = 0;
+
+ // Skip the evaluation of filter-intra if cached MB_MODE_INFO does not have
+ // filter-intra as winner.
+ if (x->use_mb_mode_cache &&
+ !x->mb_mode_cache->filter_intra_mode_info.use_filter_intra)
+ return 0;
+
+ for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+ int64_t this_rd;
+ RD_STATS tokenonly_rd_stats;
+ mbmi->filter_intra_mode_info.filter_intra_mode = mode;
+
+ if ((cpi->sf.intra_sf.prune_filter_intra_level == 1) &&
+ !(av1_derived_filter_intra_mode_used_flag[best_mode_so_far] &
+ (1 << mode)))
+ continue;
+
+ // Skip the evaluation of modes that do not match with the winner mode in
+ // x->mb_mode_cache.
+ if (x->use_mb_mode_cache &&
+ mode != x->mb_mode_cache->filter_intra_mode_info.filter_intra_mode)
+ continue;
+
+ if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) {
+ continue;
+ }
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+ *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ const int this_rate =
+ tokenonly_rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+ // Visual quality adjustment based on recon vs source variance.
+ if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) {
+ this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize));
+ }
+
+ // Collect mode stats for multiwinner mode processing
+ const int txfm_search_done = 1;
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ best_tx_size = mbmi->tx_size;
+ filter_intra_mode_info = mbmi->filter_intra_mode_info;
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ *rate = this_rate;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *skippable = tokenonly_rd_stats.skip_txfm;
+ filter_intra_selected_flag = 1;
+ }
+ }
+
+ if (filter_intra_selected_flag) {
+ mbmi->mode = DC_PRED;
+ mbmi->tx_size = best_tx_size;
+ mbmi->filter_intra_mode_info = filter_intra_mode_info;
+ av1_copy_array(ctx->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+ int *val_count, int *num_colors) {
+ const int max_pix_val = 1 << 8;
+ memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ const int this_val = src[r * stride + c];
+ assert(this_val < max_pix_val);
+ ++val_count[this_val];
+ }
+ }
+ int n = 0;
+ for (int i = 0; i < max_pix_val; ++i) {
+ if (val_count[i]) ++n;
+ }
+ *num_colors = n;
+}
+
+void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+ int cols, int bit_depth, int *val_count,
+ int *bin_val_count, int *num_color_bins,
+ int *num_colors) {
+ assert(bit_depth <= 12);
+ const int max_bin_val = 1 << 8;
+ const int max_pix_val = 1 << bit_depth;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ memset(bin_val_count, 0, max_bin_val * sizeof(val_count[0]));
+ if (val_count != NULL)
+ memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ /*
+ * Down-convert the pixels to 8-bit domain before counting.
+ * This provides consistency of behavior for palette search
+ * between lbd and hbd encodes. This down-converted pixels
+ * are only used for calculating the threshold (n).
+ */
+ const int this_val = ((src[r * stride + c]) >> (bit_depth - 8));
+ assert(this_val < max_bin_val);
+ if (this_val >= max_bin_val) continue;
+ ++bin_val_count[this_val];
+ if (val_count != NULL) ++val_count[(src[r * stride + c])];
+ }
+ }
+ int n = 0;
+ // Count the colors based on 8-bit domain used to gate the palette path
+ for (int i = 0; i < max_bin_val; ++i) {
+ if (bin_val_count[i]) ++n;
+ }
+ *num_color_bins = n;
+
+ // Count the actual hbd colors used to create top_colors
+ n = 0;
+ if (val_count != NULL) {
+ for (int i = 0; i < max_pix_val; ++i) {
+ if (val_count[i]) ++n;
+ }
+ *num_colors = n;
+ }
+}
+
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi,
+ int reorder_delta_angle_eval) {
+ if (mode_idx < INTRA_MODE_END) {
+ mbmi->mode = intra_rd_search_mode_order[mode_idx];
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ } else {
+ mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED;
+ int delta_angle_eval_idx =
+ (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2);
+ if (reorder_delta_angle_eval) {
+ mbmi->angle_delta[PLANE_TYPE_Y] =
+ luma_delta_angles_order[delta_angle_eval_idx];
+ } else {
+ mbmi->angle_delta[PLANE_TYPE_Y] =
+ (delta_angle_eval_idx < 3 ? (delta_angle_eval_idx - 3)
+ : (delta_angle_eval_idx - 2));
+ }
+ }
+}
+
+static AOM_INLINE int get_model_rd_index_for_pruning(
+ const MACROBLOCK *const x,
+ const INTRA_MODE_SPEED_FEATURES *const intra_sf) {
+ const int top_intra_model_count_allowed =
+ intra_sf->top_intra_model_count_allowed;
+ if (!intra_sf->adapt_top_model_rd_count_using_neighbors)
+ return top_intra_model_count_allowed - 1;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const PREDICTION_MODE mode = xd->mi[0]->mode;
+ int model_rd_index_for_pruning = top_intra_model_count_allowed - 1;
+ int is_left_mode_neq_cur_mode = 0, is_above_mode_neq_cur_mode = 0;
+ if (xd->left_available)
+ is_left_mode_neq_cur_mode = xd->left_mbmi->mode != mode;
+ if (xd->up_available)
+ is_above_mode_neq_cur_mode = xd->above_mbmi->mode != mode;
+ // The pruning of luma intra modes is made more aggressive at lower quantizers
+ // and vice versa. The value for model_rd_index_for_pruning is derived as
+ // follows.
+ // qidx 0 to 127: Reduce the index of a candidate used for comparison only if
+ // the current mode does not match either of the available neighboring modes.
+ // qidx 128 to 255: Reduce the index of a candidate used for comparison only
+ // if the current mode does not match both the available neighboring modes.
+ if (x->qindex <= 127) {
+ if (is_left_mode_neq_cur_mode || is_above_mode_neq_cur_mode)
+ model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0);
+ } else {
+ if (is_left_mode_neq_cur_mode && is_above_mode_neq_cur_mode)
+ model_rd_index_for_pruning = AOMMAX(model_rd_index_for_pruning - 1, 0);
+ }
+ return model_rd_index_for_pruning;
+}
+
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+ int64_t top_intra_model_rd[], int max_model_cnt_allowed,
+ int model_rd_index_for_pruning) {
+ const double thresh_best = 1.50;
+ const double thresh_top = 1.00;
+ for (int i = 0; i < max_model_cnt_allowed; i++) {
+ if (this_model_rd < top_intra_model_rd[i]) {
+ for (int j = max_model_cnt_allowed - 1; j > i; j--) {
+ top_intra_model_rd[j] = top_intra_model_rd[j - 1];
+ }
+ top_intra_model_rd[i] = this_model_rd;
+ break;
+ }
+ }
+ if (top_intra_model_rd[model_rd_index_for_pruning] != INT64_MAX &&
+ this_model_rd >
+ thresh_top * top_intra_model_rd[model_rd_index_for_pruning])
+ return 1;
+
+ if (this_model_rd != INT64_MAX &&
+ this_model_rd > thresh_best * (*best_model_rd))
+ return 1;
+ if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+ return 0;
+}
+
+// Run RD calculation with given chroma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t pick_intra_angle_routine_sbuv(
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
+ int *best_angle_delta, int64_t *best_rd) {
+ MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+ assert(!is_inter_block(mbmi));
+ int this_rate;
+ int64_t this_rd;
+ RD_STATS tokenonly_rd_stats;
+
+ if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
+ return INT64_MAX;
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+ *rate = this_rate;
+ rd_stats->rate = tokenonly_rd_stats.rate;
+ rd_stats->dist = tokenonly_rd_stats.dist;
+ rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm;
+ }
+ return this_rd;
+}
+
+/*!\brief Search for the best angle delta for chroma prediction
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Given a chroma directional intra prediction mode, this function will try to
+ * estimate the best delta_angle.
+ *
+ * \returns Return if there is a new mode with smaller rdcost than best_rd.
+ */
+static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int rate_overhead,
+ int64_t best_rd, int *rate,
+ RD_STATS *rd_stats) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ int i, angle_delta, best_angle_delta = 0;
+ int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+ rd_stats->rate = INT_MAX;
+ rd_stats->skip_txfm = 0;
+ rd_stats->dist = INT64_MAX;
+ for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+ for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ for (i = 0; i < 2; ++i) {
+ best_rd_in = (best_rd == INT64_MAX)
+ ? INT64_MAX
+ : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
+ mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+ this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
+ best_rd_in, rate, rd_stats,
+ &best_angle_delta, &best_rd);
+ rd_cost[2 * angle_delta + i] = this_rd;
+ if (angle_delta == 0) {
+ if (this_rd == INT64_MAX) return 0;
+ rd_cost[1] = this_rd;
+ break;
+ }
+ }
+ }
+
+ assert(best_rd != INT64_MAX);
+ for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ int64_t rd_thresh;
+ for (i = 0; i < 2; ++i) {
+ int skip_search = 0;
+ rd_thresh = best_rd + (best_rd >> 5);
+ if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+ rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+ skip_search = 1;
+ if (!skip_search) {
+ mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+ pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+ rate, rd_stats, &best_angle_delta,
+ &best_rd);
+ }
+ }
+ }
+
+ mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta;
+ return rd_stats->rate != INT_MAX;
+}
+
+#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
+ (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
+
+static void cfl_idx_to_sign_and_alpha(int cfl_idx, CFL_SIGN_TYPE *cfl_sign,
+ int *cfl_alpha) {
+ int cfl_linear_idx = cfl_idx - CFL_INDEX_ZERO;
+ if (cfl_linear_idx == 0) {
+ *cfl_sign = CFL_SIGN_ZERO;
+ *cfl_alpha = 0;
+ } else {
+ *cfl_sign = cfl_linear_idx > 0 ? CFL_SIGN_POS : CFL_SIGN_NEG;
+ *cfl_alpha = abs(cfl_linear_idx) - 1;
+ }
+}
+
+static int64_t cfl_compute_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int plane, TX_SIZE tx_size,
+ BLOCK_SIZE plane_bsize, int cfl_idx,
+ int fast_mode, RD_STATS *rd_stats) {
+ assert(IMPLIES(fast_mode, rd_stats == NULL));
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int cfl_plane = get_cfl_pred_type(plane);
+ CFL_SIGN_TYPE cfl_sign;
+ int cfl_alpha;
+ cfl_idx_to_sign_and_alpha(cfl_idx, &cfl_sign, &cfl_alpha);
+ // We conly build CFL for a given plane, the other plane's sign is dummy
+ int dummy_sign = CFL_SIGN_NEG;
+ const int8_t orig_cfl_alpha_signs = mbmi->cfl_alpha_signs;
+ const uint8_t orig_cfl_alpha_idx = mbmi->cfl_alpha_idx;
+ mbmi->cfl_alpha_signs =
+ PLANE_SIGN_TO_JOINT_SIGN(cfl_plane, cfl_sign, dummy_sign);
+ mbmi->cfl_alpha_idx = (cfl_alpha << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha;
+ int64_t cfl_cost;
+ if (fast_mode) {
+ cfl_cost =
+ intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0);
+ } else {
+ av1_init_rd_stats(rd_stats);
+ av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize,
+ tx_size, FTXS_NONE, 0);
+ av1_rd_cost_update(x->rdmult, rd_stats);
+ cfl_cost = rd_stats->rdcost;
+ }
+ mbmi->cfl_alpha_signs = orig_cfl_alpha_signs;
+ mbmi->cfl_alpha_idx = orig_cfl_alpha_idx;
+ return cfl_cost;
+}
+
+static const int cfl_dir_ls[2] = { 1, -1 };
+
+// If cfl_search_range is CFL_MAGS_SIZE, return zero. Otherwise return the index
+// of the best alpha found using intra_model_rd().
+static int cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int plane, TX_SIZE tx_size,
+ int cfl_search_range) {
+ assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+
+ if (cfl_search_range == CFL_MAGS_SIZE) return CFL_INDEX_ZERO;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->uv_mode == UV_CFL_PRED);
+ const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+ int est_best_cfl_idx = CFL_INDEX_ZERO;
+ int fast_mode = 1;
+ int start_cfl_idx = CFL_INDEX_ZERO;
+ int64_t best_cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize,
+ start_cfl_idx, fast_mode, NULL);
+ for (int si = 0; si < 2; ++si) {
+ const int dir = cfl_dir_ls[si];
+ for (int i = 1; i < CFL_MAGS_SIZE; ++i) {
+ int cfl_idx = start_cfl_idx + dir * i;
+ if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
+ int64_t cfl_cost = cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize,
+ cfl_idx, fast_mode, NULL);
+ if (cfl_cost < best_cfl_cost) {
+ best_cfl_cost = cfl_cost;
+ est_best_cfl_idx = cfl_idx;
+ } else {
+ break;
+ }
+ }
+ }
+ return est_best_cfl_idx;
+}
+
+static AOM_INLINE void set_invalid_cfl_parameters(
+ uint8_t *best_cfl_alpha_idx, int8_t *best_cfl_alpha_signs) {
+ *best_cfl_alpha_idx = 0;
+ *best_cfl_alpha_signs = 0;
+}
+
+static void cfl_pick_plane_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int plane, TX_SIZE tx_size, int cfl_search_range,
+ RD_STATS cfl_rd_arr[CFL_MAGS_SIZE],
+ int est_best_cfl_idx) {
+ assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->uv_mode == UV_CFL_PRED);
+ const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+ for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) {
+ av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]);
+ }
+
+ int fast_mode = 0;
+ int start_cfl_idx = est_best_cfl_idx;
+ cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode,
+ &cfl_rd_arr[start_cfl_idx]);
+
+ if (cfl_search_range == 1) return;
+
+ for (int si = 0; si < 2; ++si) {
+ const int dir = cfl_dir_ls[si];
+ for (int i = 1; i < cfl_search_range; ++i) {
+ int cfl_idx = start_cfl_idx + dir * i;
+ if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
+ cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode,
+ &cfl_rd_arr[cfl_idx]);
+ }
+ }
+}
+
+/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ *
+ * This function will use DCT_DCT followed by computing SATD (sum of absolute
+ * transformed differences) to estimate the RD score and find the best possible
+ * CFL parameter.
+ *
+ * Then the function will apply a full RD search near the best possible CFL
+ * parameter to find the best actual CFL parameter.
+ *
+ * Side effect:
+ * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD
+ * search.
+ *
+ * \param[in] x Encoder prediction block structure.
+ * \param[in] cpi Top-level encoder instance structure.
+ * \param[in] tx_size Transform size.
+ * \param[in] ref_best_rd Reference best RD.
+ * \param[in] cfl_search_range The search range of full RD search near the
+ * estimated best CFL parameter.
+ *
+ * \param[out] best_rd_stats RD stats of the best CFL parameter
+ * \param[out] best_cfl_alpha_idx Best CFL alpha index
+ * \param[out] best_cfl_alpha_signs Best CFL joint signs
+ *
+ */
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+ TX_SIZE tx_size, int64_t ref_best_rd,
+ int cfl_search_range, RD_STATS *best_rd_stats,
+ uint8_t *best_cfl_alpha_idx,
+ int8_t *best_cfl_alpha_signs) {
+ assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
+ const ModeCosts *mode_costs = &x->mode_costs;
+ RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE];
+ RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE];
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int est_best_cfl_idx_u, est_best_cfl_idx_v;
+
+ av1_invalid_rd_stats(best_rd_stats);
+
+ // As the dc pred data is same for different values of alpha, enable the
+ // caching of dc pred data. Call clear_cfl_dc_pred_cache_flags() before
+ // returning to avoid the unintentional usage of cached dc pred data.
+ xd->cfl.use_dc_pred_cache = true;
+ // Evaluate alpha parameter of each chroma plane.
+ est_best_cfl_idx_u =
+ cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range);
+ est_best_cfl_idx_v =
+ cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range);
+
+ if (cfl_search_range == 1) {
+ // For cfl_search_range=1, further refinement of alpha is not enabled. Hence
+ // CfL index=0 for both the chroma planes implies invalid CfL mode.
+ if (est_best_cfl_idx_u == CFL_INDEX_ZERO &&
+ est_best_cfl_idx_v == CFL_INDEX_ZERO) {
+ set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
+ clear_cfl_dc_pred_cache_flags(&xd->cfl);
+ return 0;
+ }
+
+ int cfl_alpha_u, cfl_alpha_v;
+ CFL_SIGN_TYPE cfl_sign_u, cfl_sign_v;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ cfl_idx_to_sign_and_alpha(est_best_cfl_idx_u, &cfl_sign_u, &cfl_alpha_u);
+ cfl_idx_to_sign_and_alpha(est_best_cfl_idx_v, &cfl_sign_v, &cfl_alpha_v);
+ const int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
+ // Compute alpha and mode signaling rate.
+ const int rate_overhead =
+ mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u] +
+ mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v] +
+ mode_costs
+ ->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_CFL_PRED];
+ // Skip the CfL mode evaluation if the RD cost derived using the rate needed
+ // to signal the CfL mode and alpha parameter exceeds the ref_best_rd.
+ if (RDCOST(x->rdmult, rate_overhead, 0) > ref_best_rd) {
+ set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
+ clear_cfl_dc_pred_cache_flags(&xd->cfl);
+ return 0;
+ }
+ }
+
+ // Compute the rd cost of each chroma plane using the alpha parameters which
+ // were already evaluated.
+ cfl_pick_plane_rd(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u,
+ est_best_cfl_idx_u);
+ cfl_pick_plane_rd(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v,
+ est_best_cfl_idx_v);
+
+ clear_cfl_dc_pred_cache_flags(&xd->cfl);
+
+ for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) {
+ if (cfl_rd_arr_u[ui].rate == INT_MAX) continue;
+ int cfl_alpha_u;
+ CFL_SIGN_TYPE cfl_sign_u;
+ cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u);
+ for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) {
+ if (cfl_rd_arr_v[vi].rate == INT_MAX) continue;
+ int cfl_alpha_v;
+ CFL_SIGN_TYPE cfl_sign_v;
+ cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v);
+ // cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO is not a
+ // valid parameter for CFL
+ if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue;
+ int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
+ RD_STATS rd_stats = cfl_rd_arr_u[ui];
+ av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]);
+ if (rd_stats.rate != INT_MAX) {
+ rd_stats.rate +=
+ mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u];
+ rd_stats.rate +=
+ mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v];
+ }
+ av1_rd_cost_update(x->rdmult, &rd_stats);
+ if (rd_stats.rdcost < best_rd_stats->rdcost) {
+ *best_rd_stats = rd_stats;
+ *best_cfl_alpha_idx =
+ (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v;
+ *best_cfl_alpha_signs = joint_sign;
+ }
+ }
+ }
+ if (best_rd_stats->rdcost >= ref_best_rd) {
+ av1_invalid_rd_stats(best_rd_stats);
+ // Set invalid CFL parameters here since the rdcost is not better than
+ // ref_best_rd.
+ set_invalid_cfl_parameters(best_cfl_alpha_idx, best_cfl_alpha_signs);
+ return 0;
+ }
+ return 1;
+}
+
+static bool should_prune_chroma_smooth_pred_based_on_source_variance(
+ const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize) {
+ if (!cpi->sf.intra_sf.prune_smooth_intra_mode_for_chroma) return false;
+
+ // If the source variance of both chroma planes is less than 20 (empirically
+ // derived), prune UV_SMOOTH_PRED.
+ for (int i = AOM_PLANE_U; i < av1_num_planes(&cpi->common); i++) {
+ const unsigned int variance = av1_get_perpixel_variance_facade(
+ cpi, &x->e_mbd, &x->plane[i].src, bsize, i);
+ if (variance >= 20) return false;
+ }
+ return true;
+}
+
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, uint8_t *skippable,
+ BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ MB_MODE_INFO best_mbmi = *mbmi;
+ int64_t best_rd = INT64_MAX, this_rd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+
+ init_sbuv_mode(mbmi);
+
+ // Return if the current block does not correspond to a chroma block.
+ if (!xd->is_chroma_ref) {
+ *rate = 0;
+ *rate_tokenonly = 0;
+ *distortion = 0;
+ *skippable = 1;
+ return INT64_MAX;
+ }
+
+ // Only store reconstructed luma when there's chroma RDO. When there's no
+ // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+ xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+ if (xd->cfl.store_y) {
+ // Restore reconstructed luma values.
+ // TODO(chiyotsai@google.com): right now we are re-computing the txfm in
+ // this function everytime we search through uv modes. There is some
+ // potential speed up here if we cache the result to avoid redundant
+ // computation.
+ av1_encode_intra_block_plane(cpi, x, mbmi->bsize, AOM_PLANE_Y,
+ DRY_RUN_NORMAL,
+ cpi->optimize_seg_arr[mbmi->segment_id]);
+ xd->cfl.store_y = 0;
+ }
+ IntraModeSearchState intra_search_state;
+ init_intra_mode_search_state(&intra_search_state);
+ const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+
+ // Search through all non-palette modes.
+ for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
+ int this_rate;
+ RD_STATS tokenonly_rd_stats;
+ UV_PREDICTION_MODE uv_mode = uv_rd_search_mode_order[mode_idx];
+
+ // Skip the current mode evaluation if the RD cost derived using the mode
+ // signaling rate exceeds the best_rd so far.
+ const int mode_rate =
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
+ if (RDCOST(x->rdmult, mode_rate, 0) > best_rd) continue;
+
+ PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ const int is_diagonal_mode = av1_is_diagonal_mode(intra_mode);
+ const int is_directional_mode = av1_is_directional_mode(intra_mode);
+
+ if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra)
+ continue;
+ if (is_directional_mode &&
+ !cpi->oxcf.intra_mode_cfg.enable_directional_intra)
+ continue;
+
+ if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
+ (1 << uv_mode)))
+ continue;
+ if (!intra_mode_cfg->enable_smooth_intra && uv_mode >= UV_SMOOTH_PRED &&
+ uv_mode <= UV_SMOOTH_H_PRED)
+ continue;
+
+ if (!intra_mode_cfg->enable_paeth_intra && uv_mode == UV_PAETH_PRED)
+ continue;
+
+ assert(mbmi->mode < INTRA_MODES);
+ if (cpi->sf.intra_sf.prune_chroma_modes_using_luma_winner &&
+ !(av1_derived_chroma_intra_mode_used_flag[mbmi->mode] & (1 << uv_mode)))
+ continue;
+
+ mbmi->uv_mode = uv_mode;
+
+ // Init variables for cfl and angle delta
+ const SPEED_FEATURES *sf = &cpi->sf;
+ mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+ if (uv_mode == UV_CFL_PRED) {
+ if (!cfl_allowed || !intra_mode_cfg->enable_cfl_intra) continue;
+ assert(!is_directional_mode);
+ const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+ if (!cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd,
+ sf->intra_sf.cfl_search_range, &tokenonly_rd_stats,
+ &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs)) {
+ continue;
+ }
+ } else if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) &&
+ intra_mode_cfg->enable_angle_delta) {
+ if (sf->intra_sf.chroma_intra_pruning_with_hog &&
+ !intra_search_state.dir_mode_skip_mask_ready) {
+ static const float thresh[2][4] = {
+ { -1.2f, 0.0f, 0.0f, 1.2f }, // Interframe
+ { -1.2f, -1.2f, -0.6f, 0.4f }, // Intraframe
+ };
+ const int is_chroma = 1;
+ const int is_intra_frame = frame_is_intra_only(cm);
+ prune_intra_mode_with_hog(
+ x, bsize, cm->seq_params->sb_size,
+ thresh[is_intra_frame]
+ [sf->intra_sf.chroma_intra_pruning_with_hog - 1],
+ intra_search_state.directional_mode_skip_mask, is_chroma);
+ intra_search_state.dir_mode_skip_mask_ready = 1;
+ }
+ if (intra_search_state.directional_mode_skip_mask[uv_mode]) {
+ continue;
+ }
+
+ // Search through angle delta
+ const int rate_overhead =
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
+ if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+ &this_rate, &tokenonly_rd_stats))
+ continue;
+ } else {
+ if (uv_mode == UV_SMOOTH_PRED &&
+ should_prune_chroma_smooth_pred_based_on_source_variance(cpi, x,
+ bsize))
+ continue;
+
+ // Predict directly if we don't need to search for angle delta.
+ if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
+ continue;
+ }
+ }
+ const int mode_cost =
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][uv_mode];
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+ if (this_rd < best_rd) {
+ best_mbmi = *mbmi;
+ best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *skippable = tokenonly_rd_stats.skip_txfm;
+ }
+ }
+
+ // Search palette mode
+ const int try_palette =
+ cpi->oxcf.tool_cfg.enable_palette &&
+ av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+ mbmi->bsize);
+ if (try_palette) {
+ uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
+ av1_rd_pick_palette_intra_sbuv(
+ cpi, x,
+ mode_costs->intra_uv_mode_cost[cfl_allowed][mbmi->mode][UV_DC_PRED],
+ best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
+ distortion, skippable);
+ }
+
+ *mbmi = best_mbmi;
+ // Make sure we actually chose a mode
+ assert(best_rd < INT64_MAX);
+ return best_rd;
+}
+
+// Searches palette mode for luma channel in inter frame.
+int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost,
+ int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int rate2 = 0;
+ int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd;
+ int skippable = 0;
+ uint8_t *const best_palette_color_map =
+ x->palette_buffer->best_palette_color_map;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ MB_MODE_INFO best_mbmi_palette = *mbmi;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *const intra_mode_cost =
+ mode_costs->mbmode_cost[size_group_lookup[bsize]];
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ av1_zero(pmi->palette_size);
+
+ RD_STATS rd_stats_y;
+ av1_invalid_rd_stats(&rd_stats_y);
+ av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED],
+ &best_mbmi_palette, best_palette_color_map,
+ &best_rd_palette, &rd_stats_y.rate, NULL,
+ &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+ ctx, best_blk_skip, best_tx_type_map);
+ if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
+ this_rd_cost->rdcost = INT64_MAX;
+ return skippable;
+ }
+
+ memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+ memcpy(color_map, best_palette_color_map,
+ rows * cols * sizeof(best_palette_color_map[0]));
+
+ skippable = rd_stats_y.skip_txfm;
+ distortion2 = rd_stats_y.dist;
+ rate2 = rd_stats_y.rate + ref_frame_cost;
+ if (num_planes > 1) {
+ if (intra_search_state->rate_uv_intra == INT_MAX) {
+ // We have not found any good uv mode yet, so we need to search for it.
+ TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+ av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra,
+ &intra_search_state->rate_uv_tokenonly,
+ &intra_search_state->dist_uvs,
+ &intra_search_state->skip_uvs, bsize, uv_tx);
+ intra_search_state->mode_uv = mbmi->uv_mode;
+ intra_search_state->pmi_uv = *pmi;
+ intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+ }
+
+ // We have found at least one good uv mode before, so copy and paste it
+ // over.
+ mbmi->uv_mode = intra_search_state->mode_uv;
+ pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+ if (pmi->palette_size[1] > 0) {
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+ intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+ 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+ }
+ mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+ skippable = skippable && intra_search_state->skip_uvs;
+ distortion2 += intra_search_state->dist_uvs;
+ rate2 += intra_search_state->rate_uv_intra;
+ }
+
+ if (skippable) {
+ rate2 -= rd_stats_y.rate;
+ if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly;
+ rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1];
+ } else {
+ rate2 += mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
+ }
+ this_rd = RDCOST(x->rdmult, rate2, distortion2);
+ this_rd_cost->rate = rate2;
+ this_rd_cost->dist = distortion2;
+ this_rd_cost->rdcost = this_rd;
+ return skippable;
+}
+
+void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ PICK_MODE_CONTEXT *ctx,
+ RD_STATS *this_rd_cost, int64_t best_rd) {
+ MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int64_t best_rd_palette = best_rd, this_rd;
+ uint8_t *const best_palette_color_map =
+ x->palette_buffer->best_palette_color_map;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ MB_MODE_INFO best_mbmi_palette = *mbmi;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *const intra_mode_cost =
+ mode_costs->mbmode_cost[size_group_lookup[bsize]];
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ av1_zero(pmi->palette_size);
+
+ RD_STATS rd_stats_y;
+ av1_invalid_rd_stats(&rd_stats_y);
+ av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED],
+ &best_mbmi_palette, best_palette_color_map,
+ &best_rd_palette, &rd_stats_y.rate, NULL,
+ &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+ ctx, best_blk_skip, best_tx_type_map);
+ if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
+ this_rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+ memcpy(color_map, best_palette_color_map,
+ rows * cols * sizeof(best_palette_color_map[0]));
+
+ rd_stats_y.rate += ref_frame_cost;
+
+ if (rd_stats_y.skip_txfm) {
+ rd_stats_y.rate =
+ ref_frame_cost +
+ mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][1];
+ } else {
+ rd_stats_y.rate +=
+ mode_costs->skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
+ }
+ this_rd = RDCOST(x->rdmult, rd_stats_y.rate, rd_stats_y.dist);
+ this_rd_cost->rate = rd_stats_y.rate;
+ this_rd_cost->dist = rd_stats_y.dist;
+ this_rd_cost->rdcost = this_rd;
+ this_rd_cost->skip_txfm = rd_stats_y.skip_txfm;
+}
+
+/*!\brief Get the intra prediction by searching through tx_type and tx_size.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * Currently this function is only used in the intra frame code path for
+ * winner-mode processing.
+ *
+ * \return Returns whether the current mode is an improvement over best_rd.
+ */
+static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, const int *bmode_costs,
+ int64_t *best_rd, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable,
+ MB_MODE_INFO *best_mbmi,
+ PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ RD_STATS rd_stats;
+ // In order to improve txfm search, avoid rd based breakouts during winner
+ // mode evaluation. Hence passing ref_best_rd as INT64_MAX by default when the
+ // speed feature use_rd_based_breakout_for_intra_tx_search is disabled.
+ int64_t ref_best_rd = cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search
+ ? *best_rd
+ : INT64_MAX;
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, ref_best_rd);
+ if (rd_stats.rate == INT_MAX) return 0;
+ int this_rate_tokenonly = rd_stats.rate;
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+ // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+ // in the tokenonly rate, but for intra blocks, tx_size is always coded
+ // (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size);
+ }
+ const int this_rate =
+ rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0);
+ const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_mbmi = *mbmi;
+ *best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = rd_stats.dist;
+ *skippable = rd_stats.skip_txfm;
+ av1_copy_array(ctx->blk_skip, x->txfm_search_info.blk_skip,
+ ctx->num_4x4_blk);
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ return 1;
+ }
+ return 0;
+}
+
+/*!\brief Search for the best filter_intra mode when coding inter frame.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function loops through all filter_intra modes to find the best one.
+ *
+ * \remark Returns nothing, but updates the mbmi and rd_stats.
+ */
+static INLINE void handle_filter_intra_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize,
+ const PICK_MODE_CONTEXT *ctx,
+ RD_STATS *rd_stats_y, int mode_cost,
+ int64_t best_rd,
+ int64_t best_rd_so_far) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->mode == DC_PRED &&
+ av1_filter_intra_allowed_bsize(&cpi->common, bsize));
+
+ RD_STATS rd_stats_y_fi;
+ int filter_intra_selected_flag = 0;
+ TX_SIZE best_tx_size = mbmi->tx_size;
+ FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ mbmi->filter_intra_mode_info.use_filter_intra = 1;
+ for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED; fi_mode < FILTER_INTRA_MODES;
+ ++fi_mode) {
+ mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize, best_rd);
+ if (rd_stats_y_fi.rate == INT_MAX) continue;
+ const int this_rate_tmp =
+ rd_stats_y_fi.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+ const int64_t this_rd_tmp =
+ RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+
+ if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) {
+ break;
+ }
+ if (this_rd_tmp < best_rd_so_far) {
+ best_tx_size = mbmi->tx_size;
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+ best_fi_mode = fi_mode;
+ *rd_stats_y = rd_stats_y_fi;
+ filter_intra_selected_flag = 1;
+ best_rd_so_far = this_rd_tmp;
+ }
+ }
+
+ mbmi->tx_size = best_tx_size;
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+ memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+
+ if (filter_intra_selected_flag) {
+ mbmi->filter_intra_mode_info.use_filter_intra = 1;
+ mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+ } else {
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ }
+}
+
+// Evaluate a given luma intra-mode in inter frames.
+int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
+ int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+ int64_t *best_model_rd,
+ int64_t top_intra_model_rd[]) {
+ const AV1_COMMON *cm = &cpi->common;
+ const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->ref_frame[0] == INTRA_FRAME);
+ const PREDICTION_MODE mode = mbmi->mode;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int mode_cost =
+ mode_costs->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost;
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+
+ int known_rate = mode_cost;
+ const int intra_cost_penalty = av1_get_intra_cost_penalty(
+ cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
+ cm->seq_params->bit_depth);
+
+ if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty;
+ known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0],
+ mode_costs->skip_txfm_cost[skip_ctx][1]);
+ const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
+ if (known_rd > best_rd) {
+ intra_search_state->skip_intra_modes = 1;
+ return 0;
+ }
+
+ const int is_directional_mode = av1_is_directional_mode(mode);
+ if (is_directional_mode && av1_use_angle_delta(bsize) &&
+ cpi->oxcf.intra_mode_cfg.enable_angle_delta) {
+ if (intra_sf->intra_pruning_with_hog &&
+ !intra_search_state->dir_mode_skip_mask_ready) {
+ const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f };
+ const int is_chroma = 0;
+ prune_intra_mode_with_hog(x, bsize, cm->seq_params->sb_size,
+ thresh[intra_sf->intra_pruning_with_hog - 1],
+ intra_search_state->directional_mode_skip_mask,
+ is_chroma);
+ intra_search_state->dir_mode_skip_mask_ready = 1;
+ }
+ if (intra_search_state->directional_mode_skip_mask[mode]) return 0;
+ }
+ const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+ const int64_t this_model_rd =
+ intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+
+ const int model_rd_index_for_pruning =
+ get_model_rd_index_for_pruning(x, intra_sf);
+
+ if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd,
+ intra_sf->top_intra_model_count_allowed,
+ model_rd_index_for_pruning))
+ return 0;
+ av1_init_rd_stats(rd_stats_y);
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
+
+ // Pick filter intra modes.
+ if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+ int try_filter_intra = 1;
+ int64_t best_rd_so_far = INT64_MAX;
+ if (rd_stats_y->rate != INT_MAX) {
+ // best_rd_so_far is the rdcost of DC_PRED without using filter_intra.
+ // Later, in filter intra search, best_rd_so_far is used for comparison.
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ const int tmp_rate =
+ rd_stats_y->rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+ best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
+ try_filter_intra = (best_rd_so_far / 2) <= best_rd;
+ } else if (intra_sf->skip_filter_intra_in_inter_frames >= 1) {
+ // As rd cost of luma intra dc mode is more than best_rd (i.e.,
+ // rd_stats_y->rate = INT_MAX), skip the evaluation of filter intra modes.
+ try_filter_intra = 0;
+ }
+
+ if (try_filter_intra) {
+ handle_filter_intra_mode(cpi, x, bsize, ctx, rd_stats_y, mode_cost,
+ best_rd, best_rd_so_far);
+ }
+ }
+
+ if (rd_stats_y->rate == INT_MAX) return 0;
+
+ *mode_cost_y = intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost, 0);
+ const int rate_y = rd_stats_y->skip_txfm
+ ? mode_costs->skip_txfm_cost[skip_ctx][1]
+ : rd_stats_y->rate;
+ *rd_y = RDCOST(x->rdmult, rate_y + *mode_cost_y, rd_stats_y->dist);
+ if (best_rd < (INT64_MAX / 2) && *rd_y > (best_rd + (best_rd >> 2))) {
+ intra_search_state->skip_intra_modes = 1;
+ return 0;
+ }
+
+ return 1;
+}
+
+int av1_search_intra_uv_modes_in_interframe(
+ IntraModeSearchState *intra_search_state, const AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats,
+ const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->ref_frame[0] == INTRA_FRAME);
+
+ // TODO(chiyotsai@google.com): Consolidate the chroma search code here with
+ // the one in av1_search_palette_mode.
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int try_palette =
+ cpi->oxcf.tool_cfg.enable_palette &&
+ av1_allow_palette(cm->features.allow_screen_content_tools, mbmi->bsize);
+
+ assert(intra_search_state->rate_uv_intra == INT_MAX);
+ if (intra_search_state->rate_uv_intra == INT_MAX) {
+ // If no good uv-predictor had been found, search for it.
+ const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+ av1_rd_pick_intra_sbuv_mode(cpi, x, &intra_search_state->rate_uv_intra,
+ &intra_search_state->rate_uv_tokenonly,
+ &intra_search_state->dist_uvs,
+ &intra_search_state->skip_uvs, bsize, uv_tx);
+ intra_search_state->mode_uv = mbmi->uv_mode;
+ if (try_palette) intra_search_state->pmi_uv = *pmi;
+ intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+
+ const int uv_rate = intra_search_state->rate_uv_tokenonly;
+ const int64_t uv_dist = intra_search_state->dist_uvs;
+ const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+ if (uv_rd > best_rd) {
+ // If there is no good intra uv-mode available, we can skip all intra
+ // modes.
+ intra_search_state->skip_intra_modes = 1;
+ return 0;
+ }
+ }
+
+ // If we are here, then the encoder has found at least one good intra uv
+ // predictor, so we can directly copy its statistics over.
+ // TODO(any): the stats here is not right if the best uv mode is CFL but the
+ // best y mode is palette.
+ rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly;
+ rd_stats_uv->dist = intra_search_state->dist_uvs;
+ rd_stats_uv->skip_txfm = intra_search_state->skip_uvs;
+ rd_stats->skip_txfm = rd_stats_y->skip_txfm && rd_stats_uv->skip_txfm;
+ mbmi->uv_mode = intra_search_state->mode_uv;
+ if (try_palette) {
+ pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+ intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+ 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+ }
+ mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+
+ return 1;
+}
+
+// Checks if odd delta angles can be pruned based on rdcosts of even delta
+// angles of the corresponding directional mode.
+static AOM_INLINE int prune_luma_odd_delta_angles_using_rd_cost(
+ const MB_MODE_INFO *const mbmi, const int64_t *const intra_modes_rd_cost,
+ int64_t best_rd, int prune_luma_odd_delta_angles_in_intra) {
+ const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y];
+ if (!prune_luma_odd_delta_angles_in_intra ||
+ !av1_is_directional_mode(mbmi->mode) || !(abs(luma_delta_angle) & 1) ||
+ best_rd == INT64_MAX)
+ return 0;
+
+ const int64_t rd_thresh = best_rd + (best_rd >> 3);
+
+ // Neighbour rdcosts are considered for pruning of odd delta angles as
+ // mentioned below:
+ // Delta angle Delta angle rdcost
+ // to be pruned to be considered
+ // -3 -2
+ // -1 -2, 0
+ // 1 0, 2
+ // 3 2
+ return intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA] > rd_thresh &&
+ intra_modes_rd_cost[luma_delta_angle + MAX_ANGLE_DELTA + 2] >
+ rd_thresh;
+}
+
+// Finds the best non-intrabc mode on an intra frame.
+int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, uint8_t *skippable,
+ BLOCK_SIZE bsize, int64_t best_rd,
+ PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ int64_t best_model_rd = INT64_MAX;
+ int is_directional_mode;
+ uint8_t directional_mode_skip_mask[INTRA_MODES] = { 0 };
+ // Flag to check rd of any intra mode is better than best_rd passed to this
+ // function
+ int beat_best_rd = 0;
+ const int *bmode_costs;
+ const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int try_palette =
+ cpi->oxcf.tool_cfg.enable_palette &&
+ av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+ mbmi->bsize);
+ uint8_t *best_palette_color_map =
+ try_palette ? x->palette_buffer->best_palette_color_map : NULL;
+ const MB_MODE_INFO *above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *left_mi = xd->left_mbmi;
+ const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[A];
+ const int left_ctx = intra_mode_context[L];
+ bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
+
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ const INTRA_MODE_SPEED_FEATURES *const intra_sf = &cpi->sf.intra_sf;
+ if (intra_sf->intra_pruning_with_hog) {
+ // Less aggressive thresholds are used here than those used in inter frame
+ // encoding in av1_handle_intra_y_mode() because we want key frames/intra
+ // frames to have higher quality.
+ const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f };
+ const int is_chroma = 0;
+ prune_intra_mode_with_hog(x, bsize, cpi->common.seq_params->sb_size,
+ thresh[intra_sf->intra_pruning_with_hog - 1],
+ directional_mode_skip_mask, is_chroma);
+ }
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ pmi->palette_size[0] = 0;
+
+ // Set params for mode evaluation
+ set_mode_eval_params(cpi, x, MODE_EVAL);
+
+ MB_MODE_INFO best_mbmi = *mbmi;
+ const int max_winner_mode_count =
+ winner_mode_count_allowed[cpi->sf.winner_mode_sf.multi_winner_mode_type];
+ zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats);
+ x->winner_mode_count = 0;
+
+ // Searches the intra-modes except for intrabc, palette, and filter_intra.
+ int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+ for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+ top_intra_model_rd[i] = INT64_MAX;
+ }
+
+ // Initialize the rdcost corresponding to all the directional and
+ // non-directional intra modes.
+ // 1. For directional modes, it stores the rdcost values for delta angles -4,
+ // -3, ..., 3, 4.
+ // 2. The rdcost value for luma_delta_angle is stored at index
+ // luma_delta_angle + MAX_ANGLE_DELTA + 1.
+ // 3. The rdcost values for fictitious/nonexistent luma_delta_angle -4 and 4
+ // (array indices 0 and 8) are always set to INT64_MAX (the initial value).
+ int64_t intra_modes_rd_cost[INTRA_MODE_END]
+ [SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY];
+ for (int i = 0; i < INTRA_MODE_END; i++) {
+ for (int j = 0; j < SIZE_OF_ANGLE_DELTA_RD_COST_ARRAY; j++) {
+ intra_modes_rd_cost[i][j] = INT64_MAX;
+ }
+ }
+
+ for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT;
+ ++mode_idx) {
+ set_y_mode_and_delta_angle(mode_idx, mbmi,
+ intra_sf->prune_luma_odd_delta_angles_in_intra);
+ RD_STATS this_rd_stats;
+ int this_rate, this_rate_tokenonly, s;
+ int is_diagonal_mode;
+ int64_t this_distortion, this_rd;
+ const int luma_delta_angle = mbmi->angle_delta[PLANE_TYPE_Y];
+
+ is_diagonal_mode = av1_is_diagonal_mode(mbmi->mode);
+ if (is_diagonal_mode && !intra_mode_cfg->enable_diagonal_intra) continue;
+ if (av1_is_directional_mode(mbmi->mode) &&
+ !intra_mode_cfg->enable_directional_intra)
+ continue;
+
+ // The smooth prediction mode appears to be more frequently picked
+ // than horizontal / vertical smooth prediction modes. Hence treat
+ // them differently in speed features.
+ if ((!intra_mode_cfg->enable_smooth_intra ||
+ intra_sf->disable_smooth_intra) &&
+ (mbmi->mode == SMOOTH_H_PRED || mbmi->mode == SMOOTH_V_PRED))
+ continue;
+ if (!intra_mode_cfg->enable_smooth_intra && mbmi->mode == SMOOTH_PRED)
+ continue;
+
+ // The functionality of filter intra modes and smooth prediction
+ // overlap. Hence smooth prediction is pruned only if all the
+ // filter intra modes are enabled.
+ if (intra_sf->disable_smooth_intra &&
+ intra_sf->prune_filter_intra_level == 0 && mbmi->mode == SMOOTH_PRED)
+ continue;
+ if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED)
+ continue;
+
+ // Skip the evaluation of modes that do not match with the winner mode in
+ // x->mb_mode_cache.
+ if (x->use_mb_mode_cache && mbmi->mode != x->mb_mode_cache->mode) continue;
+
+ is_directional_mode = av1_is_directional_mode(mbmi->mode);
+ if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
+ if (is_directional_mode &&
+ !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) &&
+ luma_delta_angle != 0)
+ continue;
+
+ // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+ if (!(intra_sf->intra_y_mode_mask[max_txsize_lookup[bsize]] &
+ (1 << mbmi->mode)))
+ continue;
+
+ if (prune_luma_odd_delta_angles_using_rd_cost(
+ mbmi, intra_modes_rd_cost[mbmi->mode], best_rd,
+ intra_sf->prune_luma_odd_delta_angles_in_intra))
+ continue;
+
+ const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+ const int64_t this_model_rd =
+ intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+
+ const int model_rd_index_for_pruning =
+ get_model_rd_index_for_pruning(x, intra_sf);
+
+ if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd,
+ intra_sf->top_intra_model_count_allowed,
+ model_rd_index_for_pruning))
+ continue;
+
+ // Builds the actual prediction. The prediction from
+ // model_intra_yrd_and_prune was just an estimation that did not take into
+ // account the effect of txfm pipeline, so we need to redo it for real
+ // here.
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+ this_rate_tokenonly = this_rd_stats.rate;
+ this_distortion = this_rd_stats.dist;
+ s = this_rd_stats.skip_txfm;
+
+ if (this_rate_tokenonly == INT_MAX) continue;
+
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+ // av1_pick_uniform_tx_size_type_yrd above includes the cost of the
+ // tx_size in the tokenonly rate, but for intra blocks, tx_size is always
+ // coded (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size);
+ }
+ this_rate =
+ this_rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode], 0);
+ this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
+
+ // Visual quality adjustment based on recon vs source variance.
+ if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) {
+ this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize));
+ }
+
+ intra_modes_rd_cost[mbmi->mode][luma_delta_angle + MAX_ANGLE_DELTA + 1] =
+ this_rd;
+
+ // Collect mode stats for multiwinner mode processing
+ const int txfm_search_done = 1;
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+ if (this_rd < best_rd) {
+ best_mbmi = *mbmi;
+ best_rd = this_rd;
+ // Setting beat_best_rd flag because current mode rd is better than
+ // best_rd passed to this function
+ beat_best_rd = 1;
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = this_distortion;
+ *skippable = s;
+ memcpy(ctx->blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ }
+ }
+
+ // Searches palette
+ if (try_palette) {
+ av1_rd_pick_palette_intra_sby(
+ cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map,
+ &best_rd, rate, rate_tokenonly, distortion, skippable, &beat_best_rd,
+ ctx, ctx->blk_skip, ctx->tx_type_map);
+ }
+
+ // Searches filter_intra
+ if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
+ if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
+ skippable, bsize, bmode_costs[DC_PRED],
+ best_mbmi.mode, &best_rd, &best_model_rd,
+ ctx)) {
+ best_mbmi = *mbmi;
+ }
+ }
+
+ // No mode is identified with less rd value than best_rd passed to this
+ // function. In such cases winner mode processing is not necessary and return
+ // best_rd as INT64_MAX to indicate best mode is not identified
+ if (!beat_best_rd) return INT64_MAX;
+
+ // In multi-winner mode processing, perform tx search for few best modes
+ // identified during mode evaluation. Winner mode processing uses best tx
+ // configuration for tx search.
+ if (cpi->sf.winner_mode_sf.multi_winner_mode_type) {
+ int best_mode_idx = 0;
+ int block_width, block_height;
+ uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map;
+ av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width,
+ &block_height, NULL, NULL);
+
+ for (int mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) {
+ *mbmi = x->winner_mode_stats[mode_idx].mbmi;
+ if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) {
+ // Restore color_map of palette mode before winner mode processing
+ if (mbmi->palette_mode_info.palette_size[0] > 0) {
+ uint8_t *color_map_src =
+ x->winner_mode_stats[mode_idx].color_index_map;
+ memcpy(color_map_dst, color_map_src,
+ block_width * block_height * sizeof(*color_map_src));
+ }
+ // Set params for winner mode evaluation
+ set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+
+ // Winner mode processing
+ // If previous searches use only the default tx type/no R-D optimization
+ // of quantized coeffs, do an extra search for the best tx type/better
+ // R-D optimization of quantized coeffs
+ if (intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate,
+ rate_tokenonly, distortion, skippable, &best_mbmi,
+ ctx))
+ best_mode_idx = mode_idx;
+ }
+ }
+ // Copy color_map of palette mode for final winner mode
+ if (best_mbmi.palette_mode_info.palette_size[0] > 0) {
+ uint8_t *color_map_src =
+ x->winner_mode_stats[best_mode_idx].color_index_map;
+ memcpy(color_map_dst, color_map_src,
+ block_width * block_height * sizeof(*color_map_src));
+ }
+ } else {
+ // If previous searches use only the default tx type/no R-D optimization of
+ // quantized coeffs, do an extra search for the best tx type/better R-D
+ // optimization of quantized coeffs
+ if (is_winner_mode_processing_enabled(cpi, x, mbmi, 0)) {
+ // Set params for winner mode evaluation
+ set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+ *mbmi = best_mbmi;
+ intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate,
+ rate_tokenonly, distortion, skippable, &best_mbmi, ctx);
+ }
+ }
+ *mbmi = best_mbmi;
+ av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
+ return best_rd;
+}
diff --git a/third_party/aom/av1/encoder/intra_mode_search.h b/third_party/aom/av1/encoder/intra_mode_search.h
new file mode 100644
index 0000000000..75289c4e3c
--- /dev/null
+++ b/third_party/aom/av1/encoder/intra_mode_search.h
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares high level functions to search through intra modes.
+ */
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \brief Variables related to intra-mode search during inter frame coding.
+ *
+ * \ingroup intra_mode_search
+ * This is a set of variables used during intra-mode search for inter frames.
+ * This includes an histogram of gradient speed features and a cache of uv
+ * prediction to avoid repeated search of chroma prediction.
+ */
+typedef struct IntraModeSearchState {
+ /*!
+ * \brief The best luma intra-mode found so far
+ */
+ PREDICTION_MODE best_intra_mode;
+
+ /** \name Speed feature variables
+ * Variables to help with pruning some luma intra-modes during inter frame
+ * coding process.
+ */
+ /**@{*/
+ /*!
+ * \brief Whether to terminate all intra mode search.
+ */
+ int skip_intra_modes;
+ /*!
+ * \brief Whether a directional mode is pruned.
+ */
+ uint8_t directional_mode_skip_mask[INTRA_MODES];
+ /*!
+ * \brief Whether \ref directional_mode_skip_mask is valid for pruning.
+ */
+ int dir_mode_skip_mask_ready;
+ /**@}*/
+
+ /** \name Chroma mode search cache
+ * A cache of the best chroma prediction mode to avoid having to search for
+ * chroma predictions repeatedly in \ref
+ * av1_search_intra_uv_modes_in_interframe()
+ */
+ /**@{*/
+ int rate_uv_intra; /*!< \brief Total rate to transmit uv_mode */
+ int rate_uv_tokenonly; /*!< \brief Rate transmit txfm tokens */
+ int64_t dist_uvs; /*!< \brief Distortion of the uv_mode's recon */
+ uint8_t skip_uvs; /*!< \brief Whether the uv txfm is skippable */
+ UV_PREDICTION_MODE mode_uv; /*!< \brief The best uv mode */
+ PALETTE_MODE_INFO pmi_uv; /*!< \brief Color map if mode_uv is palette */
+ int8_t uv_angle_delta; /*!< \brief Angle delta if mode_uv directional */
+ /**@}*/
+} IntraModeSearchState;
+
+/*!\brief Evaluate a given luma intra-mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function handles an intra-mode luma prediction when the current frame
+ * is an inter frame. This is the intra-mode counterpart of handle_inter_mode.
+ * This function performs an intra luma prediction using the mode specified by
+ * x->e_mbd.mi[0]->mode. This function does *not* support palette mode
+ * prediction in the luma channel.
+ *
+ * \param[in,out] intra_search_state Structure to intra search state.
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in,out] x Pointer to structure holding all the
+ * data for the current macroblock.
+ * \param[in] bsize Current partition block size.
+ * \param[in] ref_frame_cost The entropy cost for signaling that the
+ * current ref frame is an intra frame.
+ * \param[in] ctx Structure to hold the number of 4x4 blks
+ * to copy tx_type and txfm_skip arrays.
+ * \param[out] rd_stats_y Struct to keep track of the current
+ * intra-mode's rd_stats (luma only).
+ * \param[in] best_rd Best RD seen for this block so far.
+ * \param[out] mode_cost_y The cost needed to signal the current
+ * intra mode.
+ * \param[out] rd_y The rdcost of the chosen mode.
+ * \param[in] best_model_rd Best model RD seen for this block so far
+ * \param[in] top_intra_model_rd Top intra model RD seen for this
+ * block so far.
+ *
+ * \return Returns 1 if a valid intra mode is found, 0 otherwise.
+ * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and
+ * rd_y are also updated. Moreover, in the first evaluation with directional
+ * mode, a prune_mask computed with histogram of gradient is also stored in
+ * intra_search_state.
+ */
+int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
+ int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+ int64_t *best_model_rd,
+ int64_t top_intra_model_rd[]);
+
+/*!\brief Search through all chroma intra-modes for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function handles intra-mode chroma prediction when the current frame
+ * is an inter frame. This is done by calling \ref av1_rd_pick_intra_sbuv_mode
+ * with some additional book-keeping.
+ *
+ * \param[in,out] intra_search_state Structure to intra search state.
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in,out] x Pointer to structure holding all the
+ * data for the current macroblock.
+ * \param[in] bsize Current partition block size.
+ * \param[out] rd_stats Struct to keep track of the current
+ * intra-mode's rd_stats (all planes).
+ * \param[out] rd_stats_y Struct to keep track of the current
+ * intra-mode's rd_stats (luma only).
+ * \param[out] rd_stats_uv Struct to keep track of the current
+ * intra-mode's rd_stats (chroma only).
+ * \param[in] best_rd Best RD seen for this block so far.
+ *
+ * \return Returns 1 if a valid intra mode is found, 0 otherwise.
+ * The corresponding values in x->e_mbd.mi[0], rd_stats(_y|_uv) are also
+ * updated. Moreover, in the first evocation of the function, the chroma intra
+ * mode result is cached in intra_search_state to be used in subsequent calls.
+ */
+int av1_search_intra_uv_modes_in_interframe(
+ IntraModeSearchState *intra_search_state, const AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize, RD_STATS *rd_stats,
+ const RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, int64_t best_rd);
+
+/*!\brief Evaluate luma palette mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function handles luma palette mode when the current frame is an
+ * inter frame.
+ *
+ * \param[in] intra_search_state Structure to hold the best luma intra mode
+ * and cache chroma prediction for speed up.
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to structure holding all the data
+ * for the current macroblock.
+ * \param[in] bsize Current partition block size.
+ * \param[in] ref_frame_cost The entropy cost for signaling that the
+ * current ref frame is an intra frame.
+ * \param[in] ctx Structure to hold the number of 4x4 blks to
+ * copy the tx_type and txfm_skip arrays.
+ * \param[in] this_rd_cost Struct to keep track of palette mode's
+ * rd_stats.
+ * \param[in] best_rd Best RD seen for this block so far.
+ *
+ * \return Returns whether luma palette mode can skip the txfm. The
+ * corresponding mbmi, this_rd_costs, intra_search_state, and tx_type arrays in
+ * ctx are also updated.
+ */
+int av1_search_palette_mode(IntraModeSearchState *intra_search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ PICK_MODE_CONTEXT *ctx, RD_STATS *this_rd_cost,
+ int64_t best_rd);
+
+/*!\brief Evaluate luma palette mode for inter frames.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function handles luma palette mode when the current frame is an
+ * inter frame.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to structure holding all the data
+ * for the current macroblock.
+ * \param[in] bsize Current partition block size.
+ * \param[in] ref_frame_cost The entropy cost for signaling that the
+ * current ref frame is an intra frame.
+ * \param[in] ctx Structure to hold the number of 4x4 blks to
+ * copy the tx_type and txfm_skip arrays.
+ * \param[in] this_rd_cost Struct to keep track of palette mode's
+ * rd_stats.
+ * \param[in] best_rd Best RD seen for this block so far.
+ */
+void av1_search_palette_mode_luma(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int ref_frame_cost,
+ PICK_MODE_CONTEXT *ctx,
+ RD_STATS *this_rd_cost, int64_t best_rd);
+
+/*!\brief Perform intra-mode search on luma channels for intra frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This function performs intra-mode search on the luma channel when the
+ * current frame is intra-only. This function does not search intrabc mode,
+ * but it does search palette and filter_intra.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to structure holding all the data
+ * for the current macroblock.
+ * \param[in] rate The total rate needed to predict the current
+ * chroma block.
+ * \param[in] rate_tokenonly The rate without the cost of sending the
+ * prediction modes.
+ * chroma block.
+ * after the reconstruction.
+ * \param[in] distortion The chroma distortion of the best prediction
+ * after the reconstruction.
+ * \param[in] skippable Whether we can skip txfm process.
+ * \param[in] bsize Current partition block size.
+ * \param[in] best_rd Best RD seen for this block so far.
+ * \param[in] ctx Structure to hold the number of 4x4 blks to
+ * copy the tx_type and txfm_skip arrays.
+ *
+ * \return Returns the rd_cost if this function finds a mode better than
+ * best_rd, otherwise returns INT64_MAX. This also updates the mbmi, the rate
+ * and distortion, and the tx_type arrays in ctx.
+ */
+int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, uint8_t *skippable,
+ BLOCK_SIZE bsize, int64_t best_rd,
+ PICK_MODE_CONTEXT *ctx);
+
+/*!\brief Perform intra-mode search on chroma channels.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * \callgraph
+ * This function performs intra-mode search on the chroma channels. Just like
+ * \ref av1_rd_pick_intra_sby_mode(), this function searches over palette mode
+ * (filter_intra is not available on chroma planes). Unlike \ref
+ * av1_rd_pick_intra_sby_mode() this function is used by both inter and intra
+ * frames.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to structure holding all the data
+ * for the current macroblock.
+ * \param[in] rate The total rate needed to predict the current
+ * chroma block.
+ * \param[in] rate_tokenonly The rate without the cost of sending the
+ * prediction modes.
+ * chroma block.
+ * after the reconstruction.
+ * \param[in] distortion The chroma distortion of the best prediction
+ * after the reconstruction.
+ * \param[in] skippable Whether we can skip txfm process.
+ * \param[in] bsize Current partition block size.
+ * \param[in] max_tx_size The maximum tx_size available
+ *
+ * \return Returns the rd_cost of the best uv mode found. This also updates the
+ * mbmi, the rate and distortion, distortion.
+ */
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, uint8_t *skippable,
+ BLOCK_SIZE bsize, TX_SIZE max_tx_size);
+
+/*! \brief Return the number of colors in src. Used by palette mode.
+ */
+void av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+ int *val_count, int *num_colors);
+
+/*! \brief See \ref av1_count_colors(), but for highbd.
+ */
+void av1_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+ int cols, int bit_depth, int *val_count,
+ int *val_count_8bit, int *num_color_bins,
+ int *num_colors);
+
+/*! \brief Initializes the \ref IntraModeSearchState struct.
+ */
+static AOM_INLINE void init_intra_mode_search_state(
+ IntraModeSearchState *intra_search_state) {
+ memset(intra_search_state, 0, sizeof(*intra_search_state));
+ intra_search_state->rate_uv_intra = INT_MAX;
+}
+
+/*! \brief set the luma intra mode and delta angles for a given mode index.
+ * The total number of luma intra mode is LUMA_MODE_COUNT = 61.
+ * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional
+ * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2
+ * delta angles.
+ * \param[in] mode_idx mode index in intra mode decision
+ * process.
+ * \param[in] mbmi Pointer to structure holding the mode
+ * info for the current macroblock.
+ * \param[in] reorder_delta_angle_eval Indicates whether to reorder the
+ * evaluation of delta angle modes.
+ */
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi,
+ int reorder_delta_angle_eval);
+
+/*! \brief prune luma intra mode based on the model rd.
+ * \param[in] this_model_rd model rd for current mode.
+ * \param[in] best_model_rd Best model RD seen for this block so
+ * far.
+ * \param[in] top_intra_model_rd Top intra model RD seen for this
+ * block so far.
+ * \param[in] max_model_cnt_allowed The maximum number of top intra
+ * model RD allowed.
+ * \param[in] model_rd_index_for_pruning Index of the candidate used for
+ * pruning based on model rd.
+ */
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+ int64_t top_intra_model_rd[], int max_model_cnt_allowed,
+ int model_rd_index_for_pruning);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/intra_mode_search_utils.h b/third_party/aom/av1/encoder/intra_mode_search_utils.h
new file mode 100644
index 0000000000..107c2236f8
--- /dev/null
+++ b/third_party/aom/av1/encoder/intra_mode_search_utils.h
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Defines utility functions used in intra mode search.
+ *
+ * This includes rdcost estimations, histogram based pruning, etc.
+ */
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
+
+#include "av1/common/enums.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+// Macro for computing the speed-preset dependent threshold which is used for
+// deciding whether to enable/disable variance calculations in
+// intra_rd_variance_factor().
+#define INTRA_RD_VAR_THRESH(X) (1.0 - (0.25 * (X)))
+
+#define BINS 32
+static const float av1_intra_hog_model_bias[DIRECTIONAL_MODES] = {
+ 0.450578f, 0.695518f, -0.717944f, -0.639894f,
+ -0.602019f, -0.453454f, 0.055857f, -0.465480f,
+};
+
+static const float av1_intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
+ -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
+ -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
+ -0.434156f, 0.322868f, 2.260546f, 3.368715f, 3.989290f, 3.308487f,
+ 2.277893f, 0.923793f, 0.026412f, -0.385174f, -0.718622f, -1.408867f,
+ -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
+ -2.985709f, -3.447155f, 3.758139f, 3.204353f, 2.170998f, 0.826587f,
+ -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
+ -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
+ -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
+ -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
+ -0.088058f, 0.753494f, 2.092413f, 3.215266f, -3.300277f, -2.748658f,
+ -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
+ -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
+ -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
+ 0.813112f, 1.702213f, 2.653045f, 3.351749f, 3.243554f, 3.199409f,
+ 2.437856f, 1.468854f, 0.533039f, -0.099065f, -0.622643f, -2.200732f,
+ -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f, 1.975043f,
+ 3.179528f, 3.939064f, 3.454379f, 3.689386f, 3.116411f, 1.970991f,
+ 0.798406f, -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
+ -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
+ -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
+ -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
+ -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f, 1.416882f,
+ 2.572884f, 3.607755f, 3.974820f, 3.997783f, 2.970459f, 0.791687f,
+ -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
+ -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
+ -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
+ 2.794130f, 3.685984f, 3.745195f, 3.252444f, 2.316108f, 1.399146f,
+ -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
+ -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
+ -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
+ -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
+ -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
+ -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
+ -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
+ -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
+ 0.716997f, 1.481393f, 2.216702f, 2.737986f, 3.109809f, 3.226084f,
+ 2.490098f, -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
+ -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
+ -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
+ -1.430687f, 0.872896f, 2.766550f, 3.610080f, 3.578041f, 3.334928f,
+ 2.586680f, 1.895721f, 1.122195f, 0.488519f, -0.140689f, -0.799076f,
+ -1.222860f, -1.502437f, -1.900969f, -3.206816f,
+};
+
+static const NN_CONFIG av1_intra_hog_model_nnconfig = {
+ BINS, // num_inputs
+ DIRECTIONAL_MODES, // num_outputs
+ 0, // num_hidden_layers
+ { 0 },
+ {
+ av1_intra_hog_model_weights,
+ },
+ {
+ av1_intra_hog_model_bias,
+ },
+};
+
+#define FIX_PREC_BITS (16)
+static AOM_INLINE int get_hist_bin_idx(int dx, int dy) {
+ const int32_t ratio = (dy * (1 << FIX_PREC_BITS)) / dx;
+
+ // Find index by bisection
+ static const int thresholds[BINS] = {
+ -1334015, -441798, -261605, -183158, -138560, -109331, -88359, -72303,
+ -59392, -48579, -39272, -30982, -23445, -16400, -9715, -3194,
+ 3227, 9748, 16433, 23478, 31015, 39305, 48611, 59425,
+ 72336, 88392, 109364, 138593, 183191, 261638, 441831, INT32_MAX
+ };
+
+ int lo_idx = 0, hi_idx = BINS - 1;
+ // Divide into segments of size 8 gives better performance than binary search
+ // here.
+ if (ratio <= thresholds[7]) {
+ lo_idx = 0;
+ hi_idx = 7;
+ } else if (ratio <= thresholds[15]) {
+ lo_idx = 8;
+ hi_idx = 15;
+ } else if (ratio <= thresholds[23]) {
+ lo_idx = 16;
+ hi_idx = 23;
+ } else {
+ lo_idx = 24;
+ hi_idx = 31;
+ }
+
+ for (int idx = lo_idx; idx <= hi_idx; idx++) {
+ if (ratio <= thresholds[idx]) {
+ return idx;
+ }
+ }
+ assert(0 && "No valid histogram bin found!");
+ return BINS - 1;
+}
+#undef FIX_PREC_BITS
+
+// Normalizes the hog data.
+static AOM_INLINE void normalize_hog(float total, float *hist) {
+ for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride,
+ int rows, int cols, float *hist) {
+ float total = 0.1f;
+ src += stride;
+ for (int r = 1; r < rows - 1; ++r) {
+ for (int c = 1; c < cols - 1; ++c) {
+ const uint8_t *above = &src[c - stride];
+ const uint8_t *below = &src[c + stride];
+ const uint8_t *left = &src[c - 1];
+ const uint8_t *right = &src[c + 1];
+ // Calculate gradient using Sobel filters.
+ const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+ (left[-stride] + 2 * left[0] + left[stride]);
+ const int dy = (below[-1] + 2 * below[0] + below[1]) -
+ (above[-1] + 2 * above[0] + above[1]);
+ if (dx == 0 && dy == 0) continue;
+ const int temp = abs(dx) + abs(dy);
+ if (!temp) continue;
+ total += temp;
+ if (dx == 0) {
+ hist[0] += temp / 2;
+ hist[BINS - 1] += temp / 2;
+ } else {
+ const int idx = get_hist_bin_idx(dx, dy);
+ assert(idx >= 0 && idx < BINS);
+ hist[idx] += temp;
+ }
+ }
+ src += stride;
+ }
+
+ normalize_hog(total, hist);
+}
+
+// Computes and stores pixel level gradient information of a given superblock
+// for LBD encode.
+static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x,
+ BLOCK_SIZE sb_size,
+ PLANE_TYPE plane) {
+ PixelLevelGradientInfo *const grad_info_sb =
+ x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+ const uint8_t *src = x->plane[plane].src.buf;
+ const int stride = x->plane[plane].src.stride;
+ const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+ const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+ const int sb_height = block_size_high[sb_size] >> ss_y;
+ const int sb_width = block_size_wide[sb_size] >> ss_x;
+ src += stride;
+ for (int r = 1; r < sb_height - 1; ++r) {
+ for (int c = 1; c < sb_width - 1; ++c) {
+ const uint8_t *above = &src[c - stride];
+ const uint8_t *below = &src[c + stride];
+ const uint8_t *left = &src[c - 1];
+ const uint8_t *right = &src[c + 1];
+ // Calculate gradient using Sobel filters.
+ const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+ (left[-stride] + 2 * left[0] + left[stride]);
+ const int dy = (below[-1] + 2 * below[0] + below[1]) -
+ (above[-1] + 2 * above[0] + above[1]);
+ grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+ grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+ (uint16_t)(abs(dx) + abs(dy));
+ grad_info_sb[r * sb_width + c].hist_bin_idx =
+ (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+ }
+ src += stride;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride,
+ int rows, int cols, float *hist) {
+ float total = 0.1f;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ src += stride;
+ for (int r = 1; r < rows - 1; ++r) {
+ for (int c = 1; c < cols - 1; ++c) {
+ const uint16_t *above = &src[c - stride];
+ const uint16_t *below = &src[c + stride];
+ const uint16_t *left = &src[c - 1];
+ const uint16_t *right = &src[c + 1];
+ // Calculate gradient using Sobel filters.
+ const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+ (left[-stride] + 2 * left[0] + left[stride]);
+ const int dy = (below[-1] + 2 * below[0] + below[1]) -
+ (above[-1] + 2 * above[0] + above[1]);
+ if (dx == 0 && dy == 0) continue;
+ const int temp = abs(dx) + abs(dy);
+ if (!temp) continue;
+ total += temp;
+ if (dx == 0) {
+ hist[0] += temp / 2;
+ hist[BINS - 1] += temp / 2;
+ } else {
+ const int idx = get_hist_bin_idx(dx, dy);
+ assert(idx >= 0 && idx < BINS);
+ hist[idx] += temp;
+ }
+ }
+ src += stride;
+ }
+
+ normalize_hog(total, hist);
+}
+
+// Computes and stores pixel level gradient information of a given superblock
+// for HBD encode.
+static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x,
+ BLOCK_SIZE sb_size,
+ PLANE_TYPE plane) {
+ PixelLevelGradientInfo *const grad_info_sb =
+ x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf);
+ const int stride = x->plane[plane].src.stride;
+ const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+ const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+ const int sb_height = block_size_high[sb_size] >> ss_y;
+ const int sb_width = block_size_wide[sb_size] >> ss_x;
+ src += stride;
+ for (int r = 1; r < sb_height - 1; ++r) {
+ for (int c = 1; c < sb_width - 1; ++c) {
+ const uint16_t *above = &src[c - stride];
+ const uint16_t *below = &src[c + stride];
+ const uint16_t *left = &src[c - 1];
+ const uint16_t *right = &src[c + 1];
+ // Calculate gradient using Sobel filters.
+ const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+ (left[-stride] + 2 * left[0] + left[stride]);
+ const int dy = (below[-1] + 2 * below[0] + below[1]) -
+ (above[-1] + 2 * above[0] + above[1]);
+ grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+ grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+ (uint16_t)(abs(dx) + abs(dy));
+ grad_info_sb[r * sb_width + c].hist_bin_idx =
+ (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+ }
+ src += stride;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows,
+ int cols, float *hist, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd) {
+ highbd_generate_hog(src8, stride, rows, cols, hist);
+ return;
+ }
+#else
+ (void)highbd;
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ lowbd_generate_hog(src8, stride, rows, cols, hist);
+}
+
+static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x,
+ BLOCK_SIZE sb_size,
+ PLANE_TYPE plane) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(&x->e_mbd)) {
+ highbd_compute_gradient_info_sb(x, sb_size, plane);
+ return;
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ lowbd_compute_gradient_info_sb(x, sb_size, plane);
+}
+
+// Gradient caching at superblock level is allowed only if all of the following
+// conditions are satisfied:
+// (1) The current frame is an intra only frame
+// (2) Non-RD mode decisions are not enabled
+// (3) The sf partition_search_type is set to SEARCH_PARTITION
+// (4) Either intra_pruning_with_hog or chroma_intra_pruning_with_hog is enabled
+//
+// SB level caching of gradient data may not help in speedup for the following
+// cases:
+// (1) Inter frames (due to early intra gating)
+// (2) When partition_search_type is not SEARCH_PARTITION
+// Hence, gradient data is computed at block level in such cases.
+static AOM_INLINE bool is_gradient_caching_for_hog_enabled(
+ const AV1_COMP *const cpi) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ return frame_is_intra_only(&cpi->common) && !sf->rt_sf.use_nonrd_pick_mode &&
+ (sf->part_sf.partition_search_type == SEARCH_PARTITION) &&
+ (sf->intra_sf.intra_pruning_with_hog ||
+ sf->intra_sf.chroma_intra_pruning_with_hog);
+}
+
+// Function to generate pixel level gradient information for a given superblock.
+// Sets the flags 'is_sb_gradient_cached' for the specific plane-type if
+// gradient info is generated for the same.
+static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE sb_size, int mi_row,
+ int mi_col) {
+ // Initialise flags related to hog data caching.
+ x->is_sb_gradient_cached[PLANE_TYPE_Y] = false;
+ x->is_sb_gradient_cached[PLANE_TYPE_UV] = false;
+ if (!is_gradient_caching_for_hog_enabled(cpi)) return;
+
+ const SPEED_FEATURES *sf = &cpi->sf;
+ const int num_planes = av1_num_planes(&cpi->common);
+
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+
+ if (sf->intra_sf.intra_pruning_with_hog) {
+ compute_gradient_info_sb(x, sb_size, PLANE_TYPE_Y);
+ x->is_sb_gradient_cached[PLANE_TYPE_Y] = true;
+ }
+ if (sf->intra_sf.chroma_intra_pruning_with_hog && num_planes > 1) {
+ compute_gradient_info_sb(x, sb_size, PLANE_TYPE_UV);
+ x->is_sb_gradient_cached[PLANE_TYPE_UV] = true;
+ }
+}
+
+// Reuses the pixel level gradient data generated at superblock level for block
+// level histogram computation.
+static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x,
+ int rows, int cols,
+ BLOCK_SIZE sb_size,
+ PLANE_TYPE plane,
+ float *hist) {
+ float total = 0.1f;
+ const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+ const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+ const int sb_width = block_size_wide[sb_size] >> ss_x;
+
+ // Derive the offset from the starting of the superblock in order to locate
+ // the block level gradient data in the cache.
+ const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1);
+ const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1);
+ const int block_offset_in_grad_cache =
+ sb_width * (mi_row_in_sb << (MI_SIZE_LOG2 - ss_y)) +
+ (mi_col_in_sb << (MI_SIZE_LOG2 - ss_x));
+ const PixelLevelGradientInfo *grad_info_blk = x->pixel_gradient_info +
+ plane * MAX_SB_SQUARE +
+ block_offset_in_grad_cache;
+
+ // Retrieve the cached gradient information and generate the histogram.
+ for (int r = 1; r < rows - 1; ++r) {
+ for (int c = 1; c < cols - 1; ++c) {
+ const uint16_t abs_dx_abs_dy_sum =
+ grad_info_blk[r * sb_width + c].abs_dx_abs_dy_sum;
+ if (!abs_dx_abs_dy_sum) continue;
+ total += abs_dx_abs_dy_sum;
+ const bool is_dx_zero = grad_info_blk[r * sb_width + c].is_dx_zero;
+ if (is_dx_zero) {
+ hist[0] += abs_dx_abs_dy_sum >> 1;
+ hist[BINS - 1] += abs_dx_abs_dy_sum >> 1;
+ } else {
+ const int8_t idx = grad_info_blk[r * sb_width + c].hist_bin_idx;
+ assert(idx >= 0 && idx < BINS);
+ hist[idx] += abs_dx_abs_dy_sum;
+ }
+ }
+ }
+ normalize_hog(total, hist);
+}
+
+static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
+ BLOCK_SIZE sb_size, int plane, float *hog) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const int bh = block_size_high[bsize];
+ const int bw = block_size_wide[bsize];
+ const int rows =
+ ((xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh) >>
+ ss_y;
+ const int cols =
+ ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >>
+ ss_x;
+
+ // If gradient data is already generated at SB level, reuse the cached data.
+ // Otherwise, compute the data.
+ if (x->is_sb_gradient_cached[plane]) {
+ generate_hog_using_gradient_cache(x, rows, cols, sb_size, plane, hog);
+ } else {
+ const uint8_t *src = x->plane[plane].src.buf;
+ const int src_stride = x->plane[plane].src.stride;
+ generate_hog(src, src_stride, rows, cols, hog, is_cur_buf_hbd(xd));
+ }
+
+ // Scale the hog so the luma and chroma are on the same scale
+ for (int b = 0; b < BINS; ++b) {
+ hog[b] *= (1 + ss_x) * (1 + ss_y);
+ }
+}
+
+static AOM_INLINE void prune_intra_mode_with_hog(
+ const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th,
+ uint8_t *directional_mode_skip_mask, int is_chroma) {
+ const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y;
+ float hist[BINS] = { 0.0f };
+ collect_hog_data(x, bsize, sb_size, plane, hist);
+
+ // Make prediction for each of the mode
+ float scores[DIRECTIONAL_MODES] = { 0.0f };
+ av1_nn_predict(hist, &av1_intra_hog_model_nnconfig, 1, scores);
+ for (UV_PREDICTION_MODE uv_mode = UV_V_PRED; uv_mode <= UV_D67_PRED;
+ uv_mode++) {
+ if (scores[uv_mode - UV_V_PRED] <= th) {
+ directional_mode_skip_mask[uv_mode] = 1;
+ }
+ }
+}
+#undef BINS
+
+int av1_calc_normalized_variance(aom_variance_fn_t vf, const uint8_t *const buf,
+ const int stride, const int is_hbd);
+
+// Returns whether caching of source variance for 4x4 sub-blocks is allowed.
+static AOM_INLINE bool is_src_var_for_4x4_sub_blocks_caching_enabled(
+ const AV1_COMP *const cpi) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ if (cpi->oxcf.mode != ALLINTRA) return false;
+
+ if (sf->part_sf.partition_search_type == SEARCH_PARTITION) return true;
+
+ if (INTRA_RD_VAR_THRESH(cpi->oxcf.speed) <= 0 ||
+ (sf->rt_sf.use_nonrd_pick_mode && !sf->rt_sf.hybrid_intra_pickmode))
+ return false;
+
+ return true;
+}
+
+// Initialize the members of Block4x4VarInfo structure to -1 at the start
+// of every superblock.
+static AOM_INLINE void init_src_var_info_of_4x4_sub_blocks(
+ const AV1_COMP *const cpi, Block4x4VarInfo *src_var_info_of_4x4_sub_blocks,
+ const BLOCK_SIZE sb_size) {
+ if (!is_src_var_for_4x4_sub_blocks_caching_enabled(cpi)) return;
+
+ const int mi_count_in_sb = mi_size_wide[sb_size] * mi_size_high[sb_size];
+ for (int i = 0; i < mi_count_in_sb; i++) {
+ src_var_info_of_4x4_sub_blocks[i].var = -1;
+ src_var_info_of_4x4_sub_blocks[i].log_var = -1.0;
+ }
+}
+
+// Returns the cost needed to send a uniformly distributed r.v.
+static AOM_INLINE int write_uniform_cost(int n, int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return 0;
+ if (v < m)
+ return av1_cost_literal(l - 1);
+ else
+ return av1_cost_literal(l);
+}
+/*!\endcond */
+
+/*!\brief Returns the rate cost for luma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_y(const AV1_COMP *cpi,
+ const MACROBLOCK *x,
+ const MB_MODE_INFO *mbmi,
+ BLOCK_SIZE bsize, int mode_cost,
+ int discount_color_cost) {
+ int total_rate = mode_cost;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+ const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+ const int use_intrabc = mbmi->use_intrabc;
+ // Can only activate one mode.
+ assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+ use_filter_intra) <= 1);
+ const int try_palette = av1_allow_palette(
+ cpi->common.features.allow_screen_content_tools, mbmi->bsize);
+ if (try_palette && mbmi->mode == DC_PRED) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+ const int mode_ctx = av1_get_palette_mode_ctx(xd);
+ total_rate +=
+ mode_costs->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+ if (use_palette) {
+ const uint8_t *const color_map = xd->plane[0].color_index_map;
+ int block_width, block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+ &cols);
+ const int plt_size = mbmi->palette_mode_info.palette_size[0];
+ int palette_mode_cost =
+ mode_costs
+ ->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+ write_uniform_cost(plt_size, color_map[0]);
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+ palette_mode_cost +=
+ av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+ n_cache, cpi->common.seq_params->bit_depth);
+ if (!discount_color_cost)
+ palette_mode_cost +=
+ av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+
+ total_rate += palette_mode_cost;
+ }
+ }
+ if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+ total_rate += mode_costs->filter_intra_cost[mbmi->bsize][use_filter_intra];
+ if (use_filter_intra) {
+ total_rate +=
+ mode_costs->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+ .filter_intra_mode];
+ }
+ }
+ if (av1_is_directional_mode(mbmi->mode)) {
+ if (av1_use_angle_delta(bsize)) {
+ total_rate +=
+ mode_costs->angle_delta_cost[mbmi->mode - V_PRED]
+ [MAX_ANGLE_DELTA +
+ mbmi->angle_delta[PLANE_TYPE_Y]];
+ }
+ }
+ if (av1_allow_intrabc(&cpi->common))
+ total_rate += mode_costs->intrabc_cost[use_intrabc];
+ return total_rate;
+}
+
+/*!\brief Return the rate cost for chroma prediction mode info of intra blocks.
+ *
+ * \callergraph
+ */
+static AOM_INLINE int intra_mode_info_cost_uv(const AV1_COMP *cpi,
+ const MACROBLOCK *x,
+ const MB_MODE_INFO *mbmi,
+ BLOCK_SIZE bsize, int mode_cost) {
+ int total_rate = mode_cost;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ // Can only activate one mode.
+ assert(((uv_mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+ const int try_palette = av1_allow_palette(
+ cpi->common.features.allow_screen_content_tools, mbmi->bsize);
+ if (try_palette && uv_mode == UV_DC_PRED) {
+ const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+ total_rate +=
+ mode_costs->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+ if (use_palette) {
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+ const int plt_size = pmi->palette_size[1];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const uint8_t *const color_map = xd->plane[1].color_index_map;
+ int palette_mode_cost =
+ mode_costs
+ ->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+ write_uniform_cost(plt_size, color_map[0]);
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+ palette_mode_cost += av1_palette_color_cost_uv(
+ pmi, color_cache, n_cache, cpi->common.seq_params->bit_depth);
+ palette_mode_cost +=
+ av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+ total_rate += palette_mode_cost;
+ }
+ }
+ const PREDICTION_MODE intra_mode = get_uv_mode(uv_mode);
+ if (av1_is_directional_mode(intra_mode)) {
+ if (av1_use_angle_delta(bsize)) {
+ total_rate +=
+ mode_costs->angle_delta_cost[intra_mode - V_PRED]
+ [mbmi->angle_delta[PLANE_TYPE_UV] +
+ MAX_ANGLE_DELTA];
+ }
+ }
+ return total_rate;
+}
+
+/*!\cond */
+// Makes a quick intra prediction and estimate the rdcost with a model without
+// going through the whole txfm/quantize/itxfm process.
+static int64_t intra_model_rd(const AV1_COMMON *cm, MACROBLOCK *const x,
+ int plane, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, int use_hadamard) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ int row, col;
+ assert(!is_inter_block(xd->mi[0]));
+ const int stepr = tx_size_high_unit[tx_size];
+ const int stepc = tx_size_wide_unit[tx_size];
+ const int txbw = tx_size_wide[tx_size];
+ const int txbh = tx_size_high[tx_size];
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ int64_t satd_cost = 0;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ // Prediction.
+ for (row = 0; row < max_blocks_high; row += stepr) {
+ for (col = 0; col < max_blocks_wide; col += stepc) {
+ av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+ // Here we use p->src_diff and p->coeff as temporary buffers for
+ // prediction residue and transform coefficients. The buffers are only
+ // used in this for loop, therefore we don't need to properly add offset
+ // to the buffers.
+ av1_subtract_block(
+ bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
+ p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride,
+ pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride);
+ av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff,
+ block_size_wide[plane_bsize], p->coeff);
+ satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]);
+ }
+ }
+ return satd_cost;
+}
+/*!\endcond */
+
+/*!\brief Estimate the luma rdcost of a given intra mode and try to prune it.
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function first makes a quick luma prediction and estimates the rdcost
+ * with a model without going through the txfm, then try to prune the current
+ * mode if the new estimate y_rd > 1.25 * best_model_rd.
+ *
+ * \return Returns 1 if the given mode is prune; 0 otherwise.
+ */
+static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize,
+ int64_t *best_model_rd) {
+ const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+ const int plane = 0;
+ const AV1_COMMON *cm = &cpi->common;
+ const int64_t this_model_rd =
+ intra_model_rd(cm, x, plane, bsize, tx_size, /*use_hadamard=*/1);
+ if (*best_model_rd != INT64_MAX &&
+ this_model_rd > *best_model_rd + (*best_model_rd >> 2)) {
+ return 1;
+ } else if (this_model_rd < *best_model_rd) {
+ *best_model_rd = this_model_rd;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_UTILS_H_
diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h
new file mode 100644
index 0000000000..4be2038a6f
--- /dev/null
+++ b/third_party/aom/av1/encoder/k_means_template.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+
+#ifndef AV1_K_MEANS_DIM
+#error "This template requires AV1_K_MEANS_DIM to be defined"
+#endif
+
+#define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
+#define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
+
+// Though we want to compute the smallest L2 norm, in 1 dimension,
+// it is equivalent to find the smallest L1 norm and then square it.
+// This is preferrable for speed, especially on the SIMD side.
+static int RENAME(calc_dist)(const int16_t *p1, const int16_t *p2) {
+#if AV1_K_MEANS_DIM == 1
+ return abs(p1[0] - p2[0]);
+#else
+ int dist = 0;
+ for (int i = 0; i < AV1_K_MEANS_DIM; ++i) {
+ const int diff = p1[i] - p2[i];
+ dist += diff * diff;
+ }
+ return dist;
+#endif
+}
+
+void RENAME(av1_calc_indices)(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *dist, int n, int k) {
+ if (dist) {
+ *dist = 0;
+ }
+ for (int i = 0; i < n; ++i) {
+ int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
+ indices[i] = 0;
+ for (int j = 1; j < k; ++j) {
+ const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+ centroids + j * AV1_K_MEANS_DIM);
+ if (this_dist < min_dist) {
+ min_dist = this_dist;
+ indices[i] = j;
+ }
+ }
+ if (dist) {
+#if AV1_K_MEANS_DIM == 1
+ *dist += min_dist * min_dist;
+#else
+ *dist += min_dist;
+#endif
+ }
+ }
+}
+
+static void RENAME(calc_centroids)(const int16_t *data, int16_t *centroids,
+ const uint8_t *indices, int n, int k) {
+ int i, j;
+ int count[PALETTE_MAX_SIZE] = { 0 };
+ int centroids_sum[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE];
+ unsigned int rand_state = (unsigned int)data[0];
+ assert(n <= 32768);
+ memset(centroids_sum, 0, sizeof(centroids_sum[0]) * k * AV1_K_MEANS_DIM);
+
+ for (i = 0; i < n; ++i) {
+ const int index = indices[i];
+ assert(index < k);
+ ++count[index];
+ for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+ centroids_sum[index * AV1_K_MEANS_DIM + j] +=
+ data[i * AV1_K_MEANS_DIM + j];
+ }
+ }
+
+ for (i = 0; i < k; ++i) {
+ if (count[i] == 0) {
+ memcpy(centroids + i * AV1_K_MEANS_DIM,
+ data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM,
+ sizeof(centroids[0]) * AV1_K_MEANS_DIM);
+ } else {
+ for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+ centroids[i * AV1_K_MEANS_DIM + j] =
+ DIVIDE_AND_ROUND(centroids_sum[i * AV1_K_MEANS_DIM + j], count[i]);
+ }
+ }
+ }
+}
+
+void RENAME(av1_k_means)(const int16_t *data, int16_t *centroids,
+ uint8_t *indices, int n, int k, int max_itr) {
+ int16_t centroids_tmp[AV1_K_MEANS_DIM * PALETTE_MAX_SIZE];
+ uint8_t indices_tmp[MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT];
+ int16_t *meta_centroids[2] = { centroids, centroids_tmp };
+ uint8_t *meta_indices[2] = { indices, indices_tmp };
+ int i, l = 0, prev_l, best_l = 0;
+ int64_t this_dist;
+
+ assert(n <= MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT);
+
+#if AV1_K_MEANS_DIM == 1
+ av1_calc_indices_dim1(data, centroids, indices, &this_dist, n, k);
+#else
+ av1_calc_indices_dim2(data, centroids, indices, &this_dist, n, k);
+#endif
+
+ for (i = 0; i < max_itr; ++i) {
+ const int64_t prev_dist = this_dist;
+ prev_l = l;
+ l = (l == 1) ? 0 : 1;
+
+ RENAME(calc_centroids)(data, meta_centroids[l], meta_indices[prev_l], n, k);
+ if (!memcmp(meta_centroids[l], meta_centroids[prev_l],
+ sizeof(centroids[0]) * k * AV1_K_MEANS_DIM)) {
+ break;
+ }
+#if AV1_K_MEANS_DIM == 1
+ av1_calc_indices_dim1(data, meta_centroids[l], meta_indices[l], &this_dist,
+ n, k);
+#else
+ av1_calc_indices_dim2(data, meta_centroids[l], meta_indices[l], &this_dist,
+ n, k);
+#endif
+
+ if (this_dist > prev_dist) {
+ best_l = prev_l;
+ break;
+ }
+ }
+ if (i == max_itr) best_l = l;
+ if (best_l != 0) {
+ memcpy(centroids, meta_centroids[1],
+ sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
+ memcpy(indices, meta_indices[1], sizeof(indices[0]) * n);
+ }
+}
+#undef RENAME_
+#undef RENAME
diff --git a/third_party/aom/av1/encoder/level.c b/third_party/aom/av1/encoder/level.c
new file mode 100644
index 0000000000..5d5fe9ce96
--- /dev/null
+++ b/third_party/aom/av1/encoder/level.c
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/level.h"
+
+#define UNDEFINED_LEVEL \
+ { \
+ .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0, \
+ .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0, \
+ .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \
+ .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0 \
+ }
+
+static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = {
+ { .level = SEQ_LEVEL_2_0,
+ .max_picture_size = 147456,
+ .max_h_size = 2048,
+ .max_v_size = 1152,
+ .max_display_rate = 4423680L,
+ .max_decode_rate = 5529600L,
+ .max_header_rate = 150,
+ .main_mbps = 1.5,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 8,
+ .max_tile_cols = 4 },
+ { .level = SEQ_LEVEL_2_1,
+ .max_picture_size = 278784,
+ .max_h_size = 2816,
+ .max_v_size = 1584,
+ .max_display_rate = 8363520L,
+ .max_decode_rate = 10454400L,
+ .max_header_rate = 150,
+ .main_mbps = 3.0,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 8,
+ .max_tile_cols = 4 },
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ { .level = SEQ_LEVEL_3_0,
+ .max_picture_size = 665856,
+ .max_h_size = 4352,
+ .max_v_size = 2448,
+ .max_display_rate = 19975680L,
+ .max_decode_rate = 24969600L,
+ .max_header_rate = 150,
+ .main_mbps = 6.0,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 16,
+ .max_tile_cols = 6 },
+ { .level = SEQ_LEVEL_3_1,
+ .max_picture_size = 1065024,
+ .max_h_size = 5504,
+ .max_v_size = 3096,
+ .max_display_rate = 31950720L,
+ .max_decode_rate = 39938400L,
+ .max_header_rate = 150,
+ .main_mbps = 10.0,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 16,
+ .max_tile_cols = 6 },
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ { .level = SEQ_LEVEL_4_0,
+ .max_picture_size = 2359296,
+ .max_h_size = 6144,
+ .max_v_size = 3456,
+ .max_display_rate = 70778880L,
+ .max_decode_rate = 77856768L,
+ .max_header_rate = 300,
+ .main_mbps = 12.0,
+ .high_mbps = 30.0,
+ .main_cr = 4.0,
+ .high_cr = 4.0,
+ .max_tiles = 32,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_4_1,
+ .max_picture_size = 2359296,
+ .max_h_size = 6144,
+ .max_v_size = 3456,
+ .max_display_rate = 141557760L,
+ .max_decode_rate = 155713536L,
+ .max_header_rate = 300,
+ .main_mbps = 20.0,
+ .high_mbps = 50.0,
+ .main_cr = 4.0,
+ .high_cr = 4.0,
+ .max_tiles = 32,
+ .max_tile_cols = 8 },
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ { .level = SEQ_LEVEL_5_0,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 267386880L,
+ .max_decode_rate = 273715200L,
+ .max_header_rate = 300,
+ .main_mbps = 30.0,
+ .high_mbps = 100.0,
+ .main_cr = 6.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_5_1,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 534773760L,
+ .max_decode_rate = 547430400L,
+ .max_header_rate = 300,
+ .main_mbps = 40.0,
+ .high_mbps = 160.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_5_2,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 1069547520L,
+ .max_decode_rate = 1094860800L,
+ .max_header_rate = 300,
+ .main_mbps = 60.0,
+ .high_mbps = 240.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_5_3,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 1069547520L,
+ .max_decode_rate = 1176502272L,
+ .max_header_rate = 300,
+ .main_mbps = 60.0,
+ .high_mbps = 240.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_6_0,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 1069547520L,
+ .max_decode_rate = 1176502272L,
+ .max_header_rate = 300,
+ .main_mbps = 60.0,
+ .high_mbps = 240.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+ { .level = SEQ_LEVEL_6_1,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 2139095040L,
+ .max_decode_rate = 2189721600L,
+ .max_header_rate = 300,
+ .main_mbps = 100.0,
+ .high_mbps = 480.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+ { .level = SEQ_LEVEL_6_2,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 4278190080L,
+ .max_decode_rate = 4379443200L,
+ .max_header_rate = 300,
+ .main_mbps = 160.0,
+ .high_mbps = 800.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+ { .level = SEQ_LEVEL_6_3,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 4278190080L,
+ .max_decode_rate = 4706009088L,
+ .max_header_rate = 300,
+ .main_mbps = 160.0,
+ .high_mbps = 800.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+#if CONFIG_CWG_C013
+ { .level = SEQ_LEVEL_7_0,
+ .max_picture_size = 142606336,
+ .max_h_size = 32768,
+ .max_v_size = 17408,
+ .max_display_rate = 4278190080L,
+ .max_decode_rate = 4706009088L,
+ .max_header_rate = 300,
+ .main_mbps = 160.0,
+ .high_mbps = 800.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 256,
+ .max_tile_cols = 32 },
+ { .level = SEQ_LEVEL_7_1,
+ .max_picture_size = 142606336,
+ .max_h_size = 32768,
+ .max_v_size = 17408,
+ .max_display_rate = 8556380160L,
+ .max_decode_rate = 8758886400L,
+ .max_header_rate = 300,
+ .main_mbps = 200.0,
+ .high_mbps = 960.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 256,
+ .max_tile_cols = 32 },
+ { .level = SEQ_LEVEL_7_2,
+ .max_picture_size = 142606336,
+ .max_h_size = 32768,
+ .max_v_size = 17408,
+ .max_display_rate = 17112760320L,
+ .max_decode_rate = 17517772800L,
+ .max_header_rate = 300,
+ .main_mbps = 320.0,
+ .high_mbps = 1600.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 256,
+ .max_tile_cols = 32 },
+ { .level = SEQ_LEVEL_7_3,
+ .max_picture_size = 142606336,
+ .max_h_size = 32768,
+ .max_v_size = 17408,
+ .max_display_rate = 17112760320L,
+ .max_decode_rate = 18824036352L,
+ .max_header_rate = 300,
+ .main_mbps = 320.0,
+ .high_mbps = 1600.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 256,
+ .max_tile_cols = 32 },
+ { .level = SEQ_LEVEL_8_0,
+ .max_picture_size = 530841600,
+ .max_h_size = 65536,
+ .max_v_size = 34816,
+ .max_display_rate = 17112760320L,
+ .max_decode_rate = 18824036352L,
+ .max_header_rate = 300,
+ .main_mbps = 320.0,
+ .high_mbps = 1600.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 512,
+ .max_tile_cols = 64 },
+ { .level = SEQ_LEVEL_8_1,
+ .max_picture_size = 530841600,
+ .max_h_size = 65536,
+ .max_v_size = 34816,
+ .max_display_rate = 34225520640L,
+ .max_decode_rate = 34910031052L,
+ .max_header_rate = 300,
+ .main_mbps = 400.0,
+ .high_mbps = 1920.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 512,
+ .max_tile_cols = 64 },
+ { .level = SEQ_LEVEL_8_2,
+ .max_picture_size = 530841600,
+ .max_h_size = 65536,
+ .max_v_size = 34816,
+ .max_display_rate = 68451041280L,
+ .max_decode_rate = 69820062105L,
+ .max_header_rate = 300,
+ .main_mbps = 640.0,
+ .high_mbps = 3200.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 512,
+ .max_tile_cols = 64 },
+ { .level = SEQ_LEVEL_8_3,
+ .max_picture_size = 530841600,
+ .max_h_size = 65536,
+ .max_v_size = 34816,
+ .max_display_rate = 68451041280L,
+ .max_decode_rate = 75296145408L,
+ .max_header_rate = 300,
+ .main_mbps = 640.0,
+ .high_mbps = 3200.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 512,
+ .max_tile_cols = 64 },
+#else // !CONFIG_CWG_C013
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+#endif // CONFIG_CWG_C013
+};
+
+typedef enum {
+ LUMA_PIC_SIZE_TOO_LARGE,
+ LUMA_PIC_H_SIZE_TOO_LARGE,
+ LUMA_PIC_V_SIZE_TOO_LARGE,
+ LUMA_PIC_H_SIZE_TOO_SMALL,
+ LUMA_PIC_V_SIZE_TOO_SMALL,
+ TOO_MANY_TILE_COLUMNS,
+ TOO_MANY_TILES,
+ TILE_RATE_TOO_HIGH,
+ TILE_TOO_LARGE,
+ SUPERRES_TILE_WIDTH_TOO_LARGE,
+ CROPPED_TILE_WIDTH_TOO_SMALL,
+ CROPPED_TILE_HEIGHT_TOO_SMALL,
+ TILE_WIDTH_INVALID,
+ FRAME_HEADER_RATE_TOO_HIGH,
+ DISPLAY_RATE_TOO_HIGH,
+ DECODE_RATE_TOO_HIGH,
+ CR_TOO_SMALL,
+ TILE_SIZE_HEADER_RATE_TOO_HIGH,
+ BITRATE_TOO_HIGH,
+ DECODER_MODEL_FAIL,
+
+ TARGET_LEVEL_FAIL_IDS,
+ TARGET_LEVEL_OK,
+} TARGET_LEVEL_FAIL_ID;
+
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+ "The picture size is too large.",
+ "The picture width is too large.",
+ "The picture height is too large.",
+ "The picture width is too small.",
+ "The picture height is too small.",
+ "Too many tile columns are used.",
+ "Too many tiles are used.",
+ "The tile rate is too high.",
+ "The tile size is too large.",
+ "The superres tile width is too large.",
+ "The cropped tile width is less than 8.",
+ "The cropped tile height is less than 8.",
+ "The tile width is invalid.",
+ "The frame header rate is too high.",
+ "The display luma sample rate is too high.",
+ "The decoded luma sample rate is too high.",
+ "The compression ratio is too small.",
+ "The product of max tile size and header rate is too high.",
+ "The bitrate is too high.",
+ "The decoder model fails.",
+};
+
+static double get_max_bitrate(const AV1LevelSpec *const level_spec, int tier,
+ BITSTREAM_PROFILE profile) {
+ if (level_spec->level < SEQ_LEVEL_4_0) tier = 0;
+ const double bitrate_basis =
+ (tier ? level_spec->high_mbps : level_spec->main_mbps) * 1e6;
+ const double bitrate_profile_factor =
+ profile == PROFILE_0 ? 1.0 : (profile == PROFILE_1 ? 2.0 : 3.0);
+ return bitrate_basis * bitrate_profile_factor;
+}
+
+double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
+ BITSTREAM_PROFILE profile) {
+ assert(is_valid_seq_level_idx(level_index));
+ return get_max_bitrate(&av1_level_defs[level_index], tier, profile);
+}
+
+void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles,
+ int *const max_tile_cols) {
+ assert(is_valid_seq_level_idx(level_index));
+ const AV1LevelSpec *const level_spec = &av1_level_defs[level_index];
+ *max_tiles = level_spec->max_tiles;
+ *max_tile_cols = level_spec->max_tile_cols;
+}
+
+// We assume time t to be valid if and only if t >= 0.0.
+// So INVALID_TIME can be defined as anything less than 0.
+#define INVALID_TIME (-1.0)
+
+// This corresponds to "free_buffer" in the spec.
+static void release_buffer(DECODER_MODEL *const decoder_model, int idx) {
+ assert(idx >= 0 && idx < BUFFER_POOL_MAX_SIZE);
+ FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx];
+ this_buffer->decoder_ref_count = 0;
+ this_buffer->player_ref_count = 0;
+ this_buffer->display_index = -1;
+ this_buffer->presentation_time = INVALID_TIME;
+}
+
+static void initialize_buffer_pool(DECODER_MODEL *const decoder_model) {
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ release_buffer(decoder_model, i);
+ }
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ decoder_model->vbi[i] = -1;
+ }
+}
+
+static int get_free_buffer(DECODER_MODEL *const decoder_model) {
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ const FRAME_BUFFER *const this_buffer =
+ &decoder_model->frame_buffer_pool[i];
+ if (this_buffer->decoder_ref_count == 0 &&
+ this_buffer->player_ref_count == 0)
+ return i;
+ }
+ return -1;
+}
+
+static void update_ref_buffers(DECODER_MODEL *const decoder_model, int idx,
+ int refresh_frame_flags) {
+ FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx];
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ if (refresh_frame_flags & (1 << i)) {
+ const int pre_idx = decoder_model->vbi[i];
+ if (pre_idx != -1) {
+ --decoder_model->frame_buffer_pool[pre_idx].decoder_ref_count;
+ }
+ decoder_model->vbi[i] = idx;
+ ++this_buffer->decoder_ref_count;
+ }
+ }
+}
+
+// The time (in seconds) required to decode a frame.
+static double time_to_decode_frame(const AV1_COMMON *const cm,
+ int64_t max_decode_rate) {
+ if (cm->show_existing_frame) return 0.0;
+
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+ int luma_samples = 0;
+ if (frame_type == KEY_FRAME || frame_type == INTRA_ONLY_FRAME) {
+ luma_samples = cm->superres_upscaled_width * cm->height;
+ } else {
+ const int spatial_layer_dimensions_present_flag = 0;
+ if (spatial_layer_dimensions_present_flag) {
+ assert(0 && "Spatial layer dimensions not supported yet.");
+ } else {
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int max_frame_width = seq_params->max_frame_width;
+ const int max_frame_height = seq_params->max_frame_height;
+ luma_samples = max_frame_width * max_frame_height;
+ }
+ }
+
+ return luma_samples / (double)max_decode_rate;
+}
+
+// Release frame buffers that are no longer needed for decode or display.
+// It corresponds to "start_decode_at_removal_time" in the spec.
+static void release_processed_frames(DECODER_MODEL *const decoder_model,
+ double removal_time) {
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i];
+ if (this_buffer->player_ref_count > 0) {
+ if (this_buffer->presentation_time >= 0.0 &&
+ this_buffer->presentation_time <= removal_time) {
+ this_buffer->player_ref_count = 0;
+ if (this_buffer->decoder_ref_count == 0) {
+ release_buffer(decoder_model, i);
+ }
+ }
+ }
+ }
+}
+
+static int frames_in_buffer_pool(const DECODER_MODEL *const decoder_model) {
+ int frames_in_pool = 0;
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ const FRAME_BUFFER *const this_buffer =
+ &decoder_model->frame_buffer_pool[i];
+ if (this_buffer->decoder_ref_count > 0 ||
+ this_buffer->player_ref_count > 0) {
+ ++frames_in_pool;
+ }
+ }
+ return frames_in_pool;
+}
+
+static double get_presentation_time(const DECODER_MODEL *const decoder_model,
+ int display_index) {
+ if (decoder_model->mode == SCHEDULE_MODE) {
+ assert(0 && "SCHEDULE_MODE NOT SUPPORTED");
+ return INVALID_TIME;
+ } else {
+ const double initial_presentation_delay =
+ decoder_model->initial_presentation_delay;
+ // Can't decide presentation time until the initial presentation delay is
+ // known.
+ if (initial_presentation_delay < 0.0) return INVALID_TIME;
+
+ return initial_presentation_delay +
+ display_index * decoder_model->num_ticks_per_picture *
+ decoder_model->display_clock_tick;
+ }
+}
+
+#define MAX_TIME 1e16
+static double time_next_buffer_is_free(int num_decoded_frame,
+ int decoder_buffer_delay,
+ const FRAME_BUFFER *frame_buffer_pool,
+ double current_time) {
+ if (num_decoded_frame == 0) {
+ return (double)decoder_buffer_delay / 90000.0;
+ }
+
+ double buf_free_time = MAX_TIME;
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ const FRAME_BUFFER *const this_buffer = &frame_buffer_pool[i];
+ if (this_buffer->decoder_ref_count == 0) {
+ if (this_buffer->player_ref_count == 0) {
+ return current_time;
+ }
+ const double presentation_time = this_buffer->presentation_time;
+ if (presentation_time >= 0.0 && presentation_time < buf_free_time) {
+ buf_free_time = presentation_time;
+ }
+ }
+ }
+ return buf_free_time < MAX_TIME ? buf_free_time : INVALID_TIME;
+}
+#undef MAX_TIME
+
+static double get_removal_time(int mode, int num_decoded_frame,
+ int decoder_buffer_delay,
+ const FRAME_BUFFER *frame_buffer_pool,
+ double current_time) {
+ if (mode == SCHEDULE_MODE) {
+ assert(0 && "SCHEDULE_MODE IS NOT SUPPORTED YET");
+ return INVALID_TIME;
+ } else {
+ return time_next_buffer_is_free(num_decoded_frame, decoder_buffer_delay,
+ frame_buffer_pool, current_time);
+ }
+}
+
+void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) {
+ printf(
+ "\n status %d, num_frame %3d, num_decoded_frame %3d, "
+ "num_shown_frame %3d, current time %6.2f, frames in buffer %2d, "
+ "presentation delay %6.2f, total interval %6.2f\n",
+ decoder_model->status, decoder_model->num_frame,
+ decoder_model->num_decoded_frame, decoder_model->num_shown_frame,
+ decoder_model->current_time, frames_in_buffer_pool(decoder_model),
+ decoder_model->initial_presentation_delay,
+ decoder_model->dfg_interval_queue.total_interval);
+ for (int i = 0; i < 10; ++i) {
+ const FRAME_BUFFER *const this_buffer =
+ &decoder_model->frame_buffer_pool[i];
+ printf("buffer %d, decode count %d, display count %d, present time %6.4f\n",
+ i, this_buffer->decoder_ref_count, this_buffer->player_ref_count,
+ this_buffer->presentation_time);
+ }
+}
+
+// op_index is the operating point index.
+void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
+ int op_index, DECODER_MODEL *const decoder_model) {
+ decoder_model->status = DECODER_MODEL_OK;
+ decoder_model->level = level;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ decoder_model->bit_rate = get_max_bitrate(
+ av1_level_defs + level, seq_params->tier[op_index], seq_params->profile);
+
+ // TODO(huisu or anyone): implement SCHEDULE_MODE.
+ decoder_model->mode = RESOURCE_MODE;
+ decoder_model->encoder_buffer_delay = 20000;
+ decoder_model->decoder_buffer_delay = 70000;
+ decoder_model->is_low_delay_mode = false;
+
+ decoder_model->first_bit_arrival_time = 0.0;
+ decoder_model->last_bit_arrival_time = 0.0;
+ decoder_model->coded_bits = 0;
+
+ decoder_model->removal_time = INVALID_TIME;
+ decoder_model->presentation_time = INVALID_TIME;
+ decoder_model->decode_samples = 0;
+ decoder_model->display_samples = 0;
+ decoder_model->max_decode_rate = 0.0;
+ decoder_model->max_display_rate = 0.0;
+
+ decoder_model->num_frame = -1;
+ decoder_model->num_decoded_frame = -1;
+ decoder_model->num_shown_frame = -1;
+ decoder_model->current_time = 0.0;
+
+ initialize_buffer_pool(decoder_model);
+
+ DFG_INTERVAL_QUEUE *const dfg_interval_queue =
+ &decoder_model->dfg_interval_queue;
+ dfg_interval_queue->total_interval = 0.0;
+ dfg_interval_queue->head = 0;
+ dfg_interval_queue->size = 0;
+
+ if (seq_params->timing_info_present) {
+ decoder_model->num_ticks_per_picture =
+ seq_params->timing_info.num_ticks_per_picture;
+ decoder_model->display_clock_tick =
+ seq_params->timing_info.num_units_in_display_tick /
+ seq_params->timing_info.time_scale;
+ } else {
+ decoder_model->num_ticks_per_picture = 1;
+ decoder_model->display_clock_tick = 1.0 / cpi->framerate;
+ }
+
+ decoder_model->initial_display_delay =
+ seq_params->op_params[op_index].initial_display_delay;
+ decoder_model->initial_presentation_delay = INVALID_TIME;
+ decoder_model->decode_rate = av1_level_defs[level].max_decode_rate;
+}
+
+DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf(
+ const AV1_COMP *const cpi, size_t coded_bits,
+ const DECODER_MODEL *const decoder_model) {
+ DECODER_MODEL_STATUS status = DECODER_MODEL_OK;
+
+ if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) {
+ return status;
+ }
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int show_existing_frame = cm->show_existing_frame;
+
+ size_t cur_coded_bits = decoder_model->coded_bits + coded_bits;
+ int num_decoded_frame = decoder_model->num_decoded_frame;
+ if (!show_existing_frame) ++num_decoded_frame;
+
+ if (show_existing_frame) {
+ return status;
+ } else {
+ const double removal_time = get_removal_time(
+ decoder_model->mode, num_decoded_frame,
+ decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool,
+ decoder_model->current_time);
+ if (removal_time < 0.0) {
+ status = DECODE_FRAME_BUF_UNAVAILABLE;
+ return status;
+ }
+
+ // A frame with show_existing_frame being false indicates the end of a DFG.
+ // Update the bits arrival time of this DFG.
+ const double buffer_delay = (decoder_model->encoder_buffer_delay +
+ decoder_model->decoder_buffer_delay) /
+ 90000.0;
+ const double latest_arrival_time = removal_time - buffer_delay;
+ const double first_bit_arrival_time =
+ AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time);
+ const double last_bit_arrival_time =
+ first_bit_arrival_time +
+ (double)cur_coded_bits / decoder_model->bit_rate;
+ // Smoothing buffer underflows if the last bit arrives after the removal
+ // time.
+ if (last_bit_arrival_time > removal_time &&
+ !decoder_model->is_low_delay_mode) {
+ status = SMOOTHING_BUFFER_UNDERFLOW;
+ return status;
+ }
+
+ // Check if the smoothing buffer overflows.
+ const DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue;
+ if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) {
+ assert(0);
+ }
+
+ double total_interval = queue->total_interval;
+ int qhead = queue->head;
+ int qsize = queue->size;
+ // Remove the DFGs with removal time earlier than last_bit_arrival_time.
+ while (queue->buf[qhead].removal_time <= last_bit_arrival_time &&
+ qsize > 0) {
+ if (queue->buf[qhead].removal_time - first_bit_arrival_time +
+ total_interval >
+ 1.0) {
+ status = SMOOTHING_BUFFER_OVERFLOW;
+ return status;
+ }
+ total_interval -= queue->buf[qhead].last_bit_arrival_time -
+ queue->buf[qhead].first_bit_arrival_time;
+ qhead = (qhead + 1) % DFG_INTERVAL_QUEUE_SIZE;
+ --qsize;
+ }
+ total_interval += last_bit_arrival_time - first_bit_arrival_time;
+ // The smoothing buffer can hold at most "bit_rate" bits, which is
+ // equivalent to 1 second of total interval.
+ if (total_interval > 1.0) {
+ status = SMOOTHING_BUFFER_OVERFLOW;
+ return status;
+ }
+
+ return status;
+ }
+}
+
+void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
+ size_t coded_bits,
+ DECODER_MODEL *const decoder_model) {
+ if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int luma_pic_size = cm->superres_upscaled_width * cm->height;
+ const int show_existing_frame = cm->show_existing_frame;
+ const int show_frame = cm->show_frame || show_existing_frame;
+ ++decoder_model->num_frame;
+ if (!show_existing_frame) ++decoder_model->num_decoded_frame;
+ if (show_frame) ++decoder_model->num_shown_frame;
+ decoder_model->coded_bits += coded_bits;
+
+ int display_idx = -1;
+ if (show_existing_frame) {
+ display_idx = decoder_model->vbi[cpi->existing_fb_idx_to_show];
+ if (display_idx < 0) {
+ decoder_model->status = DECODE_EXISTING_FRAME_BUF_EMPTY;
+ return;
+ }
+ if (decoder_model->frame_buffer_pool[display_idx].frame_type == KEY_FRAME) {
+ update_ref_buffers(decoder_model, display_idx, 0xFF);
+ }
+ } else {
+ const double removal_time = get_removal_time(
+ decoder_model->mode, decoder_model->num_decoded_frame,
+ decoder_model->decoder_buffer_delay, decoder_model->frame_buffer_pool,
+ decoder_model->current_time);
+ if (removal_time < 0.0) {
+ decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
+ return;
+ }
+
+ const int previous_decode_samples = decoder_model->decode_samples;
+ const double previous_removal_time = decoder_model->removal_time;
+ assert(previous_removal_time < removal_time);
+ decoder_model->removal_time = removal_time;
+ decoder_model->decode_samples = luma_pic_size;
+ const double this_decode_rate =
+ previous_decode_samples / (removal_time - previous_removal_time);
+ decoder_model->max_decode_rate =
+ AOMMAX(decoder_model->max_decode_rate, this_decode_rate);
+
+ // A frame with show_existing_frame being false indicates the end of a DFG.
+ // Update the bits arrival time of this DFG.
+ const double buffer_delay = (decoder_model->encoder_buffer_delay +
+ decoder_model->decoder_buffer_delay) /
+ 90000.0;
+ const double latest_arrival_time = removal_time - buffer_delay;
+ decoder_model->first_bit_arrival_time =
+ AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time);
+ decoder_model->last_bit_arrival_time =
+ decoder_model->first_bit_arrival_time +
+ (double)decoder_model->coded_bits / decoder_model->bit_rate;
+ // Smoothing buffer underflows if the last bit arrives after the removal
+ // time.
+ if (decoder_model->last_bit_arrival_time > removal_time &&
+ !decoder_model->is_low_delay_mode) {
+ decoder_model->status = SMOOTHING_BUFFER_UNDERFLOW;
+ return;
+ }
+ // Reset the coded bits for the next DFG.
+ decoder_model->coded_bits = 0;
+
+ // Check if the smoothing buffer overflows.
+ DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue;
+ if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) {
+ assert(0);
+ }
+ const double first_bit_arrival_time = decoder_model->first_bit_arrival_time;
+ const double last_bit_arrival_time = decoder_model->last_bit_arrival_time;
+ // Remove the DFGs with removal time earlier than last_bit_arrival_time.
+ while (queue->buf[queue->head].removal_time <= last_bit_arrival_time &&
+ queue->size > 0) {
+ if (queue->buf[queue->head].removal_time - first_bit_arrival_time +
+ queue->total_interval >
+ 1.0) {
+ decoder_model->status = SMOOTHING_BUFFER_OVERFLOW;
+ return;
+ }
+ queue->total_interval -= queue->buf[queue->head].last_bit_arrival_time -
+ queue->buf[queue->head].first_bit_arrival_time;
+ queue->head = (queue->head + 1) % DFG_INTERVAL_QUEUE_SIZE;
+ --queue->size;
+ }
+ // Push current DFG into the queue.
+ const int queue_index =
+ (queue->head + queue->size++) % DFG_INTERVAL_QUEUE_SIZE;
+ queue->buf[queue_index].first_bit_arrival_time = first_bit_arrival_time;
+ queue->buf[queue_index].last_bit_arrival_time = last_bit_arrival_time;
+ queue->buf[queue_index].removal_time = removal_time;
+ queue->total_interval += last_bit_arrival_time - first_bit_arrival_time;
+ // The smoothing buffer can hold at most "bit_rate" bits, which is
+ // equivalent to 1 second of total interval.
+ if (queue->total_interval > 1.0) {
+ decoder_model->status = SMOOTHING_BUFFER_OVERFLOW;
+ return;
+ }
+
+ release_processed_frames(decoder_model, removal_time);
+ decoder_model->current_time =
+ removal_time + time_to_decode_frame(cm, decoder_model->decode_rate);
+
+ const int cfbi = get_free_buffer(decoder_model);
+ if (cfbi < 0) {
+ decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
+ return;
+ }
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ decoder_model->frame_buffer_pool[cfbi].frame_type =
+ cm->current_frame.frame_type;
+ display_idx = cfbi;
+ update_ref_buffers(decoder_model, cfbi, current_frame->refresh_frame_flags);
+
+ if (decoder_model->initial_presentation_delay < 0.0) {
+ // Display can begin after required number of frames have been buffered.
+ if (frames_in_buffer_pool(decoder_model) >=
+ decoder_model->initial_display_delay - 1) {
+ decoder_model->initial_presentation_delay = decoder_model->current_time;
+ // Update presentation time for each shown frame in the frame buffer.
+ for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+ FRAME_BUFFER *const this_buffer =
+ &decoder_model->frame_buffer_pool[i];
+ if (this_buffer->player_ref_count == 0) continue;
+ assert(this_buffer->display_index >= 0);
+ this_buffer->presentation_time =
+ get_presentation_time(decoder_model, this_buffer->display_index);
+ }
+ }
+ }
+ }
+
+ // Display.
+ if (show_frame) {
+ assert(display_idx >= 0 && display_idx < BUFFER_POOL_MAX_SIZE);
+ FRAME_BUFFER *const this_buffer =
+ &decoder_model->frame_buffer_pool[display_idx];
+ ++this_buffer->player_ref_count;
+ this_buffer->display_index = decoder_model->num_shown_frame;
+ const double presentation_time =
+ get_presentation_time(decoder_model, this_buffer->display_index);
+ this_buffer->presentation_time = presentation_time;
+ if (presentation_time >= 0.0 &&
+ decoder_model->current_time > presentation_time) {
+ decoder_model->status = DISPLAY_FRAME_LATE;
+ return;
+ }
+
+ const int previous_display_samples = decoder_model->display_samples;
+ const double previous_presentation_time = decoder_model->presentation_time;
+ decoder_model->display_samples = luma_pic_size;
+ decoder_model->presentation_time = presentation_time;
+ if (presentation_time >= 0.0 && previous_presentation_time >= 0.0) {
+ assert(previous_presentation_time < presentation_time);
+ const double this_display_rate =
+ previous_display_samples /
+ (presentation_time - previous_presentation_time);
+ decoder_model->max_display_rate =
+ AOMMAX(decoder_model->max_display_rate, this_display_rate);
+ }
+ }
+}
+
+void av1_init_level_info(AV1_COMP *cpi) {
+ for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) {
+ AV1LevelInfo *const this_level_info =
+ cpi->ppi->level_params.level_info[op_index];
+ if (!this_level_info) continue;
+ memset(this_level_info, 0, sizeof(*this_level_info));
+ AV1LevelSpec *const level_spec = &this_level_info->level_spec;
+ level_spec->level = SEQ_LEVEL_MAX;
+ AV1LevelStats *const level_stats = &this_level_info->level_stats;
+ level_stats->min_cropped_tile_width = INT_MAX;
+ level_stats->min_cropped_tile_height = INT_MAX;
+ level_stats->min_frame_width = INT_MAX;
+ level_stats->min_frame_height = INT_MAX;
+ level_stats->tile_width_is_valid = 1;
+ level_stats->min_cr = 1e8;
+
+ FrameWindowBuffer *const frame_window_buffer =
+ &this_level_info->frame_window_buffer;
+ frame_window_buffer->num = 0;
+ frame_window_buffer->start = 0;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int upscaled_width = cm->superres_upscaled_width;
+ const int height = cm->height;
+ const int pic_size = upscaled_width * height;
+ for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
+ DECODER_MODEL *const this_model = &this_level_info->decoder_models[level];
+ const AV1LevelSpec *const spec = &av1_level_defs[level];
+ if (upscaled_width > spec->max_h_size || height > spec->max_v_size ||
+ pic_size > spec->max_picture_size) {
+ // Turn off decoder model for this level as the frame size already
+ // exceeds level constraints.
+ this_model->status = DECODER_MODEL_DISABLED;
+ } else {
+ av1_decoder_model_init(cpi, level, op_index, this_model);
+ }
+ }
+ }
+}
+
+static double get_min_cr(const AV1LevelSpec *const level_spec, int tier,
+ int is_still_picture, int64_t decoded_sample_rate) {
+ if (is_still_picture) return 0.8;
+ if (level_spec->level < SEQ_LEVEL_4_0) tier = 0;
+ const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr;
+ const double speed_adj =
+ (double)decoded_sample_rate / level_spec->max_display_rate;
+ return AOMMAX(min_cr_basis * speed_adj, 0.8);
+}
+
+double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier,
+ int is_still_picture) {
+ assert(is_valid_seq_level_idx(level_index));
+ const AV1LevelSpec *const level_spec = &av1_level_defs[level_index];
+ return get_min_cr(level_spec, tier, is_still_picture,
+ level_spec->max_decode_rate);
+}
+
+static void get_temporal_parallel_params(int scalability_mode_idc,
+ int *temporal_parallel_num,
+ int *temporal_parallel_denom) {
+ if (scalability_mode_idc < 0) {
+ *temporal_parallel_num = 1;
+ *temporal_parallel_denom = 1;
+ return;
+ }
+
+ // TODO(huisu@): handle scalability cases.
+ if (scalability_mode_idc == SCALABILITY_SS) {
+ (void)scalability_mode_idc;
+ } else {
+ (void)scalability_mode_idc;
+ }
+}
+
+#define MIN_CROPPED_TILE_WIDTH 8
+#define MIN_CROPPED_TILE_HEIGHT 8
+#define MIN_FRAME_WIDTH 16
+#define MIN_FRAME_HEIGHT 16
+#define MAX_TILE_SIZE_HEADER_RATE_PRODUCT 588251136
+
+static TARGET_LEVEL_FAIL_ID check_level_constraints(
+ const AV1LevelInfo *const level_info, AV1_LEVEL level, int tier,
+ int is_still_picture, BITSTREAM_PROFILE profile, int check_bitrate) {
+ const DECODER_MODEL *const decoder_model = &level_info->decoder_models[level];
+ const DECODER_MODEL_STATUS decoder_model_status = decoder_model->status;
+ if (decoder_model_status != DECODER_MODEL_OK &&
+ decoder_model_status != DECODER_MODEL_DISABLED) {
+ return DECODER_MODEL_FAIL;
+ }
+
+ const AV1LevelSpec *const level_spec = &level_info->level_spec;
+ const AV1LevelSpec *const target_level_spec = &av1_level_defs[level];
+ const AV1LevelStats *const level_stats = &level_info->level_stats;
+ TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK;
+ do {
+ if (level_spec->max_picture_size > target_level_spec->max_picture_size) {
+ fail_id = LUMA_PIC_SIZE_TOO_LARGE;
+ break;
+ }
+
+ if (level_spec->max_h_size > target_level_spec->max_h_size) {
+ fail_id = LUMA_PIC_H_SIZE_TOO_LARGE;
+ break;
+ }
+
+ if (level_spec->max_v_size > target_level_spec->max_v_size) {
+ fail_id = LUMA_PIC_V_SIZE_TOO_LARGE;
+ break;
+ }
+
+ if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) {
+ fail_id = TOO_MANY_TILE_COLUMNS;
+ break;
+ }
+
+ if (level_spec->max_tiles > target_level_spec->max_tiles) {
+ fail_id = TOO_MANY_TILES;
+ break;
+ }
+
+ if (level_spec->max_header_rate > target_level_spec->max_header_rate) {
+ fail_id = FRAME_HEADER_RATE_TOO_HIGH;
+ break;
+ }
+
+ if (decoder_model->max_display_rate >
+ (double)target_level_spec->max_display_rate) {
+ fail_id = DISPLAY_RATE_TOO_HIGH;
+ break;
+ }
+
+ // TODO(huisu): we are not using max decode rate calculated by the decoder
+ // model because the model in resource availability mode always returns
+ // MaxDecodeRate(as in the level definitions) as the max decode rate.
+ if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) {
+ fail_id = DECODE_RATE_TOO_HIGH;
+ break;
+ }
+
+ if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) {
+ fail_id = TILE_RATE_TOO_HIGH;
+ break;
+ }
+
+#if CONFIG_CWG_C013
+ const int max_tile_size = (level >= SEQ_LEVEL_7_0 && level <= SEQ_LEVEL_8_3)
+ ? MAX_TILE_AREA_LEVEL_7_AND_ABOVE
+ : MAX_TILE_AREA;
+#else
+ const int max_tile_size = MAX_TILE_AREA;
+#endif
+ if (level_stats->max_tile_size > max_tile_size) {
+ fail_id = TILE_TOO_LARGE;
+ break;
+ }
+
+ if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) {
+ fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE;
+ break;
+ }
+
+ if (level_stats->min_cropped_tile_width < MIN_CROPPED_TILE_WIDTH) {
+ fail_id = CROPPED_TILE_WIDTH_TOO_SMALL;
+ break;
+ }
+
+ if (level_stats->min_cropped_tile_height < MIN_CROPPED_TILE_HEIGHT) {
+ fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL;
+ break;
+ }
+
+ if (level_stats->min_frame_width < MIN_FRAME_WIDTH) {
+ fail_id = LUMA_PIC_H_SIZE_TOO_SMALL;
+ break;
+ }
+
+ if (level_stats->min_frame_height < MIN_FRAME_HEIGHT) {
+ fail_id = LUMA_PIC_V_SIZE_TOO_SMALL;
+ break;
+ }
+
+ if (!level_stats->tile_width_is_valid) {
+ fail_id = TILE_WIDTH_INVALID;
+ break;
+ }
+
+ const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture,
+ level_spec->max_decode_rate);
+ if (level_stats->min_cr < min_cr) {
+ fail_id = CR_TOO_SMALL;
+ break;
+ }
+
+ if (check_bitrate) {
+ // Check average bitrate instead of max_bitrate.
+ const double bitrate_limit =
+ get_max_bitrate(target_level_spec, tier, profile);
+ const double avg_bitrate = level_stats->total_compressed_size * 8.0 /
+ level_stats->total_time_encoded;
+ if (avg_bitrate > bitrate_limit) {
+ fail_id = BITRATE_TOO_HIGH;
+ break;
+ }
+ }
+
+ if (target_level_spec->level > SEQ_LEVEL_5_1) {
+ int temporal_parallel_num;
+ int temporal_parallel_denom;
+ const int scalability_mode_idc = -1;
+ get_temporal_parallel_params(scalability_mode_idc, &temporal_parallel_num,
+ &temporal_parallel_denom);
+ const int val = level_stats->max_tile_size * level_spec->max_header_rate *
+ temporal_parallel_denom / temporal_parallel_num;
+ if (val > MAX_TILE_SIZE_HEADER_RATE_PRODUCT) {
+ fail_id = TILE_SIZE_HEADER_RATE_TOO_HIGH;
+ break;
+ }
+ }
+ } while (0);
+
+ return fail_id;
+}
+
+static void get_tile_stats(const AV1_COMMON *const cm,
+ const TileDataEnc *const tile_data,
+ int *max_tile_size, int *max_superres_tile_width,
+ int *min_cropped_tile_width,
+ int *min_cropped_tile_height,
+ int *tile_width_valid) {
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ const int superres_scale_denominator = cm->superres_scale_denominator;
+
+ *max_tile_size = 0;
+ *max_superres_tile_width = 0;
+ *min_cropped_tile_width = INT_MAX;
+ *min_cropped_tile_height = INT_MAX;
+ *tile_width_valid = 1;
+
+ for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ const TileInfo *const tile_info =
+ &tile_data[tile_row * cm->tiles.cols + tile_col].tile_info;
+ const int tile_width =
+ (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE;
+ const int tile_height =
+ (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+ const int tile_size = tile_width * tile_height;
+ *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+
+ const int supperres_tile_width =
+ tile_width * superres_scale_denominator / SCALE_NUMERATOR;
+ *max_superres_tile_width =
+ AOMMAX(*max_superres_tile_width, supperres_tile_width);
+
+ const int cropped_tile_width =
+ cm->width - tile_info->mi_col_start * MI_SIZE;
+ const int cropped_tile_height =
+ cm->height - tile_info->mi_row_start * MI_SIZE;
+ *min_cropped_tile_width =
+ AOMMIN(*min_cropped_tile_width, cropped_tile_width);
+ *min_cropped_tile_height =
+ AOMMIN(*min_cropped_tile_height, cropped_tile_height);
+
+ const int is_right_most_tile =
+ tile_info->mi_col_end == cm->mi_params.mi_cols;
+ if (!is_right_most_tile) {
+ if (av1_superres_scaled(cm))
+ *tile_width_valid &= tile_width >= 128;
+ else
+ *tile_width_valid &= tile_width >= 64;
+ }
+ }
+ }
+}
+
+static int store_frame_record(int64_t ts_start, int64_t ts_end,
+ size_t encoded_size, int pic_size,
+ int frame_header_count, int tiles, int show_frame,
+ int show_existing_frame,
+ FrameWindowBuffer *const buffer) {
+ if (buffer->num < FRAME_WINDOW_SIZE) {
+ ++buffer->num;
+ } else {
+ buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE;
+ }
+ const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+ FrameRecord *const record = &buffer->buf[new_idx];
+ record->ts_start = ts_start;
+ record->ts_end = ts_end;
+ record->encoded_size_in_bytes = encoded_size;
+ record->pic_size = pic_size;
+ record->frame_header_count = frame_header_count;
+ record->tiles = tiles;
+ record->show_frame = show_frame;
+ record->show_existing_frame = show_existing_frame;
+
+ return new_idx;
+}
+
+// Count the number of frames encoded in the last "duration" ticks, in display
+// time.
+static int count_frames(const FrameWindowBuffer *const buffer,
+ int64_t duration) {
+ const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+ // Assume current frame is shown frame.
+ assert(buffer->buf[current_idx].show_frame);
+
+ const int64_t current_time = buffer->buf[current_idx].ts_end;
+ const int64_t time_limit = AOMMAX(current_time - duration, 0);
+ int num_frames = 1;
+ int index = current_idx - 1;
+ for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) {
+ if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+ const FrameRecord *const record = &buffer->buf[index];
+ if (!record->show_frame) continue;
+ const int64_t ts_start = record->ts_start;
+ if (ts_start < time_limit) break;
+ }
+
+ return num_frames;
+}
+
+// Scan previously encoded frames and update level metrics accordingly.
+static void scan_past_frames(const FrameWindowBuffer *const buffer,
+ int num_frames_to_scan,
+ AV1LevelSpec *const level_spec,
+ AV1LevelStats *const level_stats) {
+ const int num_frames_in_buffer = buffer->num;
+ int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE;
+ int frame_headers = 0;
+ int tiles = 0;
+ int64_t display_samples = 0;
+ int64_t decoded_samples = 0;
+ size_t encoded_size_in_bytes = 0;
+ for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) {
+ const FrameRecord *const record = &buffer->buf[index];
+ if (!record->show_existing_frame) {
+ frame_headers += record->frame_header_count;
+ decoded_samples += record->pic_size;
+ }
+ if (record->show_frame) {
+ display_samples += record->pic_size;
+ }
+ tiles += record->tiles;
+ encoded_size_in_bytes += record->encoded_size_in_bytes;
+ --index;
+ if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+ }
+ level_spec->max_header_rate =
+ AOMMAX(level_spec->max_header_rate, frame_headers);
+ // TODO(huisu): we can now compute max display rate with the decoder model, so
+ // these couple of lines can be removed. Keep them here for a while for
+ // debugging purpose.
+ level_spec->max_display_rate =
+ AOMMAX(level_spec->max_display_rate, display_samples);
+ level_spec->max_decode_rate =
+ AOMMAX(level_spec->max_decode_rate, decoded_samples);
+ level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles);
+ level_stats->max_bitrate =
+ AOMMAX(level_stats->max_bitrate,
+ (int)AOMMIN(encoded_size_in_bytes * 8, (size_t)INT_MAX));
+}
+
+void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
+ int64_t ts_end) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1LevelParams *const level_params = &cpi->ppi->level_params;
+
+ const int upscaled_width = cm->superres_upscaled_width;
+ const int width = cm->width;
+ const int height = cm->height;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+ const int tiles = tile_cols * tile_rows;
+ const int luma_pic_size = upscaled_width * height;
+ const int frame_header_count = cpi->frame_header_count;
+ const int show_frame = cm->show_frame;
+ const int show_existing_frame = cm->show_existing_frame;
+
+ int max_tile_size;
+ int min_cropped_tile_width;
+ int min_cropped_tile_height;
+ int max_superres_tile_width;
+ int tile_width_is_valid;
+ get_tile_stats(cm, cpi->tile_data, &max_tile_size, &max_superres_tile_width,
+ &min_cropped_tile_width, &min_cropped_tile_height,
+ &tile_width_is_valid);
+
+ const double compression_ratio = av1_get_compression_ratio(cm, size);
+
+ const int temporal_layer_id = cm->temporal_layer_id;
+ const int spatial_layer_id = cm->spatial_layer_id;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const BITSTREAM_PROFILE profile = seq_params->profile;
+ const int is_still_picture = seq_params->still_picture;
+ // update level_stats
+ // TODO(kyslov@) fix the implementation according to buffer model
+ for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) {
+ if (!is_in_operating_point(seq_params->operating_point_idc[i],
+ temporal_layer_id, spatial_layer_id) ||
+ !((level_params->keep_level_stats >> i) & 1)) {
+ continue;
+ }
+
+ AV1LevelInfo *const level_info = level_params->level_info[i];
+ assert(level_info != NULL);
+ AV1LevelStats *const level_stats = &level_info->level_stats;
+
+ level_stats->max_tile_size =
+ AOMMAX(level_stats->max_tile_size, max_tile_size);
+ level_stats->max_superres_tile_width =
+ AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width);
+ level_stats->min_cropped_tile_width =
+ AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width);
+ level_stats->min_cropped_tile_height =
+ AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height);
+ level_stats->tile_width_is_valid &= tile_width_is_valid;
+ level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width);
+ level_stats->min_frame_height =
+ AOMMIN(level_stats->min_frame_height, height);
+ level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio);
+ level_stats->total_compressed_size += (double)size;
+
+ // update level_spec
+ // TODO(kyslov@) update all spec fields
+ AV1LevelSpec *const level_spec = &level_info->level_spec;
+ level_spec->max_picture_size =
+ AOMMAX(level_spec->max_picture_size, luma_pic_size);
+ level_spec->max_h_size =
+ AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width);
+ level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height);
+ level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols);
+ level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles);
+
+ // Store info. of current frame into FrameWindowBuffer.
+ FrameWindowBuffer *const buffer = &level_info->frame_window_buffer;
+ store_frame_record(ts_start, ts_end, size, luma_pic_size,
+ frame_header_count, tiles, show_frame,
+ show_existing_frame, buffer);
+ if (show_frame) {
+ // Count the number of frames encoded in the past 1 second.
+ const int encoded_frames_in_last_second =
+ show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0;
+ scan_past_frames(buffer, encoded_frames_in_last_second, level_spec,
+ level_stats);
+ level_stats->total_time_encoded +=
+ (cpi->time_stamps.prev_ts_end - cpi->time_stamps.prev_ts_start) /
+ (double)TICKS_PER_SEC;
+ }
+
+ DECODER_MODEL *const decoder_models = level_info->decoder_models;
+ for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
+ av1_decoder_model_process_frame(cpi, size << 3, &decoder_models[level]);
+ }
+
+ // Check whether target level is met.
+ const AV1_LEVEL target_level = level_params->target_seq_level_idx[i];
+ if (target_level < SEQ_LEVELS && cpi->oxcf.strict_level_conformance) {
+ assert(is_valid_seq_level_idx(target_level));
+ const int tier = seq_params->tier[i];
+ const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+ level_info, target_level, tier, is_still_picture, profile, 0);
+ if (fail_id != TARGET_LEVEL_OK) {
+ const int target_level_major = 2 + (target_level >> 2);
+ const int target_level_minor = target_level & 3;
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Failed to encode to the target level %d_%d. %s",
+ target_level_major, target_level_minor,
+ level_fail_messages[fail_id]);
+ }
+ }
+ }
+}
+
+aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
+ const AV1LevelParams *level_params,
+ int *seq_level_idx) {
+ const int is_still_picture = seq_params->still_picture;
+ const BITSTREAM_PROFILE profile = seq_params->profile;
+ for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+ seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+ if (!((level_params->keep_level_stats >> op) & 1)) continue;
+ const int tier = seq_params->tier[op];
+ const AV1LevelInfo *const level_info = level_params->level_info[op];
+ assert(level_info != NULL);
+ for (int level = 0; level < SEQ_LEVELS; ++level) {
+ if (!is_valid_seq_level_idx(level)) continue;
+ const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+ level_info, level, tier, is_still_picture, profile, 1);
+ if (fail_id == TARGET_LEVEL_OK) {
+ seq_level_idx[op] = level;
+ break;
+ }
+ }
+ }
+
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params,
+ const AV1LevelParams *level_params,
+ int *target_seq_level_idx) {
+ for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+ target_seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+ if (!((level_params->keep_level_stats >> op) & 1)) continue;
+ target_seq_level_idx[op] = level_params->target_seq_level_idx[op];
+ }
+
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/level.h b/third_party/aom/av1/encoder/level.h
new file mode 100644
index 0000000000..ebf2a1c19d
--- /dev/null
+++ b/third_party/aom/av1/encoder/level.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_LEVEL_H_
+#define AOM_AV1_ENCODER_LEVEL_H_
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+
+// AV1 Level Specifications
+typedef struct {
+ AV1_LEVEL level;
+ int max_picture_size;
+ int max_h_size;
+ int max_v_size;
+ int max_header_rate;
+ int max_tile_rate;
+ int max_tiles;
+ int max_tile_cols;
+ int64_t max_display_rate;
+ int64_t max_decode_rate;
+ double main_mbps;
+ double high_mbps;
+ double main_cr;
+ double high_cr;
+} AV1LevelSpec;
+
+typedef struct {
+ int64_t ts_start;
+ int64_t ts_end;
+ size_t encoded_size_in_bytes;
+ int pic_size;
+ int frame_header_count;
+ int tiles;
+ int show_frame;
+ int show_existing_frame;
+} FrameRecord;
+
+// Record frame info. in a rolling window.
+#define FRAME_WINDOW_SIZE 256
+typedef struct {
+ FrameRecord buf[FRAME_WINDOW_SIZE];
+ int num; // Number of FrameRecord stored in the buffer.
+ int start; // Buffer index of the first FrameRecord.
+} FrameWindowBuffer;
+
+typedef struct {
+ int max_bitrate; // Max bitrate in any 1-second window, in bps.
+ int max_tile_size;
+ int max_superres_tile_width;
+ int min_cropped_tile_width;
+ int min_cropped_tile_height;
+ int tile_width_is_valid;
+ int min_frame_width;
+ int min_frame_height;
+ double total_compressed_size; // In bytes.
+ double total_time_encoded; // In seconds.
+ double min_cr;
+} AV1LevelStats;
+
+// The following data structures are for the decoder model.
+typedef struct {
+ int decoder_ref_count;
+ int player_ref_count;
+ int display_index;
+ FRAME_TYPE frame_type;
+ double presentation_time;
+} FRAME_BUFFER;
+
+// Interval of bits transmission for a DFG(Decodable Frame Group).
+typedef struct {
+ double first_bit_arrival_time; // Time when the first bit arrives.
+ double last_bit_arrival_time; // Time when the last bit arrives.
+ // Removal time means the time when the bits to be decoded are removed from
+ // the smoothing buffer. Removal time is essentially the time when the
+ // decoding of the frame starts.
+ double removal_time;
+} DFG_INTERVAL;
+
+#define DFG_INTERVAL_QUEUE_SIZE 64
+typedef struct {
+ int head;
+ int size;
+ double total_interval;
+ DFG_INTERVAL buf[DFG_INTERVAL_QUEUE_SIZE];
+} DFG_INTERVAL_QUEUE;
+
+enum {
+ RESOURCE_MODE = 0, // Resource availability mode.
+ SCHEDULE_MODE // Decoding schedule mode.
+} UENUM1BYTE(DECODER_MODEL_MODE);
+
+enum {
+ DECODER_MODEL_OK = 0,
+ DECODE_BUFFER_AVAILABLE_LATE,
+ DECODE_FRAME_BUF_UNAVAILABLE,
+ DECODE_EXISTING_FRAME_BUF_EMPTY,
+ DISPLAY_FRAME_LATE,
+ SMOOTHING_BUFFER_UNDERFLOW,
+ SMOOTHING_BUFFER_OVERFLOW,
+ DECODER_MODEL_DISABLED
+} UENUM1BYTE(DECODER_MODEL_STATUS);
+
+#define BUFFER_POOL_MAX_SIZE 10
+typedef struct {
+ DECODER_MODEL_STATUS status;
+ DECODER_MODEL_MODE mode;
+ bool is_low_delay_mode;
+ AV1_LEVEL level;
+ int encoder_buffer_delay; // In units of 1/90000 seconds.
+ int decoder_buffer_delay; // In units of 1/90000 seconds.
+ int num_ticks_per_picture;
+ int initial_display_delay; // In units of frames.
+ int64_t decode_rate;
+ double display_clock_tick; // In units of seconds.
+ double current_time; // In units of seconds.
+ double initial_presentation_delay; // In units of seconds.
+ double bit_rate; // Bits per second.
+
+ int num_frame;
+ int num_decoded_frame;
+ int num_shown_frame;
+ int vbi[REF_FRAMES]; // Virtual buffer index.
+ FRAME_BUFFER frame_buffer_pool[BUFFER_POOL_MAX_SIZE];
+ DFG_INTERVAL_QUEUE dfg_interval_queue;
+
+ // Information for the DFG(Decodable Frame Group) being processed.
+ double first_bit_arrival_time;
+ double last_bit_arrival_time;
+ size_t coded_bits;
+
+ // Information for the frame being processed.
+ double removal_time;
+ double presentation_time;
+ int decode_samples;
+ int display_samples;
+
+ double max_display_rate;
+ double max_decode_rate;
+} DECODER_MODEL;
+
+typedef struct {
+ AV1LevelStats level_stats;
+ AV1LevelSpec level_spec;
+ FrameWindowBuffer frame_window_buffer;
+ DECODER_MODEL decoder_models[SEQ_LEVELS];
+} AV1LevelInfo;
+
+typedef struct AV1LevelParams {
+ // Specifies the level that the coded video sequence conforms to for each
+ // operating point.
+ AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+ // Bit mask to indicate whether to keep level stats for corresponding
+ // operating points.
+ uint32_t keep_level_stats;
+ // Level information for each operating point.
+ AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS];
+} AV1LevelParams;
+
+static INLINE int is_in_operating_point(int operating_point,
+ int temporal_layer_id,
+ int spatial_layer_id) {
+ if (!operating_point) return 1;
+
+ return ((operating_point >> temporal_layer_id) & 1) &&
+ ((operating_point >> (spatial_layer_id + 8)) & 1);
+}
+
+void av1_init_level_info(struct AV1_COMP *cpi);
+
+void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start,
+ int64_t ts_end);
+
+// Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS].
+aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
+ const AV1LevelParams *level_params,
+ int *seq_level_idx);
+
+aom_codec_err_t av1_get_target_seq_level_idx(const SequenceHeader *seq_params,
+ const AV1LevelParams *level_params,
+ int *target_seq_level_idx);
+
+// Print the status of the decoder model(for debugging).
+void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model);
+
+void av1_decoder_model_init(const struct AV1_COMP *const cpi, AV1_LEVEL level,
+ int op_index, DECODER_MODEL *const decoder_model);
+
+void av1_decoder_model_process_frame(const struct AV1_COMP *const cpi,
+ size_t coded_bits,
+ DECODER_MODEL *const decoder_model);
+
+// This function uses the decoder model to check whether there could be
+// SMOOTHING_BUFFER_UNDERFLOW or SMOOTHING_BUFFER_OVERFLOW. It does not
+// update the content of decoder_model, and can be used to target certain
+// encoding level in the recode loop.
+DECODER_MODEL_STATUS av1_decoder_model_try_smooth_buf(
+ const struct AV1_COMP *const cpi, size_t coded_bits,
+ const DECODER_MODEL *const decoder_model);
+
+// Return max bitrate(bps) for given level.
+double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
+ BITSTREAM_PROFILE profile);
+
+// Get max number of tiles and tile columns for given level.
+void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles,
+ int *const max_tile_cols);
+
+// Return minimum compression ratio for given level.
+double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier,
+ int is_still_picture);
+#endif // AOM_AV1_ENCODER_LEVEL_H_
diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c
new file mode 100644
index 0000000000..9ef9b88675
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/common.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) {
+ int index = *idx;
+ struct lookahead_entry *buf = ctx->buf + index;
+
+ assert(index < ctx->max_sz);
+ if (++index >= ctx->max_sz) index -= ctx->max_sz;
+ *idx = index;
+ return buf;
+}
+
+void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
+ if (ctx) {
+ if (ctx->buf) {
+ int i;
+
+ for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img);
+ free(ctx->buf);
+ }
+ free(ctx);
+ }
+}
+
+struct lookahead_ctx *av1_lookahead_init(
+ unsigned int width, unsigned int height, unsigned int subsampling_x,
+ unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+ const int border_in_pixels, int byte_alignment, int num_lap_buffers,
+ bool is_all_intra, int num_pyramid_levels) {
+ int lag_in_frames = AOMMAX(1, depth);
+
+ // For all-intra frame encoding, previous source frames are not required.
+ // Hence max_pre_frames is set to 0 in this case. As previous source frames
+ // are accessed using a negative index to av1_lookahead_peek(), setting
+ // max_pre_frames to 0 will cause av1_lookahead_peek() to return NULL for a
+ // negative index.
+ const uint8_t max_pre_frames = is_all_intra ? 0 : MAX_PRE_FRAMES;
+
+ // Add the lags to depth and clamp
+ depth += num_lap_buffers;
+ depth = clamp(depth, 1, MAX_TOTAL_BUFFERS);
+
+ // Allocate memory to keep previous source frames available.
+ depth += max_pre_frames;
+
+ // Allocate the lookahead structures
+ struct lookahead_ctx *ctx = calloc(1, sizeof(*ctx));
+ if (ctx) {
+ unsigned int i;
+ ctx->max_sz = depth;
+ ctx->push_frame_count = 0;
+ ctx->max_pre_frames = max_pre_frames;
+ ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - ctx->max_pre_frames;
+ ctx->read_ctxs[ENCODE_STAGE].valid = 1;
+ if (num_lap_buffers) {
+ ctx->read_ctxs[LAP_STAGE].pop_sz = lag_in_frames;
+ ctx->read_ctxs[LAP_STAGE].valid = 1;
+ }
+ ctx->buf = calloc(depth, sizeof(*ctx->buf));
+ if (!ctx->buf) goto fail;
+ for (i = 0; i < depth; i++) {
+ if (aom_realloc_frame_buffer(
+ &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
+ use_highbitdepth, border_in_pixels, byte_alignment, NULL, NULL,
+ NULL, num_pyramid_levels, 0)) {
+ goto fail;
+ }
+ }
+ }
+ return ctx;
+fail:
+ av1_lookahead_destroy(ctx);
+ return NULL;
+}
+
+int av1_lookahead_full(const struct lookahead_ctx *ctx) {
+ // TODO(angiebird): Test this function.
+ return ctx->read_ctxs[ENCODE_STAGE].sz >= ctx->read_ctxs[ENCODE_STAGE].pop_sz;
+}
+
+int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
+ int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+ int num_pyramid_levels, aom_enc_frame_flags_t flags) {
+ int width = src->y_crop_width;
+ int height = src->y_crop_height;
+ int uv_width = src->uv_crop_width;
+ int uv_height = src->uv_crop_height;
+ int subsampling_x = src->subsampling_x;
+ int subsampling_y = src->subsampling_y;
+ int larger_dimensions, new_dimensions;
+
+ assert(ctx->read_ctxs[ENCODE_STAGE].valid == 1);
+ if (ctx->read_ctxs[ENCODE_STAGE].sz + ctx->max_pre_frames > ctx->max_sz)
+ return 1;
+
+ ctx->read_ctxs[ENCODE_STAGE].sz++;
+ if (ctx->read_ctxs[LAP_STAGE].valid) {
+ ctx->read_ctxs[LAP_STAGE].sz++;
+ }
+
+ struct lookahead_entry *buf = pop(ctx, &ctx->write_idx);
+
+ new_dimensions = width != buf->img.y_crop_width ||
+ height != buf->img.y_crop_height ||
+ uv_width != buf->img.uv_crop_width ||
+ uv_height != buf->img.uv_crop_height;
+ larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
+ uv_width > buf->img.uv_width ||
+ uv_height > buf->img.uv_height;
+ assert(!larger_dimensions || new_dimensions);
+
+ if (larger_dimensions) {
+ YV12_BUFFER_CONFIG new_img;
+ memset(&new_img, 0, sizeof(new_img));
+ if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+ subsampling_y, use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, 0, num_pyramid_levels, 0))
+ return 1;
+ aom_free_frame_buffer(&buf->img);
+ buf->img = new_img;
+ } else if (new_dimensions) {
+ buf->img.y_crop_width = src->y_crop_width;
+ buf->img.y_crop_height = src->y_crop_height;
+ buf->img.uv_crop_width = src->uv_crop_width;
+ buf->img.uv_crop_height = src->uv_crop_height;
+ buf->img.subsampling_x = src->subsampling_x;
+ buf->img.subsampling_y = src->subsampling_y;
+ }
+ // Partial copy not implemented yet
+ av1_copy_and_extend_frame(src, &buf->img);
+
+ buf->ts_start = ts_start;
+ buf->ts_end = ts_end;
+ buf->display_idx = ctx->push_frame_count;
+ buf->flags = flags;
+ ++ctx->push_frame_count;
+ aom_remove_metadata_from_frame_buffer(&buf->img);
+ if (src->metadata &&
+ aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata)) {
+ return 1;
+ }
+ return 0;
+}
+
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
+ COMPRESSOR_STAGE stage) {
+ struct lookahead_entry *buf = NULL;
+ if (ctx) {
+ struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+ assert(read_ctx->valid == 1);
+ if (read_ctx->sz && (drain || read_ctx->sz == read_ctx->pop_sz)) {
+ buf = pop(ctx, &read_ctx->read_idx);
+ read_ctx->sz--;
+ }
+ }
+ return buf;
+}
+
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
+ COMPRESSOR_STAGE stage) {
+ struct lookahead_entry *buf = NULL;
+ if (ctx == NULL) {
+ return buf;
+ }
+
+ struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+ assert(read_ctx->valid == 1);
+ if (index >= 0) {
+ // Forward peek
+ if (index < read_ctx->sz) {
+ index += read_ctx->read_idx;
+ if (index >= ctx->max_sz) index -= ctx->max_sz;
+ buf = ctx->buf + index;
+ }
+ } else if (index < 0) {
+ // Backward peek
+ if (-index <= ctx->max_pre_frames) {
+ index += (int)(read_ctx->read_idx);
+ if (index < 0) index += (int)(ctx->max_sz);
+ buf = ctx->buf + index;
+ }
+ }
+
+ return buf;
+}
+
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
+ COMPRESSOR_STAGE stage) {
+ assert(ctx != NULL);
+
+ struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+ assert(read_ctx->valid == 1);
+ return read_ctx->sz;
+}
+
+int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) {
+ assert(ctx != NULL);
+
+ struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+ assert(read_ctx->valid == 1);
+ return read_ctx->pop_sz;
+}
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
new file mode 100644
index 0000000000..c0e6d222f5
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes look ahead buffer operations.
+ */
+#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
+#define AOM_AV1_ENCODER_LOOKAHEAD_H_
+
+#include <stdbool.h>
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+#define MAX_LAG_BUFFERS 48
+#define MAX_LAP_BUFFERS 48
+#define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS)
+#define LAP_LAG_IN_FRAMES 17
+
+struct lookahead_entry {
+ YV12_BUFFER_CONFIG img;
+ int64_t ts_start;
+ int64_t ts_end;
+ int display_idx;
+ aom_enc_frame_flags_t flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+enum { ENCODE_STAGE, LAP_STAGE, MAX_STAGES } UENUM1BYTE(COMPRESSOR_STAGE);
+
+struct read_ctx {
+ int sz; /* Number of buffers currently in the queue */
+ int read_idx; /* Read index */
+ int pop_sz; /* Size to check for pop condition */
+ int valid; /* Is this ctx valid? */
+};
+
+struct lookahead_ctx {
+ int max_sz; /* Absolute size of the queue */
+ int write_idx; /* Write index */
+ struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */
+ struct lookahead_entry *buf; /* Buffer list */
+ int push_frame_count; /* Number of frames that have been pushed in the queue*/
+ uint8_t
+ max_pre_frames; /* Maximum number of past frames allowed in the queue */
+};
+/*!\endcond */
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *av1_lookahead_init(
+ unsigned int width, unsigned int height, unsigned int subsampling_x,
+ unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+ const int border_in_pixels, int byte_alignment, int num_lap_buffers,
+ bool is_all_intra, int num_pyramid_levels);
+
+/**\brief Destroys the lookahead stage
+ */
+void av1_lookahead_destroy(struct lookahead_ctx *ctx);
+
+/**\brief Check if lookahead buffer is full
+ */
+int av1_lookahead_full(const struct lookahead_ctx *ctx);
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] src Pointer to the image to enqueue
+ * \param[in] ts_start Timestamp for the start of this frame
+ * \param[in] ts_end Timestamp for the end of this frame
+ * \param[in] use_highbitdepth Tell if HBD is used
+ * \param[in] num_pyramid_levels Number of pyramid levels to allocate
+ for each frame buffer
+ * \param[in] flags Flags set on this frame
+ */
+int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src,
+ int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+ int num_pyramid_levels, aom_enc_frame_flags_t flags);
+
+/**\brief Get the next source buffer to encode
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] drain Flag indicating the buffer should be drained
+ * (return a buffer regardless of the current queue depth)
+ * \param[in] stage Encoder stage
+ *
+ * \retval Return NULL, if drain set and queue is empty, or if drain not set and
+ * queue not of the configured depth.
+ */
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
+ COMPRESSOR_STAGE stage);
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] index Index of the frame to be returned, 0 == next frame
+ * \param[in] stage Encoder stage
+ *
+ * \retval Return NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
+ COMPRESSOR_STAGE stage);
+
+/**\brief Get the number of frames currently in the lookahead queue
+ */
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
+ COMPRESSOR_STAGE stage);
+
+/**\brief Get pop_sz value
+ */
+int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_LOOKAHEAD_H_
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
new file mode 100644
index 0000000000..4e53447379
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -0,0 +1,3998 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
+ const MvCosts *mv_costs,
+ const MV *ref_mv, int errorperbit,
+ int sadperbit) {
+ mv_cost_params->ref_mv = ref_mv;
+ mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv);
+ mv_cost_params->mv_cost_type = MV_COST_ENTROPY;
+ mv_cost_params->error_per_bit = errorperbit;
+ mv_cost_params->sad_per_bit = sadperbit;
+ // For allintra encoding mode, 'mv_costs' is not allocated. Hence, the
+ // population of mvjcost and mvcost are avoided. In case of IntraBC, these
+ // values are populated from 'dv_costs' in av1_set_ms_to_intra_mode().
+ if (mv_costs != NULL) {
+ mv_cost_params->mvjcost = mv_costs->nmv_joint_cost;
+ mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0];
+ mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1];
+ }
+}
+
+static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
+ ms_buffers->ref = &x->e_mbd.plane[0].pre[0];
+ ms_buffers->src = &x->plane[0].src;
+
+ av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0);
+
+ ms_buffers->wsrc = x->obmc_buffer.wsrc;
+ ms_buffers->obmc_mask = x->obmc_buffer.mask;
+}
+
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer) {
+ obmc_buffer->wsrc = NULL;
+ obmc_buffer->mask = NULL;
+ obmc_buffer->above_pred = NULL;
+ obmc_buffer->left_pred = NULL;
+}
+
+void av1_make_default_fullpel_ms_params(
+ FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
+ const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+ SEARCH_METHODS search_method, int fine_search_interval) {
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const int is_key_frame =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE;
+
+ // High level params
+ ms_params->bsize = bsize;
+ ms_params->vfp = &cpi->ppi->fn_ptr[bsize];
+
+ init_ms_buffers(&ms_params->ms_buffers, x);
+
+ av1_set_mv_search_method(ms_params, search_sites, search_method);
+
+ ms_params->mesh_patterns[0] = mv_sf->mesh_patterns;
+ ms_params->mesh_patterns[1] = mv_sf->intrabc_mesh_patterns;
+ ms_params->force_mesh_thresh = mv_sf->exhaustive_searches_thresh;
+ ms_params->prune_mesh_search =
+ (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_2) ? 1 : 0;
+ ms_params->mesh_search_mv_diff_threshold = 4;
+ ms_params->run_mesh_search = 0;
+ ms_params->fine_search_interval = fine_search_interval;
+
+ ms_params->is_intra_mode = 0;
+
+ ms_params->fast_obmc_search = mv_sf->obmc_full_pixel_search_level;
+
+ ms_params->mv_limits = x->mv_limits;
+ av1_set_mv_search_range(&ms_params->mv_limits, ref_mv);
+
+ // Mvcost params
+ init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
+ x->errorperbit, x->sadperbit);
+
+ ms_params->sdf = ms_params->vfp->sdf;
+ ms_params->sdx4df = ms_params->vfp->sdx4df;
+ ms_params->sdx3df = ms_params->vfp->sdx3df;
+
+ if (mv_sf->use_downsampled_sad == 2 && block_size_high[bsize] >= 16) {
+ ms_params->sdf = ms_params->vfp->sdsf;
+ ms_params->sdx4df = ms_params->vfp->sdsx4df;
+ // Skip version of sadx3 is not available yet
+ ms_params->sdx3df = ms_params->vfp->sdsx4df;
+ } else if (mv_sf->use_downsampled_sad == 1 && block_size_high[bsize] >= 16 &&
+ !is_key_frame) {
+ FULLPEL_MV start_mv_clamped = start_mv;
+ // adjust start_mv to make sure it is within MV range
+ clamp_fullmv(&start_mv_clamped, &ms_params->mv_limits);
+
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const int ref_stride = ref->stride;
+ const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv_clamped);
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+
+ unsigned int start_mv_sad_even_rows, start_mv_sad_odd_rows;
+ start_mv_sad_even_rows =
+ ms_params->vfp->sdsf(src_buf, src_stride, best_address, ref_stride);
+ start_mv_sad_odd_rows =
+ ms_params->vfp->sdsf(src_buf + src_stride, src_stride,
+ best_address + ref_stride, ref_stride);
+
+ // If the absolute SAD difference computed between the pred-to-src of even
+ // and odd rows is small, skip every other row in sad computation.
+ const int odd_to_even_diff_sad =
+ abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+ const int mult_thresh = 4;
+ if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+ ms_params->sdf = ms_params->vfp->sdsf;
+ ms_params->sdx4df = ms_params->vfp->sdsx4df;
+ ms_params->sdx3df = ms_params->vfp->sdsx4df;
+ }
+ }
+}
+
+void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const IntraBCMVCosts *dv_costs) {
+ ms_params->is_intra_mode = 1;
+
+ MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+
+ mv_cost_params->mvjcost = dv_costs->joint_mv;
+ mv_cost_params->mvcost[0] = dv_costs->dv_costs[0];
+ mv_cost_params->mvcost[1] = dv_costs->dv_costs[1];
+}
+
+void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const struct AV1_COMP *cpi,
+ const MACROBLOCK *x, BLOCK_SIZE bsize,
+ const MV *ref_mv, const int *cost_list) {
+ const AV1_COMMON *cm = &cpi->common;
+ // High level params
+ ms_params->allow_hp = cm->features.allow_high_precision_mv;
+ ms_params->forced_stop = cpi->sf.mv_sf.subpel_force_stop;
+ ms_params->iters_per_step = cpi->sf.mv_sf.subpel_iters_per_step;
+ ms_params->cost_list = cond_cost_list_const(cpi, cost_list);
+
+ av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv);
+
+ // Mvcost params
+ init_mv_cost_params(&ms_params->mv_cost_params, x->mv_costs, ref_mv,
+ x->errorperbit, x->sadperbit);
+
+ // Subpel variance params
+ ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize];
+ ms_params->var_params.subpel_search_type =
+ cpi->sf.mv_sf.use_accurate_subpel_search;
+ ms_params->var_params.w = block_size_wide[bsize];
+ ms_params->var_params.h = block_size_high[bsize];
+
+ // Ref and src buffers
+ MSBuffers *ms_buffers = &ms_params->var_params.ms_buffers;
+ init_ms_buffers(ms_buffers, x);
+}
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) {
+ // Calculate the outermost full-pixel MVs which are inside the limits set by
+ // av1_set_subpel_mv_search_range().
+ //
+ // The subpel limits are simply mv->col +/- 8*MAX_FULL_PEL_VAL, and similar
+ // for mv->row. We can then divide by 8 to find the fullpel MV limits. But
+ // we have to be careful about the rounding. We want these bounds to be
+ // at least as tight as the subpel limits, which means that we must round
+ // the minimum values up and the maximum values down when dividing.
+ int col_min = ((mv->col + 7) >> 3) - MAX_FULL_PEL_VAL;
+ int row_min = ((mv->row + 7) >> 3) - MAX_FULL_PEL_VAL;
+ int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+ int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+ col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1);
+ row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1);
+ col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1);
+ row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1);
+
+ // Get intersection of UMV window and valid MV window to reduce # of checks
+ // in diamond search.
+ if (mv_limits->col_min < col_min) mv_limits->col_min = col_min;
+ if (mv_limits->col_max > col_max) mv_limits->col_max = col_max;
+ if (mv_limits->row_min < row_min) mv_limits->row_min = row_min;
+ if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
+
+ mv_limits->col_max = AOMMAX(mv_limits->col_min, mv_limits->col_max);
+ mv_limits->row_max = AOMMAX(mv_limits->row_min, mv_limits->row_max);
+}
+
+int av1_init_search_range(int size) {
+ int sr = 0;
+ // Minimum search size no matter what the passed in value.
+ size = AOMMAX(16, size);
+
+ while ((size << sr) < MAX_FULL_PEL_VAL) sr++;
+
+ sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2);
+ return sr;
+}
+
+// ============================================================================
+// Cost of motion vectors
+// ============================================================================
+// TODO(any): Adaptively adjust the regularization strength based on image size
+// and motion activity instead of using hard-coded values. It seems like we
+// roughly half the lambda for each increase in resolution
+// These are multiplier used to perform regularization in motion compensation
+// when x->mv_cost_type is set to MV_COST_L1.
+// LOWRES
+#define SSE_LAMBDA_LOWRES 2 // Used by mv_cost_err_fn
+#define SAD_LAMBDA_LOWRES 32 // Used by mvsad_err_cost during full pixel search
+// MIDRES
+#define SSE_LAMBDA_MIDRES 0 // Used by mv_cost_err_fn
+#define SAD_LAMBDA_MIDRES 15 // Used by mvsad_err_cost during full pixel search
+// HDRES
+#define SSE_LAMBDA_HDRES 1 // Used by mv_cost_err_fn
+#define SAD_LAMBDA_HDRES 8 // Used by mvsad_err_cost during full pixel search
+
+// Returns the rate of encoding the current motion vector based on the
+// joint_cost and comp_cost. joint_costs covers the cost of transmitting
+// JOINT_MV, and comp_cost covers the cost of transmitting the actual motion
+// vector.
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+ const int *const comp_cost[2]) {
+ return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+ comp_cost[1][mv->col];
+}
+
+#define CONVERT_TO_CONST_MVCOST(ptr) ((const int *const *)(ptr))
+// Returns the cost of encoding the motion vector diff := *mv - *ref. The cost
+// is defined as the rate required to encode diff * weight, rounded to the
+// nearest 2 ** 7.
+// This is NOT used during motion compensation.
+int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
+ int *const mvcost[2], int weight) {
+ const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
+ return ROUND_POWER_OF_TWO(
+ mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7);
+}
+
+// Returns the cost of using the current mv during the motion search. This is
+// used when var is used as the error metric.
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv,
+ const int *mvjcost, const int *const mvcost[2],
+ int error_per_bit, MV_COST_TYPE mv_cost_type) {
+ const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
+ const MV abs_diff = { abs(diff.row), abs(diff.col) };
+
+ switch (mv_cost_type) {
+ case MV_COST_ENTROPY:
+ if (mvcost) {
+ return (int)ROUND_POWER_OF_TWO_64(
+ (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+ RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
+ PIXEL_TRANSFORM_ERROR_SCALE);
+ }
+ return 0;
+ case MV_COST_L1_LOWRES:
+ return (SSE_LAMBDA_LOWRES * (abs_diff.row + abs_diff.col)) >> 3;
+ case MV_COST_L1_MIDRES:
+ return (SSE_LAMBDA_MIDRES * (abs_diff.row + abs_diff.col)) >> 3;
+ case MV_COST_L1_HDRES:
+ return (SSE_LAMBDA_HDRES * (abs_diff.row + abs_diff.col)) >> 3;
+ case MV_COST_NONE: return 0;
+ default: assert(0 && "Invalid rd_cost_type"); return 0;
+ }
+}
+
+static INLINE int mv_err_cost_(const MV *mv,
+ const MV_COST_PARAMS *mv_cost_params) {
+ if (mv_cost_params->mv_cost_type == MV_COST_NONE) {
+ return 0;
+ }
+ return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost,
+ mv_cost_params->mvcost, mv_cost_params->error_per_bit,
+ mv_cost_params->mv_cost_type);
+}
+
+// Returns the cost of using the current mv during the motion search. This is
+// only used during full pixel motion search when sad is used as the error
+// metric
+static INLINE int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv,
+ const int *mvjcost, const int *const mvcost[2],
+ int sad_per_bit, MV_COST_TYPE mv_cost_type) {
+ const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row),
+ GET_MV_SUBPEL(mv->col - ref_mv->col) };
+
+ switch (mv_cost_type) {
+ case MV_COST_ENTROPY:
+ return ROUND_POWER_OF_TWO(
+ (unsigned)mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) *
+ sad_per_bit,
+ AV1_PROB_COST_SHIFT);
+ case MV_COST_L1_LOWRES:
+ return (SAD_LAMBDA_LOWRES * (abs(diff.row) + abs(diff.col))) >> 3;
+ case MV_COST_L1_MIDRES:
+ return (SAD_LAMBDA_MIDRES * (abs(diff.row) + abs(diff.col))) >> 3;
+ case MV_COST_L1_HDRES:
+ return (SAD_LAMBDA_HDRES * (abs(diff.row) + abs(diff.col))) >> 3;
+ case MV_COST_NONE: return 0;
+ default: assert(0 && "Invalid rd_cost_type"); return 0;
+ }
+}
+
+static INLINE int mvsad_err_cost_(const FULLPEL_MV *mv,
+ const MV_COST_PARAMS *mv_cost_params) {
+ return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv,
+ mv_cost_params->mvjcost, mv_cost_params->mvcost,
+ mv_cost_params->sad_per_bit,
+ mv_cost_params->mv_cost_type);
+}
+
+// =============================================================================
+// Fullpixel Motion Search: Translational
+// =============================================================================
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8 // max number of candidates per scale
+#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates
+
+// Search site initialization for DIAMOND / CLAMPED_DIAMOND search methods.
+// level = 0: DIAMOND, level = 1: CLAMPED_DIAMOND.
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride,
+ int level) {
+ int num_search_steps = 0;
+ int stage_index = MAX_MVSEARCH_STEPS - 1;
+
+ cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
+ cfg->site[stage_index][0].offset = 0;
+ cfg->stride = stride;
+
+ // Choose the initial step size depending on level.
+ const int first_step = (level > 0) ? (MAX_FIRST_STEP / 4) : MAX_FIRST_STEP;
+
+ for (int radius = first_step; radius > 0;) {
+ int num_search_pts = 8;
+
+ const FULLPEL_MV search_site_mvs[13] = {
+ { 0, 0 }, { -radius, 0 }, { radius, 0 },
+ { 0, -radius }, { 0, radius }, { -radius, -radius },
+ { radius, radius }, { -radius, radius }, { radius, -radius },
+ };
+
+ int i;
+ for (i = 0; i <= num_search_pts; ++i) {
+ search_site *const site = &cfg->site[stage_index][i];
+ site->mv = search_site_mvs[i];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ cfg->searches_per_step[stage_index] = num_search_pts;
+ cfg->radius[stage_index] = radius;
+ // Update the search radius based on level.
+ if (!level || ((stage_index < 9) && level)) radius /= 2;
+ --stage_index;
+ ++num_search_steps;
+ }
+ cfg->num_search_steps = num_search_steps;
+}
+
+void av1_init_motion_fpf(search_site_config *cfg, int stride) {
+ int num_search_steps = 0;
+ int stage_index = MAX_MVSEARCH_STEPS - 1;
+
+ cfg->site[stage_index][0].mv.col = cfg->site[stage_index][0].mv.row = 0;
+ cfg->site[stage_index][0].offset = 0;
+ cfg->stride = stride;
+
+ for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
+ // Generate offsets for 8 search sites per step.
+ int tan_radius = AOMMAX((int)(0.41 * radius), 1);
+ int num_search_pts = 12;
+ if (radius == 1) num_search_pts = 8;
+
+ const FULLPEL_MV search_site_mvs[13] = {
+ { 0, 0 },
+ { -radius, 0 },
+ { radius, 0 },
+ { 0, -radius },
+ { 0, radius },
+ { -radius, -tan_radius },
+ { radius, tan_radius },
+ { -tan_radius, radius },
+ { tan_radius, -radius },
+ { -radius, tan_radius },
+ { radius, -tan_radius },
+ { tan_radius, radius },
+ { -tan_radius, -radius },
+ };
+
+ int i;
+ for (i = 0; i <= num_search_pts; ++i) {
+ search_site *const site = &cfg->site[stage_index][i];
+ site->mv = search_site_mvs[i];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ cfg->searches_per_step[stage_index] = num_search_pts;
+ cfg->radius[stage_index] = radius;
+ --stage_index;
+ ++num_search_steps;
+ }
+ cfg->num_search_steps = num_search_steps;
+}
+
+// Search site initialization for NSTEP / NSTEP_8PT search methods.
+// level = 0: NSTEP, level = 1: NSTEP_8PT.
+void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride,
+ int level) {
+ int num_search_steps = 0;
+ int stage_index = 0;
+ cfg->stride = stride;
+ int radius = 1;
+ const int num_stages = (level > 0) ? 16 : 15;
+ for (stage_index = 0; stage_index < num_stages; ++stage_index) {
+ int tan_radius = AOMMAX((int)(0.41 * radius), 1);
+ int num_search_pts = 12;
+ if ((radius <= 5) || (level > 0)) {
+ tan_radius = radius;
+ num_search_pts = 8;
+ }
+ const FULLPEL_MV search_site_mvs[13] = {
+ { 0, 0 },
+ { -radius, 0 },
+ { radius, 0 },
+ { 0, -radius },
+ { 0, radius },
+ { -radius, -tan_radius },
+ { radius, tan_radius },
+ { -tan_radius, radius },
+ { tan_radius, -radius },
+ { -radius, tan_radius },
+ { radius, -tan_radius },
+ { tan_radius, radius },
+ { -tan_radius, -radius },
+ };
+
+ for (int i = 0; i <= num_search_pts; ++i) {
+ search_site *const site = &cfg->site[stage_index][i];
+ site->mv = search_site_mvs[i];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ cfg->searches_per_step[stage_index] = num_search_pts;
+ cfg->radius[stage_index] = radius;
+ ++num_search_steps;
+ if (stage_index < 12)
+ radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1);
+ }
+ cfg->num_search_steps = num_search_steps;
+}
+
+// Search site initialization for BIGDIA / FAST_BIGDIA / FAST_DIAMOND
+// search methods.
+void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride,
+ int level) {
+ (void)level;
+ cfg->stride = stride;
+ // First scale has 4-closest points, the rest have 8 points in diamond
+ // shape at increasing scales
+ static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+ 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+
+ // BIGDIA search method candidates.
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const FULLPEL_MV
+ site_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, 0 }, { 0, 0 },
+ { 0, 0 }, { 0, 0 } },
+ { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+ { -1, 1 }, { -2, 0 } },
+ { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+ { -2, 2 }, { -4, 0 } },
+ { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+ { -4, 4 }, { -8, 0 } },
+ { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+ { -8, 8 }, { -16, 0 } },
+ { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+ { 0, 32 }, { -16, 16 }, { -32, 0 } },
+ { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+ { 0, 64 }, { -32, 32 }, { -64, 0 } },
+ { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+ { 0, 128 }, { -64, 64 }, { -128, 0 } },
+ { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 },
+ { 128, 128 }, { 0, 256 }, { -128, 128 }, { -256, 0 } },
+ { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 },
+ { 256, 256 }, { 0, 512 }, { -256, 256 }, { -512, 0 } },
+ { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+ { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
+ };
+
+ /* clang-format on */
+ int radius = 1;
+ for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+ cfg->searches_per_step[i] = bigdia_num_candidates[i];
+ cfg->radius[i] = radius;
+ for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
+ search_site *const site = &cfg->site[i][j];
+ site->mv = site_candidates[i][j];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ radius *= 2;
+ }
+ cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+// Search site initialization for SQUARE search method.
+void av1_init_motion_compensation_square(search_site_config *cfg, int stride,
+ int level) {
+ (void)level;
+ cfg->stride = stride;
+ // All scales have 8 closest points in square shape.
+ static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+
+ // Square search method candidates.
+ // Note that the largest candidate step at each scale is 2^scale.
+ /* clang-format off */
+ static const FULLPEL_MV
+ square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+ { -1, 1 }, { -1, 0 } },
+ { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+ { -2, 2 }, { -2, 0 } },
+ { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+ { -4, 4 }, { -4, 0 } },
+ { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+ { -8, 8 }, { -8, 0 } },
+ { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+ { 0, 16 }, { -16, 16 }, { -16, 0 } },
+ { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+ { 0, 32 }, { -32, 32 }, { -32, 0 } },
+ { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+ { 0, 64 }, { -64, 64 }, { -64, 0 } },
+ { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 },
+ { 128, 128 }, { 0, 128 }, { -128, 128 }, { -128, 0 } },
+ { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 },
+ { 256, 256 }, { 0, 256 }, { -256, 256 }, { -256, 0 } },
+ { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 },
+ { 512, 512 }, { 0, 512 }, { -512, 512 }, { -512, 0 } },
+ { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+ { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+ };
+
+ /* clang-format on */
+ int radius = 1;
+ for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+ cfg->searches_per_step[i] = square_num_candidates[i];
+ cfg->radius[i] = radius;
+ for (int j = 0; j < MAX_PATTERN_CANDIDATES; ++j) {
+ search_site *const site = &cfg->site[i][j];
+ site->mv = square_candidates[i][j];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ radius *= 2;
+ }
+ cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+// Search site initialization for HEX / FAST_HEX search methods.
+void av1_init_motion_compensation_hex(search_site_config *cfg, int stride,
+ int level) {
+ (void)level;
+ cfg->stride = stride;
+ // First scale has 8-closest points, the rest have 6 points in hex shape
+ // at increasing scales.
+ static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6 };
+ // Note that the largest candidate step at each scale is 2^scale.
+ /* clang-format off */
+ static const FULLPEL_MV
+ hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+ { -1, 1 }, { -1, 0 } },
+ { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+ { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+ { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+ { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 },
+ { -8, 16 }, { -16, 0 } },
+ { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+ { -32, 0 } },
+ { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+ { -64, 0 } },
+ { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 },
+ { -64, 128 }, { -128, 0 } },
+ { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 },
+ { -128, 256 }, { -256, 0 } },
+ { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 },
+ { -256, 512 }, { -512, 0 } },
+ { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+ { -512, 1024 }, { -1024, 0 } },
+ };
+
+ /* clang-format on */
+ int radius = 1;
+ for (int i = 0; i < MAX_PATTERN_SCALES; ++i) {
+ cfg->searches_per_step[i] = hex_num_candidates[i];
+ cfg->radius[i] = radius;
+ for (int j = 0; j < hex_num_candidates[i]; ++j) {
+ search_site *const site = &cfg->site[i][j];
+ site->mv = hex_candidates[i][j];
+ site->offset = get_offset_from_fullmv(&site->mv, stride);
+ }
+ radius *= 2;
+ }
+ cfg->num_search_steps = MAX_PATTERN_SCALES;
+}
+
+const av1_init_search_site_config
+ av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS] = {
+ av1_init_dsmotion_compensation, av1_init_motion_compensation_nstep,
+ av1_init_motion_compensation_nstep, av1_init_dsmotion_compensation,
+ av1_init_motion_compensation_hex, av1_init_motion_compensation_bigdia,
+ av1_init_motion_compensation_square
+ };
+
+// Checks whether the mv is within range of the mv_limits
+static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col,
+ int range) {
+ return ((row - range) >= mv_limits->row_min) &
+ ((row + range) <= mv_limits->row_max) &
+ ((col - range) >= mv_limits->col_min) &
+ ((col + range) <= mv_limits->col_max);
+}
+
+static INLINE int get_mvpred_var_cost(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
+ FULLPEL_MV_STATS *mv_stats) {
+ const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+ const MV sub_this_mv = get_mv_from_fullmv(this_mv);
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+ const int ref_stride = ref->stride;
+
+ int bestsme;
+
+ bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
+ ref_stride, &mv_stats->sse);
+ mv_stats->distortion = bestsme;
+
+ mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+ bestsme += mv_stats->err_cost;
+
+ return bestsme;
+}
+
+static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const struct buf_2d *const src,
+ const uint8_t *const ref_address,
+ const int ref_stride) {
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+
+ return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
+}
+
+static INLINE int get_mvpred_compound_var_cost(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv,
+ FULLPEL_MV_STATS *mv_stats) {
+ const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+ const int ref_stride = ref->stride;
+
+ const uint8_t *mask = ms_params->ms_buffers.mask;
+ const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+ const int mask_stride = ms_params->ms_buffers.mask_stride;
+ const int invert_mask = ms_params->ms_buffers.inv_mask;
+ int bestsme;
+
+ if (mask) {
+ bestsme = vfp->msvf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
+ src_buf, src_stride, second_pred, mask, mask_stride,
+ invert_mask, &mv_stats->sse);
+ } else if (second_pred) {
+ bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
+ src_buf, src_stride, &mv_stats->sse, second_pred);
+ } else {
+ bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
+ ref_stride, &mv_stats->sse);
+ }
+ mv_stats->distortion = bestsme;
+
+ const MV sub_this_mv = get_mv_from_fullmv(this_mv);
+ mv_stats->err_cost = mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+ bestsme += mv_stats->err_cost;
+
+ return bestsme;
+}
+
+static INLINE int get_mvpred_compound_sad(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const struct buf_2d *const src, const uint8_t *const ref_address,
+ const int ref_stride) {
+ const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+
+ const uint8_t *mask = ms_params->ms_buffers.mask;
+ const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+ const int mask_stride = ms_params->ms_buffers.mask_stride;
+ const int invert_mask = ms_params->ms_buffers.inv_mask;
+
+ if (mask) {
+ return vfp->msdf(src_buf, src_stride, ref_address, ref_stride, second_pred,
+ mask, mask_stride, invert_mask);
+ } else if (second_pred) {
+ return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred);
+ } else {
+ return ms_params->sdf(src_buf, src_stride, ref_address, ref_stride);
+ }
+}
+
+// Calculates and returns a sad+mvcost list around an integer best pel during
+// fullpixel motion search. The resulting list can be used to speed up subpel
+// motion search later.
+#define USE_SAD_COSTLIST 1
+
+// calc_int_cost_list uses var to populate the costlist, which is more accurate
+// than sad but slightly slower.
+static AOM_FORCE_INLINE void calc_int_cost_list(
+ const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ int *cost_list) {
+ static const FULLPEL_MV neighbors[4] = {
+ { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
+ };
+ const int br = best_mv.row;
+ const int bc = best_mv.col;
+
+ FULLPEL_MV_STATS mv_stats;
+ cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv, &mv_stats);
+
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
+ for (int i = 0; i < 4; i++) {
+ const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
+ bc + neighbors[i].col };
+ cost_list[i + 1] =
+ get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
+ }
+ } else {
+ for (int i = 0; i < 4; i++) {
+ const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
+ bc + neighbors[i].col };
+ if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) {
+ cost_list[i + 1] = INT_MAX;
+ } else {
+ cost_list[i + 1] =
+ get_mvpred_var_cost(ms_params, &neighbor_mv, &mv_stats);
+ }
+ }
+ }
+}
+
+// calc_int_sad_list uses sad to populate the costlist, which is less accurate
+// than var but faster.
+static AOM_FORCE_INLINE void calc_int_sad_list(
+ const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ int *cost_list, int costlist_has_sad) {
+ static const FULLPEL_MV neighbors[4] = {
+ { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
+ };
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const int ref_stride = ref->stride;
+ const int br = best_mv.row;
+ const int bc = best_mv.col;
+
+ assert(av1_is_fullmv_in_range(&ms_params->mv_limits, best_mv));
+
+ // Refresh the costlist it does not contain valid sad
+ if (!costlist_has_sad) {
+ cost_list[0] = get_mvpred_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &best_mv), ref_stride);
+
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
+ for (int i = 0; i < 4; i++) {
+ const FULLPEL_MV this_mv = { br + neighbors[i].row,
+ bc + neighbors[i].col };
+ cost_list[i + 1] = get_mvpred_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+ }
+ } else {
+ for (int i = 0; i < 4; i++) {
+ const FULLPEL_MV this_mv = { br + neighbors[i].row,
+ bc + neighbors[i].col };
+ if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+ cost_list[i + 1] = INT_MAX;
+ } else {
+ cost_list[i + 1] = get_mvpred_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+ }
+ }
+ }
+ }
+
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ cost_list[0] += mvsad_err_cost_(&best_mv, mv_cost_params);
+
+ for (int idx = 0; idx < 4; idx++) {
+ if (cost_list[idx + 1] != INT_MAX) {
+ const FULLPEL_MV this_mv = { br + neighbors[idx].row,
+ bc + neighbors[idx].col };
+ cost_list[idx + 1] += mvsad_err_cost_(&this_mv, mv_cost_params);
+ }
+ }
+}
+
+// Computes motion vector cost and adds to the sad cost.
+// Then updates the best sad and motion vectors.
+// Inputs:
+// this_sad: the sad to be evaluated.
+// mv: the current motion vector.
+// mv_cost_params: a structure containing information to compute mv cost.
+// best_sad: the current best sad.
+// raw_best_sad (optional): the current best sad without calculating mv cost.
+// best_mv: the current best motion vector.
+// second_best_mv (optional): the second best motion vector up to now.
+// Modifies:
+// best_sad, raw_best_sad, best_mv, second_best_mv
+// If the current sad is lower than the current best sad.
+// Returns:
+// Whether the input sad (mv) is better than the current best.
+static AOM_INLINE int update_mvs_and_sad(const unsigned int this_sad,
+ const FULLPEL_MV *mv,
+ const MV_COST_PARAMS *mv_cost_params,
+ unsigned int *best_sad,
+ unsigned int *raw_best_sad,
+ FULLPEL_MV *best_mv,
+ FULLPEL_MV *second_best_mv) {
+ if (this_sad >= *best_sad) return 0;
+
+ // Add the motion vector cost.
+ const unsigned int sad = this_sad + mvsad_err_cost_(mv, mv_cost_params);
+ if (sad < *best_sad) {
+ if (raw_best_sad) *raw_best_sad = this_sad;
+ *best_sad = sad;
+ if (second_best_mv) *second_best_mv = *best_mv;
+ *best_mv = *mv;
+ return 1;
+ }
+ return 0;
+}
+
+// Calculate sad4 and update the bestmv information
+// in FAST_DIAMOND search method.
+static AOM_INLINE void calc_sad4_update_bestmv(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+ const FULLPEL_MV center_mv, const uint8_t *center_address,
+ unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+ int *best_site, int cand_start, int *cost_list) {
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const search_site *site = ms_params->search_sites->site[search_step];
+
+ unsigned char const *block_offset[4];
+ unsigned int sads_buf[4];
+ unsigned int *sads;
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+ if (cost_list) {
+ sads = (unsigned int *)(cost_list + 1);
+ } else {
+ sads = sads_buf;
+ }
+ // Loop over number of candidates.
+ for (int j = 0; j < 4; j++)
+ block_offset[j] = site[cand_start + j].offset + center_address;
+
+ // 4-point sad calculation.
+ ms_params->sdx4df(src_buf, src_stride, block_offset, ref->stride, sads);
+
+ for (int j = 0; j < 4; j++) {
+ const FULLPEL_MV this_mv = { center_mv.row + site[cand_start + j].mv.row,
+ center_mv.col + site[cand_start + j].mv.col };
+ const int found_better_mv = update_mvs_and_sad(
+ sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+ /*second_best_mv=*/NULL);
+ if (found_better_mv) *best_site = cand_start + j;
+ }
+}
+
+static AOM_INLINE void calc_sad3_update_bestmv(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+ FULLPEL_MV center_mv, const uint8_t *center_address, unsigned int *bestsad,
+ unsigned int *raw_bestsad, int search_step, int *best_site,
+ const int *chkpts_indices, int *cost_list) {
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const search_site *site = ms_params->search_sites->site[search_step];
+ unsigned char const *block_offset[4] = {
+ center_address + site[chkpts_indices[0]].offset,
+ center_address + site[chkpts_indices[1]].offset,
+ center_address + site[chkpts_indices[2]].offset,
+ center_address,
+ };
+ unsigned int sads[4];
+ ms_params->sdx3df(src->buf, src->stride, block_offset, ref->stride, sads);
+ for (int j = 0; j < 3; j++) {
+ const int index = chkpts_indices[j];
+ const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
+ center_mv.col + site[index].mv.col };
+ const int found_better_mv = update_mvs_and_sad(
+ sads[j], &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+ /*second_best_mv=*/NULL);
+ if (found_better_mv) *best_site = j;
+ }
+ if (cost_list) {
+ for (int j = 0; j < 3; j++) {
+ int index = chkpts_indices[j];
+ cost_list[index + 1] = sads[j];
+ }
+ }
+}
+
+// Calculate sad and update the bestmv information
+// in FAST_DIAMOND search method.
+static AOM_INLINE void calc_sad_update_bestmv(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+ const FULLPEL_MV center_mv, const uint8_t *center_address,
+ unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+ int *best_site, const int num_candidates, int cand_start, int *cost_list) {
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const search_site *site = ms_params->search_sites->site[search_step];
+ // Loop over number of candidates.
+ for (int i = cand_start; i < num_candidates; i++) {
+ const FULLPEL_MV this_mv = { center_mv.row + site[i].mv.row,
+ center_mv.col + site[i].mv.col };
+ if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
+ int thissad = get_mvpred_sad(ms_params, src,
+ center_address + site[i].offset, ref->stride);
+ if (cost_list) {
+ cost_list[i + 1] = thissad;
+ }
+ const int found_better_mv = update_mvs_and_sad(
+ thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+ /*second_best_mv=*/NULL);
+ if (found_better_mv) *best_site = i;
+ }
+}
+
+static AOM_INLINE void calc_sad_update_bestmv_with_indices(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const MV_COST_PARAMS *mv_cost_params, FULLPEL_MV *best_mv,
+ const FULLPEL_MV center_mv, const uint8_t *center_address,
+ unsigned int *bestsad, unsigned int *raw_bestsad, int search_step,
+ int *best_site, const int num_candidates, const int *chkpts_indices,
+ int *cost_list) {
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const search_site *site = ms_params->search_sites->site[search_step];
+ // Loop over number of candidates.
+ for (int i = 0; i < num_candidates; i++) {
+ int index = chkpts_indices[i];
+ const FULLPEL_MV this_mv = { center_mv.row + site[index].mv.row,
+ center_mv.col + site[index].mv.col };
+ if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+ if (cost_list) {
+ cost_list[index + 1] = INT_MAX;
+ }
+ continue;
+ }
+ const int thissad = get_mvpred_sad(
+ ms_params, src, center_address + site[index].offset, ref->stride);
+ if (cost_list) {
+ cost_list[index + 1] = thissad;
+ }
+ const int found_better_mv = update_mvs_and_sad(
+ thissad, &this_mv, mv_cost_params, bestsad, raw_bestsad, best_mv,
+ /*second_best_mv=*/NULL);
+ if (found_better_mv) *best_site = i;
+ }
+}
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+static int pattern_search(FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ static const int search_steps[MAX_MVSEARCH_STEPS] = {
+ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ };
+ int i, s, t;
+
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const search_site_config *search_sites = ms_params->search_sites;
+ const int *num_candidates = search_sites->searches_per_step;
+ const int ref_stride = ref->stride;
+ const int last_is_4 = num_candidates[0] == 4;
+ int br, bc;
+ unsigned int bestsad = UINT_MAX, raw_bestsad = UINT_MAX;
+ int k = -1;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ search_step = AOMMIN(search_step, MAX_MVSEARCH_STEPS - 1);
+ assert(search_step >= 0);
+ int best_init_s = search_steps[search_step];
+ // adjust ref_mv to make sure it is within MV range
+ clamp_fullmv(&start_mv, &ms_params->mv_limits);
+ br = start_mv.row;
+ bc = start_mv.col;
+ if (cost_list != NULL) {
+ cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+ INT_MAX;
+ }
+ int costlist_has_sad = 0;
+
+ // Work out the start point for the search
+ raw_bestsad = get_mvpred_sad(ms_params, src,
+ get_buf_from_fullmv(ref, &start_mv), ref_stride);
+ bestsad = raw_bestsad + mvsad_err_cost_(&start_mv, mv_cost_params);
+
+ // Search all possible scales up to the search param around the center point
+ // pick the scale of the point that is best as the starting scale of
+ // further steps around it.
+ const uint8_t *center_address = get_buf_from_fullmv(ref, &start_mv);
+ if (do_init_search) {
+ s = best_init_s;
+ best_init_s = -1;
+ for (t = 0; t <= s; ++t) {
+ int best_site = -1;
+ FULLPEL_MV center_mv = { br, bc };
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) {
+ // Call 4-point sad for multiples of 4 candidates.
+ const int no_of_4_cand_loops = num_candidates[t] >> 2;
+ for (i = 0; i < no_of_4_cand_loops; i++) {
+ calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, t,
+ &best_site, i * 4, /*cost_list=*/NULL);
+ }
+ // Rest of the candidates
+ const int remaining_cand = num_candidates[t] % 4;
+ calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, t,
+ &best_site, remaining_cand,
+ no_of_4_cand_loops * 4, NULL);
+ } else {
+ calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, t,
+ &best_site, num_candidates[t], 0, NULL);
+ }
+ if (best_site == -1) {
+ continue;
+ } else {
+ best_init_s = t;
+ k = best_site;
+ }
+ }
+ if (best_init_s != -1) {
+ br += search_sites->site[best_init_s][k].mv.row;
+ bc += search_sites->site[best_init_s][k].mv.col;
+ center_address += search_sites->site[best_init_s][k].offset;
+ }
+ }
+
+ // If the center point is still the best, just skip this and move to
+ // the refinement step.
+ if (best_init_s != -1) {
+ const int last_s = (last_is_4 && cost_list != NULL);
+ int best_site = -1;
+ s = best_init_s;
+
+ for (; s >= last_s; s--) {
+ // No need to search all points the 1st time if initial search was used
+ if (!do_init_search || s != best_init_s) {
+ FULLPEL_MV center_mv = { br, bc };
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+ // Call 4-point sad for multiples of 4 candidates.
+ const int no_of_4_cand_loops = num_candidates[s] >> 2;
+ for (i = 0; i < no_of_4_cand_loops; i++) {
+ calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv,
+ center_mv, center_address, &bestsad,
+ &raw_bestsad, s, &best_site, i * 4,
+ /*cost_list=*/NULL);
+ }
+ // Rest of the candidates
+ const int remaining_cand = num_candidates[s] % 4;
+ calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, remaining_cand,
+ no_of_4_cand_loops * 4, NULL);
+ } else {
+ calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, num_candidates[s], 0, NULL);
+ }
+
+ if (best_site == -1) {
+ continue;
+ } else {
+ br += search_sites->site[s][best_site].mv.row;
+ bc += search_sites->site[s][best_site].mv.col;
+ center_address += search_sites->site[s][best_site].offset;
+ k = best_site;
+ }
+ }
+
+ do {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+ FULLPEL_MV center_mv = { br, bc };
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+ calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, next_chkpts_indices, NULL);
+ } else {
+ calc_sad_update_bestmv_with_indices(
+ ms_params, mv_cost_params, best_mv, center_mv, center_address,
+ &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF,
+ next_chkpts_indices, NULL);
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += search_sites->site[s][k].mv.row;
+ bc += search_sites->site[s][k].mv.col;
+ center_address += search_sites->site[s][k].offset;
+ }
+ } while (best_site != -1);
+ }
+ // Note: If we enter the if below, then cost_list must be non-NULL.
+ if (s == 0) {
+ cost_list[0] = raw_bestsad;
+ costlist_has_sad = 1;
+ assert(num_candidates[s] == 4);
+ if (!do_init_search || s != best_init_s) {
+ FULLPEL_MV center_mv = { br, bc };
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+ calc_sad4_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, 0, cost_list);
+ } else {
+ calc_sad_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, /*num_candidates=*/4,
+ /*cand_start=*/0, cost_list);
+ }
+
+ if (best_site != -1) {
+ br += search_sites->site[s][best_site].mv.row;
+ bc += search_sites->site[s][best_site].mv.col;
+ center_address += search_sites->site[s][best_site].offset;
+ k = best_site;
+ }
+ }
+ while (best_site != -1) {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+ cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+ cost_list[((k + 2) % 4) + 1] = cost_list[0];
+ cost_list[0] = raw_bestsad;
+
+ FULLPEL_MV center_mv = { br, bc };
+ if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+ assert(PATTERN_CANDIDATES_REF == 3);
+ calc_sad3_update_bestmv(ms_params, mv_cost_params, best_mv, center_mv,
+ center_address, &bestsad, &raw_bestsad, s,
+ &best_site, next_chkpts_indices, cost_list);
+ } else {
+ calc_sad_update_bestmv_with_indices(
+ ms_params, mv_cost_params, best_mv, center_mv, center_address,
+ &bestsad, &raw_bestsad, s, &best_site, PATTERN_CANDIDATES_REF,
+ next_chkpts_indices, cost_list);
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += search_sites->site[s][k].mv.row;
+ bc += search_sites->site[s][k].mv.col;
+ center_address += search_sites->site[s][k].offset;
+ }
+ }
+ }
+ }
+ best_mv->row = br;
+ best_mv->col = bc;
+
+ assert(center_address == get_buf_from_fullmv(ref, best_mv) &&
+ "center address is out of sync with best_mv!\n");
+
+ // Returns the one-away integer pel cost/sad around the best as follows:
+ // cost_list[0]: cost/sad at the best integer pel
+ // cost_list[1]: cost/sad at delta {0, -1} (left) from the best integer pel
+ // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
+ // cost_list[3]: cost/sad at delta { 0, 1} (right) from the best integer pel
+ // cost_list[4]: cost/sad at delta {-1, 0} (top) from the best integer pel
+ if (cost_list) {
+ if (USE_SAD_COSTLIST) {
+ calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+ } else {
+ calc_int_cost_list(*best_mv, ms_params, cost_list);
+ }
+ }
+
+ const int var_cost = get_mvpred_var_cost(ms_params, best_mv, best_mv_stats);
+ return var_cost;
+}
+
+// For the following foo_search, the input arguments are:
+// start_mv: where we are starting our motion search
+// ms_params: a collection of motion search parameters
+// search_step: how many steps to skip in our motion search. For example,
+// a value 3 suggests that 3 search steps have already taken place prior to
+// this function call, so we jump directly to step 4 of the search process
+// do_init_search: if on, do an initial search of all possible scales around the
+// start_mv, and then pick the best scale.
+// cond_list: used to hold the cost around the best full mv so we can use it to
+// speed up subpel search later.
+// best_mv: the best mv found in the motion search
+static int hex_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return pattern_search(start_mv, ms_params, search_step, do_init_search,
+ cost_list, best_mv, best_mv_stats);
+}
+
+static int bigdia_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return pattern_search(start_mv, ms_params, search_step, do_init_search,
+ cost_list, best_mv, best_mv_stats);
+}
+
+static int square_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return pattern_search(start_mv, ms_params, search_step, do_init_search,
+ cost_list, best_mv, best_mv_stats);
+}
+
+static int fast_hex_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return hex_search(start_mv, ms_params,
+ AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step), do_init_search,
+ cost_list, best_mv, best_mv_stats);
+}
+
+static int vfast_dia_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return bigdia_search(start_mv, ms_params,
+ AOMMAX(MAX_MVSEARCH_STEPS - 1, search_step),
+ do_init_search, cost_list, best_mv, best_mv_stats);
+}
+
+static int fast_dia_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return bigdia_search(start_mv, ms_params,
+ AOMMAX(MAX_MVSEARCH_STEPS - 2, search_step),
+ do_init_search, cost_list, best_mv, best_mv_stats);
+}
+
+static int fast_bigdia_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, const int do_init_search,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats) {
+ return bigdia_search(start_mv, ms_params,
+ AOMMAX(MAX_MVSEARCH_STEPS - 3, search_step),
+ do_init_search, cost_list, best_mv, best_mv_stats);
+}
+
+static int diamond_search_sad(FULLPEL_MV start_mv, unsigned int start_mv_sad,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int search_step, int *num00,
+ FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+#define UPDATE_SEARCH_STEP \
+ do { \
+ if (best_site != 0) { \
+ tmp_second_best_mv = *best_mv; \
+ best_mv->row += site[best_site].mv.row; \
+ best_mv->col += site[best_site].mv.col; \
+ best_address += site[best_site].offset; \
+ is_off_center = 1; \
+ } \
+ \
+ if (is_off_center == 0) num_center_steps++; \
+ \
+ if (best_site == 0 && step > 2) { \
+ int next_step_size = cfg->radius[step - 1]; \
+ while (next_step_size == cfg->radius[step] && step > 2) { \
+ num_center_steps++; \
+ --step; \
+ next_step_size = cfg->radius[step - 1]; \
+ } \
+ } \
+ } while (0)
+
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+
+ const uint8_t *src_buf = src->buf;
+ const int src_stride = src->stride;
+ const int ref_stride = ref->stride;
+
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+
+ const search_site_config *cfg = ms_params->search_sites;
+
+ int is_off_center = 0;
+ // Number of times that we have stayed in the middle. This is used to skip
+ // search steps in the future if diamond_search_sad is called again.
+ int num_center_steps = 0;
+
+ // search_step determines the length of the initial step and hence the number
+ // of iterations.
+ const int tot_steps = cfg->num_search_steps - search_step;
+ FULLPEL_MV tmp_second_best_mv;
+ if (second_best_mv) {
+ tmp_second_best_mv = *second_best_mv;
+ }
+
+ *best_mv = start_mv;
+
+ // Check the starting position
+ const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv);
+ unsigned int bestsad = start_mv_sad;
+
+ // TODO(chiyotsai@google.com): Implement 4 points search for msdf&sdaf
+ if (ms_params->ms_buffers.second_pred) {
+ for (int step = tot_steps - 1; step >= 0; --step) {
+ const search_site *site = cfg->site[step];
+ const int num_searches = cfg->searches_per_step[step];
+ int best_site = 0;
+
+ for (int idx = 1; idx <= num_searches; idx++) {
+ const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row,
+ best_mv->col + site[idx].mv.col };
+
+ if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+ const uint8_t *const check_here = site[idx].offset + best_address;
+ unsigned int thissad =
+ get_mvpred_compound_sad(ms_params, src, check_here, ref_stride);
+
+ if (thissad < bestsad) {
+ thissad += mvsad_err_cost_(&this_mv, mv_cost_params);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = idx;
+ }
+ }
+ }
+ }
+ UPDATE_SEARCH_STEP;
+ }
+ } else {
+ for (int step = tot_steps - 1; step >= 0; --step) {
+ const search_site *site = cfg->site[step];
+ const int num_searches = cfg->searches_per_step[step];
+ int best_site = 0;
+
+ int all_in = 1;
+ // Trap illegal vectors
+ all_in &= best_mv->row + site[1].mv.row >= ms_params->mv_limits.row_min;
+ all_in &= best_mv->row + site[2].mv.row <= ms_params->mv_limits.row_max;
+ all_in &= best_mv->col + site[3].mv.col >= ms_params->mv_limits.col_min;
+ all_in &= best_mv->col + site[4].mv.col <= ms_params->mv_limits.col_max;
+
+ if (all_in) {
+ for (int idx = 1; idx <= num_searches; idx += 4) {
+ unsigned char const *block_offset[4];
+ unsigned int sads[4];
+
+ for (int j = 0; j < 4; j++)
+ block_offset[j] = site[idx + j].offset + best_address;
+
+ ms_params->sdx4df(src_buf, src_stride, block_offset, ref_stride,
+ sads);
+ for (int j = 0; j < 4; j++) {
+ if (sads[j] < bestsad) {
+ const FULLPEL_MV this_mv = { best_mv->row + site[idx + j].mv.row,
+ best_mv->col +
+ site[idx + j].mv.col };
+ unsigned int thissad =
+ sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = idx + j;
+ }
+ }
+ }
+ }
+ } else {
+ for (int idx = 1; idx <= num_searches; idx++) {
+ const FULLPEL_MV this_mv = { best_mv->row + site[idx].mv.row,
+ best_mv->col + site[idx].mv.col };
+
+ if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+ const uint8_t *const check_here = site[idx].offset + best_address;
+ unsigned int thissad =
+ get_mvpred_sad(ms_params, src, check_here, ref_stride);
+
+ if (thissad < bestsad) {
+ thissad += mvsad_err_cost_(&this_mv, mv_cost_params);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = idx;
+ }
+ }
+ }
+ }
+ }
+ UPDATE_SEARCH_STEP;
+ }
+ }
+
+ *num00 = num_center_steps;
+ if (second_best_mv) {
+ *second_best_mv = tmp_second_best_mv;
+ }
+
+ return bestsad;
+
+#undef UPDATE_SEARCH_STEP
+}
+
+static INLINE unsigned int get_start_mvpred_sad_cost(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv) {
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const uint8_t *best_address = get_buf_from_fullmv(ref, &start_mv);
+
+ unsigned int start_mv_sad =
+ mvsad_err_cost_(&start_mv, &ms_params->mv_cost_params);
+
+ if (ms_params->ms_buffers.second_pred)
+ start_mv_sad +=
+ get_mvpred_compound_sad(ms_params, src, best_address, ref->stride);
+ else
+ start_mv_sad += get_mvpred_sad(ms_params, src, best_address, ref->stride);
+
+ return start_mv_sad;
+}
+
+static int full_pixel_diamond(FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int step_param, int *cost_list,
+ FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *best_mv_stats,
+ FULLPEL_MV *second_best_mv) {
+ const search_site_config *cfg = ms_params->search_sites;
+ int thissme, n, num00 = 0;
+
+ // Clamp start mv and calculate the cost
+ clamp_fullmv(&start_mv, &ms_params->mv_limits);
+ unsigned int start_mv_sad = get_start_mvpred_sad_cost(ms_params, start_mv);
+
+ diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param, &n, best_mv,
+ second_best_mv);
+
+ int bestsme = get_mvpred_compound_var_cost(ms_params, best_mv, best_mv_stats);
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ const int further_steps = cfg->num_search_steps - 1 - step_param;
+ while (n < further_steps) {
+ ++n;
+
+ // TODO(chiyotsai@google.com): There is another bug here where the second
+ // best mv gets incorrectly overwritten. Fix it later.
+ FULLPEL_MV tmp_best_mv;
+ FULLPEL_MV_STATS tmp_best_mv_stats;
+ diamond_search_sad(start_mv, start_mv_sad, ms_params, step_param + n,
+ &num00, &tmp_best_mv, second_best_mv);
+
+ thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv,
+ &tmp_best_mv_stats);
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *best_mv = tmp_best_mv;
+ *best_mv_stats = tmp_best_mv_stats;
+ }
+
+ if (num00) {
+ // Advance the loop by num00 steps
+ n += num00;
+ num00 = 0;
+ }
+ }
+
+ // Return cost list.
+ if (cost_list) {
+ if (USE_SAD_COSTLIST) {
+ const int costlist_has_sad = 0;
+ calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+ } else {
+ calc_int_cost_list(*best_mv, ms_params, cost_list);
+ }
+ }
+ return bestsme;
+}
+
+// Exhaustive motion search around a given centre position with a given
+// step size.
+static int exhaustive_mesh_search(FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int range, const int step,
+ FULLPEL_MV *best_mv,
+ FULLPEL_MV *second_best_mv) {
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const struct buf_2d *const src = ms_params->ms_buffers.src;
+ const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+ const int ref_stride = ref->stride;
+ unsigned int best_sad = INT_MAX;
+ int r, c, i;
+ int start_col, end_col, start_row, end_row;
+ const int col_step = (step > 1) ? step : 4;
+
+ assert(step >= 1);
+
+ clamp_fullmv(&start_mv, &ms_params->mv_limits);
+ *best_mv = start_mv;
+ best_sad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv),
+ ref_stride);
+ best_sad += mvsad_err_cost_(&start_mv, mv_cost_params);
+ start_row = AOMMAX(-range, ms_params->mv_limits.row_min - start_mv.row);
+ start_col = AOMMAX(-range, ms_params->mv_limits.col_min - start_mv.col);
+ end_row = AOMMIN(range, ms_params->mv_limits.row_max - start_mv.row);
+ end_col = AOMMIN(range, ms_params->mv_limits.col_max - start_mv.col);
+
+ for (r = start_row; r <= end_row; r += step) {
+ for (c = start_col; c <= end_col; c += col_step) {
+ // Step > 1 means we are not checking every location in this pass.
+ if (step > 1) {
+ const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c };
+ unsigned int sad = get_mvpred_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+ update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
+ /*raw_best_sad=*/NULL, best_mv, second_best_mv);
+ } else {
+ // 4 sads in a single call if we are checking every location
+ if (c + 3 <= end_col) {
+ unsigned int sads[4];
+ const uint8_t *addrs[4];
+ for (i = 0; i < 4; ++i) {
+ const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+ addrs[i] = get_buf_from_fullmv(ref, &mv);
+ }
+
+ ms_params->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
+
+ for (i = 0; i < 4; ++i) {
+ if (sads[i] < best_sad) {
+ const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+ update_mvs_and_sad(sads[i], &mv, mv_cost_params, &best_sad,
+ /*raw_best_sad=*/NULL, best_mv,
+ second_best_mv);
+ }
+ }
+ } else {
+ for (i = 0; i < end_col - c; ++i) {
+ const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+ unsigned int sad = get_mvpred_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+ update_mvs_and_sad(sad, &mv, mv_cost_params, &best_sad,
+ /*raw_best_sad=*/NULL, best_mv, second_best_mv);
+ }
+ }
+ }
+ }
+ }
+
+ return best_sad;
+}
+
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const struct MESH_PATTERN *const mesh_patterns,
+ int *cost_list, FULLPEL_MV *best_mv,
+ FULLPEL_MV_STATS *mv_stats,
+ FULLPEL_MV *second_best_mv) {
+ const int kMinRange = 7;
+ const int kMaxRange = 256;
+ const int kMinInterval = 1;
+
+ int bestsme;
+ int i;
+ int interval = mesh_patterns[0].interval;
+ int range = mesh_patterns[0].range;
+ int baseline_interval_divisor;
+
+ // TODO(chiyotsai@google.com): Currently exhaustive search calls single ref
+ // version of sad and variance function. We still need to check the
+ // performance when compound ref exhaustive search is enabled.
+ assert(!ms_params->ms_buffers.second_pred &&
+ "Mesh search does not support compound mode!");
+
+ *best_mv = start_mv;
+
+ // Trap illegal values for interval and range for this function.
+ if ((range < kMinRange) || (range > kMaxRange) || (interval < kMinInterval) ||
+ (interval > range))
+ return INT_MAX;
+
+ baseline_interval_divisor = range / interval;
+
+ // Check size of proposed first range against magnitude of the centre
+ // value used as a starting point.
+ range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4);
+ range = AOMMIN(range, kMaxRange);
+ interval = AOMMAX(interval, range / baseline_interval_divisor);
+ // Use a small search step/interval for certain kind of clips.
+ // For example, screen content clips with a lot of texts.
+ // Large interval could lead to a false matching position, and it can't find
+ // the best global candidate in following iterations due to reduced search
+ // range. The solution here is to use a small search iterval in the beginning
+ // and thus reduces the chance of missing the best candidate.
+ if (ms_params->fine_search_interval) {
+ interval = AOMMIN(interval, 4);
+ }
+
+ // initial search
+ bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval,
+ best_mv, second_best_mv);
+
+ if ((interval > kMinInterval) && (range > kMinRange)) {
+ // Progressive searches with range and step size decreasing each time
+ // till we reach a step size of 1. Then break out.
+ for (i = 1; i < MAX_MESH_STEP; ++i) {
+ // First pass with coarser step and longer range
+ bestsme = exhaustive_mesh_search(
+ *best_mv, ms_params, mesh_patterns[i].range,
+ mesh_patterns[i].interval, best_mv, second_best_mv);
+
+ if (mesh_patterns[i].interval == 1) break;
+ }
+ }
+
+ if (bestsme < INT_MAX) {
+ bestsme = get_mvpred_var_cost(ms_params, best_mv, mv_stats);
+ }
+
+ // Return cost list.
+ if (cost_list) {
+ if (USE_SAD_COSTLIST) {
+ const int costlist_has_sad = 0;
+ calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+ } else {
+ calc_int_cost_list(*best_mv, ms_params, cost_list);
+ }
+ }
+ return bestsme;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode, or when searching for one component of an ext-inter compound mode.
+int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const FULLPEL_MV start_mv, FULLPEL_MV *best_mv) {
+ static const search_neighbors neighbors[8] = {
+ { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
+ { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 },
+ { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 },
+ { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
+ { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
+ };
+
+ uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P *
+ SEARCH_GRID_STRIDE_8P] = { 0 };
+ int grid_center = SEARCH_GRID_CENTER_8P;
+ int grid_coord = grid_center;
+
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const FullMvLimits *mv_limits = &ms_params->mv_limits;
+ const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+ const struct buf_2d *src = ms_buffers->src;
+ const struct buf_2d *ref = ms_buffers->ref;
+ const int ref_stride = ref->stride;
+
+ *best_mv = start_mv;
+ clamp_fullmv(best_mv, mv_limits);
+
+ unsigned int best_sad = get_mvpred_compound_sad(
+ ms_params, src, get_buf_from_fullmv(ref, best_mv), ref_stride);
+ best_sad += mvsad_err_cost_(best_mv, mv_cost_params);
+
+ do_refine_search_grid[grid_coord] = 1;
+
+ for (int i = 0; i < SEARCH_RANGE_8P; ++i) {
+ int best_site = -1;
+
+ for (int j = 0; j < 8; ++j) {
+ grid_coord = grid_center + neighbors[j].coord_offset;
+ if (do_refine_search_grid[grid_coord] == 1) {
+ continue;
+ }
+ const FULLPEL_MV mv = { best_mv->row + neighbors[j].coord.row,
+ best_mv->col + neighbors[j].coord.col };
+
+ do_refine_search_grid[grid_coord] = 1;
+ if (av1_is_fullmv_in_range(mv_limits, mv)) {
+ unsigned int sad;
+ sad = get_mvpred_compound_sad(
+ ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ best_mv->row += neighbors[best_site].coord.row;
+ best_mv->col += neighbors[best_site].coord.col;
+ grid_center += neighbors[best_site].coord_offset;
+ }
+ }
+ return best_sad;
+}
+
+int av1_full_pixel_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int step_param, int *cost_list,
+ FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
+ FULLPEL_MV *second_best_mv) {
+ const BLOCK_SIZE bsize = ms_params->bsize;
+ const SEARCH_METHODS search_method = ms_params->search_method;
+
+ const int is_intra_mode = ms_params->is_intra_mode;
+ int run_mesh_search = ms_params->run_mesh_search;
+
+ int var = 0;
+ MARK_MV_INVALID(best_mv);
+ if (second_best_mv) {
+ MARK_MV_INVALID(second_best_mv);
+ }
+
+ if (cost_list) {
+ cost_list[0] = INT_MAX;
+ cost_list[1] = INT_MAX;
+ cost_list[2] = INT_MAX;
+ cost_list[3] = INT_MAX;
+ cost_list[4] = INT_MAX;
+ }
+
+ assert(ms_params->ms_buffers.ref->stride == ms_params->search_sites->stride);
+ assert(ms_params->ms_buffers.ref->width == ms_params->ms_buffers.src->width);
+
+ switch (search_method) {
+ case FAST_BIGDIA:
+ var = fast_bigdia_search(start_mv, ms_params, step_param, 0, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case VFAST_DIAMOND:
+ var = vfast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case FAST_DIAMOND:
+ var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case FAST_HEX:
+ var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case HEX:
+ var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv,
+ best_mv_stats);
+ break;
+ case SQUARE:
+ var = square_search(start_mv, ms_params, step_param, 1, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case BIGDIA:
+ var = bigdia_search(start_mv, ms_params, step_param, 1, cost_list,
+ best_mv, best_mv_stats);
+ break;
+ case NSTEP:
+ case NSTEP_8PT:
+ case DIAMOND:
+ case CLAMPED_DIAMOND:
+ var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list,
+ best_mv, best_mv_stats, second_best_mv);
+ break;
+ default: assert(0 && "Invalid search method.");
+ }
+
+ // Should we allow a follow on exhaustive search?
+ if (!run_mesh_search &&
+ ((search_method == NSTEP) || (search_method == NSTEP_8PT)) &&
+ !ms_params->ms_buffers.second_pred) {
+ int exhaustive_thr = ms_params->force_mesh_thresh;
+ exhaustive_thr >>=
+ 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+ // Threshold variance for an exhaustive full search.
+ if (var > exhaustive_thr) run_mesh_search = 1;
+ }
+
+ // TODO(yunqing): the following is used to reduce mesh search in temporal
+ // filtering. Can extend it to intrabc.
+ if (!is_intra_mode && ms_params->prune_mesh_search) {
+ const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row),
+ abs(start_mv.col - best_mv->col));
+ if (full_pel_mv_diff <= ms_params->mesh_search_mv_diff_threshold) {
+ run_mesh_search = 0;
+ }
+ }
+
+ if (ms_params->sdf != ms_params->vfp->sdf) {
+ // If we are skipping rows when we perform the motion search, we need to
+ // check the quality of skipping. If it's bad, then we run mesh search with
+ // skip row features off.
+ // TODO(chiyotsai@google.com): Handle the case where we have a vertical
+ // offset of 1 before we hit this statement to avoid having to redo
+ // motion search.
+ const struct buf_2d *src = ms_params->ms_buffers.src;
+ const struct buf_2d *ref = ms_params->ms_buffers.ref;
+ const int src_stride = src->stride;
+ const int ref_stride = ref->stride;
+
+ const uint8_t *src_address = src->buf;
+ const uint8_t *best_address = get_buf_from_fullmv(ref, best_mv);
+ const int sad =
+ ms_params->vfp->sdf(src_address, src_stride, best_address, ref_stride);
+ const int skip_sad =
+ ms_params->vfp->sdsf(src_address, src_stride, best_address, ref_stride);
+ // We will keep the result of skipping rows if it's good enough. Here, good
+ // enough means the error is less than 1 per pixel.
+ const int kSADThresh =
+ 1 << (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+ if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= AOMMAX(sad, 1) * 9) {
+ // There is a large discrepancy between skipping and not skipping, so we
+ // need to redo the motion search.
+ FULLPEL_MOTION_SEARCH_PARAMS new_ms_params = *ms_params;
+ new_ms_params.sdf = new_ms_params.vfp->sdf;
+ new_ms_params.sdx4df = new_ms_params.vfp->sdx4df;
+ new_ms_params.sdx3df = new_ms_params.vfp->sdx3df;
+
+ return av1_full_pixel_search(start_mv, &new_ms_params, step_param,
+ cost_list, best_mv, best_mv_stats,
+ second_best_mv);
+ }
+ }
+
+ if (run_mesh_search) {
+ int var_ex;
+ FULLPEL_MV tmp_mv_ex;
+ FULLPEL_MV_STATS tmp_mv_stats;
+ // Pick the mesh pattern for exhaustive search based on the toolset (intraBC
+ // or non-intraBC)
+ // TODO(chiyotsai@google.com): There is a bug here where the second best mv
+ // gets overwritten without actually comparing the rdcost.
+ const MESH_PATTERN *const mesh_patterns =
+ ms_params->mesh_patterns[is_intra_mode];
+ // TODO(chiyotsai@google.com): the second best mv is not set correctly by
+ // full_pixel_exhaustive, which can incorrectly override it.
+ var_ex =
+ full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns, cost_list,
+ &tmp_mv_ex, &tmp_mv_stats, second_best_mv);
+ if (var_ex < var) {
+ var = var_ex;
+ *best_mv_stats = tmp_mv_stats;
+ *best_mv = tmp_mv_ex;
+ }
+ }
+
+ return var;
+}
+
+int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ IntraBCHashInfo *intrabc_hash_info,
+ FULLPEL_MV *best_mv) {
+ if (!av1_use_hash_me(cpi)) return INT_MAX;
+
+ const BLOCK_SIZE bsize = ms_params->bsize;
+ const int block_width = block_size_wide[bsize];
+ const int block_height = block_size_high[bsize];
+
+ if (block_width != block_height) return INT_MAX;
+
+ const FullMvLimits *mv_limits = &ms_params->mv_limits;
+ const MSBuffers *ms_buffer = &ms_params->ms_buffers;
+
+ const uint8_t *src = ms_buffer->src->buf;
+ const int src_stride = ms_buffer->src->stride;
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int x_pos = mi_col * MI_SIZE;
+ const int y_pos = mi_row * MI_SIZE;
+
+ uint32_t hash_value1, hash_value2;
+ int best_hash_cost = INT_MAX;
+
+ // for the hashMap
+ hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table;
+
+ av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width,
+ &hash_value1, &hash_value2, is_cur_buf_hbd(xd));
+
+ const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
+ if (count <= 1) {
+ return INT_MAX;
+ }
+
+ Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
+ for (int i = 0; i < count; i++, aom_iterator_increment(&iterator)) {
+ block_hash ref_block_hash = *(block_hash *)(aom_iterator_get(&iterator));
+ if (hash_value2 == ref_block_hash.hash_value2) {
+ // Make sure the prediction is from valid area.
+ const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos),
+ GET_MV_SUBPEL(ref_block_hash.x - x_pos) };
+ if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize,
+ cpi->common.seq_params->mib_size_log2))
+ continue;
+
+ FULLPEL_MV hash_mv;
+ hash_mv.col = ref_block_hash.x - x_pos;
+ hash_mv.row = ref_block_hash.y - y_pos;
+ if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue;
+ FULLPEL_MV_STATS mv_stats;
+ const int refCost = get_mvpred_var_cost(ms_params, &hash_mv, &mv_stats);
+ if (refCost < best_hash_cost) {
+ best_hash_cost = refCost;
+ *best_mv = hash_mv;
+ }
+ }
+ }
+
+ return best_hash_cost;
+}
+
+static int vector_match(int16_t *ref, int16_t *src, int bwl, int search_size,
+ int full_search, int *sad) {
+ int best_sad = INT_MAX;
+ int this_sad;
+ int d;
+ int center, offset = 0;
+ int bw = search_size << 1;
+
+ if (full_search) {
+ for (d = 0; d <= bw; d++) {
+ this_sad = aom_vector_var(&ref[d], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ offset = d;
+ }
+ }
+ center = offset;
+ *sad = best_sad;
+ return (center - (bw >> 1));
+ }
+
+ for (d = 0; d <= bw; d += 16) {
+ this_sad = aom_vector_var(&ref[d], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ offset = d;
+ }
+ }
+ center = offset;
+
+ for (d = -8; d <= 8; d += 16) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -4; d <= 4; d += 8) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -2; d <= 2; d += 4) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -1; d <= 1; d += 2) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ *sad = best_sad;
+ return (center - (bw >> 1));
+}
+
+// A special fast version of motion search used in rt mode.
+// The search window along columns and row is given by:
+// +/- me_search_size_col/row.
+unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col, const MV *ref_mv,
+ unsigned int *y_sad_zero,
+ int me_search_size_col,
+ int me_search_size_row) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mi = xd->mi[0];
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+ int idx;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ const int full_search = is_screen;
+ const bool screen_scroll_superblock =
+ is_screen && bsize == cm->seq_params->sb_size;
+ // Keep border a multiple of 16.
+ const int border = (cpi->oxcf.border_in_pixels >> 4) << 4;
+ int search_size_width = me_search_size_col;
+ int search_size_height = me_search_size_row;
+ // Adjust based on boundary.
+ if (((mi_col << 2) - search_size_width < -border) ||
+ ((mi_col << 2) + search_size_width > cm->width + border))
+ search_size_width = border;
+ if (((mi_row << 2) - search_size_height < -border) ||
+ ((mi_row << 2) + search_size_height > cm->height + border))
+ search_size_height = border;
+ const int src_stride = x->plane[0].src.stride;
+ const int ref_stride = xd->plane[0].pre[0].stride;
+ uint8_t const *ref_buf, *src_buf;
+ int_mv *best_int_mv = &xd->mi[0]->mv[0];
+ unsigned int best_sad, tmp_sad, this_sad[4];
+ int best_sad_col, best_sad_row;
+ const int row_norm_factor = mi_size_high_log2[bsize] + 1;
+ const int col_norm_factor = 3 + (bw >> 5);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+ static const MV search_pos[4] = {
+ { -1, 0 },
+ { 0, -1 },
+ { 0, 1 },
+ { 1, 0 },
+ };
+
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+ av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+ MAX_MB_PLANE);
+ }
+
+ if (xd->bd != 8) {
+ best_int_mv->as_fullmv = kZeroFullMv;
+ best_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+ xd->plane[0].pre[0].buf, ref_stride);
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+ return best_sad;
+ }
+ const int width_ref_buf = (search_size_width << 1) + bw;
+ const int height_ref_buf = (search_size_height << 1) + bh;
+ int16_t *hbuf = (int16_t *)aom_malloc(width_ref_buf * sizeof(*hbuf));
+ int16_t *vbuf = (int16_t *)aom_malloc(height_ref_buf * sizeof(*vbuf));
+ int16_t *src_hbuf = (int16_t *)aom_malloc(bw * sizeof(*src_hbuf));
+ int16_t *src_vbuf = (int16_t *)aom_malloc(bh * sizeof(*src_vbuf));
+ if (!hbuf || !vbuf || !src_hbuf || !src_vbuf) {
+ aom_free(hbuf);
+ aom_free(vbuf);
+ aom_free(src_hbuf);
+ aom_free(src_vbuf);
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf");
+ }
+
+ // Set up prediction 1-D reference set for rows.
+ ref_buf = xd->plane[0].pre[0].buf - search_size_width;
+ aom_int_pro_row(hbuf, ref_buf, ref_stride, width_ref_buf, bh,
+ row_norm_factor);
+
+ // Set up prediction 1-D reference set for cols
+ ref_buf = xd->plane[0].pre[0].buf - search_size_height * ref_stride;
+ aom_int_pro_col(vbuf, ref_buf, ref_stride, bw, height_ref_buf,
+ col_norm_factor);
+
+ // Set up src 1-D reference set
+ src_buf = x->plane[0].src.buf;
+ aom_int_pro_row(src_hbuf, src_buf, src_stride, bw, bh, row_norm_factor);
+ aom_int_pro_col(src_vbuf, src_buf, src_stride, bw, bh, col_norm_factor);
+
+ // Find the best match per 1-D search
+ best_int_mv->as_fullmv.col =
+ vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize], search_size_width,
+ full_search, &best_sad_col);
+ best_int_mv->as_fullmv.row =
+ vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize], search_size_height,
+ full_search, &best_sad_row);
+
+ // For screen: select between horiz or vert motion.
+ if (is_screen) {
+ if (best_sad_col < best_sad_row)
+ best_int_mv->as_fullmv.row = 0;
+ else
+ best_int_mv->as_fullmv.col = 0;
+ }
+
+ FULLPEL_MV this_mv = best_int_mv->as_fullmv;
+ src_buf = x->plane[0].src.buf;
+ ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
+ best_sad =
+ cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+ // Evaluate zero MV if found MV is non-zero.
+ if (best_int_mv->as_int != 0) {
+ tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+ xd->plane[0].pre[0].buf, ref_stride);
+ *y_sad_zero = tmp_sad;
+ if (tmp_sad < best_sad) {
+ best_int_mv->as_fullmv = kZeroFullMv;
+ this_mv = best_int_mv->as_fullmv;
+ ref_buf = xd->plane[0].pre[0].buf;
+ best_sad = tmp_sad;
+ }
+ } else {
+ *y_sad_zero = best_sad;
+ }
+
+ if (!screen_scroll_superblock) {
+ const uint8_t *const pos[4] = {
+ ref_buf - ref_stride,
+ ref_buf - 1,
+ ref_buf + 1,
+ ref_buf + ref_stride,
+ };
+
+ cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride,
+ this_sad);
+
+ for (idx = 0; idx < 4; ++idx) {
+ if (this_sad[idx] < best_sad) {
+ best_sad = this_sad[idx];
+ best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
+ best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
+ }
+ }
+
+ if (this_sad[0] < this_sad[3])
+ this_mv.row -= 1;
+ else
+ this_mv.row += 1;
+
+ if (this_sad[1] < this_sad[2])
+ this_mv.col -= 1;
+ else
+ this_mv.col += 1;
+
+ ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
+
+ tmp_sad =
+ cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+ if (best_sad > tmp_sad) {
+ best_int_mv->as_fullmv = this_mv;
+ best_sad = tmp_sad;
+ }
+ }
+
+ FullMvLimits mv_limits = x->mv_limits;
+ av1_set_mv_search_range(&mv_limits, ref_mv);
+ clamp_fullmv(&best_int_mv->as_fullmv, &mv_limits);
+
+ convert_fullmv_to_mv(best_int_mv);
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+
+ aom_free(hbuf);
+ aom_free(vbuf);
+ aom_free(src_hbuf);
+ aom_free(src_vbuf);
+ return best_sad;
+}
+
+// =============================================================================
+// Fullpixel Motion Search: OBMC
+// =============================================================================
+static INLINE int get_obmc_mvpred_var(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+ const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+ const int32_t *wsrc = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const struct buf_2d *ref_buf = ms_buffers->ref;
+
+ const MV mv = get_mv_from_fullmv(this_mv);
+ unsigned int unused;
+
+ return vfp->ovf(get_buf_from_fullmv(ref_buf, this_mv), ref_buf->stride, wsrc,
+ mask, &unused) +
+ mv_err_cost_(&mv, mv_cost_params);
+}
+
+static int obmc_refining_search_sad(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV *best_mv) {
+ const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+ const int32_t *wsrc = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const struct buf_2d *ref_buf = ms_buffers->ref;
+ const FULLPEL_MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+ const int kSearchRange = 8;
+
+ unsigned int best_sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, best_mv),
+ ref_buf->stride, wsrc, mask) +
+ mvsad_err_cost_(best_mv, mv_cost_params);
+
+ for (int i = 0; i < kSearchRange; i++) {
+ int best_site = -1;
+
+ for (int j = 0; j < 4; j++) {
+ const FULLPEL_MV mv = { best_mv->row + neighbors[j].row,
+ best_mv->col + neighbors[j].col };
+ if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
+ unsigned int sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, &mv),
+ ref_buf->stride, wsrc, mask);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ best_mv->row += neighbors[best_site].row;
+ best_mv->col += neighbors[best_site].col;
+ }
+ }
+ return best_sad;
+}
+
+static int obmc_diamond_search_sad(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv,
+ FULLPEL_MV *best_mv, int search_step, int *num00) {
+ const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
+ const search_site_config *cfg = ms_params->search_sites;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+ const int32_t *wsrc = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const struct buf_2d *const ref_buf = ms_buffers->ref;
+
+ // search_step determines the length of the initial step and hence the number
+ // of iterations.
+ const int tot_steps = cfg->num_search_steps - search_step;
+ const uint8_t *best_address, *init_ref;
+ int best_sad = INT_MAX;
+ int best_site = 0;
+
+ clamp_fullmv(&start_mv, &ms_params->mv_limits);
+ best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv);
+ *num00 = 0;
+ *best_mv = start_mv;
+
+ // Check the starting position
+ best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) +
+ mvsad_err_cost_(best_mv, mv_cost_params);
+
+ for (int step = tot_steps - 1; step >= 0; --step) {
+ const search_site *const site = cfg->site[step];
+ best_site = 0;
+ for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) {
+ const FULLPEL_MV mv = { best_mv->row + site[idx].mv.row,
+ best_mv->col + site[idx].mv.col };
+ if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
+ int sad = fn_ptr->osdf(best_address + site[idx].offset, ref_buf->stride,
+ wsrc, mask);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = idx;
+ }
+ }
+ }
+ }
+
+ if (best_site != 0) {
+ best_mv->row += site[best_site].mv.row;
+ best_mv->col += site[best_site].mv.col;
+ best_address += site[best_site].offset;
+ } else if (best_address == init_ref) {
+ (*num00)++;
+ }
+ }
+ return best_sad;
+}
+
+static int obmc_full_pixel_diamond(
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv,
+ int step_param, FULLPEL_MV *best_mv) {
+ const search_site_config *cfg = ms_params->search_sites;
+ FULLPEL_MV tmp_mv;
+ int thissme, n, num00 = 0;
+ int bestsme =
+ obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param, &n);
+ if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(ms_params, &tmp_mv);
+ *best_mv = tmp_mv;
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ const int further_steps = cfg->num_search_steps - 1 - step_param;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ num00--;
+ } else {
+ thissme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv,
+ step_param + n, &num00);
+ if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv);
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *best_mv = tmp_mv;
+ }
+ }
+ }
+
+ return bestsme;
+}
+
+int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int step_param, FULLPEL_MV *best_mv) {
+ if (!ms_params->fast_obmc_search) {
+ const int bestsme =
+ obmc_full_pixel_diamond(ms_params, start_mv, step_param, best_mv);
+ return bestsme;
+ } else {
+ *best_mv = start_mv;
+ clamp_fullmv(best_mv, &ms_params->mv_limits);
+ int thissme = obmc_refining_search_sad(ms_params, best_mv);
+ if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, best_mv);
+ return thissme;
+ }
+}
+
+// =============================================================================
+// Subpixel Motion Search: Translational
+// =============================================================================
+#define INIT_SUBPEL_STEP_SIZE (4)
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+// Returns the subpel offset used by various subpel variance functions [m]sv[a]f
+static INLINE int get_subpel_part(int x) { return x & 7; }
+
+// Gets the address of the ref buffer at subpel location (r, c), rounded to the
+// nearest fullpel precision toward - \infty
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+ const MV mv) {
+ const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3);
+ return &buf->buf[offset];
+}
+
+// Estimates the variance of prediction residue using bilinear filter for fast
+// search.
+static INLINE int estimated_pref_error(
+ const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ unsigned int *sse) {
+ const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const uint8_t *src = ms_buffers->src->buf;
+ const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+ const int src_stride = ms_buffers->src->stride;
+ const int ref_stride = ms_buffers->ref->stride;
+ const uint8_t *second_pred = ms_buffers->second_pred;
+ const uint8_t *mask = ms_buffers->mask;
+ const int mask_stride = ms_buffers->mask_stride;
+ const int invert_mask = ms_buffers->inv_mask;
+
+ const int subpel_x_q3 = get_subpel_part(this_mv->col);
+ const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+ if (second_pred == NULL) {
+ return vfp->svf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+ sse);
+ } else if (mask) {
+ return vfp->msvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+ second_pred, mask, mask_stride, invert_mask, sse);
+ } else {
+ return vfp->svaf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+ sse, second_pred);
+ }
+}
+
+// Calculates the variance of prediction residue.
+static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+ const MV *this_mv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ unsigned int *sse) {
+ const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+ const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const uint8_t *src = ms_buffers->src->buf;
+ const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+ const int src_stride = ms_buffers->src->stride;
+ const int ref_stride = ms_buffers->ref->stride;
+ const uint8_t *second_pred = ms_buffers->second_pred;
+ const uint8_t *mask = ms_buffers->mask;
+ const int mask_stride = ms_buffers->mask_stride;
+ const int invert_mask = ms_buffers->inv_mask;
+ const int w = var_params->w;
+ const int h = var_params->h;
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int subpel_x_q3 = get_subpel_part(this_mv->col);
+ const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+ unsigned int besterr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
+ if (second_pred != NULL) {
+ if (mask) {
+ aom_highbd_comp_mask_upsampled_pred(
+ xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+ invert_mask, xd->bd, subpel_search_type);
+ } else {
+ aom_highbd_comp_avg_upsampled_pred(
+ xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+ subpel_search_type);
+ }
+ } else {
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ xd->bd, subpel_search_type);
+ }
+ besterr = vfp->vf(pred8, w, src, src_stride, sse);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+ if (second_pred != NULL) {
+ if (mask) {
+ aom_comp_mask_upsampled_pred(
+ xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+ invert_mask, subpel_search_type);
+ } else {
+ aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+ second_pred, w, h, subpel_x_q3, subpel_y_q3,
+ ref, ref_stride, subpel_search_type);
+ }
+ } else {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search_type);
+ }
+
+ besterr = vfp->vf(pred, w, src, src_stride, sse);
+ }
+#else
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+ if (second_pred != NULL) {
+ if (mask) {
+ aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+ second_pred, w, h, subpel_x_q3, subpel_y_q3,
+ ref, ref_stride, mask, mask_stride,
+ invert_mask, subpel_search_type);
+ } else {
+ aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+ second_pred, w, h, subpel_x_q3, subpel_y_q3,
+ ref, ref_stride, subpel_search_type);
+ }
+ } else {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+ subpel_y_q3, ref, ref_stride, subpel_search_type);
+ }
+
+ besterr = vfp->vf(pred, w, src, src_stride, sse);
+#endif
+ return besterr;
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account. It is suffixed "fast" because
+// it uses bilinear filter to estimate the prediction.
+static INLINE unsigned int check_better_fast(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+ const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int *has_better_mv, int is_scaled) {
+ unsigned int cost;
+ if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+ unsigned int sse;
+ int thismse;
+ if (is_scaled) {
+ thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
+ } else {
+ thismse = estimated_pref_error(this_mv, var_params, &sse);
+ }
+ cost = mv_err_cost_(this_mv, mv_cost_params);
+ cost += thismse;
+
+ if (cost < *besterr) {
+ *besterr = cost;
+ *best_mv = *this_mv;
+ *distortion = thismse;
+ *sse1 = sse;
+ *has_better_mv |= 1;
+ }
+ } else {
+ cost = INT_MAX;
+ }
+ return cost;
+}
+
+// Checks whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static AOM_FORCE_INLINE unsigned int check_better(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+ const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int *is_better) {
+ unsigned int cost;
+ if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+ unsigned int sse;
+ int thismse;
+ thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
+ cost = mv_err_cost_(this_mv, mv_cost_params);
+ cost += thismse;
+ if (cost < *besterr) {
+ *besterr = cost;
+ *best_mv = *this_mv;
+ *distortion = thismse;
+ *sse1 = sse;
+ *is_better |= 1;
+ }
+ } else {
+ cost = INT_MAX;
+ }
+ return cost;
+}
+
+static INLINE MV get_best_diag_step(int step_size, unsigned int left_cost,
+ unsigned int right_cost,
+ unsigned int up_cost,
+ unsigned int down_cost) {
+ const MV diag_step = { up_cost <= down_cost ? -step_size : step_size,
+ left_cost <= right_cost ? -step_size : step_size };
+
+ return diag_step;
+}
+
+// Searches the four cardinal direction for a better mv, then follows up with a
+// search in the best quadrant. This uses bilinear filter to speed up the
+// calculation.
+static AOM_FORCE_INLINE MV first_level_check_fast(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
+ int hstep, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int is_scaled) {
+ // Check the four cardinal directions
+ const MV left_mv = { this_mv.row, this_mv.col - hstep };
+ int dummy = 0;
+ const unsigned int left = check_better_fast(
+ xd, cm, &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+ sse1, distortion, &dummy, is_scaled);
+
+ const MV right_mv = { this_mv.row, this_mv.col + hstep };
+ const unsigned int right = check_better_fast(
+ xd, cm, &right_mv, best_mv, mv_limits, var_params, mv_cost_params,
+ besterr, sse1, distortion, &dummy, is_scaled);
+
+ const MV top_mv = { this_mv.row - hstep, this_mv.col };
+ const unsigned int up = check_better_fast(
+ xd, cm, &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+ sse1, distortion, &dummy, is_scaled);
+
+ const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+ const unsigned int down = check_better_fast(
+ xd, cm, &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params,
+ besterr, sse1, distortion, &dummy, is_scaled);
+
+ const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+ const MV diag_mv = { this_mv.row + diag_step.row,
+ this_mv.col + diag_step.col };
+
+ // Check the diagonal direction with the best mv
+ check_better_fast(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+
+ return diag_step;
+}
+
+// Performs a following up search after first_level_check_fast is called. This
+// performs two extra chess pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void second_level_check_fast(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, const MV diag_step,
+ MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int is_scaled) {
+ assert(diag_step.row == hstep || diag_step.row == -hstep);
+ assert(diag_step.col == hstep || diag_step.col == -hstep);
+ const int tr = this_mv.row;
+ const int tc = this_mv.col;
+ const int br = best_mv->row;
+ const int bc = best_mv->col;
+ int dummy = 0;
+ if (tr != br && tc != bc) {
+ assert(diag_step.col == bc - tc);
+ assert(diag_step.row == br - tr);
+ const MV chess_mv_1 = { br, bc + diag_step.col };
+ const MV chess_mv_2 = { br + diag_step.row, bc };
+ check_better_fast(xd, cm, &chess_mv_1, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+
+ check_better_fast(xd, cm, &chess_mv_2, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+ } else if (tr == br && tc != bc) {
+ assert(diag_step.col == bc - tc);
+ // Continue searching in the best direction
+ const MV bottom_long_mv = { br + hstep, bc + diag_step.col };
+ const MV top_long_mv = { br - hstep, bc + diag_step.col };
+ check_better_fast(xd, cm, &bottom_long_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &top_long_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+
+ // Search in the direction opposite of the best quadrant
+ const MV rev_mv = { br - diag_step.row, bc };
+ check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+ } else if (tr != br && tc == bc) {
+ assert(diag_step.row == br - tr);
+ // Continue searching in the best direction
+ const MV right_long_mv = { br + diag_step.row, bc + hstep };
+ const MV left_long_mv = { br + diag_step.row, bc - hstep };
+ check_better_fast(xd, cm, &right_long_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &left_long_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+
+ // Search in the direction opposite of the best quadrant
+ const MV rev_mv = { br, bc - diag_step.col };
+ check_better_fast(xd, cm, &rev_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy,
+ is_scaled);
+ }
+}
+
+// Combines first level check and second level check when applicable. This first
+// searches the four cardinal directions, and perform several
+// diagonal/chess-pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void two_level_checks_fast(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV this_mv, MV *best_mv,
+ int hstep, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int iters, int is_scaled) {
+ const MV diag_step = first_level_check_fast(
+ xd, cm, this_mv, best_mv, hstep, mv_limits, var_params, mv_cost_params,
+ besterr, sse1, distortion, is_scaled);
+ if (iters > 1) {
+ second_level_check_fast(xd, cm, this_mv, diag_step, best_mv, hstep,
+ mv_limits, var_params, mv_cost_params, besterr,
+ sse1, distortion, is_scaled);
+ }
+}
+
+static AOM_FORCE_INLINE MV
+first_level_check(MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv,
+ MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion) {
+ int dummy = 0;
+ const MV left_mv = { this_mv.row, this_mv.col - hstep };
+ const MV right_mv = { this_mv.row, this_mv.col + hstep };
+ const MV top_mv = { this_mv.row - hstep, this_mv.col };
+ const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+
+ const unsigned int left =
+ check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int right =
+ check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int up =
+ check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int down =
+ check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+
+ const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+ const MV diag_mv = { this_mv.row + diag_step.row,
+ this_mv.col + diag_step.col };
+
+ // Check the diagonal direction with the best mv
+ check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params,
+ besterr, sse1, distortion, &dummy);
+
+ return diag_step;
+}
+
+// A newer version of second level check that gives better quality.
+// TODO(chiyotsai@google.com): evaluate this on subpel_search_types different
+// from av1_find_best_sub_pixel_tree
+static AOM_FORCE_INLINE void second_level_check_v2(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
+ MV *best_mv, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int is_scaled) {
+ assert(best_mv->row == this_mv.row + diag_step.row ||
+ best_mv->col == this_mv.col + diag_step.col);
+ if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
+ return;
+ } else if (this_mv.row == best_mv->row) {
+ // Search away from diagonal step since diagonal search did not provide any
+ // improvement
+ diag_step.row *= -1;
+ } else if (this_mv.col == best_mv->col) {
+ diag_step.col *= -1;
+ }
+
+ const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
+ const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
+ const MV diag_bias_mv = { best_mv->row + diag_step.row,
+ best_mv->col + diag_step.col };
+ int has_better_mv = 0;
+
+ if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+ check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+ check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+
+ // Do an additional search if the second iteration gives a better mv
+ if (has_better_mv) {
+ check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+ }
+ } else {
+ check_better_fast(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &has_better_mv,
+ is_scaled);
+ check_better_fast(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &has_better_mv,
+ is_scaled);
+
+ // Do an additional search if the second iteration gives a better mv
+ if (has_better_mv) {
+ check_better_fast(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv, is_scaled);
+ }
+ }
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int setup_center_error(
+ const MACROBLOCKD *xd, const MV *bestmv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+ const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+ const int w = var_params->w;
+ const int h = var_params->h;
+
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const uint8_t *src = ms_buffers->src->buf;
+ const uint8_t *y = get_buf_from_mv(ms_buffers->ref, *bestmv);
+ const int src_stride = ms_buffers->src->stride;
+ const int y_stride = ms_buffers->ref->stride;
+ const uint8_t *second_pred = ms_buffers->second_pred;
+ const uint8_t *mask = ms_buffers->mask;
+ const int mask_stride = ms_buffers->mask_stride;
+ const int invert_mask = ms_buffers->inv_mask;
+
+ unsigned int besterr;
+
+ if (second_pred != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+ uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
+ if (mask) {
+ aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride,
+ mask, mask_stride, invert_mask);
+ } else {
+ aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+ }
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+ if (mask) {
+ aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+ mask_stride, invert_mask);
+ } else {
+ aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+ }
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ }
+#else
+ (void)xd;
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+ if (mask) {
+ aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+ mask_stride, invert_mask);
+ } else {
+ aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+ }
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+#endif
+ } else {
+ besterr = vfp->vf(y, y_stride, src, src_stride, sse1);
+ }
+ *distortion = besterr;
+ besterr += mv_err_cost_(bestmv, mv_cost_params);
+ return besterr;
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int upsampled_setup_center_error(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+ unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, var_params, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost_(bestmv, mv_cost_params);
+ return besterr;
+}
+
+static INLINE int divide_and_round(int n, int d) {
+ return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(const int *cost_list) {
+ return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+ cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
+ int bits) {
+ *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+ (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+ *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+ (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+// Checks the list of mvs searched in the last iteration and see if we are
+// repeating it. If so, return 1. Otherwise we update the last_mv_search_list
+// with current_mv and return 0.
+static INLINE int check_repeated_mv_and_update(int_mv *last_mv_search_list,
+ const MV current_mv, int iter) {
+ if (last_mv_search_list) {
+ if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) {
+ return 1;
+ }
+
+ last_mv_search_list[iter].as_mv = current_mv;
+ }
+ return 0;
+}
+
+static AOM_INLINE int setup_center_error_facade(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *bestmv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion,
+ int is_scaled) {
+ if (is_scaled) {
+ return upsampled_setup_center_error(xd, cm, bestmv, var_params,
+ mv_cost_params, sse1, distortion);
+ } else {
+ return setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+ distortion);
+ }
+}
+
+int av1_find_best_sub_pixel_tree_pruned_more(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+ unsigned int *sse1, int_mv *last_mv_search_list) {
+ (void)cm;
+ const int allow_hp = ms_params->allow_hp;
+ const int forced_stop = ms_params->forced_stop;
+ const int iters_per_step = ms_params->iters_per_step;
+ const int *cost_list = ms_params->cost_list;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+ // The iteration we are current searching for. Iter 0 corresponds to fullpel
+ // mv, iter 1 to half pel, and so on
+ int iter = 0;
+ int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel
+ unsigned int besterr = INT_MAX;
+ *bestmv = start_mv;
+
+ const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+ ? &cm->sf_identity
+ : xd->block_ref_scale_factors[0];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (start_mv_stats != NULL && !is_scaled) {
+ besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+ *distortion = start_mv_stats->distortion;
+ *sse1 = start_mv_stats->sse;
+ } else {
+ besterr =
+ setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
+ sse1, distortion, is_scaled);
+ }
+
+ // If forced_stop is FULL_PEL, return.
+ if (forced_stop == FULL_PEL) return besterr;
+
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+ int ir, ic;
+ get_cost_surf_min(cost_list, &ir, &ic, 1);
+ if (ir != 0 || ic != 0) {
+ const MV this_mv = { start_mv.row + ir * hstep,
+ start_mv.col + ic * hstep };
+ int dummy = 0;
+ check_better_fast(xd, cm, &this_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ }
+ } else {
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+ if (forced_stop < HALF_PEL) {
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ hstep >>= 1;
+ start_mv = *bestmv;
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ if (allow_hp && forced_stop == EIGHTH_PEL) {
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ hstep >>= 1;
+ start_mv = *bestmv;
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+ unsigned int *sse1, int_mv *last_mv_search_list) {
+ (void)cm;
+ (void)start_mv_stats;
+ const int allow_hp = ms_params->allow_hp;
+ const int forced_stop = ms_params->forced_stop;
+ const int iters_per_step = ms_params->iters_per_step;
+ const int *cost_list = ms_params->cost_list;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+ // The iteration we are current searching for. Iter 0 corresponds to fullpel
+ // mv, iter 1 to half pel, and so on
+ int iter = 0;
+ int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel
+ unsigned int besterr = INT_MAX;
+ *bestmv = start_mv;
+
+ const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+ ? &cm->sf_identity
+ : xd->block_ref_scale_factors[0];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (start_mv_stats != NULL && !is_scaled) {
+ besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+ *distortion = start_mv_stats->distortion;
+ *sse1 = start_mv_stats->sse;
+ } else {
+ besterr =
+ setup_center_error_facade(xd, cm, bestmv, var_params, mv_cost_params,
+ sse1, distortion, is_scaled);
+ }
+
+ // If forced_stop is FULL_PEL, return.
+ if (forced_stop == FULL_PEL) return besterr;
+
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX) {
+ const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+ (cost_list[2] < cost_list[4] ? 0 : 2);
+
+ const MV left_mv = { start_mv.row, start_mv.col - hstep };
+ const MV right_mv = { start_mv.row, start_mv.col + hstep };
+ const MV bottom_mv = { start_mv.row + hstep, start_mv.col };
+ const MV top_mv = { start_mv.row - hstep, start_mv.col };
+
+ const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep };
+ const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep };
+ const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep };
+ const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep };
+
+ int dummy = 0;
+
+ switch (whichdir) {
+ case 0: // bottom left quadrant
+ check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &bottom_left_mv, bestmv, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, &dummy, is_scaled);
+ break;
+ case 1: // bottom right quadrant
+ check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &bottom_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &bottom_right_mv, bestmv, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, &dummy, is_scaled);
+ break;
+ case 2: // top left quadrant
+ check_better_fast(xd, cm, &left_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &top_left_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ break;
+ case 3: // top right quadrant
+ check_better_fast(xd, cm, &right_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &top_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ check_better_fast(xd, cm, &top_right_mv, bestmv, mv_limits, var_params,
+ mv_cost_params, &besterr, sse1, distortion, &dummy,
+ is_scaled);
+ break;
+ }
+ } else {
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+ if (forced_stop < HALF_PEL) {
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ hstep >>= 1;
+ start_mv = *bestmv;
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ if (allow_hp && forced_stop == EIGHTH_PEL) {
+ if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+ return INT_MAX;
+ }
+ iter++;
+
+ hstep >>= 1;
+ start_mv = *bestmv;
+ two_level_checks_fast(xd, cm, start_mv, bestmv, hstep, mv_limits,
+ var_params, mv_cost_params, &besterr, sse1,
+ distortion, iters_per_step, is_scaled);
+ }
+
+ return besterr;
+}
+
+int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion,
+ unsigned int *sse1,
+ int_mv *last_mv_search_list) {
+ (void)start_mv_stats;
+ const int allow_hp = ms_params->allow_hp;
+ const int forced_stop = ms_params->forced_stop;
+ const int iters_per_step = ms_params->iters_per_step;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+ const SUBPEL_SEARCH_TYPE subpel_search_type =
+ ms_params->var_params.subpel_search_type;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+ // How many steps to take. A round of 0 means fullpel search only, 1 means
+ // half-pel, and so on.
+ const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
+ int hstep = INIT_SUBPEL_STEP_SIZE; // Step size, initialized to 4/8=1/2 pel
+
+ unsigned int besterr = INT_MAX;
+
+ *bestmv = start_mv;
+
+ const struct scale_factors *const sf = is_intrabc_block(xd->mi[0])
+ ? &cm->sf_identity
+ : xd->block_ref_scale_factors[0];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (start_mv_stats != NULL && !is_scaled) {
+ besterr = start_mv_stats->distortion + start_mv_stats->err_cost;
+ *distortion = start_mv_stats->distortion;
+ *sse1 = start_mv_stats->sse;
+ } else {
+ if (subpel_search_type != USE_2_TAPS_ORIG) {
+ besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
+ mv_cost_params, sse1, distortion);
+ } else {
+ besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+ distortion);
+ }
+ }
+
+ // If forced_stop is FULL_PEL, return.
+ if (!round) return besterr;
+
+ for (int iter = 0; iter < round; ++iter) {
+ MV iter_center_mv = *bestmv;
+ if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv,
+ iter)) {
+ return INT_MAX;
+ }
+
+ MV diag_step;
+ if (subpel_search_type != USE_2_TAPS_ORIG) {
+ diag_step = first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
+ mv_limits, var_params, mv_cost_params,
+ &besterr, sse1, distortion);
+ } else {
+ diag_step = first_level_check_fast(xd, cm, iter_center_mv, bestmv, hstep,
+ mv_limits, var_params, mv_cost_params,
+ &besterr, sse1, distortion, is_scaled);
+ }
+
+ // Check diagonal sub-pixel position
+ if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
+ second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
+ mv_limits, var_params, mv_cost_params, &besterr,
+ sse1, distortion, is_scaled);
+ }
+
+ hstep >>= 1;
+ }
+
+ return besterr;
+}
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+// Returns the maximum MV.
+int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion, unsigned int *sse1,
+ int_mv *last_mv_search_list) {
+ (void)xd;
+ (void)cm;
+ (void)start_mv;
+ (void)start_mv_stats;
+ (void)sse1;
+ (void)distortion;
+ (void)last_mv_search_list;
+
+ const int allow_hp = ms_params->allow_hp;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+ bestmv->row = mv_limits->row_max;
+ bestmv->col = mv_limits->col_max;
+
+ unsigned int besterr = 0;
+
+ // In the sub-pel motion search, if hp is not used, then the last bit of mv
+ // has to be 0.
+ lower_mv_precision(bestmv, allow_hp, 0);
+ return besterr;
+}
+
+// Returns the minimum MV.
+int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion, unsigned int *sse1,
+ int_mv *last_mv_search_list) {
+ (void)xd;
+ (void)cm;
+ (void)start_mv;
+ (void)start_mv_stats;
+ (void)sse1;
+ (void)distortion;
+ (void)last_mv_search_list;
+
+ const int allow_hp = ms_params->allow_hp;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+ bestmv->row = mv_limits->row_min;
+ bestmv->col = mv_limits->col_min;
+
+ unsigned int besterr = 0;
+ // In the sub-pel motion search, if hp is not used, then the last bit of mv
+ // has to be 0.
+ lower_mv_precision(bestmv, allow_hp, 0);
+ return besterr;
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Computes the cost of the current predictor by going through the whole
+// av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv
+// during motion_mode_rd. We are going through the whole
+// av1_enc_build_inter_predictor because we might have changed the interpolation
+// filter, etc before motion_mode_rd is called.
+static INLINE unsigned int compute_motion_cost(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize,
+ const MV *this_mv) {
+ unsigned int mse;
+ unsigned int sse;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+
+ const uint8_t *const src = ms_buffers->src->buf;
+ const int src_stride = ms_buffers->src->stride;
+ const uint8_t *const dst = xd->plane[0].dst.buf;
+ const int dst_stride = xd->plane[0].dst.stride;
+ const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp;
+
+ mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
+ mse += mv_err_cost_(this_mv, &ms_params->mv_cost_params);
+ return mse;
+}
+
+// Refines MV in a small range
+
+// Macros to build bitmasks which help us avoid redundant computations
+//
+// To explain the idea here, imagine that on the first iteration of the
+// loop below, we step rightwards. Then, on the second iteration, the neighbors
+// to consider are:
+// . . .
+// 0 1 .
+// . . .
+// Where 0 is the initial search point, 1 is the best candidate found in the
+// first iteration, and the dots are the other neighbors of point 1.
+//
+// Naively, we would now need to scan all 8 neighbors of point 1 (point 0 and
+// the seven points marked with dots), and compare them to see where to move
+// next. However, we already evaluated 5 of those 8 neighbors in the last
+// iteration, and decided that they are worse than point 1. So we don't need
+// to re-consider these points. We only really need to consider the three
+// points which are adjacent to point 1 but *not* to point 0.
+//
+// As the algorithm goes on, there are other ways that redundant evaluations
+// can happen, if the search path curls back around on itself.
+//
+// To avoid all possible redundancies, we'd have to build a set containing
+// every point we have already checked, and this would be quite expensive.
+//
+// So instead, we apply a 95%-effective solution with a much lower overhead:
+// we prune out the points which were considered during the previous
+// iteration, but we don't worry about any prior iteration. This can be done
+// as follows:
+//
+// We build a static table, called neighbor_mask, which answers the question
+// "if we moved in direction X last time, which neighbors are new, and which
+// were scanned last iteration?"
+// Then we can query this table to quickly determine which points we need to
+// evaluate, and which we can skip.
+//
+// To query the table, the logic is simply:
+// neighbor_mask[i] & (1 << j) == "if we moved in direction i last iteration,
+// do we need to scan neighbor j this iteration?"
+#define NEIGHBOR_MASK_DIA(left, down, right, up) \
+ (left | (down << 1) | (right << 2) | (up << 3))
+
+#define NEIGHBOR_MASK_SQR(left, down, right, up, down_left, down_right, \
+ up_left, up_right) \
+ (left | (down << 1) | (right << 2) | (up << 3) | (down_left << 4) | \
+ (down_right << 5) | (up_left << 6) | (up_right << 7))
+
+static const warp_search_config warp_search_info[WARP_SEARCH_METHODS] = {
+ // WARP_SEARCH_DIAMOND
+ {
+ .num_neighbors = 4,
+ .neighbors = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
+ .neighbor_mask = {
+ // If we stepped left last time, consider all points except right
+ NEIGHBOR_MASK_DIA(1, 1, 0, 1),
+ // If we stepped down last time, consider all points except up
+ NEIGHBOR_MASK_DIA(1, 1, 1, 0),
+ // Stepped right last time
+ NEIGHBOR_MASK_DIA(0, 1, 1, 1),
+ // Stepped up last time
+ NEIGHBOR_MASK_DIA(1, 0, 1, 1),
+ },
+ },
+ // WARP_SEARCH_SQUARE
+ {
+ .num_neighbors = 8,
+ .neighbors = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
+ { 1, -1 }, { 1, 1 }, { -1, -1 }, { -1, 1 } },
+ .neighbor_mask = {
+ // If we stepped left last time, then we only need to consider 3 points:
+ // left, down+left, up+left
+ NEIGHBOR_MASK_SQR(1, 0, 0, 0, 1, 0, 1, 0),
+ // If we stepped down last time, then we only need to consider 3 points:
+ // down, down+left, down+right
+ NEIGHBOR_MASK_SQR(0, 1, 0, 0, 1, 1, 0, 0),
+ // Stepped right last time
+ NEIGHBOR_MASK_SQR(0, 0, 1, 0, 0, 1, 0, 1),
+ // Stepped up last time
+ NEIGHBOR_MASK_SQR(0, 0, 0, 1, 0, 0, 1, 1),
+
+ // If we stepped down+left last time, then we need to consider 5 points:
+ // left, down, down+left, down+right, up+left
+ NEIGHBOR_MASK_SQR(1, 1, 0, 0, 1, 1, 1, 0),
+ // Stepped down+right last time
+ NEIGHBOR_MASK_SQR(0, 1, 1, 0, 1, 1, 0, 1),
+ // Stepped up+left last time
+ NEIGHBOR_MASK_SQR(1, 0, 0, 1, 1, 0, 1, 1),
+ // Stepped up+right last time
+ NEIGHBOR_MASK_SQR(0, 0, 1, 1, 0, 1, 1, 1),
+ },
+ },
+};
+
+unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ BLOCK_SIZE bsize, const int *pts0,
+ const int *pts_inref0, int total_samples,
+ WARP_SEARCH_METHOD search_method,
+ int num_iterations) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+
+ const MV *neighbors = warp_search_info[search_method].neighbors;
+ const int num_neighbors = warp_search_info[search_method].num_neighbors;
+ const uint8_t *neighbor_mask = warp_search_info[search_method].neighbor_mask;
+
+ MV *best_mv = &mbmi->mv[0].as_mv;
+
+ WarpedMotionParams best_wm_params = mbmi->wm_params;
+ int best_num_proj_ref = mbmi->num_proj_ref;
+ unsigned int bestmse;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+ const int mv_shift = ms_params->allow_hp ? 0 : 1;
+
+ // Calculate the center position's error
+ assert(av1_is_subpelmv_in_range(mv_limits, *best_mv));
+ bestmse = compute_motion_cost(xd, cm, ms_params, bsize, best_mv);
+
+ // MV search
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // First step always scans all neighbors
+ uint8_t valid_neighbors = UINT8_MAX;
+
+ for (int ite = 0; ite < num_iterations; ++ite) {
+ int best_idx = -1;
+
+ for (int idx = 0; idx < num_neighbors; ++idx) {
+ if ((valid_neighbors & (1 << idx)) == 0) {
+ continue;
+ }
+
+ unsigned int thismse;
+
+ MV this_mv = { best_mv->row + neighbors[idx].row * (1 << mv_shift),
+ best_mv->col + neighbors[idx].col * (1 << mv_shift) };
+ if (av1_is_subpelmv_in_range(mv_limits, this_mv)) {
+ memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+ memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+ if (total_samples > 1) {
+ mbmi->num_proj_ref =
+ av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
+ }
+
+ if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+ this_mv.row, this_mv.col, &mbmi->wm_params,
+ mi_row, mi_col)) {
+ thismse = compute_motion_cost(xd, cm, ms_params, bsize, &this_mv);
+
+ if (thismse < bestmse) {
+ best_idx = idx;
+ best_wm_params = mbmi->wm_params;
+ best_num_proj_ref = mbmi->num_proj_ref;
+ bestmse = thismse;
+ }
+ }
+ }
+ }
+
+ if (best_idx == -1) break;
+
+ if (best_idx >= 0) {
+ best_mv->row += neighbors[best_idx].row * (1 << mv_shift);
+ best_mv->col += neighbors[best_idx].col * (1 << mv_shift);
+ valid_neighbors = neighbor_mask[best_idx];
+ }
+ }
+
+ mbmi->wm_params = best_wm_params;
+ mbmi->num_proj_ref = best_num_proj_ref;
+ return bestmse;
+}
+
+#endif // !CONFIG_REALTIME_ONLY
+// =============================================================================
+// Subpixel Motion Search: OBMC
+// =============================================================================
+// Estimates the variance of prediction residue
+static INLINE int estimate_obmc_pref_error(
+ const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ unsigned int *sse) {
+ const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const int32_t *src = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+ const int ref_stride = ms_buffers->ref->stride;
+
+ const int subpel_x_q3 = get_subpel_part(this_mv->col);
+ const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+ return vfp->osvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, mask, sse);
+}
+
+// Calculates the variance of prediction residue
+static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+ const MV *this_mv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ unsigned int *sse) {
+ const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+ const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+ const int w = var_params->w;
+ const int h = var_params->h;
+
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const int32_t *wsrc = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+ const int ref_stride = ms_buffers->ref->stride;
+
+ const int subpel_x_q3 = get_subpel_part(this_mv->col);
+ const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ unsigned int besterr;
+ DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+ subpel_search_type);
+ besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
+ } else {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+ subpel_y_q3, ref, ref_stride, subpel_search_type);
+
+ besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+ }
+#else
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+ subpel_y_q3, ref, ref_stride, subpel_search_type);
+
+ besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+#endif
+ return besterr;
+}
+
+static unsigned int setup_obmc_center_error(
+ const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+ // TODO(chiyotsai@google.com): There might be a bug here where we didn't use
+ // get_buf_from_mv(ref, *this_mv).
+ const MSBuffers *ms_buffers = &var_params->ms_buffers;
+ const int32_t *wsrc = ms_buffers->wsrc;
+ const int32_t *mask = ms_buffers->obmc_mask;
+ const uint8_t *ref = ms_buffers->ref->buf;
+ const int ref_stride = ms_buffers->ref->stride;
+ unsigned int besterr =
+ var_params->vfp->ovf(ref, ref_stride, wsrc, mask, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost_(this_mv, mv_cost_params);
+ return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *this_mv,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+ unsigned int besterr =
+ upsampled_obmc_pref_error(xd, cm, this_mv, var_params, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost_(this_mv, mv_cost_params);
+ return besterr;
+}
+
+// Estimates the variance of prediction residue
+// TODO(chiyotsai@google.com): the cost does does not match the cost in
+// mv_cost_. Investigate this later.
+static INLINE int estimate_obmc_mvcost(const MV *this_mv,
+ const MV_COST_PARAMS *mv_cost_params) {
+ const MV *ref_mv = mv_cost_params->ref_mv;
+ const int *mvjcost = mv_cost_params->mvjcost;
+ const int *const *mvcost = mv_cost_params->mvcost;
+ const int error_per_bit = mv_cost_params->error_per_bit;
+ const MV_COST_TYPE mv_cost_type = mv_cost_params->mv_cost_type;
+ const MV diff_mv = { GET_MV_SUBPEL(this_mv->row - ref_mv->row),
+ GET_MV_SUBPEL(this_mv->col - ref_mv->col) };
+
+ switch (mv_cost_type) {
+ case MV_COST_ENTROPY:
+ return (unsigned)((mv_cost(&diff_mv, mvjcost,
+ CONVERT_TO_CONST_MVCOST(mvcost)) *
+ error_per_bit +
+ 4096) >>
+ 13);
+ case MV_COST_NONE: return 0;
+ default:
+ assert(0 && "L1 norm is not tuned for estimated obmc mvcost");
+ return 0;
+ }
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static INLINE unsigned int obmc_check_better_fast(
+ const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int *has_better_mv) {
+ unsigned int cost;
+ if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+ unsigned int sse;
+ const int thismse = estimate_obmc_pref_error(this_mv, var_params, &sse);
+
+ cost = estimate_obmc_mvcost(this_mv, mv_cost_params);
+ cost += thismse;
+
+ if (cost < *besterr) {
+ *besterr = cost;
+ *best_mv = *this_mv;
+ *distortion = thismse;
+ *sse1 = sse;
+ *has_better_mv |= 1;
+ }
+ } else {
+ cost = INT_MAX;
+ }
+ return cost;
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static INLINE unsigned int obmc_check_better(
+ MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+ const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion, int *has_better_mv) {
+ unsigned int cost;
+ if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+ unsigned int sse;
+ const int thismse =
+ upsampled_obmc_pref_error(xd, cm, this_mv, var_params, &sse);
+ cost = mv_err_cost_(this_mv, mv_cost_params);
+
+ cost += thismse;
+
+ if (cost < *besterr) {
+ *besterr = cost;
+ *best_mv = *this_mv;
+ *distortion = thismse;
+ *sse1 = sse;
+ *has_better_mv |= 1;
+ }
+ } else {
+ cost = INT_MAX;
+ }
+ return cost;
+}
+
+static AOM_FORCE_INLINE MV obmc_first_level_check(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv,
+ const int hstep, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion) {
+ int dummy = 0;
+ const MV left_mv = { this_mv.row, this_mv.col - hstep };
+ const MV right_mv = { this_mv.row, this_mv.col + hstep };
+ const MV top_mv = { this_mv.row - hstep, this_mv.col };
+ const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+
+ if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+ const unsigned int left =
+ obmc_check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int right =
+ obmc_check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int up =
+ obmc_check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+ const unsigned int down =
+ obmc_check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+
+ const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+ const MV diag_mv = { this_mv.row + diag_step.row,
+ this_mv.col + diag_step.col };
+
+ // Check the diagonal direction with the best mv
+ obmc_check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+
+ return diag_step;
+ } else {
+ const unsigned int left = obmc_check_better_fast(
+ &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
+ distortion, &dummy);
+ const unsigned int right = obmc_check_better_fast(
+ &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+ sse1, distortion, &dummy);
+
+ const unsigned int up = obmc_check_better_fast(
+ &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
+ distortion, &dummy);
+
+ const unsigned int down = obmc_check_better_fast(
+ &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+ sse1, distortion, &dummy);
+
+ const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+ const MV diag_mv = { this_mv.row + diag_step.row,
+ this_mv.col + diag_step.col };
+
+ // Check the diagonal direction with the best mv
+ obmc_check_better_fast(&diag_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion, &dummy);
+
+ return diag_step;
+ }
+}
+
+// A newer version of second level check for obmc that gives better quality.
+static AOM_FORCE_INLINE void obmc_second_level_check_v2(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
+ MV *best_mv, const SubpelMvLimits *mv_limits,
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+ const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+ unsigned int *sse1, int *distortion) {
+ assert(best_mv->row == this_mv.row + diag_step.row ||
+ best_mv->col == this_mv.col + diag_step.col);
+ if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
+ return;
+ } else if (this_mv.row == best_mv->row) {
+ // Search away from diagonal step since diagonal search did not provide any
+ // improvement
+ diag_step.row *= -1;
+ } else if (this_mv.col == best_mv->col) {
+ diag_step.col *= -1;
+ }
+
+ const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
+ const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
+ const MV diag_bias_mv = { best_mv->row + diag_step.row,
+ best_mv->col + diag_step.col };
+ int has_better_mv = 0;
+
+ if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+ obmc_check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+ obmc_check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+
+ // Do an additional search if the second iteration gives a better mv
+ if (has_better_mv) {
+ obmc_check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+ }
+ } else {
+ obmc_check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+ obmc_check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+
+ // Do an additional search if the second iteration gives a better mv
+ if (has_better_mv) {
+ obmc_check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params,
+ mv_cost_params, besterr, sse1, distortion,
+ &has_better_mv);
+ }
+ }
+}
+
+int av1_find_best_obmc_sub_pixel_tree_up(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats, MV *bestmv, int *distortion,
+ unsigned int *sse1, int_mv *last_mv_search_list) {
+ (void)last_mv_search_list;
+ (void)start_mv_stats;
+ const int allow_hp = ms_params->allow_hp;
+ const int forced_stop = ms_params->forced_stop;
+ const int iters_per_step = ms_params->iters_per_step;
+ const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+ const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+ const SUBPEL_SEARCH_TYPE subpel_search_type =
+ ms_params->var_params.subpel_search_type;
+ const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+ int hstep = INIT_SUBPEL_STEP_SIZE;
+ const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
+
+ unsigned int besterr = INT_MAX;
+ *bestmv = start_mv;
+
+ if (subpel_search_type != USE_2_TAPS_ORIG)
+ besterr = upsampled_setup_obmc_center_error(
+ xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion);
+ else
+ besterr = setup_obmc_center_error(bestmv, var_params, mv_cost_params, sse1,
+ distortion);
+
+ for (int iter = 0; iter < round; ++iter) {
+ MV iter_center_mv = *bestmv;
+ MV diag_step = obmc_first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
+ mv_limits, var_params, mv_cost_params,
+ &besterr, sse1, distortion);
+
+ if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
+ obmc_second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
+ mv_limits, var_params, mv_cost_params,
+ &besterr, sse1, distortion);
+ }
+ hstep >>= 1;
+ }
+
+ return besterr;
+}
+
+// =============================================================================
+// Public cost function: mv_cost + pred error
+// =============================================================================
+int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
+ const FULLPEL_MV best_mv,
+ const aom_variance_fn_ptr_t *vfp,
+ const struct buf_2d *src, const struct buf_2d *pre) {
+ const MV mv = get_mv_from_fullmv(&best_mv);
+ unsigned int sse, var;
+
+ var = vfp->vf(src->buf, src->stride, get_buf_from_fullmv(pre, &best_mv),
+ pre->stride, &sse);
+ (void)var;
+
+ return sse + mv_err_cost_(&mv, mv_cost_params);
+}
+
+static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
+ const FULLPEL_MV best_mv,
+ const uint8_t *second_pred,
+ const aom_variance_fn_ptr_t *vfp,
+ const struct buf_2d *src,
+ const struct buf_2d *pre) {
+ const MV mv = get_mv_from_fullmv(&best_mv);
+ unsigned int unused;
+
+ return vfp->svaf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
+ src->buf, src->stride, &unused, second_pred) +
+ mv_err_cost_(&mv, mv_cost_params);
+}
+
+static INLINE int get_mvpred_mask_var(
+ const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv,
+ const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
+ int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src,
+ const struct buf_2d *pre) {
+ const MV mv = get_mv_from_fullmv(&best_mv);
+ unsigned int unused;
+
+ return vfp->msvf(get_buf_from_fullmv(pre, &best_mv), pre->stride, 0, 0,
+ src->buf, src->stride, second_pred, mask, mask_stride,
+ invert_mask, &unused) +
+ mv_err_cost_(&mv, mv_cost_params);
+}
+
+int av1_get_mvpred_compound_var(const MV_COST_PARAMS *mv_cost_params,
+ const FULLPEL_MV best_mv,
+ const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask,
+ const aom_variance_fn_ptr_t *vfp,
+ const struct buf_2d *src,
+ const struct buf_2d *pre) {
+ if (mask) {
+ return get_mvpred_mask_var(mv_cost_params, best_mv, second_pred, mask,
+ mask_stride, invert_mask, vfp, src, pre);
+ } else {
+ return get_mvpred_av_var(mv_cost_params, best_mv, second_pred, vfp, src,
+ pre);
+ }
+}
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
new file mode 100644
index 0000000000..87b9309b61
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MCOMP_H_
+#define AOM_AV1_ENCODER_MCOMP_H_
+
+#include "av1/common/mv.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/rd.h"
+
+#include "aom_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct SPEED_FEATURES;
+
+// =============================================================================
+// Cost functions
+// =============================================================================
+
+enum {
+ MV_COST_ENTROPY, // Use the entropy rate of the mv as the cost
+ MV_COST_L1_LOWRES, // Use the l1 norm of the mv as the cost (<480p)
+ MV_COST_L1_MIDRES, // Use the l1 norm of the mv as the cost (>=480p)
+ MV_COST_L1_HDRES, // Use the l1 norm of the mv as the cost (>=720p)
+ MV_COST_NONE // Use 0 as as cost irrespective of the current mv
+} UENUM1BYTE(MV_COST_TYPE);
+
+typedef struct {
+ // The reference mv used to compute the mv cost
+ const MV *ref_mv;
+ FULLPEL_MV full_ref_mv;
+ MV_COST_TYPE mv_cost_type;
+ const int *mvjcost;
+ const int *mvcost[2];
+ int error_per_bit;
+ // A multiplier used to convert rate to sad cost
+ int sad_per_bit;
+} MV_COST_PARAMS;
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
+ int *const mvcost[2], int weight);
+
+int av1_get_mvpred_sse(const MV_COST_PARAMS *mv_cost_params,
+ const FULLPEL_MV best_mv,
+ const aom_variance_fn_ptr_t *vfp,
+ const struct buf_2d *src, const struct buf_2d *pre);
+int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params,
+ const FULLPEL_MV best_mv,
+ const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask,
+ const aom_variance_fn_ptr_t *vfp,
+ const struct buf_2d *src,
+ const struct buf_2d *pre);
+
+// =============================================================================
+// Motion Search
+// =============================================================================
+typedef struct {
+ // The reference buffer
+ const struct buf_2d *ref;
+
+ // The source and predictors/mask used by translational search
+ const struct buf_2d *src;
+ const uint8_t *second_pred;
+ const uint8_t *mask;
+ int mask_stride;
+ int inv_mask;
+
+ // The weighted source and mask used by OBMC
+ const int32_t *wsrc;
+ const int32_t *obmc_mask;
+} MSBuffers;
+
+static INLINE void av1_set_ms_compound_refs(MSBuffers *ms_buffers,
+ const uint8_t *second_pred,
+ const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ ms_buffers->second_pred = second_pred;
+ ms_buffers->mask = mask;
+ ms_buffers->mask_stride = mask_stride;
+ ms_buffers->inv_mask = invert_mask;
+}
+
+// =============================================================================
+// Fullpixel Motion Search
+// =============================================================================
+// This struct holds fullpixel motion search parameters that should be constant
+// during the search
+typedef struct {
+ BLOCK_SIZE bsize;
+ // A function pointer to the simd function for fast computation
+ const aom_variance_fn_ptr_t *vfp;
+
+ MSBuffers ms_buffers;
+
+ // WARNING: search_method should be regarded as a private variable and should
+ // not be modified directly so it is in sync with search_sites. To modify it,
+ // use av1_set_mv_search_method.
+ SEARCH_METHODS search_method;
+ const search_site_config *search_sites;
+ FullMvLimits mv_limits;
+
+ int run_mesh_search; // Sets mesh search unless it got pruned by
+ // prune_mesh_search.
+ int prune_mesh_search; // Disables mesh search if the best_mv after a normal
+ // search if close to the start_mv.
+ int mesh_search_mv_diff_threshold; // mv diff threshold to enable
+ // prune_mesh_search
+ int force_mesh_thresh; // Forces mesh search if the residue variance is
+ // higher than the threshold.
+ const struct MESH_PATTERN *mesh_patterns[2];
+
+ // Use maximum search interval of 4 if true. This helps motion search to find
+ // the best motion vector for screen content types.
+ int fine_search_interval;
+
+ int is_intra_mode;
+
+ int fast_obmc_search;
+
+ // For calculating mv cost
+ MV_COST_PARAMS mv_cost_params;
+
+ // Stores the function used to compute the sad. This can be different from the
+ // sdf in vfp (e.g. downsampled sad and not sad) to allow speed up.
+ aom_sad_fn_t sdf;
+ aom_sad_multi_d_fn_t sdx4df;
+ aom_sad_multi_d_fn_t sdx3df;
+} FULLPEL_MOTION_SEARCH_PARAMS;
+
+typedef struct {
+ int err_cost;
+ unsigned int distortion;
+ unsigned int sse;
+} FULLPEL_MV_STATS;
+
+void av1_init_obmc_buffer(OBMCBuffer *obmc_buffer);
+
+void av1_make_default_fullpel_ms_params(
+ FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv, FULLPEL_MV start_mv,
+ const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+ SEARCH_METHODS search_method, int fine_search_interval);
+
+/*! Sets the \ref FULLPEL_MOTION_SEARCH_PARAMS to intra mode. */
+void av1_set_ms_to_intra_mode(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const IntraBCMVCosts *dv_costs);
+
+// Sets up configs for fullpixel DIAMOND / CLAMPED_DIAMOND search method.
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride,
+ int level);
+// Sets up configs for firstpass motion search.
+void av1_init_motion_fpf(search_site_config *cfg, int stride);
+// Sets up configs for NSTEP / NSTEP_8PT motion search method.
+void av1_init_motion_compensation_nstep(search_site_config *cfg, int stride,
+ int level);
+// Sets up configs for BIGDIA / FAST_DIAMOND / FAST_BIGDIA
+// motion search method.
+void av1_init_motion_compensation_bigdia(search_site_config *cfg, int stride,
+ int level);
+// Sets up configs for HEX or FAST_HEX motion search method.
+void av1_init_motion_compensation_hex(search_site_config *cfg, int stride,
+ int level);
+// Sets up configs for SQUARE motion search method.
+void av1_init_motion_compensation_square(search_site_config *cfg, int stride,
+ int level);
+
+/*! Function pointer to search site config initialization of different search
+ * method functions. */
+typedef void (*av1_init_search_site_config)(search_site_config *cfg, int stride,
+ int level);
+
+/*! Array of function pointers used to set the motion search config. */
+extern const av1_init_search_site_config
+ av1_init_motion_compensation[NUM_DISTINCT_SEARCH_METHODS];
+
+// Array to inform which all search methods are having
+// same candidates and different in number of search steps.
+static const SEARCH_METHODS search_method_lookup[NUM_SEARCH_METHODS] = {
+ DIAMOND, // DIAMOND
+ NSTEP, // NSTEP
+ NSTEP_8PT, // NSTEP_8PT
+ CLAMPED_DIAMOND, // CLAMPED_DIAMOND
+ HEX, // HEX
+ BIGDIA, // BIGDIA
+ SQUARE, // SQUARE
+ HEX, // FAST_HEX
+ BIGDIA, // FAST_DIAMOND
+ BIGDIA, // FAST_BIGDIA
+ BIGDIA // VFAST_DIAMOND
+};
+
+// Reinitialize the search site config.
+static AOM_INLINE void av1_refresh_search_site_config(
+ search_site_config *ss_cfg_buf, SEARCH_METHODS search_method,
+ const int ref_stride) {
+ const int level =
+ search_method == NSTEP_8PT || search_method == CLAMPED_DIAMOND;
+ search_method = search_method_lookup[search_method];
+ av1_init_motion_compensation[search_method](&ss_cfg_buf[search_method],
+ ref_stride, level);
+}
+
+// Mv beyond the range do not produce new/different prediction block.
+static INLINE void av1_set_mv_search_method(
+ FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const search_site_config search_sites[NUM_DISTINCT_SEARCH_METHODS],
+ SEARCH_METHODS search_method) {
+ ms_params->search_method = search_method;
+ ms_params->search_sites =
+ &search_sites[search_method_lookup[ms_params->search_method]];
+}
+
+// Set up limit values for MV components.
+// Mv beyond the range do not produce new/different prediction block.
+static INLINE void av1_set_mv_row_limits(
+ const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+ int mi_row, int mi_height, int border) {
+ const int min1 = -(mi_row * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
+ const int min2 = -(((mi_row + mi_height) * MI_SIZE) + 2 * AOM_INTERP_EXTEND);
+ mv_limits->row_min = AOMMAX(min1, min2);
+ const int max1 = (mi_params->mi_rows - mi_row - mi_height) * MI_SIZE +
+ border - 2 * AOM_INTERP_EXTEND;
+ const int max2 =
+ (mi_params->mi_rows - mi_row) * MI_SIZE + 2 * AOM_INTERP_EXTEND;
+ mv_limits->row_max = AOMMIN(max1, max2);
+}
+
+static INLINE void av1_set_mv_col_limits(
+ const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+ int mi_col, int mi_width, int border) {
+ const int min1 = -(mi_col * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
+ const int min2 = -(((mi_col + mi_width) * MI_SIZE) + 2 * AOM_INTERP_EXTEND);
+ mv_limits->col_min = AOMMAX(min1, min2);
+ const int max1 = (mi_params->mi_cols - mi_col - mi_width) * MI_SIZE + border -
+ 2 * AOM_INTERP_EXTEND;
+ const int max2 =
+ (mi_params->mi_cols - mi_col) * MI_SIZE + 2 * AOM_INTERP_EXTEND;
+ mv_limits->col_max = AOMMIN(max1, max2);
+}
+
+static INLINE void av1_set_mv_limits(
+ const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+ int mi_row, int mi_col, int mi_height, int mi_width, int border) {
+ av1_set_mv_row_limits(mi_params, mv_limits, mi_row, mi_height, border);
+ av1_set_mv_col_limits(mi_params, mv_limits, mi_col, mi_width, border);
+}
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv);
+
+int av1_init_search_range(int size);
+
+unsigned int av1_int_pro_motion_estimation(
+ const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, const MV *ref_mv, unsigned int *y_sad_zero,
+ int me_search_size_col, int me_search_size_row);
+
+int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const FULLPEL_MV start_mv, FULLPEL_MV *best_mv);
+
+int av1_full_pixel_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int step_param, int *cost_list,
+ FULLPEL_MV *best_mv, FULLPEL_MV_STATS *best_mv_stats,
+ FULLPEL_MV *second_best_mv);
+
+int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ IntraBCHashInfo *intrabc_hash_info,
+ FULLPEL_MV *best_mv);
+
+int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
+ const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const int step_param, FULLPEL_MV *best_mv);
+
+static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits,
+ FULLPEL_MV mv) {
+ return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+ (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+// =============================================================================
+// Subpixel Motion Search
+// =============================================================================
+enum {
+ EIGHTH_PEL,
+ QUARTER_PEL,
+ HALF_PEL,
+ FULL_PEL
+} UENUM1BYTE(SUBPEL_FORCE_STOP);
+
+typedef struct {
+ const aom_variance_fn_ptr_t *vfp;
+ SUBPEL_SEARCH_TYPE subpel_search_type;
+ // Source and reference buffers
+ MSBuffers ms_buffers;
+ int w, h;
+} SUBPEL_SEARCH_VAR_PARAMS;
+
+// This struct holds subpixel motion search parameters that should be constant
+// during the search
+typedef struct {
+ // High level motion search settings
+ int allow_hp;
+ const int *cost_list;
+ SUBPEL_FORCE_STOP forced_stop;
+ int iters_per_step;
+ SubpelMvLimits mv_limits;
+
+ // For calculating mv cost
+ MV_COST_PARAMS mv_cost_params;
+
+ // Distortion calculation params
+ SUBPEL_SEARCH_VAR_PARAMS var_params;
+} SUBPEL_MOTION_SEARCH_PARAMS;
+
+void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ const struct AV1_COMP *cpi,
+ const MACROBLOCK *x, BLOCK_SIZE bsize,
+ const MV *ref_mv, const int *cost_list);
+
+typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ MV start_mv,
+ const FULLPEL_MV_STATS *start_mv_stats,
+ MV *bestmv, int *distortion,
+ unsigned int *sse1,
+ int_mv *last_mv_search_list);
+
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
+extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up;
+
+unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+ BLOCK_SIZE bsize, const int *pts0,
+ const int *pts_inref0, int total_samples,
+ WARP_SEARCH_METHOD search_method,
+ int num_iterations);
+
+static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) {
+ for (int z = 0; z < 3; z++) {
+ fractional_best_mv[z].as_int = INVALID_MV;
+ }
+}
+
+static INLINE void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits,
+ const FullMvLimits *mv_limits,
+ const MV *ref_mv) {
+ const int max_mv = GET_MV_SUBPEL(MAX_FULL_PEL_VAL);
+ int minc = AOMMAX(GET_MV_SUBPEL(mv_limits->col_min), ref_mv->col - max_mv);
+ int maxc = AOMMIN(GET_MV_SUBPEL(mv_limits->col_max), ref_mv->col + max_mv);
+ int minr = AOMMAX(GET_MV_SUBPEL(mv_limits->row_min), ref_mv->row - max_mv);
+ int maxr = AOMMIN(GET_MV_SUBPEL(mv_limits->row_max), ref_mv->row + max_mv);
+
+ maxc = AOMMAX(minc, maxc);
+ maxr = AOMMAX(minr, maxr);
+
+ subpel_limits->col_min = AOMMAX(MV_LOW + 1, minc);
+ subpel_limits->col_max = AOMMIN(MV_UPP - 1, maxc);
+ subpel_limits->row_min = AOMMAX(MV_LOW + 1, minr);
+ subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr);
+}
+
+static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits,
+ MV mv) {
+ return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+ (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+
+static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) {
+ return mv->row * stride + mv->col;
+}
+
+static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf,
+ const FULLPEL_MV *mv) {
+ return &buf->buf[get_offset_from_fullmv(mv, buf->stride)];
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/mcomp_structs.h b/third_party/aom/av1/encoder/mcomp_structs.h
new file mode 100644
index 0000000000..06660cf4a6
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp_structs.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MCOMP_STRUCTS_H_
+#define AOM_AV1_ENCODER_MCOMP_STRUCTS_H_
+
+#include "av1/common/mv.h"
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
+// Maximum number of neighbors to scan per iteration during
+// WARPED_CAUSAL refinement
+// Note: The elements of warp_search_config.neighbor_mask must be at least
+// MAX_WARP_SEARCH_NEIGHBORS many bits wide. So the type may need to be
+// widened if this value is increased.
+#define MAX_WARP_SEARCH_NEIGHBORS 8
+
+#define SEARCH_RANGE_8P 3
+#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1)
+#define SEARCH_GRID_CENTER_8P \
+ (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P)
+
+typedef struct {
+ FULLPEL_MV coord;
+ int coord_offset;
+} search_neighbors;
+// motion search site
+typedef struct search_site {
+ FULLPEL_MV mv;
+ int offset;
+} search_site;
+
+typedef struct search_site_config {
+ search_site site[MAX_MVSEARCH_STEPS * 2][16 + 1];
+ // Number of search steps.
+ int num_search_steps;
+ int searches_per_step[MAX_MVSEARCH_STEPS * 2];
+ int radius[MAX_MVSEARCH_STEPS * 2];
+ int stride;
+} search_site_config;
+
+enum {
+ // Search 8-points in the radius grid around center, up to 11 search stages.
+ DIAMOND = 0,
+ // Search 12-points in the radius/tan_radius grid around center,
+ // up to 15 search stages.
+ NSTEP = 1,
+ // Search 8-points in the radius grid around center, up to 16 search stages.
+ NSTEP_8PT = 2,
+ // Search 8-points in the radius grid around center, upto 11 search stages
+ // with clamping of search radius.
+ CLAMPED_DIAMOND = 3,
+ // Search maximum 8-points in the radius grid around center,
+ // up to 11 search stages. First stage consists of 8 search points
+ // and the rest with 6 search points each in hex shape.
+ HEX = 4,
+ // Search maximum 8-points in the radius grid around center,
+ // up to 11 search stages. First stage consists of 4 search
+ // points and the rest with 8 search points each.
+ BIGDIA = 5,
+ // Search 8-points in the square grid around center, up to 11 search stages.
+ SQUARE = 6,
+ // HEX search with up to 2 stages.
+ FAST_HEX = 7,
+ // BIGDIA search with up to 2 stages.
+ FAST_DIAMOND = 8,
+ // BIGDIA search with up to 3 stages.
+ FAST_BIGDIA = 9,
+ // BIGDIA search with up to 1 stage.
+ VFAST_DIAMOND = 10,
+ // Total number of search methods.
+ NUM_SEARCH_METHODS,
+ // Number of distinct search methods.
+ NUM_DISTINCT_SEARCH_METHODS = SQUARE + 1,
+} UENUM1BYTE(SEARCH_METHODS);
+
+typedef struct warp_search_config {
+ int num_neighbors;
+ MV neighbors[MAX_WARP_SEARCH_NEIGHBORS];
+ // Bitmask which is used to prune the search neighbors at one iteration
+ // based on which direction we chose in the previous iteration.
+ // See comments in av1_refine_warped_mv for details.
+ uint8_t neighbor_mask[MAX_WARP_SEARCH_NEIGHBORS];
+} warp_search_config;
+
+// Methods for refining WARPED_CAUSAL motion vectors
+enum {
+ // Search 4 adjacent points in a diamond shape at each iteration
+ WARP_SEARCH_DIAMOND,
+ // Search 8 adjacent points in a square at each iteration
+ WARP_SEARCH_SQUARE,
+ WARP_SEARCH_METHODS
+} UENUM1BYTE(WARP_SEARCH_METHOD);
+
+#endif // AOM_AV1_ENCODER_MCOMP_STRUCTS_H_
diff --git a/third_party/aom/av1/encoder/misc_model_weights.h b/third_party/aom/av1/encoder/misc_model_weights.h
new file mode 100644
index 0000000000..f00aeabcf6
--- /dev/null
+++ b/third_party/aom/av1/encoder/misc_model_weights.h
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define MV_PREC_FEATURE_SIZE 18
+
+#define NUM_DNN_LAYERS 1
+#define NUM_DNN_FEATURES MV_PREC_FEATURE_SIZE
+#define MV_PREC_LAYER_SIZE_0 32
+#define NUM_LOGITS 1
+
+const float av1_mv_prec_mean[MV_PREC_FEATURE_SIZE] = { 143.67358891063745f,
+ 141.6251917346238f,
+ 0.36313633945679064f,
+ 0.0028162791958822085f,
+ 0.000484820537626698f,
+ 0.002769969388939025f,
+ 0.0f,
+ 0.00031274626720947577f,
+ 0.00020578555375160075f,
+ 0.0007075246732697733f,
+ 0.000539641029909925f,
+ 0.0013939401375906984f,
+ 4.985394760423499f,
+ 4.985394760423499f,
+ 4.9992148717283085f,
+ 5.143739822380163f,
+ 5.518483124004564f,
+ 87.63597847427077f };
+
+const float av1_mv_prec_std[MV_PREC_FEATURE_SIZE] = { 66.86256140247244f,
+ 68.04472572607503f,
+ 13.23247674430399f,
+ 0.0029123438396921955f,
+ 0.0015331406169374737f,
+ 0.0029149813096313775f,
+ 1.0f,
+ 0.00047501102871357813f,
+ 0.00030025962993117947f,
+ 0.0009861163580391207f,
+ 0.0012157593528004055f,
+ 0.002004954948490521f,
+ 6.539447500484038f,
+ 6.539447500484038f,
+ 6.396589058279465f,
+ 3.4870155874262516f,
+ 3.8911353973740535f,
+ 112.07985259573601f };
+
+const float av1_mv_prec_nn_weights_layer_0[] = { -0.13008492159557145f,
+ -0.1483527373474774f,
+ 0.08112076098858864f,
+ -0.9582568679627453f,
+ -0.34794757171071206f,
+ 0.6465225723304947f,
+ 0.0f,
+ 0.06754171885839604f,
+ 0.27156803620541214f,
+ 0.10635231245664407f,
+ -0.031183926995968583f,
+ 0.048122572260291f,
+ -0.19498534230045128f,
+ -0.2614116319273316f,
+ -0.3223762845136331f,
+ -1.2063368350609205f,
+ -0.523333556911706f,
+ 1.075632260890728f,
+ 0.48989726814387946f,
+ -0.34816466111070477f,
+ 0.41668357610256473f,
+ -1.0973562848791671f,
+ 0.04183921854389494f,
+ -0.9123815389260476f,
+ 0.0f,
+ 0.859965047744027f,
+ 0.1962095804679813f,
+ 0.2606564339077058f,
+ 0.26695868715184895f,
+ 0.5319308568326692f,
+ -0.23717505799723165f,
+ -0.43127224481782567f,
+ -0.3214545776203726f,
+ 0.5850852241402176f,
+ -0.26705531612587813f,
+ -0.5786016766610093f,
+ 0.9360519909983003f,
+ 0.20771329289016555f,
+ -0.027614159544811823f,
+ -1.175022807046164f,
+ -0.07578967497693835f,
+ 0.6890172485324256f,
+ 0.0f,
+ -0.008008338164988263f,
+ -0.08064800010158935f,
+ -0.22606910981666667f,
+ 0.4541586669210879f,
+ 0.07731527661370792f,
+ -0.6744475941247964f,
+ -0.2625842448396184f,
+ 1.7018613444303785f,
+ -0.08622229073162656f,
+ 0.041858142814941275f,
+ -0.24575964090386415f,
+ -0.046626044730994964f,
+ 0.7608713064175202f,
+ -0.23330119070907146f,
+ -0.10115510984500826f,
+ 0.9722537349192069f,
+ 0.11718554254290829f,
+ 0.0f,
+ 0.2075123446014759f,
+ 0.09465167310768637f,
+ 0.7609896851963016f,
+ 0.4441038581385328f,
+ 0.26064144727430955f,
+ -0.14678625366485035f,
+ -0.03597014452200524f,
+ 0.3128680867196166f,
+ 1.102496797385966f,
+ 0.06642253233084111f,
+ -1.2665494483407629f,
+ 0.09049412632000911f,
+ -1.1160621999565095f,
+ 0.043420275255913035f,
+ -0.8811412259978966f,
+ 0.21076234632287777f,
+ 0.16571534463543866f,
+ 0.0f,
+ -0.7324075176473275f,
+ -0.3677622514459495f,
+ 0.3273532243056415f,
+ 0.22922161936797775f,
+ 0.8204766691058087f,
+ 0.02982161033720488f,
+ 0.5266419954188112f,
+ -1.0032154963302191f,
+ 0.7007602969763729f,
+ 0.37196355167990885f,
+ -0.7608579453228548f,
+ 0.08568111584781847f,
+ 0.07011061059123677f,
+ 0.3233263598082507f,
+ -0.08249928295410253f,
+ 0.08220165761319252f,
+ 0.22148722752246794f,
+ 0.0f,
+ 0.6122392701743506f,
+ -0.26429838296378333f,
+ 0.31958081620005463f,
+ -0.006027177397853826f,
+ -0.3088310785887994f,
+ -0.5436192046707807f,
+ -0.011080356757423306f,
+ 0.12632650770008413f,
+ -0.45097913215234525f,
+ 1.8008072867127298f,
+ -0.7630029654575501f,
+ -0.4054774329826579f,
+ 0.40386074452544535f,
+ -0.18541426257453025f,
+ 0.2444879765079863f,
+ -0.6216724756115081f,
+ 0.27030299321302f,
+ 0.0f,
+ -0.6835848952967989f,
+ -0.7914184320964815f,
+ -0.6761595019582928f,
+ -1.009565565604081f,
+ -0.1904242439353305f,
+ 0.4463417126318631f,
+ 0.6025503823452971f,
+ 0.5149990860115566f,
+ 1.0242970663937634f,
+ 0.037947306826401385f,
+ 0.07039339786212848f,
+ 0.14273796789711987f,
+ 0.168103961425691f,
+ 1.6596066376811978f,
+ 0.19321092229384657f,
+ -0.3710750388148514f,
+ -0.01717015559410288f,
+ 0.0f,
+ 0.3005688477942597f,
+ 0.23877080653829577f,
+ 0.2718594552971173f,
+ 0.3885402571589898f,
+ 0.32999531945669247f,
+ -0.6134460954213243f,
+ -0.13972265462799183f,
+ -0.07180089575716991f,
+ -1.014572598188105f,
+ 0.0717207322809836f,
+ 0.34896157745155615f,
+ -0.27127687591403f,
+ -0.5058651212773623f,
+ -1.5442435628306925f,
+ -0.6399784724734707f,
+ 0.6274301429074947f,
+ -0.4645750072767051f,
+ 0.0f,
+ -0.2406726815244178f,
+ -0.06321214115916597f,
+ 0.312856714253404f,
+ 0.16459514124116134f,
+ 0.3993579604809623f,
+ -0.15232044351561913f,
+ -0.5613743948568469f,
+ 0.7219801372223262f,
+ 0.2936857469624009f,
+ 0.7823466656034087f,
+ -0.12416947814098349f,
+ -0.36413756654028345f,
+ -0.07992098796866462f,
+ -0.7395722879842416f,
+ 0.8639913543220514f,
+ -0.311931773757945f,
+ -1.7308240470400613f,
+ 0.0f,
+ 0.394499716712104f,
+ 0.6511462819539963f,
+ -0.0722425275974144f,
+ 0.13490818194661386f,
+ 0.055319135836378035f,
+ 0.15389577508097013f,
+ 0.28958598328870605f,
+ -0.14608429470539772f,
+ 0.09488817462478298f,
+ -0.17231294096622088f,
+ 0.6721115415911466f,
+ -0.05664621150536103f,
+ 0.03291799673669331f,
+ 0.02845382711057482f,
+ -0.9953563446999164f,
+ -0.17994298220605923f,
+ 0.6560824519337476f,
+ 0.0f,
+ -0.30990646375917935f,
+ 0.17215517202874f,
+ 0.2026816225170481f,
+ 0.22011958747715601f,
+ 0.3562520768889686f,
+ -0.18436559057189175f,
+ 0.1733377147302066f,
+ 0.02818276995640877f,
+ -0.29703005574859076f,
+ -0.3310652639215064f,
+ -1.6091173258529277f,
+ 0.45461585790028003f,
+ -0.5078643334592593f,
+ -0.338997374732338f,
+ 0.4688619590359733f,
+ 0.627099126828289f,
+ -0.5249801376494249f,
+ 0.0f,
+ 0.34465498218272883f,
+ 0.009891680630908135f,
+ -0.27244020967349f,
+ 0.05404589867626979f,
+ -0.06220329325739666f,
+ -0.13365376464759104f,
+ -0.13098573553512366f,
+ 0.11434198976289106f,
+ 0.6740951247574676f,
+ 1.3381727185724581f,
+ -1.4865773213251936f,
+ 0.05809898701966341f,
+ 0.25380780261023456f,
+ 1.2716367496512722f,
+ 0.1768290070780598f,
+ -0.07554828135356352f,
+ 0.8180570085344856f,
+ 0.0f,
+ 1.0788448980077463f,
+ 0.0651938742459459f,
+ 0.3807672030015587f,
+ 0.6144792680268445f,
+ 0.011660612214908059f,
+ -0.018306023765580288f,
+ 0.44140813809926516f,
+ -0.13411994195502386f,
+ 0.15920368955127778f,
+ -0.19382358417849888f,
+ -0.08802147969690055f,
+ -0.019731052733814477f,
+ 0.1104744229169665f,
+ -0.195834419735958f,
+ -0.5005295046454347f,
+ -0.17041241868229032f,
+ -0.471942117351489f,
+ 0.0f,
+ -0.3599073304761372f,
+ -0.2745532782968519f,
+ -0.8323064841106417f,
+ -0.88355885384943f,
+ -0.02826466859020679f,
+ 0.06977870308805256f,
+ 0.11926112095374196f,
+ 1.367382707959643f,
+ -0.06119843162964051f,
+ -0.5331395268889569f,
+ -1.2155531584240624f,
+ -0.01896651779524327f,
+ 0.10591845408571081f,
+ -0.010632842156504733f,
+ 0.6150787968629282f,
+ -0.4191690185896091f,
+ -0.9961718918346271f,
+ 0.0f,
+ 0.23370364516013867f,
+ 0.4156033072362998f,
+ 0.1261005546633433f,
+ 0.0812413884532226f,
+ -0.008894337353937203f,
+ 0.07984447025056046f,
+ -0.1258098052766725f,
+ -0.40245475467767916f,
+ 1.78188906675019f,
+ -1.1544387954232302f,
+ -0.41768781481273387f,
+ 0.6791211165341995f,
+ -0.4175127856183446f,
+ -0.07353219159767788f,
+ -0.2888813577574072f,
+ -0.7107767892597061f,
+ -1.0450031091195449f,
+ 0.0f,
+ -0.9221599545079143f,
+ -0.6747876356740621f,
+ 0.30241454354872105f,
+ 0.4924965303373908f,
+ -0.14042722740054084f,
+ 0.27744210409350445f,
+ -0.14788270997426836f,
+ -0.9081467469237995f,
+ -0.04513115674995093f,
+ -0.5254168669125793f,
+ -0.6999012037974789f,
+ 0.434661246306547f,
+ -0.7193303957246092f,
+ -0.9117952623409744f,
+ -1.5097267865916142f,
+ -0.20779888103770922f,
+ 0.4935562480901218f,
+ 0.0f,
+ 0.18303393908923593f,
+ 0.34753722677570037f,
+ 0.29291001533177663f,
+ 0.3832351878354224f,
+ 0.3295194956120599f,
+ -0.32398033003617527f,
+ -0.31570906736433746f,
+ 0.23657779050372962f,
+ 0.9510794465234161f,
+ -0.5122243902568278f,
+ 0.08652112725315658f,
+ 0.2246634353717998f,
+ -0.9032595595582497f,
+ -0.8936484034533545f,
+ 0.6012969720865752f,
+ -0.6454216646117924f,
+ -1.1753786049658332f,
+ 0.0f,
+ -0.4360545677728656f,
+ -0.6586237455328507f,
+ -0.34347301697886656f,
+ -0.8909724651992144f,
+ -0.24378721818350263f,
+ 0.6179733359297576f,
+ 0.0661661181742234f,
+ -0.14120142044993794f,
+ -0.07732699885498932f,
+ 1.0221355882357506f,
+ 0.44514798994115284f,
+ -0.7371569579959046f,
+ -0.7212499572378936f,
+ 0.7453626921081045f,
+ 0.5478757761345768f,
+ -0.39411232789985384f,
+ 0.7200542656743857f,
+ 0.0f,
+ -0.11790869453118827f,
+ -0.12317030713581928f,
+ -0.4207902738133338f,
+ 0.15895105878327986f,
+ 0.304261777102111f,
+ 0.11450744587017621f,
+ -0.11470709991317944f,
+ 0.5949222371739038f,
+ 0.6549518619412444f,
+ -0.24390606570422838f,
+ -0.4212796009440803f,
+ -0.6269666206320964f,
+ -0.5421193969807078f,
+ -0.12297772128652287f,
+ 0.021517257619930424f,
+ 0.25462855095544523f,
+ -0.22107798187348246f,
+ 0.0f,
+ 0.5204516300095662f,
+ 0.2837402841862462f,
+ 0.11310823283285916f,
+ 0.8944351685018025f,
+ 0.17487203235834015f,
+ -0.5271221928634433f,
+ -0.19516594503423199f,
+ 0.452456617580365f,
+ 1.2456272242706414f,
+ 0.24166615894862817f,
+ 0.09411429305204502f,
+ -0.2730072283327243f,
+ -0.8129383770918172f,
+ -0.24093254193486136f,
+ 0.5696499174142177f,
+ -0.11110805836073044f,
+ -0.3968204166235694f,
+ 0.0f,
+ -0.04388165369378549f,
+ -0.005631266017272595f,
+ -0.02574211858479705f,
+ 0.06230399626660669f,
+ 0.17677671232932785f,
+ 0.5172871274400965f,
+ 0.4919150085620063f,
+ -1.597656637582941f,
+ 0.02415185715719143f,
+ -0.17945446376668306f,
+ -0.39340600199798886f,
+ 0.25013205256886845f,
+ 0.05972330340308685f,
+ 0.1359911505596489f,
+ -0.02341033271820833f,
+ 0.15726074644063684f,
+ 0.47512625913020357f,
+ 0.0f,
+ 0.7327341664835779f,
+ -0.3689092312320013f,
+ 0.4571824787436036f,
+ 0.6215465537945456f,
+ 0.0944111296842023f,
+ -0.12571956176607574f,
+ -0.2507235674395462f,
+ -0.09579602654351593f,
+ 1.4463357293728496f,
+ 0.749153535856049f,
+ -0.5553955120807588f,
+ -0.09622771929369946f,
+ -0.2598697420394813f,
+ -0.964691815299676f,
+ -0.8289963178173902f,
+ 0.7112949291983329f,
+ -0.8667009730492162f,
+ 0.0f,
+ -0.48698304169042794f,
+ -0.18786095669893707f,
+ -0.11425249263203247f,
+ -0.3693391011684809f,
+ 0.09933145842585253f,
+ 0.2568559685298844f,
+ 0.7048512233651738f,
+ 0.6056238412407038f,
+ -0.4355558119826642f,
+ 0.17318931883915484f,
+ 0.6481333496429564f,
+ -0.45728823054344486f,
+ -0.006325004538589701f,
+ 0.45609864075494927f,
+ -0.6199385981116988f,
+ 0.035105808783046165f,
+ 0.1203147963894839f,
+ 0.0f,
+ 0.383402190836527f,
+ 0.048429009055370106f,
+ 0.5887186439275204f,
+ -0.20538767641607814f,
+ -0.031237879611002117f,
+ 0.3140759860883231f,
+ 0.24447070584999556f,
+ 0.7271263905705878f,
+ 0.8432799162434237f,
+ -0.11530577554199217f,
+ -0.7781023892314718f,
+ 0.05359488822710336f,
+ 0.5624870388700809f,
+ 0.5134656523208906f,
+ 0.18304041423438375f,
+ -0.04237421156328257f,
+ -0.20759809886942207f,
+ 0.0f,
+ -0.06249337454975615f,
+ 0.10081284533873777f,
+ 0.3894374350259183f,
+ 1.518217777528342f,
+ -0.9100037950171563f,
+ 0.17796906121831477f,
+ -0.2892167255357892f,
+ 0.6117902467884032f,
+ 0.13332120964959573f,
+ -0.3487155932849374f,
+ -0.32920583745734694f,
+ 0.08242631209809854f,
+ -0.24920225708110588f,
+ 0.8401757259392635f,
+ 0.11729108681358365f,
+ 0.11222925752499184f,
+ -0.027078490721459958f,
+ 0.0f,
+ 0.726132375517389f,
+ 0.72220359881096f,
+ 0.5721582611845177f,
+ 0.15139162075524315f,
+ 0.6676549461551197f,
+ -0.321449586554697f,
+ -0.10141104515219895f,
+ -0.09711123988777906f,
+ 0.9623356184776928f,
+ -0.7941822373167173f,
+ -0.9373923554119346f,
+ 0.4573241832354059f,
+ -0.42029139056126147f,
+ 0.2675223459380999f,
+ -0.5487300191551386f,
+ 0.2236621891916084f,
+ 0.11692039230044018f,
+ 0.0f,
+ 0.1758399202780961f,
+ 0.676447587678781f,
+ 0.5945412815881029f,
+ 0.5669863357359594f,
+ 0.8433565415303922f,
+ -0.30300550790708036f,
+ -0.43332881999693673f,
+ -0.4996522695731392f,
+ -0.2084930815451962f,
+ 0.27765278702463786f,
+ 1.0886848763946915f,
+ -0.0739433655813831f,
+ -0.4762801579229192f,
+ -0.2490825339320731f,
+ -1.8820479350439439f,
+ -0.4251592225775914f,
+ -0.3992922365484464f,
+ 0.0f,
+ 0.19598917760218867f,
+ 0.4860238022746914f,
+ 0.3364528828641281f,
+ 0.3350950865226741f,
+ 0.2773654548632006f,
+ -0.30547262140782566f,
+ 0.028649620490728344f,
+ -0.11763407628280315f,
+ 0.6237318502627169f,
+ -0.3958952632477945f,
+ 0.14797171297835243f,
+ 0.45821729624747465f,
+ -0.8687137170773626f,
+ 0.06989667196937126f,
+ -0.5752606929478727f,
+ 0.16986945686358412f,
+ 0.6925071596817824f,
+ 0.0f,
+ 0.4991250796183003f,
+ 0.03424654896322111f,
+ 0.6153698611882319f,
+ 0.5070872444849457f,
+ 0.43615747516328135f,
+ -0.7870352838659244f,
+ -0.6424101231965247f,
+ -0.7005774876651399f,
+ 0.79983115431488f,
+ 0.15720357955596242f,
+ -1.408372612176309f,
+ -0.039294695217213765f,
+ 0.6979415372962309f,
+ 0.27403316751965656f,
+ 1.2844596102619275f,
+ -0.2781534150257364f,
+ 0.3248437714908865f,
+ 0.0f,
+ 0.4364362371752831f,
+ -0.2548580911485434f,
+ -0.19578001373349452f,
+ -0.04597194387828005f,
+ -0.010035156855533233f,
+ 0.0415941475251266f,
+ 0.07929549739797387f,
+ -0.060629652912508866f,
+ 0.5977303008711333f,
+ -1.4404008068066554f,
+ 0.8555694790197376f,
+ -0.03693438534401856f,
+ 0.17761411164512408f,
+ -0.11858304304109235f,
+ -1.4241324353471327f,
+ 0.1533849765389186f,
+ 0.7650643783126995f,
+ 0.0f,
+ -0.0639949379280401f,
+ 0.4288617817939563f,
+ 0.4235508646885404f,
+ 0.3419843254383798f,
+ -0.015992360660098768f,
+ -0.773247697505441f,
+ -0.4908452922015917f,
+ 0.9868134897291486f,
+ -0.5078689994742608f,
+ 1.05632043744864f,
+ -0.38867419409275117f,
+ -0.0065547696858664194f,
+ -0.3056003173415037f,
+ -0.333762331930102f,
+ 0.4459671174011671f,
+ 0.08219092584580244f,
+ -0.08099158579518179f,
+ 0.0f,
+ -0.1568180656346373f,
+ -0.061962372393910135f,
+ 0.14065868174859464f,
+ -0.055925712798972765f,
+ 0.05136117465820622f,
+ 0.0907831030477633f,
+ 0.19518110495319604f,
+ -0.7470794578145956f,
+ 1.5945999734733545f,
+ -0.4351697502345834f,
+ -0.33253649399571805f };
+
+const float av1_mv_prec_nn_bias_layer_0[] = {
+ -0.651213833993862f, -1.1243309933417809f, -0.2123880023097051f,
+ 0.23095477452877616f, -0.6668057665893545f, 0.3082268148379634f,
+ -0.3344916753975844f, -0.20920185606857844f, 0.6057933917964854f,
+ 0.5031857662559803f, -1.5380096313468152f, -0.4457245344804041f,
+ 1.82368055812373f, 0.7973912064077963f, 0.25706500555622913f,
+ 0.1394695119825382f, 0.4508811973450553f, -0.5408959545111782f,
+ 1.064829233697863f, 0.3733268644246235f, 1.1173169029905483f,
+ -0.2012817466400134f, -0.16628447748302294f, 1.3086000088940826f,
+ 0.7267092979664235f, -0.9097857006590555f, -0.7564259343863077f,
+ -0.49844128036716173f, -0.4675729246975423f, -0.03626154526362181f,
+ -0.41957330902404616f, -0.9658160514319954f
+};
+
+const float av1_mv_prec_nn_weights_layer_1[] = {
+ 1.5017296484510276f, 1.044216918060133f, -1.066541411740906f,
+ -0.7762965171172661f, -0.9814396609661653f, 0.9334065847340715f,
+ 0.7117244268817873f, -0.7695942296628597f, 0.7892157680137047f,
+ -0.5786309358654476f, -2.4444494892027264f, 1.1666759262637185f,
+ -0.9699580532370483f, 0.5849682956422552f, -1.0372272986941953f,
+ -0.5005014627824439f, 1.1816204711740521f, -1.2204867615892114f,
+ 0.4510263977504913f, 0.35567865078585165f, -0.7811389330738839f,
+ -0.6643977800301099f, -0.6283287371705794f, 0.790873821018048f,
+ 0.8861643352684585f, 0.6438840651522237f, 0.6677191546466089f,
+ 0.9703715021995785f, 1.250893534236489f, 0.7733742028067933f,
+ -1.249673977776904f, -1.2890127265725608f
+};
+
+const float av1_mv_prec_nn_bias_layer_1[] = { -0.341771735378258f };
+
+static const NN_CONFIG av1_mv_prec_dnn_config = {
+ NUM_DNN_FEATURES,
+ NUM_LOGITS,
+ NUM_DNN_LAYERS,
+ { MV_PREC_LAYER_SIZE_0 },
+ {
+ av1_mv_prec_nn_weights_layer_0,
+ av1_mv_prec_nn_weights_layer_1,
+ },
+ {
+ av1_mv_prec_nn_bias_layer_0,
+ av1_mv_prec_nn_bias_layer_1,
+ },
+};
+#undef NUM_DNN_LAYERS
+#undef NUM_DNN_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c
new file mode 100644
index 0000000000..94cd56c5d1
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/ml.h"
+
+void av1_nn_output_prec_reduce(float *const output, int num_output) {
+ const int prec_bits = 9;
+ const int prec = 1 << prec_bits;
+ const float inv_prec = (float)(1.0 / prec);
+ for (int i = 0; i < num_output; i++) {
+ output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec;
+ }
+}
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_c(const float *input_nodes,
+ const NN_CONFIG *const nn_config, int reduce_prec,
+ float *const output) {
+ int num_input_nodes = nn_config->num_inputs;
+ int buf_index = 0;
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+
+ // Propagate hidden layers.
+ const int num_layers = nn_config->num_hidden_layers;
+ assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+ for (int layer = 0; layer < num_layers; ++layer) {
+ const float *layer_weights = nn_config->weights[layer];
+ const float *layer_bias = nn_config->bias[layer];
+ float *output_nodes = buf[buf_index];
+ const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+ assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+ for (int node = 0; node < num_output_nodes; ++node) {
+ float val = layer_bias[node];
+ for (int i = 0; i < num_input_nodes; ++i)
+ val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
+ // ReLU as activation function.
+ val = val > 0.0f ? val : 0.0f; // Could use AOMMAX().
+ output_nodes[node] = val;
+ }
+ num_input_nodes = num_output_nodes;
+ input_nodes = output_nodes;
+ buf_index = 1 - buf_index;
+ }
+
+ // Final output layer.
+ const float *layer_weights = nn_config->weights[num_layers];
+ const float *layer_bias = nn_config->bias[num_layers];
+ for (int node = 0; node < nn_config->num_outputs; ++node) {
+ float val = layer_bias[node];
+ for (int i = 0; i < num_input_nodes; ++i)
+ val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
+ output[node] = val;
+ }
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
+
+#if CONFIG_NN_V2
+// Applies the ReLu activation to one fc layer
+// output[i] = Max(input[i],0.0f)
+static float *nn_relu(const float *input, FC_LAYER *layer) {
+ for (int i = 0; i < layer->num_outputs; ++i) {
+ layer->output[i] = AOMMAX(input[i], 0.0f);
+ }
+
+ return layer->output;
+}
+
+// Applies the Sigmoid activation to one fc layer
+// output[i] = 1/(1+exp(input[i]))
+static float *nn_sigmoid(const float *input, FC_LAYER *layer) {
+ for (int i = 0; i < layer->num_outputs; ++i) {
+ const float tmp = AOMMIN(AOMMAX(input[i], -10.0f), 10.0f);
+ layer->output[i] = 1.0f / (1.0f + expf(-tmp));
+ }
+
+ return layer->output;
+}
+
+// Forward prediction in one fc layer, used in function av1_nn_predict_V2
+static float *nn_fc_forward(const float *input, FC_LAYER *layer) {
+ const float *weights = layer->weights;
+ const float *bias = layer->bias;
+ assert(layer->num_outputs < NN_MAX_NODES_PER_LAYER);
+ // fc
+ for (int node = 0; node < layer->num_outputs; ++node) {
+ float val = bias[node];
+ for (int i = 0; i < layer->num_inputs; ++i) val += weights[i] * input[i];
+ layer->output[node] = val;
+ weights += layer->num_inputs;
+ }
+
+ // activation
+ switch (layer->activation) {
+ case NONE: return layer->output;
+ case RELU: return nn_relu(layer->output, layer);
+ case SIGMOID: return nn_sigmoid(layer->output, layer);
+ case SOFTSIGN:
+ assert(0 && "Softsign has not been supported in NN."); // TO DO
+ return NULL;
+ default:
+ assert(0 && "Unknown activation"); // Unknown activation
+ return NULL;
+ }
+}
+
+void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config,
+ int reduce_prec, float *output) {
+ const float *input_nodes = feature;
+
+ // Propagate the layers.
+ const int num_layers = nn_config->num_hidden_layers;
+ assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+ for (int i = 0; i < num_layers; ++i) {
+ input_nodes = nn_fc_forward(input_nodes, nn_config->layer + i);
+ assert(nn_config->layer[i + 1].num_inputs ==
+ nn_config->layer[i].num_outputs);
+ }
+
+ // Final layer
+ input_nodes = nn_fc_forward(input_nodes, nn_config->layer + num_layers);
+ assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits);
+ // Copy the final layer output
+ memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits);
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits);
+}
+#endif // CONFIG_NN_V2
+
+void av1_nn_softmax(const float *input, float *output, int n) {
+ // Softmax function is invariant to adding the same constant
+ // to all input values, so we subtract the maximum input to avoid
+ // possible overflow.
+ float max_input = input[0];
+ for (int i = 1; i < n; i++) max_input = AOMMAX(max_input, input[i]);
+ float sum_out = 0.0f;
+ for (int i = 0; i < n; i++) {
+ // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+ const float normalized_input = AOMMAX(input[i] - max_input, -10.0f);
+ output[i] = expf(normalized_input);
+ sum_out += output[i];
+ }
+ for (int i = 0; i < n; i++) output[i] /= sum_out;
+}
+
+void av1_nn_fast_softmax_16_c(const float *input, float *output) {
+ const int kNumClasses = 16;
+ float max_input = input[0];
+ for (int i = 1; i < kNumClasses; i++) max_input = AOMMAX(max_input, input[i]);
+ float sum_out = 0.0f;
+ for (int i = 0; i < kNumClasses; i++) {
+ // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+ const float normalized_input = AOMMAX(input[i] - max_input, -10.0f);
+ output[i] = approx_exp(normalized_input);
+ sum_out += output[i];
+ }
+ for (int i = 0; i < kNumClasses; i++) output[i] /= sum_out;
+}
diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h
new file mode 100644
index 0000000000..566f9271dd
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ML_H_
+#define AOM_AV1_ENCODER_ML_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/av1_rtcd.h"
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+struct NN_CONFIG {
+ int num_inputs; // Number of input nodes, i.e. features.
+ int num_outputs; // Number of output nodes.
+ int num_hidden_layers; // Number of hidden layers, maximum 10.
+ // Number of nodes for each hidden layer.
+ int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+ // Weight parameters, indexed by layer.
+ const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+ // Bias parameters, indexed by layer.
+ const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+};
+// Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs
+
+#if CONFIG_NN_V2
+// Fully-connectedly layer configuration
+struct FC_LAYER {
+ const int num_inputs; // Number of input nodes, i.e. features.
+ const int num_outputs; // Number of output nodes.
+
+ float *weights; // Weight parameters.
+ float *bias; // Bias parameters.
+ const ACTIVATION activation; // Activation function.
+
+ float *output; // The output array.
+ float *dY; // Gradient of outputs
+ float *dW; // Gradient of weights.
+ float *db; // Gradient of bias
+};
+
+// NN configure structure V2
+struct NN_CONFIG_V2 {
+ const int num_hidden_layers; // Number of hidden layers, max = 10.
+ FC_LAYER layer[NN_MAX_HIDDEN_LAYERS + 1]; // The layer array
+ const int num_logits; // Number of output nodes.
+ float *logits; // Raw prediction (same as output of final layer)
+ const LOSS loss; // Loss function
+};
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config,
+ int reduce_prec, float *output);
+#endif // CONFIG_NN_V2
+
+// Applies the softmax normalization function to the input
+// to get a valid probability distribution in the output:
+// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
+void av1_nn_softmax(const float *input, float *output, int n);
+
+// A faster but less accurate version of av1_nn_softmax(input, output, 16)
+void av1_nn_fast_softmax_16_c(const float *input, float *output);
+
+// Applies a precision reduction to output of av1_nn_predict to prevent
+// mismatches between C and SIMD implementations.
+void av1_nn_output_prec_reduce(float *const output, int num_output);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ML_H_
diff --git a/third_party/aom/av1/encoder/mode_prune_model_weights.h b/third_party/aom/av1/encoder/mode_prune_model_weights.h
new file mode 100644
index 0000000000..98ec36808a
--- /dev/null
+++ b/third_party/aom/av1/encoder/mode_prune_model_weights.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NUM_HIDDEN_LAYERS_12 1
+#define NUM_FEATURES_12 6
+#define NUM_LAYER_0_UNITS_12 24
+#define NUM_LOGITS_12 2
+
+static const float av1_intrap_hiddenlayer_0_kernel_12[] = {
+ 7.28372f, -1.3333898f, -1.3180022f, -0.007156151f, -0.40799126f,
+ -0.57538104f, -31.81647f, 6.7057495f, 6.351472f, -0.029544508f,
+ 0.026801195f, 1.12863f, -0.70769817f, -0.24183524f, 0.0649113f,
+ -0.7189517f, 0.21791299f, 0.12840256f, -0.56424767f, 0.16924907f,
+ 0.4605501f, -0.170895f, -0.60358995f, -0.15383226f, -4.0523643f,
+ 0.6961917f, 1.3100256f, -0.4189354f, 0.37264112f, -0.14555685f,
+ 10.628014f, 8.184437f, 8.941916f, -0.011731001f, -0.45127156f,
+ 0.42704004f, 36.84277f, 8.988796f, 8.844238f, 0.00030091056f,
+ -0.022038324f, 1.3566176f, -8.863219f, -0.84811693f, -1.0908632f,
+ 0.00023130262f, -1.0698471f, -6.755927f, 7.1711984f, 4.7216063f,
+ 3.5099216f, -0.6650184f, 0.5935173f, -0.6696286f, 11.8595295f,
+ 0.3001874f, 0.29822728f, 0.04319222f, -1.203178f, 1.1210147f,
+ 0.035045594f, -0.20559944f, -0.015388541f, -0.7857941f, -0.94100875f,
+ -0.1278549f, -19.22603f, 7.9466896f, 6.5048656f, -0.22195444f,
+ 0.19061874f, 1.3927288f, -8.896529f, -0.48146892f, -1.6098932f,
+ -0.0030235797f, -0.6533787f, -2.1333003f, -22.256454f, -4.934058f,
+ -4.4707212f, -0.015831878f, -0.4243649f, -2.776269f, -0.23762038f,
+ 0.1820098f, -0.51865315f, -1.1893421f, 0.34969202f, 0.10636194f,
+ 14.545696f, 1.3849198f, 2.6815193f, -0.5145498f, 0.45948258f,
+ -0.8842355f, -0.9111363f, -0.39652422f, 0.077266276f, -0.68084997f,
+ 0.4593515f, -0.28872707f, -6.936231f, 1.12253f, 1.7616503f,
+ -0.014069137f, -0.0052156276f, -4.5095444f, 6.2076726f, -0.058755957f,
+ -0.4675936f, -0.13039507f, 0.12094394f, -0.07285393f, 68.26125f,
+ 7.4893136f, 8.770954f, 0.020274093f, -0.027877754f, 1.6579602f,
+ -0.1825479f, 0.34832543f, 0.07472531f, -0.44812247f, -1.0941806f,
+ -0.16749863f, 1.1394324f, 0.47983396f, -0.99983627f, -0.00064249727f,
+ -1.3345739f, -0.057157427f, -18.14875f, 16.506035f, 15.539248f,
+ 0.013191509f, -0.021674965f, -25.006235f, 0.51220596f, 0.7334426f,
+ 0.81836903f, -1.0443225f, 0.4459505f, -1.2045046f
+};
+
+static const float av1_intrap_hiddenlayer_0_bias_12[] = {
+ -4.154915f, 14.33833f, 0.0f, 0.0f, 2.0440118f, 12.40922f,
+ -16.77514f, 0.5879813f, 3.2305415f, 0.8303539f, 0.0f, 14.488708f,
+ 2.94393f, 1.874383f, 0.0f, -0.53140444f, 0.0f, 1.8456234f,
+ -0.55427986f, -19.856262f, 0.0f, 0.17281002f, 48.31631f, 0.0f
+};
+
+static const float av1_intrap_logits_kernel_12[] = {
+ 0.26843873f, -0.09576241f, 0.34427166f, 0.09914787f, -0.10275399f,
+ 0.02999484f, -0.1467772f, 0.11594324f, 0.29200763f, 0.0067976206f,
+ 0.050393578f, -0.018694371f, 0.3333476f, 0.2127221f, 0.35128218f,
+ 0.19968672f, 0.08099991f, 0.084850654f, -0.16045967f, 0.30286232f,
+ 0.6164765f, -0.27140254f, 0.08210814f, 0.34852806f, 0.25028184f,
+ -0.12188078f, 0.16310331f, 0.31253803f, -0.10792341f, 0.065858394f,
+ -0.1349708f, 0.08948815f, 0.31905392f, 0.03680656f, -0.05040944f,
+ -0.051539157f, 0.3211852f, 0.2137136f, 0.45037416f, 0.22748767f,
+ -0.10978614f, 0.06475646f, -0.16954158f, 0.32831904f, 0.16479677f,
+ -0.30020145f, 0.066221856f, 0.37213042f
+};
+
+static const float av1_intrap_logits_bias_12[] = { 0.95783f, -0.95823103f };
+
+static const NN_CONFIG av1_intrap_nn_config = {
+ NUM_FEATURES_12,
+ NUM_LOGITS_12,
+ NUM_HIDDEN_LAYERS_12,
+ {
+ NUM_LAYER_0_UNITS_12,
+ },
+ {
+ av1_intrap_hiddenlayer_0_kernel_12,
+ av1_intrap_logits_kernel_12,
+ },
+ {
+ av1_intrap_hiddenlayer_0_bias_12,
+ av1_intrap_logits_bias_12,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_12
+#undef NUM_FEATURES_12
+#undef NUM_LAYER_0_UNITS_12
+#undef NUM_LOGITS_12
+
+#define NUM_HIDDEN_LAYERS_15 1
+#define NUM_FEATURES_15 6
+#define NUM_LAYER_0_UNITS_15 24
+#define NUM_LOGITS_15 2
+
+static const float av1_intraph_hiddenlayer_0_kernel_15[] = {
+ -0.77480125f, 0.3219551f, -0.015702145f, -0.5310235f, 0.5254026f,
+ -1.1522819f, 2.682016f, 0.08001052f, -0.2539285f, 0.04711023f,
+ -0.81296307f, 0.2675382f, 0.1952474f, -0.0664705f, 1.2989824f,
+ -0.3150117f, -0.8022715f, 0.045423955f, -27.584324f, -2.5608704f,
+ -3.2280366f, 0.05272543f, -0.47141576f, -0.07644298f, -53.77942f,
+ -22.393923f, -23.027853f, -0.00015186476f, -0.010696465f, 2.7064638f,
+ -22.776028f, 11.514891f, 11.138167f, -0.001243723f, -0.4802433f,
+ -8.758646f, 0.26398206f, -0.23485385f, 0.27586034f, -0.004954741f,
+ -0.4935232f, -0.017607696f, 69.56049f, -1.1756641f, -0.052366666f,
+ -0.38052833f, 0.32474658f, 0.04634263f, 0.8583235f, -0.528438f,
+ -0.7868907f, -0.4757781f, 0.4620985f, -0.70621157f, 231.40195f,
+ 6.805205f, 9.420295f, 0.02585775f, -0.03480937f, 1.3577378f,
+ 0.1758226f, 15.056758f, 14.437874f, -0.1305005f, 0.115103304f,
+ 0.21297209f, 55.821743f, -6.611156f, -6.8552365f, -0.011928095f,
+ -0.2042175f, 1.2557873f, -1.0722278f, -0.2683614f, 0.48318478f,
+ -0.73739994f, 0.54055226f, -0.03224738f, -0.06767959f, -0.21015017f,
+ 0.29171246f, -0.6937296f, -1.2342545f, -0.41278538f, -37.9365f,
+ 17.68424f, 16.263042f, -0.074828684f, 0.06607806f, -0.16763286f,
+ 13.594707f, 0.6152676f, -0.4371223f, -0.8365592f, 0.8273623f,
+ -1.2126317f, 0.1216157f, -1.3002136f, -0.18856938f, -0.2589358f,
+ -0.76897144f, 0.21777137f, -122.25033f, -0.23490006f, -3.1238277f,
+ -0.13916978f, 0.08576391f, -1.7391548f, -116.24812f, 14.906071f,
+ 13.468357f, 0.02332889f, -0.034617376f, -18.506111f, 0.7500542f,
+ -1.1882535f, 0.40848416f, -0.28434393f, -0.71471655f, -0.29188696f,
+ -0.46588746f, -0.17324813f, -0.62460244f, -1.1801276f, 0.28993344f,
+ -0.22072886f, 129.2688f, -0.33782578f, -0.34836572f, -0.034112718f,
+ -0.023666814f, -0.5865087f, -33.484146f, 1.1431375f, 0.56056374f,
+ -0.0049730353f, -0.24347587f, -1.3003352f, 0.88973033f, 0.8499571f,
+ -0.5678484f, -0.39009875f, -0.062105156f, -0.13965102f
+};
+
+static const float av1_intraph_hiddenlayer_0_bias_15[] = {
+ 0.0f, -0.2926711f, 0.0f, -1.0303509f, -27.459345f, 12.412848f,
+ 0.0f, -2.5971522f, -0.02733541f, -19.881912f, 14.391992f, -8.249469f,
+ 0.0f, 0.0f, 13.676118f, -0.6472994f, -0.07189449f, 1.1986839f,
+ 52.479107f, 0.0f, 0.0f, -3.0187025f, 1.4435643f, 0.0f
+};
+
+static const float av1_intraph_logits_kernel_15[] = {
+ 0.05390722f, -0.06859513f, 0.036842898f, 0.190772f, 0.13623567f,
+ 0.09321194f, 0.2314745f, -0.13958375f, -0.3058229f, -0.0104543045f,
+ 0.11336068f, -0.276115f, 0.00470723f, -0.49123898f, -0.15988174f,
+ 0.087681435f, 0.022517204f, 0.073877744f, 0.2968856f, -0.1401399f,
+ -0.38788354f, -0.26005393f, -0.39564916f, -0.16195515f, 0.2680102f,
+ -0.032179773f, -0.35758728f, 0.25819537f, 0.11468631f, 0.13573235f,
+ -0.2672175f, 0.016490124f, 0.048118807f, 0.020319486f, 0.07892215f,
+ -0.21821865f, 0.08434734f, 0.3129456f, -0.18215221f, 0.08884877f,
+ -0.35621428f, 0.11405768f, 0.27370325f, 0.14956686f, 0.01604587f,
+ -0.18334487f, -0.42385718f, -0.08033409f
+};
+
+static const float av1_intraph_logits_bias_15[] = { 0.83619016f, -0.8340626f };
+
+static const NN_CONFIG av1_intrap_hd_nn_config = {
+ NUM_FEATURES_15,
+ NUM_LOGITS_15,
+ NUM_HIDDEN_LAYERS_15,
+ {
+ NUM_LAYER_0_UNITS_15,
+ },
+ {
+ av1_intraph_hiddenlayer_0_kernel_15,
+ av1_intraph_logits_kernel_15,
+ },
+ {
+ av1_intraph_hiddenlayer_0_bias_15,
+ av1_intraph_logits_bias_15,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_15
+#undef NUM_FEATURES_15
+#undef NUM_LAYER_0_UNITS_15
+#undef NUM_LOGITS_15
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/model_rd.h b/third_party/aom/av1/encoder/model_rd.h
new file mode 100644
index 0000000000..f7e8b96b5b
--- /dev/null
+++ b/third_party/aom/av1/encoder/model_rd.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MODEL_RD_H_
+#define AOM_AV1_ENCODER_MODEL_RD_H_
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "config/aom_dsp_rtcd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// 0: Legacy model
+// 1: Curve fit model
+// 2: Surface fit model
+// 3: DNN regression model
+// 4: Full rd model
+#define MODELRD_TYPE_INTERP_FILTER 1
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 1
+#define MODELRD_TYPE_MASKED_COMPOUND 1
+#define MODELRD_TYPE_INTERINTRA 1
+#define MODELRD_TYPE_INTRA 1
+#define MODELRD_TYPE_MOTION_MODE_RD 1
+
+typedef void (*model_rd_for_sb_type)(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+ uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+ int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
+
+static int64_t calculate_sse(MACROBLOCKD *const xd,
+ const struct macroblock_plane *p,
+ struct macroblockd_plane *pd, const int bw,
+ const int bh) {
+ int64_t sse = 0;
+ const int shift = xd->bd - 8;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ bw, bh);
+ } else {
+ sse =
+ aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+ }
+#else
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+#endif
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+ return sse;
+}
+
+static AOM_INLINE int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane, const BLOCK_SIZE bsize) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int bw, bh;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+ &bh);
+
+ int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+
+ return sse;
+}
+
+static AOM_INLINE void model_rd_from_sse(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples,
+ int *rate, int64_t *dist) {
+ (void)num_samples;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+
+ // Fast approximate the modelling function.
+ if (cpi->sf.rd_sf.simple_model_rd_from_var) {
+ const int64_t square_error = sse;
+ int quantizer = p->dequant_QTX[1] >> dequant_shift;
+ if (quantizer < 120)
+ *rate = (int)AOMMIN(
+ (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
+ INT_MAX);
+ else
+ *rate = 0;
+ assert(*rate >= 0);
+ *dist = (square_error * quantizer) >> 8;
+ } else {
+ av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
+ p->dequant_QTX[1] >> dequant_shift, rate,
+ dist);
+ }
+ *dist <<= 4;
+}
+
+// Fits a curve for rate and distortion using as feature:
+// log2(sse_norm/qstep^2)
+static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples,
+ int *rate, int64_t *dist) {
+ (void)cpi;
+ (void)plane_bsize;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int qstep = AOMMAX(p->dequant_QTX[1] >> dequant_shift, 1);
+
+ if (sse == 0) {
+ if (rate) *rate = 0;
+ if (dist) *dist = 0;
+ return;
+ }
+ const double sse_norm = (double)sse / num_samples;
+ const double qstepsqr = (double)qstep * qstep;
+ const double xqr = log2(sse_norm / qstepsqr);
+ double rate_f, dist_by_sse_norm_f;
+ av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f,
+ &dist_by_sse_norm_f);
+
+ const double dist_f = dist_by_sse_norm_f * sse_norm;
+ int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+
+ // Check if skip is better
+ if (rate_i == 0) {
+ dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
+ rate_i = 0;
+ dist_i = sse << 4;
+ }
+
+ if (rate) *rate = rate_i;
+ if (dist) *dist = dist_i;
+}
+
+static AOM_INLINE void model_rd_for_sb(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+ uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+ int64_t *plane_sse, int64_t *plane_dist) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ int plane;
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ int64_t sse;
+ int rate;
+ int64_t dist;
+
+ sse = calculate_sse(xd, p, pd, bw, bh);
+
+ model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ assert(rate_sum >= 0);
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ rate_sum = AOMMIN(rate_sum, INT_MAX);
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+static AOM_INLINE void model_rd_for_sb_with_curvfit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+ uint8_t *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+ int64_t *plane_sse, int64_t *plane_dist) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+ int bw, bh;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+
+ sse = calculate_sse(xd, p, pd, bw, bh);
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = rate_sum == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+enum { MODELRD_LEGACY, MODELRD_CURVFIT, MODELRD_TYPES } UENUM1BYTE(ModelRdType);
+
+static const model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
+ model_rd_for_sb, model_rd_for_sb_with_curvfit
+};
+
+static const model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
+ model_rd_from_sse, model_rd_with_curvfit
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_ENCODER_MODEL_RD_H_
diff --git a/third_party/aom/av1/encoder/motion_search_facade.c b/third_party/aom/av1/encoder/motion_search_facade.c
new file mode 100644
index 0000000000..e7eec29dc3
--- /dev/null
+++ b/third_party/aom/av1/encoder/motion_search_facade.c
@@ -0,0 +1,1071 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/tx_search.h"
+
+#define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3)
+
+typedef struct {
+ int_mv fmv;
+ int weight;
+} cand_mv_t;
+
+static int compare_weight(const void *a, const void *b) {
+ const int diff = ((cand_mv_t *)a)->weight - ((cand_mv_t *)b)->weight;
+ if (diff < 0)
+ return 1;
+ else if (diff > 0)
+ return -1;
+ return 0;
+}
+
+// Allow more mesh searches for screen content type on the ARF.
+static int use_fine_search_interval(const AV1_COMP *const cpi) {
+ return cpi->is_screen_content_type &&
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE &&
+ cpi->oxcf.speed <= 2;
+}
+
+// Iterate through the tpl and collect the mvs to be used as candidates
+static INLINE void get_mv_candidate_from_tpl(const AV1_COMP *const cpi,
+ const MACROBLOCK *x,
+ BLOCK_SIZE bsize, int ref,
+ cand_mv_t *cand, int *cand_count,
+ int *total_cand_weight) {
+ const SuperBlockEnc *sb_enc = &x->sb_enc;
+ if (!sb_enc->tpl_data_count) {
+ return;
+ }
+
+ const AV1_COMMON *cm = &cpi->common;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ const BLOCK_SIZE tpl_bsize =
+ convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+ const int tplw = mi_size_wide[tpl_bsize];
+ const int tplh = mi_size_high[tpl_bsize];
+ const int nw = mi_size_wide[bsize] / tplw;
+ const int nh = mi_size_high[bsize] / tplh;
+
+ if (nw >= 1 && nh >= 1) {
+ const int of_h = mi_row % mi_size_high[cm->seq_params->sb_size];
+ const int of_w = mi_col % mi_size_wide[cm->seq_params->sb_size];
+ const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw;
+ int valid = 1;
+
+ // Assign large weight to start_mv, so it is always tested.
+ cand[0].weight = nw * nh;
+
+ for (int k = 0; k < nh; k++) {
+ for (int l = 0; l < nw; l++) {
+ const int_mv mv =
+ sb_enc
+ ->tpl_mv[start + k * sb_enc->tpl_stride + l][ref - LAST_FRAME];
+ if (mv.as_int == INVALID_MV) {
+ valid = 0;
+ break;
+ }
+
+ const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row),
+ GET_MV_RAWPEL(mv.as_mv.col) };
+ int unique = 1;
+ for (int m = 0; m < *cand_count; m++) {
+ if (RIGHT_SHIFT_MV(fmv.row) ==
+ RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.row) &&
+ RIGHT_SHIFT_MV(fmv.col) ==
+ RIGHT_SHIFT_MV(cand[m].fmv.as_fullmv.col)) {
+ unique = 0;
+ cand[m].weight++;
+ break;
+ }
+ }
+
+ if (unique) {
+ cand[*cand_count].fmv.as_fullmv = fmv;
+ cand[*cand_count].weight = 1;
+ (*cand_count)++;
+ }
+ }
+ if (!valid) break;
+ }
+
+ if (valid) {
+ *total_cand_weight = 2 * nh * nw;
+ if (*cand_count > 2)
+ qsort(cand, *cand_count, sizeof(cand[0]), &compare_weight);
+ }
+ }
+}
+
+void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
+ int search_range, inter_mode_info *mode_info,
+ int_mv *best_mv,
+ struct HandleInterModeArgs *const args) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const AV1_COMMON *cm = &cpi->common;
+ const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+ const int num_planes = av1_num_planes(cm);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+ int bestsme = INT_MAX;
+ const int ref = mbmi->ref_frame[ref_idx];
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const MvCosts *mv_costs = x->mv_costs;
+
+ if (scaled_ref_frame) {
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // full-pixel motion search code to be used without additional
+ // modifications.
+ for (int i = 0; i < num_planes; i++) {
+ backup_yv12[i] = xd->plane[i].pre[ref_idx];
+ }
+ av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+ num_planes);
+ }
+
+ // Work out the size of the first step in the mv step search.
+ // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
+ int step_param;
+ if (cpi->sf.mv_sf.auto_mv_step_size && cm->show_frame) {
+ // Take the weighted average of the step_params based on the last frame's
+ // max mv magnitude and that based on the best ref mvs of the current
+ // block for the given reference.
+ step_param = (av1_init_search_range(x->max_mv_context[ref]) +
+ mv_search_params->mv_step_param) /
+ 2;
+ } else {
+ step_param = mv_search_params->mv_step_param;
+ }
+
+ const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+ FULLPEL_MV start_mv;
+ if (mbmi->motion_mode != SIMPLE_TRANSLATION)
+ start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv);
+ else
+ start_mv = get_fullmv_from_mv(&ref_mv);
+
+ // cand stores start_mv and all possible MVs in a SB.
+ cand_mv_t cand[MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB + 1];
+ av1_zero(cand);
+ cand[0].fmv.as_fullmv = start_mv;
+ int cnt = 1;
+ int total_weight = 0;
+
+ if (!cpi->sf.mv_sf.full_pixel_search_level &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION) {
+ get_mv_candidate_from_tpl(cpi, x, bsize, ref, cand, &cnt, &total_weight);
+ }
+
+ const int cand_cnt = AOMMIN(2, cnt);
+ // TODO(any): Test the speed feature for OBMC_CAUSAL mode.
+ if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION) {
+ const int stack_size = args->start_mv_cnt;
+ for (int cand_idx = 0; cand_idx < cand_cnt; cand_idx++) {
+ int_mv *fmv_cand = &cand[cand_idx].fmv;
+ int skip_cand_mv = 0;
+
+ // Check difference between mvs in the stack and candidate mv.
+ for (int stack_idx = 0; stack_idx < stack_size; stack_idx++) {
+ const uint8_t this_ref_mv_idx = args->ref_mv_idx_stack[stack_idx];
+ const FULLPEL_MV *fmv_stack = &args->start_mv_stack[stack_idx];
+ const int this_newmv_valid =
+ args->single_newmv_valid[this_ref_mv_idx][ref];
+ const int row_diff = abs(fmv_stack->row - fmv_cand->as_fullmv.row);
+ const int col_diff = abs(fmv_stack->col - fmv_cand->as_fullmv.col);
+
+ if (!this_newmv_valid) continue;
+
+ if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 2) {
+ // Prunes the current start_mv candidate, if the absolute mv
+ // difference of both row and column are <= 1.
+ if (row_diff <= 1 && col_diff <= 1) {
+ skip_cand_mv = 1;
+ break;
+ }
+ } else if (cpi->sf.mv_sf.skip_fullpel_search_using_startmv >= 1) {
+ // Prunes the current start_mv candidate, if the sum of the absolute
+ // mv difference of row and column is <= 1.
+ if (row_diff + col_diff <= 1) {
+ skip_cand_mv = 1;
+ break;
+ }
+ }
+ }
+ if (skip_cand_mv) {
+ // Ensure atleast one full-pel motion search is not pruned.
+ assert(mbmi->ref_mv_idx != 0);
+ // Mark the candidate mv as invalid so that motion search gets skipped.
+ cand[cand_idx].fmv.as_int = INVALID_MV;
+ } else {
+ // Store start_mv candidate and corresponding ref_mv_idx of full-pel
+ // search in the mv stack (except last ref_mv_idx).
+ if (mbmi->ref_mv_idx != MAX_REF_MV_SEARCH - 1) {
+ assert(args->start_mv_cnt < (MAX_REF_MV_SEARCH - 1) * 2);
+ args->start_mv_stack[args->start_mv_cnt] = fmv_cand->as_fullmv;
+ args->ref_mv_idx_stack[args->start_mv_cnt] = mbmi->ref_mv_idx;
+ args->start_mv_cnt++;
+ }
+ }
+ }
+ }
+
+ // Hot fix for asan complaints when resize mode is on. When resize mode is on,
+ // the stride of the reference frame can be different from indicated by
+ // MotionVectorSearchParams::search_site_cfg. When this happens, we need to
+ // readjust the stride.
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, mv_sf, bsize);
+ const search_site_config *src_search_site_cfg =
+ av1_get_search_site_config(cpi, x, search_method);
+
+ // Further reduce the search range.
+ if (search_range < INT_MAX) {
+ const search_site_config *search_site_cfg =
+ &src_search_site_cfg[search_method_lookup[search_method]];
+ // Max step_param is search_site_cfg->num_search_steps.
+ if (search_range < 1) {
+ step_param = search_site_cfg->num_search_steps;
+ } else {
+ while (search_site_cfg->radius[search_site_cfg->num_search_steps -
+ step_param - 1] > (search_range << 1) &&
+ search_site_cfg->num_search_steps - step_param - 1 > 0)
+ step_param++;
+ }
+ }
+
+ int cost_list[5];
+ FULLPEL_MV_STATS best_mv_stats;
+ int_mv second_best_mv;
+ best_mv->as_int = second_best_mv.as_int = INVALID_MV;
+
+ // Allow more mesh searches for screen content type on the ARF.
+ const int fine_search_interval = use_fine_search_interval(cpi);
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+
+ switch (mbmi->motion_mode) {
+ case SIMPLE_TRANSLATION: {
+ // Perform a search with the top 2 candidates
+ int sum_weight = 0;
+ for (int m = 0; m < cand_cnt; m++) {
+ int_mv smv = cand[m].fmv;
+ FULLPEL_MV this_best_mv, this_second_best_mv;
+ FULLPEL_MV_STATS this_mv_stats;
+
+ if (smv.as_int == INVALID_MV) continue;
+
+ av1_make_default_fullpel_ms_params(
+ &full_ms_params, cpi, x, bsize, &ref_mv, smv.as_fullmv,
+ src_search_site_cfg, search_method, fine_search_interval);
+
+ const int thissme =
+ av1_full_pixel_search(smv.as_fullmv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list), &this_best_mv,
+ &this_mv_stats, &this_second_best_mv);
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ best_mv->as_fullmv = this_best_mv;
+ best_mv_stats = this_mv_stats;
+ second_best_mv.as_fullmv = this_second_best_mv;
+ }
+
+ sum_weight += cand[m].weight;
+ if (4 * sum_weight > 3 * total_weight) break;
+ }
+ } break;
+ case OBMC_CAUSAL:
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+ &ref_mv, start_mv, src_search_site_cfg,
+ search_method, fine_search_interval);
+
+ bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params,
+ step_param, &best_mv->as_fullmv);
+ break;
+ default: assert(0 && "Invalid motion mode!\n");
+ }
+ if (best_mv->as_int == INVALID_MV) return;
+
+ if (scaled_ref_frame) {
+ // Swap back the original buffers for subpel motion search.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[ref_idx] = backup_yv12[i];
+ }
+ }
+
+ // Terminate search with the current ref_idx based on fullpel mv, rate cost,
+ // and other know cost.
+ if (cpi->sf.inter_sf.skip_newmv_in_drl >= 2 &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION &&
+ best_mv->as_int != INVALID_MV) {
+ int_mv this_mv;
+ this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+ const int this_mv_rate =
+ av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int;
+ mode_info[ref_mv_idx].full_mv_rate = this_mv_rate;
+ mode_info[ref_mv_idx].full_mv_bestsme = bestsme;
+
+ for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+ // Check if the motion search result same as previous results
+ if (this_mv.as_int == mode_info[prev_ref_idx].full_search_mv.as_int) {
+ // Compare the rate cost
+ const int prev_rate_cost = mode_info[prev_ref_idx].full_mv_rate +
+ mode_info[prev_ref_idx].drl_cost;
+ const int this_rate_cost =
+ this_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+ if (prev_rate_cost <= this_rate_cost) {
+ // If the current rate_cost is worse than the previous rate_cost, then
+ // we terminate the search. Since av1_single_motion_search is only
+ // called by handle_new_mv in SIMPLE_TRANSLATION mode, we set the
+ // best_mv to INVALID mv to signal that we wish to terminate search
+ // for the current mode.
+ best_mv->as_int = INVALID_MV;
+ return;
+ }
+ }
+
+ // Terminate the evaluation of current ref_mv_idx based on bestsme and
+ // drl_cost.
+ const int psme = mode_info[prev_ref_idx].full_mv_bestsme;
+ if (psme == INT_MAX) continue;
+ const int thr =
+ cpi->sf.inter_sf.skip_newmv_in_drl == 3 ? (psme + (psme >> 2)) : psme;
+ if (cpi->sf.inter_sf.skip_newmv_in_drl >= 3 &&
+ mode_info[ref_mv_idx].full_mv_bestsme > thr &&
+ mode_info[prev_ref_idx].drl_cost < mode_info[ref_mv_idx].drl_cost) {
+ best_mv->as_int = INVALID_MV;
+ return;
+ }
+ }
+ }
+
+ if (cpi->common.features.cur_frame_force_integer_mv) {
+ convert_fullmv_to_mv(best_mv);
+ }
+
+ const int use_fractional_mv =
+ bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+ int best_mv_rate = 0;
+ int mv_rate_calculated = 0;
+ if (use_fractional_mv) {
+ int_mv fractional_ms_list[3];
+ av1_set_fractional_mv(fractional_ms_list);
+ int dis; /* TODO: use dis in distortion calculation later. */
+
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+ cost_list);
+ MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+
+ switch (mbmi->motion_mode) {
+ case SIMPLE_TRANSLATION:
+ if (mv_sf->use_accurate_subpel_search) {
+ const int try_second = second_best_mv.as_int != INVALID_MV &&
+ second_best_mv.as_int != best_mv->as_int &&
+ (mv_sf->disable_second_mv <= 1);
+ const int best_mv_var = mv_search_params->find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats,
+ &best_mv->as_mv, &dis, &x->pred_sse[ref], fractional_ms_list);
+
+ if (try_second) {
+ struct macroblockd_plane *p = xd->plane;
+ const BUFFER_SET orig_dst = {
+ { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+ { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+ };
+ int64_t rd = INT64_MAX;
+ if (!mv_sf->disable_second_mv) {
+ // Calculate actual rd cost.
+ mbmi->mv[0].as_mv = best_mv->as_mv;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+ bsize, 0, 0);
+ av1_subtract_plane(x, bsize, 0);
+ RD_STATS this_rd_stats;
+ av1_init_rd_stats(&this_rd_stats);
+ av1_estimate_txfm_yrd(cpi, x, &this_rd_stats, INT64_MAX, bsize,
+ max_txsize_rect_lookup[bsize]);
+ int this_mv_rate = av1_mv_bit_cost(
+ &best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ rd = RDCOST(x->rdmult, this_mv_rate + this_rd_stats.rate,
+ this_rd_stats.dist);
+ }
+
+ MV this_best_mv;
+ subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
+ if (av1_is_subpelmv_in_range(&ms_params.mv_limits,
+ subpel_start_mv)) {
+ unsigned int sse;
+ const int this_var = mv_search_params->find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv,
+ &dis, &sse, fractional_ms_list);
+
+ if (!mv_sf->disable_second_mv) {
+ // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost
+ // to choose the better MV.
+ mbmi->mv[0].as_mv = this_best_mv;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+ bsize, 0, 0);
+ av1_subtract_plane(x, bsize, 0);
+ RD_STATS tmp_rd_stats;
+ av1_init_rd_stats(&tmp_rd_stats);
+ av1_estimate_txfm_yrd(cpi, x, &tmp_rd_stats, INT64_MAX, bsize,
+ max_txsize_rect_lookup[bsize]);
+ int tmp_mv_rate = av1_mv_bit_cost(
+ &this_best_mv, &ref_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ int64_t tmp_rd =
+ RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate,
+ tmp_rd_stats.dist);
+ if (tmp_rd < rd) {
+ best_mv->as_mv = this_best_mv;
+ x->pred_sse[ref] = sse;
+ }
+ } else {
+ // If cpi->sf.mv_sf.disable_second_mv = 1, use var to decide the
+ // best MV.
+ if (this_var < best_mv_var) {
+ best_mv->as_mv = this_best_mv;
+ x->pred_sse[ref] = sse;
+ }
+ }
+ }
+ }
+ } else {
+ mv_search_params->find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats,
+ &best_mv->as_mv, &dis, &x->pred_sse[ref], NULL);
+ }
+ break;
+ case OBMC_CAUSAL:
+ av1_find_best_obmc_sub_pixel_tree_up(
+ xd, cm, &ms_params, subpel_start_mv, NULL, &best_mv->as_mv, &dis,
+ &x->pred_sse[ref], NULL);
+ break;
+ default: assert(0 && "Invalid motion mode!\n");
+ }
+
+ // Terminate search with the current ref_idx based on subpel mv and rate
+ // cost.
+ if (cpi->sf.inter_sf.skip_newmv_in_drl >= 1 && args != NULL &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION &&
+ best_mv->as_int != INVALID_MV) {
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+ best_mv_rate =
+ av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ mv_rate_calculated = 1;
+
+ for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+ if (!args->single_newmv_valid[prev_ref_idx][ref]) continue;
+ // Check if the motion vectors are the same.
+ if (best_mv->as_int == args->single_newmv[prev_ref_idx][ref].as_int) {
+ // Skip this evaluation if the previous one is skipped.
+ if (mode_info[prev_ref_idx].skip) {
+ mode_info[ref_mv_idx].skip = 1;
+ break;
+ }
+ // Compare the rate cost that we current know.
+ const int prev_rate_cost =
+ args->single_newmv_rate[prev_ref_idx][ref] +
+ mode_info[prev_ref_idx].drl_cost;
+ const int this_rate_cost =
+ best_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+ if (prev_rate_cost <= this_rate_cost) {
+ // If the current rate_cost is worse than the previous rate_cost,
+ // then we terminate the search for this ref_mv_idx.
+ mode_info[ref_mv_idx].skip = 1;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ if (mv_rate_calculated) {
+ *rate_mv = best_mv_rate;
+ } else {
+ *rate_mv =
+ av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ }
+}
+
+int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int_mv *cur_mv,
+ const uint8_t *mask, int mask_stride, int *rate_mv,
+ int allow_second_mv, int joint_me_num_refine_iter) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ const int plane = 0;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ // This function should only ever be called for compound modes
+ assert(has_second_ref(mbmi));
+ const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
+ const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+ const MvCosts *mv_costs = x->mv_costs;
+ int_mv ref_mv[2];
+ int ite, ref;
+
+ // Get the prediction block from the 'other' reference frame.
+ const int_interpfilters interp_filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+ InterPredParams inter_pred_params;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // Do joint motion search in compound mode to get more accurate mv.
+ struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+ int last_besterr[2] = { INT_MAX, INT_MAX };
+ const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+ av1_get_scaled_ref_frame(cpi, refs[0]),
+ av1_get_scaled_ref_frame(cpi, refs[1])
+ };
+
+ // Prediction buffer from second frame.
+ DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
+ uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
+
+ int_mv best_mv, second_best_mv;
+
+ // Allow joint search multiple times iteratively for each reference frame
+ // and break out of the search loop if it couldn't find a better mv.
+ for (ite = 0; ite < (2 * joint_me_num_refine_iter); ite++) {
+ struct buf_2d ref_yv12[2];
+ int bestsme = INT_MAX;
+ int id = ite % 2; // Even iterations search in the first reference frame,
+ // odd iterations search in the second. The predictor
+ // found for the 'other' reference frame is factored in.
+ if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
+ if (cur_mv[id].as_int == init_mv[id].as_int) {
+ break;
+ } else {
+ int_mv cur_int_mv, init_int_mv;
+ cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
+ cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3;
+ init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
+ init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
+ if (cur_int_mv.as_int == init_int_mv.as_int) {
+ break;
+ }
+ }
+ }
+ for (ref = 0; ref < 2; ++ref) {
+ ref_mv[ref] = av1_get_ref_mv(x, ref);
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ if (scaled_ref_frame[ref]) {
+ int i;
+ for (i = 0; i < num_planes; i++)
+ backup_yv12[ref][i] = xd->plane[i].pre[ref];
+ av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+ NULL, num_planes);
+ }
+ }
+
+ assert(IMPLIES(scaled_ref_frame[0] != NULL,
+ cm->width == scaled_ref_frame[0]->y_crop_width &&
+ cm->height == scaled_ref_frame[0]->y_crop_height));
+ assert(IMPLIES(scaled_ref_frame[1] != NULL,
+ cm->width == scaled_ref_frame[1]->y_crop_width &&
+ cm->height == scaled_ref_frame[1]->y_crop_height));
+
+ // Initialize based on (possibly scaled) prediction buffers.
+ ref_yv12[0] = xd->plane[plane].pre[0];
+ ref_yv12[1] = xd->plane[plane].pre[1];
+
+ av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE,
+ mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+ &cm->sf_identity, &ref_yv12[!id], interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+ // Since we have scaled the reference frames to match the size of the
+ // current frame we must use a unit scaling factor during mode selection.
+ av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv,
+ &inter_pred_params);
+
+ // Do full-pixel compound motion search on the current reference frame.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+ // Make motion search params
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ FULLPEL_MV_STATS best_mv_stats;
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, mv_sf, bsize);
+ const search_site_config *src_search_sites =
+ av1_get_search_site_config(cpi, x, search_method);
+ // Use the mv result from the single mode as mv predictor.
+ const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+ &ref_mv[id].as_mv, start_fullmv,
+ src_search_sites, search_method,
+ /*fine_search_interval=*/0);
+
+ av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
+ mask_stride, id);
+
+ // Small-range full-pixel motion search.
+ if (!mv_sf->disable_extensive_joint_motion_search &&
+ mbmi->interinter_comp.type != COMPOUND_WEDGE) {
+ bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+ &best_mv.as_fullmv, &best_mv_stats,
+ &second_best_mv.as_fullmv);
+ } else {
+ bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
+ &best_mv.as_fullmv);
+ second_best_mv = best_mv;
+ }
+
+ const int try_second = second_best_mv.as_int != INVALID_MV &&
+ second_best_mv.as_int != best_mv.as_int &&
+ allow_second_mv;
+
+ // Restore the pointer to the first (possibly scaled) prediction buffer.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+ for (ref = 0; ref < 2; ++ref) {
+ if (scaled_ref_frame[ref]) {
+ // Swap back the original buffers for subpel motion search.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[ref] = backup_yv12[ref][i];
+ }
+ // Re-initialize based on unscaled prediction buffers.
+ ref_yv12[ref] = xd->plane[plane].pre[ref];
+ }
+ }
+
+ // Do sub-pixel compound motion search on the current reference frame.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+ if (cpi->common.features.cur_frame_force_integer_mv) {
+ convert_fullmv_to_mv(&best_mv);
+ }
+ if (bestsme < INT_MAX &&
+ cpi->common.features.cur_frame_force_integer_mv == 0) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+ &ref_mv[id].as_mv, NULL);
+ av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred,
+ mask, mask_stride, id);
+ ms_params.forced_stop = EIGHTH_PEL;
+ MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
+ bestsme = cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis, &sse, NULL);
+
+ if (try_second) {
+ MV this_best_mv;
+ MV subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
+ if (av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv)) {
+ const int thissme = cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, NULL, &this_best_mv, &dis,
+ &sse, NULL);
+ if (thissme < bestsme) {
+ best_mv.as_mv = this_best_mv;
+ bestsme = thissme;
+ }
+ }
+ }
+ }
+
+ // Restore the pointer to the first prediction buffer.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+ if (bestsme < last_besterr[id]) {
+ cur_mv[id] = best_mv;
+ last_besterr[id] = bestsme;
+ } else {
+ break;
+ }
+ }
+
+ *rate_mv = 0;
+
+ for (ref = 0; ref < 2; ++ref) {
+ const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
+ *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
+ mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ }
+
+ return AOMMIN(last_besterr[0], last_besterr[1]);
+}
+
+// Search for the best mv for one component of a compound,
+// given that the other component is fixed.
+int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *this_mv,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ int *rate_mv, int ref_idx) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int ref = mbmi->ref_frame[ref_idx];
+ const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const MvCosts *mv_costs = x->mv_costs;
+
+ struct buf_2d backup_yv12[MAX_MB_PLANE];
+ const YV12_BUFFER_CONFIG *const scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+
+ // Check that this is either an interinter or an interintra block
+ assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi)));
+
+ // Store the first prediction buffer.
+ struct buf_2d orig_yv12;
+ if (ref_idx) {
+ orig_yv12 = pd->pre[0];
+ pd->pre[0] = pd->pre[ref_idx];
+ }
+
+ if (scaled_ref_frame) {
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // full-pixel motion search code to be used without additional
+ // modifications.
+ for (int i = 0; i < num_planes; i++) {
+ backup_yv12[i] = xd->plane[i].pre[ref_idx];
+ }
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ // The index below needs to be 0 instead of ref_idx since we assume the
+ // 0th slot to be used for subsequent searches. Note that the ref_idx
+ // reference buffer has been copied to the 0th slot in the code above.
+ // Now we need to swap the reference frame for the 0th slot.
+ av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+ num_planes);
+ }
+
+ int bestsme = INT_MAX;
+ int_mv best_mv;
+
+ // Make motion search params
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ FULLPEL_MV_STATS best_mv_stats;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
+ const search_site_config *src_search_sites =
+ av1_get_search_site_config(cpi, x, search_method);
+ // Use the mv result from the single mode as mv predictor.
+ const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+ &ref_mv.as_mv, start_fullmv,
+ src_search_sites, search_method,
+ /*fine_search_interval=*/0);
+
+ av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
+ mask_stride, ref_idx);
+
+ // Small-range full-pixel motion search.
+ bestsme = av1_full_pixel_search(start_fullmv, &full_ms_params, 5, NULL,
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+ if (scaled_ref_frame) {
+ // Swap back the original buffers for subpel motion search for the 0th slot.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = backup_yv12[i];
+ }
+ }
+
+ if (cpi->common.features.cur_frame_force_integer_mv) {
+ convert_fullmv_to_mv(&best_mv);
+ }
+ const int use_fractional_mv =
+ bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+ if (use_fractional_mv) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv,
+ NULL);
+ av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred,
+ mask, mask_stride, ref_idx);
+ ms_params.forced_stop = EIGHTH_PEL;
+ MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
+ bestsme = cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, start_mv, &best_mv_stats, &best_mv.as_mv, &dis,
+ &sse, NULL);
+ }
+
+ // Restore the pointer to the first unscaled prediction buffer.
+ if (ref_idx) pd->pre[0] = orig_yv12;
+
+ if (bestsme < INT_MAX) *this_mv = best_mv.as_mv;
+
+ *rate_mv = 0;
+
+ *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, mv_costs->nmv_joint_cost,
+ mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ return bestsme;
+}
+
+static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize,
+ const MV *other_mv, int ref_idx,
+ uint8_t *second_pred) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x);
+ const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y);
+
+ // This function should only ever be called for compound modes
+ assert(has_second_ref(mbmi));
+
+ const int plane = 0;
+ struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx];
+
+ struct scale_factors sf;
+ av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
+ cm->width, cm->height);
+
+ InterPredParams inter_pred_params;
+
+ av1_init_inter_params(&inter_pred_params, pw, ph, p_row, p_col,
+ pd->subsampling_x, pd->subsampling_y, xd->bd,
+ is_cur_buf_hbd(xd), 0, &sf, &ref_yv12,
+ mbmi->interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+
+ // Get the prediction block from the 'other' reference frame.
+ av1_enc_build_one_inter_predictor(second_pred, pw, other_mv,
+ &inter_pred_params);
+}
+
+// Wrapper for av1_compound_single_motion_search, for the common case
+// where the second prediction is also an inter mode.
+int av1_compound_single_motion_search_interinter(
+ const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+ const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ // This function should only ever be called for compound modes
+ assert(has_second_ref(xd->mi[0]));
+
+ // Prediction buffer from second frame.
+ DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+ uint8_t *second_pred;
+ if (is_cur_buf_hbd(xd))
+ second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+ else
+ second_pred = (uint8_t *)second_pred_alloc_16;
+
+ MV *this_mv = &cur_mv[ref_idx].as_mv;
+ const MV *other_mv = &cur_mv[!ref_idx].as_mv;
+ build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred);
+ return av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred,
+ mask, mask_stride, rate_mv, ref_idx);
+}
+
+static AOM_INLINE void do_masked_motion_search_indexed(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
+ int_mv *tmp_mv, int *rate_mv, int which) {
+ // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ BLOCK_SIZE sb_type = mbmi->bsize;
+ const uint8_t *mask;
+ const int mask_stride = block_size_wide[bsize];
+
+ mask = av1_get_compound_type_mask(comp_data, sb_type);
+
+ tmp_mv[0].as_int = cur_mv[0].as_int;
+ tmp_mv[1].as_int = cur_mv[1].as_int;
+ if (which == 0 || which == 1) {
+ av1_compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask,
+ mask_stride, rate_mv, which);
+ } else if (which == 2) {
+ const int joint_me_num_refine_iter =
+ cpi->sf.inter_sf.enable_fast_compound_mode_search == 2
+ ? REDUCED_JOINT_ME_REFINE_ITER
+ : NUM_JOINT_ME_REFINE_ITER;
+ av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv,
+ !cpi->sf.mv_sf.disable_second_mv,
+ joint_me_num_refine_iter);
+ }
+}
+
+int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize,
+ const PREDICTION_MODE this_mode) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int_mv tmp_mv[2];
+ int tmp_rate_mv = 0;
+ // TODO(jingning): The average compound mode has proper SAD and variance
+ // functions implemented, and is triggerd by setting the mask pointer as
+ // Null. Need to further implement those for frame distance weighted mode.
+ mbmi->interinter_comp.seg_mask =
+ mbmi->interinter_comp.type == COMPOUND_AVERAGE ? NULL : xd->seg_mask;
+ const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
+
+ if (this_mode == NEW_NEWMV) {
+ do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+ tmp_mv, &tmp_rate_mv, 2);
+ mbmi->mv[0].as_int = tmp_mv[0].as_int;
+ mbmi->mv[1].as_int = tmp_mv[1].as_int;
+ } else if (this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV) {
+ // which = 1 if this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV
+ // which = 0 if this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV
+ int which = (NEWMV == compound_ref1_mode(this_mode));
+ do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+ tmp_mv, &tmp_rate_mv, which);
+ mbmi->mv[which].as_int = tmp_mv[which].as_int;
+ }
+ return tmp_rate_mv;
+}
+
+int_mv av1_simple_motion_search_sse_var(AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int ref,
+ FULLPEL_MV start_mv, int num_planes,
+ int use_subpixel, unsigned int *sse,
+ unsigned int *var) {
+ assert(num_planes == 1 &&
+ "Currently simple_motion_search only supports luma plane");
+ assert(!frame_is_intra_only(&cpi->common) &&
+ "Simple motion search only enabled for non-key frames");
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ mbmi->bsize = bsize;
+ mbmi->ref_frame[0] = ref;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+ struct buf_2d backup_yv12;
+ // ref_mv is used to calculate the cost of the motion vector
+ const MV ref_mv = kZeroMv;
+ const int step_param =
+ AOMMIN(cpi->mv_search_params.mv_step_param +
+ cpi->sf.part_sf.simple_motion_search_reduce_search_steps,
+ MAX_MVSEARCH_STEPS - 2);
+ int cost_list[5];
+ const int ref_idx = 0;
+ int bestsme;
+ int_mv best_mv;
+ FULLPEL_MV_STATS best_mv_stats;
+
+ av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
+ get_ref_scale_factors(cm, ref), num_planes);
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ if (scaled_ref_frame) {
+ backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
+ av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+ num_planes);
+ }
+
+ // Allow more mesh searches for screen content type on the ARF.
+ const int fine_search_interval = use_fine_search_interval(cpi);
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ const MV_SPEED_FEATURES *mv_sf = &cpi->sf.mv_sf;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, mv_sf, bsize);
+ const search_site_config *src_search_sites =
+ av1_get_search_site_config(cpi, x, search_method);
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
+ start_mv, src_search_sites, search_method,
+ fine_search_interval);
+
+ bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list),
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+ const int use_subpel_search =
+ bestsme < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv &&
+ use_subpixel &&
+ (cpi->sf.mv_sf.simple_motion_subpel_force_stop != FULL_PEL);
+ if (scaled_ref_frame) {
+ xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+ }
+ if (use_subpel_search) {
+ int not_used = 0;
+
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+ cost_list);
+ // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params().
+ ms_params.forced_stop = mv_sf->simple_motion_subpel_force_stop;
+
+ MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+
+ cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv.as_mv,
+ &not_used, &x->pred_sse[ref], NULL);
+
+ mbmi->mv[0] = best_mv;
+
+ // Get a copy of the prediction output
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ *var = cpi->ppi->fn_ptr[bsize].vf(
+ x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, sse);
+ } else {
+ // Manually convert from units of pixel to 1/8-pixels if we are not doing
+ // subpel search
+ convert_fullmv_to_mv(&best_mv);
+ *var = best_mv_stats.distortion;
+ *sse = best_mv_stats.sse;
+ }
+
+ return best_mv;
+}
diff --git a/third_party/aom/av1/encoder/motion_search_facade.h b/third_party/aom/av1/encoder/motion_search_facade.h
new file mode 100644
index 0000000000..d1fa915bca
--- /dev/null
+++ b/third_party/aom/av1/encoder/motion_search_facade.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MOTION_SEARCH_H_
+#define AOM_AV1_ENCODER_MOTION_SEARCH_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NUM_JOINT_ME_REFINE_ITER 2
+#define REDUCED_JOINT_ME_REFINE_ITER 1
+// TODO(any): rename this struct to something else. There is already another
+// struct called inter_modes_info, which makes this terribly confusing.
+typedef struct {
+ int drl_cost;
+ int_mv full_search_mv;
+ int full_mv_rate;
+ int full_mv_bestsme;
+ int skip;
+} inter_mode_info;
+
+struct HandleInterModeArgs;
+void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
+ int search_range, inter_mode_info *mode_info,
+ int_mv *best_mv,
+ struct HandleInterModeArgs *const args);
+
+int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int_mv *cur_mv,
+ const uint8_t *mask, int mask_stride, int *rate_mv,
+ int allow_second_mv, int joint_me_num_refine_iter);
+
+int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize,
+ const PREDICTION_MODE this_mode);
+
+int av1_compound_single_motion_search_interinter(
+ const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+ const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx);
+
+int av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *this_mv,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ int *rate_mv, int ref_idx);
+
+// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
+// ref and calculates the sse and var of the residue. Note that this sets the
+// offset of mbmi, so we will need to reset it after calling this function.
+int_mv av1_simple_motion_search_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int ref,
+ const FULLPEL_MV start_mv,
+ int num_planes, int use_subpixel,
+ unsigned int *sse, unsigned int *var);
+
+static AOM_INLINE const search_site_config *av1_get_search_site_config(
+ const AV1_COMP *cpi, MACROBLOCK *x, SEARCH_METHODS search_method) {
+ const int ref_stride = x->e_mbd.plane[0].pre[0].stride;
+
+ // AV1_COMP::mv_search_params.search_site_config is a compressor level cache
+ // that's shared by multiple threads. In most cases where all frames have the
+ // same resolution, the cache contains the search site config that we need.
+ const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+ if (ref_stride == mv_search_params->search_site_cfg[SS_CFG_SRC]->stride) {
+ return mv_search_params->search_site_cfg[SS_CFG_SRC];
+ } else if (ref_stride ==
+ mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD]->stride) {
+ return mv_search_params->search_site_cfg[SS_CFG_LOOKAHEAD];
+ }
+
+ // If the cache does not contain the correct stride, then we will need to rely
+ // on the thread level config MACROBLOCK::search_site_cfg_buf. If even the
+ // thread level config doesn't match, then we need to update it.
+ search_method = search_method_lookup[search_method];
+ assert(search_method_lookup[search_method] == search_method &&
+ "The search_method_lookup table should be idempotent.");
+ if (ref_stride != x->search_site_cfg_buf[search_method].stride) {
+ av1_refresh_search_site_config(x->search_site_cfg_buf, search_method,
+ ref_stride);
+ }
+
+ return x->search_site_cfg_buf;
+}
+
+static AOM_INLINE SEARCH_METHODS
+av1_get_faster_search_method(SEARCH_METHODS search_method) {
+ // Note on search method's accuracy:
+ // 1. NSTEP
+ // 2. DIAMOND
+ // 3. BIGDIA \approx SQUARE
+ // 4. HEX.
+ // 5. FAST_HEX \approx FAST_DIAMOND
+ switch (search_method) {
+ case NSTEP: return DIAMOND;
+ case NSTEP_8PT: return DIAMOND;
+ case DIAMOND: return BIGDIA;
+ case CLAMPED_DIAMOND: return BIGDIA;
+ case BIGDIA: return HEX;
+ case SQUARE: return HEX;
+ case HEX: return FAST_HEX;
+ case FAST_HEX: return FAST_HEX;
+ case FAST_DIAMOND: return VFAST_DIAMOND;
+ case FAST_BIGDIA: return FAST_BIGDIA;
+ case VFAST_DIAMOND: return VFAST_DIAMOND;
+ default: assert(0 && "Invalid search method!"); return DIAMOND;
+ }
+}
+
+static AOM_INLINE SEARCH_METHODS av1_get_default_mv_search_method(
+ const MACROBLOCK *x, const MV_SPEED_FEATURES *mv_sf, BLOCK_SIZE bsize) {
+ SEARCH_METHODS search_method = mv_sf->search_method;
+ const int sf_blk_search_method = mv_sf->use_bsize_dependent_search_method;
+ const int min_dim = AOMMIN(block_size_wide[bsize], block_size_high[bsize]);
+ const int qband = x->qindex >> (QINDEX_BITS - 2);
+ const bool use_faster_search_method =
+ (sf_blk_search_method == 1 && min_dim >= 32) ||
+ (sf_blk_search_method >= 2 && min_dim >= 16 &&
+ x->content_state_sb.source_sad_nonrd <= kMedSad && qband < 3);
+
+ if (use_faster_search_method) {
+ search_method = av1_get_faster_search_method(search_method);
+ }
+ return search_method;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_MOTION_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/mv_prec.c b/third_party/aom/av1/encoder/mv_prec.c
new file mode 100644
index 0000000000..b64f4dcd0e
--- /dev/null
+++ b/third_party/aom/av1/encoder/mv_prec.c
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "av1/encoder/encodemv.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/misc_model_weights.h"
+#endif // !CONFIG_REALTIME_ONLY
+#include "av1/encoder/mv_prec.h"
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int_mv get_ref_mv_for_mv_stats(
+ const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+ int ref_idx) {
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+ assert(has_second_ref(mbmi));
+ ref_mv_idx += 1;
+ }
+
+ const MV_REFERENCE_FRAME *ref_frames = mbmi->ref_frame;
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+ const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack;
+
+ if (ref_frames[1] > INTRA_FRAME) {
+ assert(ref_idx == 0 || ref_idx == 1);
+ return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+ : curr_ref_mv_stack[ref_mv_idx].this_mv;
+ }
+
+ assert(ref_idx == 0);
+ return ref_mv_idx < mbmi_ext_frame->ref_mv_count
+ ? curr_ref_mv_stack[ref_mv_idx].this_mv
+ : mbmi_ext_frame->global_mvs[ref_frame_type];
+}
+
+static AOM_INLINE int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) {
+ const aom_cdf_prob cur_cdf = AOM_ICDF(cdf[symbol]);
+ const aom_cdf_prob prev_cdf = symbol ? AOM_ICDF(cdf[symbol - 1]) : 0;
+ const aom_cdf_prob p15 = AOMMAX(cur_cdf - prev_cdf, EC_MIN_PROB);
+
+ return av1_cost_symbol(p15);
+}
+
+static AOM_INLINE int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val,
+ int comp_idx, const AV1_COMP *cpi,
+ int *rates) {
+ assert(comp_val != 0 && "mv component should not have zero value!");
+ const int sign = comp_val < 0;
+ const int mag = sign ? -comp_val : comp_val;
+ const int mag_minus_1 = mag - 1;
+ int offset;
+ const int mv_class = av1_get_mv_class(mag_minus_1, &offset);
+ const int int_part = offset >> 3; // int mv data
+ const int frac_part = (offset >> 1) & 3; // fractional mv data
+ const int high_part = offset & 1; // high precision mv data
+ const int use_hp = cpi->common.features.allow_high_precision_mv;
+ int r_idx = 0;
+
+ const MACROBLOCK *const x = &cpi->td.mb;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ nmv_component *mvcomp_ctx = nmvc->comps;
+ nmv_component *cur_mvcomp_ctx = &mvcomp_ctx[comp_idx];
+ aom_cdf_prob *sign_cdf = cur_mvcomp_ctx->sign_cdf;
+ aom_cdf_prob *class_cdf = cur_mvcomp_ctx->classes_cdf;
+ aom_cdf_prob *class0_cdf = cur_mvcomp_ctx->class0_cdf;
+ aom_cdf_prob(*bits_cdf)[3] = cur_mvcomp_ctx->bits_cdf;
+ aom_cdf_prob *frac_part_cdf = mv_class
+ ? (cur_mvcomp_ctx->fp_cdf)
+ : (cur_mvcomp_ctx->class0_fp_cdf[int_part]);
+ aom_cdf_prob *high_part_cdf =
+ mv_class ? (cur_mvcomp_ctx->hp_cdf) : (cur_mvcomp_ctx->class0_hp_cdf);
+
+ const int sign_rate = get_symbol_cost(sign_cdf, sign);
+ rates[r_idx++] = sign_rate;
+ update_cdf(sign_cdf, sign, 2);
+
+ const int class_rate = get_symbol_cost(class_cdf, mv_class);
+ rates[r_idx++] = class_rate;
+ update_cdf(class_cdf, mv_class, MV_CLASSES);
+
+ int int_bit_rate = 0;
+ if (mv_class == MV_CLASS_0) {
+ int_bit_rate = get_symbol_cost(class0_cdf, int_part);
+ update_cdf(class0_cdf, int_part, CLASS0_SIZE);
+ } else {
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ for (int i = 0; i < n; ++i) {
+ int_bit_rate += get_symbol_cost(bits_cdf[i], (int_part >> i) & 1);
+ update_cdf(bits_cdf[i], (int_part >> i) & 1, 2);
+ }
+ }
+ rates[r_idx++] = int_bit_rate;
+ const int frac_part_rate = get_symbol_cost(frac_part_cdf, frac_part);
+ rates[r_idx++] = frac_part_rate;
+ update_cdf(frac_part_cdf, frac_part, MV_FP_SIZE);
+ const int high_part_rate =
+ use_hp ? get_symbol_cost(high_part_cdf, high_part) : 0;
+ if (use_hp) {
+ update_cdf(high_part_cdf, high_part, 2);
+ }
+ rates[r_idx++] = high_part_rate;
+
+ mv_stats->last_bit_zero += !high_part;
+ mv_stats->last_bit_nonzero += high_part;
+ const int total_rate =
+ (sign_rate + class_rate + int_bit_rate + frac_part_rate + high_part_rate);
+ return total_rate;
+}
+
+static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv,
+ const MV *cur_mv, const AV1_COMP *cpi) {
+ const MACROBLOCK *const x = &cpi->td.mb;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ aom_cdf_prob *joint_cdf = nmvc->joints_cdf;
+ const int use_hp = cpi->common.features.allow_high_precision_mv;
+
+ const MV diff = { cur_mv->row - ref_mv->row, cur_mv->col - ref_mv->col };
+ const int mv_joint = av1_get_mv_joint(&diff);
+ // TODO(chiyotsai@google.com): Estimate hp_diff when we are using lp
+ const MV hp_diff = diff;
+ const int hp_mv_joint = av1_get_mv_joint(&hp_diff);
+ const MV truncated_diff = { (diff.row / 2) * 2, (diff.col / 2) * 2 };
+ const MV lp_diff = use_hp ? truncated_diff : diff;
+ const int lp_mv_joint = av1_get_mv_joint(&lp_diff);
+
+ const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint);
+ const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint);
+ const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint);
+
+ update_cdf(joint_cdf, mv_joint, MV_JOINTS);
+
+ mv_stats->total_mv_rate += mv_joint_rate;
+ mv_stats->hp_total_mv_rate += hp_mv_joint_rate;
+ mv_stats->lp_total_mv_rate += lp_mv_joint_rate;
+ mv_stats->mv_joint_count[mv_joint]++;
+
+ for (int comp_idx = 0; comp_idx < 2; comp_idx++) {
+ const int comp_val = comp_idx ? diff.col : diff.row;
+ const int hp_comp_val = comp_idx ? hp_diff.col : hp_diff.row;
+ const int lp_comp_val = comp_idx ? lp_diff.col : lp_diff.row;
+ int rates[5];
+ av1_zero_array(rates, 5);
+
+ const int comp_rate =
+ comp_val ? keep_one_comp_stat(mv_stats, comp_val, comp_idx, cpi, rates)
+ : 0;
+ // TODO(chiyotsai@google.com): Properly get hp rate when use_hp is false
+ const int hp_rate =
+ hp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] + rates[4] : 0;
+ const int lp_rate =
+ lp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] : 0;
+
+ mv_stats->total_mv_rate += comp_rate;
+ mv_stats->hp_total_mv_rate += hp_rate;
+ mv_stats->lp_total_mv_rate += lp_rate;
+ }
+}
+
+static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats,
+ const AV1_COMP *cpi, int mi_row,
+ int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) {
+ return;
+ }
+
+ const MB_MODE_INFO *mbmi =
+ mi_params->mi_grid_base[mi_row * mi_params->mi_stride + mi_col];
+ const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame =
+ cpi->mbmi_ext_info.frame_base +
+ get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
+ cpi->mbmi_ext_info.stride);
+
+ if (!is_inter_block(mbmi)) {
+ mv_stats->intra_count++;
+ return;
+ }
+ mv_stats->inter_count++;
+
+ const PREDICTION_MODE mode = mbmi->mode;
+ const int is_compound = has_second_ref(mbmi);
+
+ if (mode == NEWMV || mode == NEW_NEWMV) {
+ // All mvs are new
+ for (int ref_idx = 0; ref_idx < 1 + is_compound; ++ref_idx) {
+ const MV ref_mv =
+ get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv;
+ const MV cur_mv = mbmi->mv[ref_idx].as_mv;
+ keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi);
+ }
+ } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV ||
+ mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+ // has exactly one new_mv
+ mv_stats->default_mvs += 1;
+
+ const int ref_idx = (mode == NEAREST_NEWMV || mode == NEAR_NEWMV);
+ const MV ref_mv =
+ get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv;
+ const MV cur_mv = mbmi->mv[ref_idx].as_mv;
+
+ keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi);
+ } else {
+ // No new_mv
+ mv_stats->default_mvs += 1 + is_compound;
+ }
+
+ // Add texture information
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int num_rows = block_size_high[bsize];
+ const int num_cols = block_size_wide[bsize];
+ const int y_stride = cpi->source->y_stride;
+ const int px_row = 4 * mi_row, px_col = 4 * mi_col;
+ const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int bd = cm->seq_params->bit_depth;
+ if (buf_is_hbd) {
+ uint16_t *source_buf =
+ CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col;
+ for (int row = 0; row < num_rows - 1; row++) {
+ for (int col = 0; col < num_cols - 1; col++) {
+ const int offset = row * y_stride + col;
+ const int horz_diff =
+ abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8);
+ const int vert_diff =
+ abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8);
+ mv_stats->horz_text += horz_diff;
+ mv_stats->vert_text += vert_diff;
+ mv_stats->diag_text += horz_diff * vert_diff;
+ }
+ }
+ } else {
+ uint8_t *source_buf = cpi->source->y_buffer + px_row * y_stride + px_col;
+ for (int row = 0; row < num_rows - 1; row++) {
+ for (int col = 0; col < num_cols - 1; col++) {
+ const int offset = row * y_stride + col;
+ const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]);
+ const int vert_diff =
+ abs(source_buf[offset + y_stride] - source_buf[offset]);
+ mv_stats->horz_text += horz_diff;
+ mv_stats->vert_text += vert_diff;
+ mv_stats->diag_text += horz_diff * vert_diff;
+ }
+ }
+ }
+}
+
+// Split block
+static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats,
+ const AV1_COMP *cpi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const AV1_COMMON *cm = &cpi->common;
+
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+ return;
+
+ const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int qbs = mi_size_wide[bsize] / 4;
+ switch (partition) {
+ case PARTITION_NONE:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ break;
+ case PARTITION_HORZ:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_VERT:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_SPLIT:
+ collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize);
+ collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs, subsize);
+ collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col, subsize);
+ collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col + hbs, subsize);
+ break;
+ case PARTITION_HORZ_A:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_HORZ_B:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_VERT_A:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_VERT_B:
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+ collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+ collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_HORZ_4:
+ for (int i = 0; i < 4; ++i) {
+ const int this_mi_row = mi_row + i * qbs;
+ collect_mv_stats_b(mv_stats, cpi, this_mi_row, mi_col);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (int i = 0; i < 4; ++i) {
+ const int this_mi_col = mi_col + i * qbs;
+ collect_mv_stats_b(mv_stats, cpi, mi_row, this_mi_col);
+ }
+ break;
+ default: assert(0);
+ }
+}
+
+static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats,
+ const AV1_COMP *cpi,
+ const TileInfo *tile_info) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int mi_row_start = tile_info->mi_row_start;
+ const int mi_row_end = tile_info->mi_row_end;
+ const int mi_col_start = tile_info->mi_col_start;
+ const int mi_col_end = tile_info->mi_col_end;
+ const int sb_size_mi = cm->seq_params->mib_size;
+ BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) {
+ for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) {
+ collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size);
+ }
+ }
+}
+
+void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) {
+ MV_STATS *mv_stats = &cpi->mv_stats;
+ const AV1_COMMON *cm = &cpi->common;
+ const int tile_cols = cm->tiles.cols;
+ const int tile_rows = cm->tiles.rows;
+
+ for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+ TileInfo tile_info;
+ av1_tile_set_row(&tile_info, cm, tile_row);
+ for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ av1_tile_set_col(&tile_info, cm, tile_col);
+ cpi->tile_data[tile_idx].tctx = *cm->fc;
+ cpi->td.mb.e_mbd.tile_ctx = &cpi->tile_data[tile_idx].tctx;
+ collect_mv_stats_tile(mv_stats, cpi, &tile_info);
+ }
+ }
+
+ mv_stats->q = current_q;
+ mv_stats->order = cpi->common.current_frame.order_hint;
+ mv_stats->valid = 1;
+}
+
+static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats,
+ int current_q) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int order_hint = cpi->common.current_frame.order_hint;
+ const int order_diff = order_hint - mv_stats->order;
+ const float area = (float)(cm->width * cm->height);
+ float features[MV_PREC_FEATURE_SIZE] = {
+ (float)current_q,
+ (float)mv_stats->q,
+ (float)order_diff,
+ mv_stats->inter_count / area,
+ mv_stats->intra_count / area,
+ mv_stats->default_mvs / area,
+ mv_stats->mv_joint_count[0] / area,
+ mv_stats->mv_joint_count[1] / area,
+ mv_stats->mv_joint_count[2] / area,
+ mv_stats->mv_joint_count[3] / area,
+ mv_stats->last_bit_zero / area,
+ mv_stats->last_bit_nonzero / area,
+ mv_stats->total_mv_rate / area,
+ mv_stats->hp_total_mv_rate / area,
+ mv_stats->lp_total_mv_rate / area,
+ mv_stats->horz_text / area,
+ mv_stats->vert_text / area,
+ mv_stats->diag_text / area,
+ };
+
+ for (int f_idx = 0; f_idx < MV_PREC_FEATURE_SIZE; f_idx++) {
+ features[f_idx] =
+ (features[f_idx] - av1_mv_prec_mean[f_idx]) / av1_mv_prec_std[f_idx];
+ }
+ float score = 0.0f;
+
+ av1_nn_predict(features, &av1_mv_prec_dnn_config, 1, &score);
+
+ const int use_high_hp = score >= 0.0f;
+ return use_high_hp;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) {
+ int use_hp = qindex < HIGH_PRECISION_MV_QTHRESH;
+#if !CONFIG_REALTIME_ONLY
+ MV_STATS *mv_stats = &cpi->mv_stats;
+#endif // !CONFIG_REALTIME_ONLY
+
+ if (cpi->sf.hl_sf.high_precision_mv_usage == QTR_ONLY) {
+ use_hp = 0;
+ }
+#if !CONFIG_REALTIME_ONLY
+ else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
+ av1_frame_allows_smart_mv(cpi) && mv_stats->valid) {
+ use_hp = get_smart_mv_prec(cpi, mv_stats, qindex);
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ av1_set_high_precision_mv(cpi, use_hp,
+ cpi->common.features.cur_frame_force_integer_mv);
+}
diff --git a/third_party/aom/av1/encoder/mv_prec.h b/third_party/aom/av1/encoder/mv_prec.h
new file mode 100644
index 0000000000..55108b6cdb
--- /dev/null
+++ b/third_party/aom/av1/encoder/mv_prec.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MV_PREC_H_
+#define AOM_AV1_ENCODER_MV_PREC_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+
+// Q threshold for high precision mv.
+#define HIGH_PRECISION_MV_QTHRESH 128
+#if !CONFIG_REALTIME_ONLY
+void av1_collect_mv_stats(AV1_COMP *cpi, int current_q);
+
+static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
+ const int gf_group_index = cpi->gf_frame_index;
+ const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index];
+ return !frame_is_intra_only(&cpi->common) &&
+ !(gf_update_type == INTNL_OVERLAY_UPDATE ||
+ gf_update_type == OVERLAY_UPDATE);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE void av1_set_high_precision_mv(
+ AV1_COMP *cpi, int allow_high_precision_mv,
+ int cur_frame_force_integer_mv) {
+ MvCosts *const mv_costs = cpi->td.mb.mv_costs;
+ // Avoid accessing 'mv_costs' when it is not allocated.
+ if (mv_costs == NULL) return;
+
+ const int copy_hp = cpi->common.features.allow_high_precision_mv =
+ allow_high_precision_mv && !cur_frame_force_integer_mv;
+
+ mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
+ mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
+ mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];
+ mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX];
+ mv_costs->mv_cost_stack =
+ copy_hp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost;
+}
+
+void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex);
+
+#endif // AOM_AV1_ENCODER_MV_PREC_H_
diff --git a/third_party/aom/av1/encoder/nonrd_opt.c b/third_party/aom/av1/encoder/nonrd_opt.c
new file mode 100644
index 0000000000..651ca43a2e
--- /dev/null
+++ b/third_party/aom/av1/encoder/nonrd_opt.c
@@ -0,0 +1,933 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/rdopt.h"
+
+static const SCAN_ORDER av1_fast_idtx_scan_order_16x16 = {
+ av1_fast_idtx_scan_16x16, av1_fast_idtx_iscan_16x16
+};
+
+#define DECLARE_BLOCK_YRD_BUFFERS() \
+ DECLARE_ALIGNED(64, tran_low_t, dqcoeff_buf[16 * 16]); \
+ DECLARE_ALIGNED(64, tran_low_t, qcoeff_buf[16 * 16]); \
+ DECLARE_ALIGNED(64, tran_low_t, coeff_buf[16 * 16]); \
+ uint16_t eob[1];
+
+#define DECLARE_BLOCK_YRD_VARS() \
+ /* When is_tx_8x8_dual_applicable is true, we compute the txfm for the \
+ * entire bsize and write macroblock_plane::coeff. So low_coeff is kept \
+ * as a non-const so we can reassign it to macroblock_plane::coeff. */ \
+ int16_t *low_coeff = (int16_t *)coeff_buf; \
+ int16_t *const low_qcoeff = (int16_t *)qcoeff_buf; \
+ int16_t *const low_dqcoeff = (int16_t *)dqcoeff_buf; \
+ const int diff_stride = bw;
+
+#define DECLARE_LOOP_VARS_BLOCK_YRD() \
+ const int16_t *src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+static AOM_FORCE_INLINE void update_yrd_loop_vars(
+ MACROBLOCK *x, int *skippable, int step, int ncoeffs,
+ int16_t *const low_coeff, int16_t *const low_qcoeff,
+ int16_t *const low_dqcoeff, RD_STATS *this_rdc, int *eob_cost,
+ int tx_blk_id) {
+ const int is_txfm_skip = (ncoeffs == 0);
+ *skippable &= is_txfm_skip;
+ x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
+ *eob_cost += get_msb(ncoeffs + 1);
+ if (ncoeffs == 1)
+ this_rdc->rate += (int)abs(low_qcoeff[0]);
+ else if (ncoeffs > 1)
+ this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
+
+ this_rdc->dist += av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
+}
+
+static INLINE void aom_process_hadamard_lp_8x16(MACROBLOCK *x,
+ int max_blocks_high,
+ int max_blocks_wide,
+ int num_4x4_w, int step,
+ int block_step) {
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ const int bw = 4 * num_4x4_w;
+ const int num_4x4 = AOMMIN(num_4x4_w, max_blocks_wide);
+ int block = 0;
+
+ for (int r = 0; r < max_blocks_high; r += block_step) {
+ for (int c = 0; c < num_4x4; c += 2 * block_step) {
+ const int16_t *src_diff = &p->src_diff[(r * bw + c) << 2];
+ int16_t *low_coeff = (int16_t *)p->coeff + BLOCK_OFFSET(block);
+ aom_hadamard_lp_8x8_dual(src_diff, (ptrdiff_t)bw, low_coeff);
+ block += 2 * step;
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define DECLARE_BLOCK_YRD_HBD_VARS() \
+ tran_low_t *const coeff = coeff_buf; \
+ tran_low_t *const qcoeff = qcoeff_buf; \
+ tran_low_t *const dqcoeff = dqcoeff_buf;
+
+static AOM_FORCE_INLINE void update_yrd_loop_vars_hbd(
+ MACROBLOCK *x, int *skippable, int step, int ncoeffs,
+ tran_low_t *const coeff, tran_low_t *const qcoeff,
+ tran_low_t *const dqcoeff, RD_STATS *this_rdc, int *eob_cost,
+ int tx_blk_id) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int is_txfm_skip = (ncoeffs == 0);
+ *skippable &= is_txfm_skip;
+ x->txfm_search_info.blk_skip[tx_blk_id] = is_txfm_skip;
+ *eob_cost += get_msb(ncoeffs + 1);
+
+ int64_t dummy;
+ if (ncoeffs == 1)
+ this_rdc->rate += (int)abs(qcoeff[0]);
+ else if (ncoeffs > 1)
+ this_rdc->rate += aom_satd(qcoeff, step << 4);
+ this_rdc->dist +=
+ av1_highbd_block_error(coeff, dqcoeff, step << 4, &dummy, xd->bd) >> 2;
+}
+#endif
+
+/*!\brief Calculates RD Cost using Hadamard transform.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost using Hadamard transform. For low bit depth this function
+ * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] this_rdc Pointer to calculated RD Cost
+ * \param[in] skippable Pointer to a flag indicating possible tx skip
+ * \param[in] bsize Current block size
+ * \param[in] tx_size Transform size
+ * \param[in] is_inter_mode Flag to indicate inter mode
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc. \c skippable flag is set if there is no non-zero quantized
+ * coefficients for Hadamard transform
+ */
+void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblockd_plane *pd = &xd->plane[AOM_PLANE_Y];
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int num_4x4_w = mi_size_wide[bsize];
+ const int num_4x4_h = mi_size_high[bsize];
+ const int step = 1 << (tx_size << 1);
+ const int block_step = (1 << tx_size);
+ const int row_step = step * num_4x4_w >> tx_size;
+ int block = 0;
+ const int max_blocks_wide =
+ num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
+ const int max_blocks_high =
+ num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
+ int eob_cost = 0;
+ const int bw = 4 * num_4x4_w;
+ const int bh = 4 * num_4x4_h;
+ const int use_hbd = is_cur_buf_hbd(xd);
+ int num_blk_skip_w = num_4x4_w;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd) {
+ aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+ p->src.stride, pd->dst.buf, pd->dst.stride);
+ } else {
+ aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+ }
+#else
+ aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+#endif
+
+ // Keep the intermediate value on the stack here. Writing directly to
+ // skippable causes speed regression due to load-and-store issues in
+ // update_yrd_loop_vars.
+ int temp_skippable = 1;
+ this_rdc->dist = 0;
+ this_rdc->rate = 0;
+ // For block sizes 8x16 or above, Hadamard txfm of two adjacent 8x8 blocks
+ // can be done per function call. Hence the call of Hadamard txfm is
+ // abstracted here for the specified cases.
+ int is_tx_8x8_dual_applicable =
+ (tx_size == TX_8X8 && block_size_wide[bsize] >= 16 &&
+ block_size_high[bsize] >= 8);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ // As of now, dual implementation of hadamard txfm is available for low
+ // bitdepth.
+ if (use_hbd) is_tx_8x8_dual_applicable = 0;
+#endif
+
+ if (is_tx_8x8_dual_applicable) {
+ aom_process_hadamard_lp_8x16(x, max_blocks_high, max_blocks_wide, num_4x4_w,
+ step, block_step);
+ }
+
+ const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+ DECLARE_BLOCK_YRD_BUFFERS()
+ DECLARE_BLOCK_YRD_VARS()
+#if CONFIG_AV1_HIGHBITDEPTH
+ DECLARE_BLOCK_YRD_HBD_VARS()
+#else
+ (void)use_hbd;
+#endif
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (int r = 0; r < max_blocks_high; r += block_step) {
+ for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
+ DECLARE_LOOP_VARS_BLOCK_YRD()
+
+ switch (tx_size) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ case TX_16X16:
+ if (use_hbd) {
+ aom_hadamard_16x16(src_diff, diff_stride, coeff);
+ av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+ dqcoeff, p->dequant_QTX, eob,
+ // default_scan_fp_16x16_transpose and
+ // av1_default_iscan_fp_16x16_transpose have to be
+ // used together.
+ default_scan_fp_16x16_transpose,
+ av1_default_iscan_fp_16x16_transpose);
+ } else {
+ aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+ av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
+ p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+ p->dequant_QTX, eob,
+ // default_scan_lp_16x16_transpose and
+ // av1_default_iscan_lp_16x16_transpose have to be
+ // used together.
+ default_scan_lp_16x16_transpose,
+ av1_default_iscan_lp_16x16_transpose);
+ }
+ break;
+ case TX_8X8:
+ if (use_hbd) {
+ aom_hadamard_8x8(src_diff, diff_stride, coeff);
+ av1_quantize_fp(
+ coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+ p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob,
+ default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
+ } else {
+ if (is_tx_8x8_dual_applicable) {
+ // The coeffs are pre-computed for the whole block, so re-assign
+ // low_coeff to the appropriate location.
+ const int block_offset = BLOCK_OFFSET(block + s);
+ low_coeff = (int16_t *)p->coeff + block_offset;
+ } else {
+ aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+ }
+ av1_quantize_lp(
+ low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff,
+ low_dqcoeff, p->dequant_QTX, eob,
+ // default_scan_8x8_transpose and
+ // av1_default_iscan_8x8_transpose have to be used together.
+ default_scan_8x8_transpose, av1_default_iscan_8x8_transpose);
+ }
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ // In tx_size=4x4 case, aom_fdct4x4 and aom_fdct4x4_lp generate
+ // normal coefficients order, so we don't need to change the scan
+ // order here.
+ if (use_hbd) {
+ aom_fdct4x4(src_diff, coeff, diff_stride);
+ av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+ dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+ scan_order->iscan);
+ } else {
+ aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+ av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
+ low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+ scan_order->scan, scan_order->iscan);
+ }
+ break;
+#else
+ case TX_16X16:
+ aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+ av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX,
+ low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+ default_scan_lp_16x16_transpose,
+ av1_default_iscan_lp_16x16_transpose);
+ break;
+ case TX_8X8:
+ if (is_tx_8x8_dual_applicable) {
+ // The coeffs are pre-computed for the whole block, so re-assign
+ // low_coeff to the appropriate location.
+ const int block_offset = BLOCK_OFFSET(block + s);
+ low_coeff = (int16_t *)p->coeff + block_offset;
+ } else {
+ aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+ }
+ av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
+ low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+ default_scan_8x8_transpose,
+ av1_default_iscan_8x8_transpose);
+ break;
+ default:
+ aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+ av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
+ low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+#endif
+ }
+ assert(*eob <= 1024);
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_hbd)
+ update_yrd_loop_vars_hbd(x, &temp_skippable, step, *eob, coeff, qcoeff,
+ dqcoeff, this_rdc, &eob_cost,
+ r * num_blk_skip_w + c);
+ else
+#endif
+ update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
+ low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
+ r * num_blk_skip_w + c);
+ }
+ block += row_step;
+ }
+
+ this_rdc->skip_txfm = *skippable = temp_skippable;
+ if (this_rdc->sse < INT64_MAX) {
+ this_rdc->sse = (this_rdc->sse << 6) >> 2;
+ if (temp_skippable) {
+ this_rdc->dist = 0;
+ this_rdc->dist = this_rdc->sse;
+ return;
+ }
+ }
+
+ // If skippable is set, rate gets clobbered later.
+ this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
+ this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
+}
+
+// Explicitly enumerate the cases so the compiler can generate SIMD for the
+// function. According to the disassembler, gcc generates SSE codes for each of
+// the possible block sizes. The hottest case is tx_width 16, which takes up
+// about 8% of the self cycle of av1_nonrd_pick_inter_mode_sb. Since
+// av1_nonrd_pick_inter_mode_sb takes up about 3% of total encoding time, the
+// potential room of improvement for writing AVX2 optimization is only 3% * 8% =
+// 0.24% of total encoding time.
+static AOM_INLINE void scale_square_buf_vals(int16_t *dst, int tx_width,
+ const int16_t *src,
+ int src_stride) {
+#define DO_SCALING \
+ do { \
+ for (int idy = 0; idy < tx_width; ++idy) { \
+ for (int idx = 0; idx < tx_width; ++idx) { \
+ dst[idy * tx_width + idx] = src[idy * src_stride + idx] * 8; \
+ } \
+ } \
+ } while (0)
+
+ if (tx_width == 4) {
+ DO_SCALING;
+ } else if (tx_width == 8) {
+ DO_SCALING;
+ } else if (tx_width == 16) {
+ DO_SCALING;
+ } else {
+ assert(0);
+ }
+
+#undef DO_SCALING
+}
+
+/*!\brief Calculates RD Cost when the block uses Identity transform.
+ * Note that this function is only for low bit depth encoding, since it
+ * is called in real-time mode for now, which sets high bit depth to 0:
+ * -DCONFIG_AV1_HIGHBITDEPTH=0
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost. For low bit depth this function
+ * uses low-precision set of functions (16-bit) and 32 bit for high bit depth
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] pred_buf Pointer to the prediction buffer
+ * \param[in] pred_stride Stride for the prediction buffer
+ * \param[in] this_rdc Pointer to calculated RD Cost
+ * \param[in] skippable Pointer to a flag indicating possible tx skip
+ * \param[in] bsize Current block size
+ * \param[in] tx_size Transform size
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc. \c skippable flag is set if all coefficients are zero.
+ */
+void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf,
+ int pred_stride, RD_STATS *this_rdc, int *skippable,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int num_4x4_w = mi_size_wide[bsize];
+ const int num_4x4_h = mi_size_high[bsize];
+ const int step = 1 << (tx_size << 1);
+ const int block_step = (1 << tx_size);
+ const int max_blocks_wide =
+ num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
+ const int max_blocks_high =
+ num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
+ int eob_cost = 0;
+ const int bw = 4 * num_4x4_w;
+ const int bh = 4 * num_4x4_h;
+ const int num_blk_skip_w = num_4x4_w;
+ // Keep the intermediate value on the stack here. Writing directly to
+ // skippable causes speed regression due to load-and-store issues in
+ // update_yrd_loop_vars.
+ int temp_skippable = 1;
+ int tx_wd = 0;
+ const SCAN_ORDER *scan_order = NULL;
+ switch (tx_size) {
+ case TX_64X64:
+ assert(0); // Not implemented
+ break;
+ case TX_32X32:
+ assert(0); // Not used
+ break;
+ case TX_16X16:
+ scan_order = &av1_fast_idtx_scan_order_16x16;
+ tx_wd = 16;
+ break;
+ case TX_8X8:
+ scan_order = &av1_fast_idtx_scan_order_8x8;
+ tx_wd = 8;
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ scan_order = &av1_fast_idtx_scan_order_4x4;
+ tx_wd = 4;
+ break;
+ }
+ assert(scan_order != NULL);
+
+ this_rdc->dist = 0;
+ this_rdc->rate = 0;
+ aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pred_buf, pred_stride);
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ DECLARE_BLOCK_YRD_BUFFERS()
+ DECLARE_BLOCK_YRD_VARS()
+ for (int r = 0; r < max_blocks_high; r += block_step) {
+ for (int c = 0, s = 0; c < max_blocks_wide; c += block_step, s += step) {
+ DECLARE_LOOP_VARS_BLOCK_YRD()
+ scale_square_buf_vals(low_coeff, tx_wd, src_diff, diff_stride);
+ av1_quantize_lp(low_coeff, tx_wd * tx_wd, p->round_fp_QTX,
+ p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX,
+ eob, scan_order->scan, scan_order->iscan);
+ assert(*eob <= 1024);
+ update_yrd_loop_vars(x, &temp_skippable, step, *eob, low_coeff,
+ low_qcoeff, low_dqcoeff, this_rdc, &eob_cost,
+ r * num_blk_skip_w + c);
+ }
+ }
+ this_rdc->skip_txfm = *skippable = temp_skippable;
+ if (this_rdc->sse < INT64_MAX) {
+ this_rdc->sse = (this_rdc->sse << 6) >> 2;
+ if (temp_skippable) {
+ this_rdc->dist = 0;
+ this_rdc->dist = this_rdc->sse;
+ return;
+ }
+ }
+ // If skippable is set, rate gets clobbered later.
+ this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
+ this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
+}
+
+int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ RD_STATS *this_rdc, int start_plane,
+ int stop_plane) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ int plane;
+ int64_t tot_sse = 0;
+
+ this_rdc->rate = 0;
+ this_rdc->dist = 0;
+ this_rdc->skip_txfm = 0;
+
+ for (plane = start_plane; plane <= stop_plane; ++plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const uint32_t dc_quant = p->dequant_QTX[0];
+ const uint32_t ac_quant = p->dequant_QTX[1];
+ const BLOCK_SIZE bs = plane_bsize;
+ unsigned int var;
+ if (!x->color_sensitivity[COLOR_SENS_IDX(plane)]) continue;
+
+ var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, &sse);
+ assert(sse >= var);
+ tot_sse += sse;
+
+ av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+ dc_quant >> 3, &rate, &dist);
+
+ this_rdc->rate += rate >> 1;
+ this_rdc->dist += dist << 3;
+
+ av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3,
+ &rate, &dist);
+
+ this_rdc->rate += rate;
+ this_rdc->dist += dist << 4;
+ }
+
+ if (this_rdc->rate == 0) {
+ this_rdc->skip_txfm = 1;
+ }
+
+ if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >=
+ RDCOST(x->rdmult, 0, tot_sse << 4)) {
+ this_rdc->rate = 0;
+ this_rdc->dist = tot_sse << 4;
+ this_rdc->skip_txfm = 1;
+ }
+
+ return tot_sse;
+}
+
+static void compute_intra_yprediction(const AV1_COMMON *cm,
+ PREDICTION_MODE mode, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd) {
+ const SequenceHeader *seq_params = cm->seq_params;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ uint8_t *const src_buf_base = p->src.buf;
+ uint8_t *const dst_buf_base = pd->dst.buf;
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ int plane = 0;
+ int row, col;
+ // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+ // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+ // transform size varies per plane, look it up in a common way.
+ const TX_SIZE tx_size = max_txsize_lookup[bsize];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ // If mb_to_right_edge is < 0 we are in a situation in which
+ // the current block size extends into the UMV and we won't
+ // visit the sub blocks that are wholly within the UMV.
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
+ p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+ pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0,
+ FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride,
+ 0, 0, plane);
+ }
+ }
+ p->src.buf = src_buf_base;
+ pd->dst.buf = dst_buf_base;
+}
+
+// Checks whether Intra mode needs to be pruned based on
+// 'intra_y_mode_bsize_mask_nrd' and 'prune_hv_pred_modes_using_blksad'
+// speed features.
+static INLINE bool is_prune_intra_mode(
+ AV1_COMP *cpi, int mode_index, int force_intra_check, BLOCK_SIZE bsize,
+ uint8_t segment_id, SOURCE_SAD source_sad_nonrd,
+ uint8_t color_sensitivity[MAX_MB_PLANE - 1]) {
+ const PREDICTION_MODE this_mode = intra_mode_list[mode_index];
+ if (mode_index > 2 || force_intra_check == 0) {
+ if (!((1 << this_mode) & cpi->sf.rt_sf.intra_y_mode_bsize_mask_nrd[bsize]))
+ return true;
+
+ if (this_mode == DC_PRED) return false;
+
+ if (!cpi->sf.rt_sf.prune_hv_pred_modes_using_src_sad) return false;
+
+ const bool has_color_sensitivity =
+ color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] &&
+ color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)];
+ if (has_color_sensitivity &&
+ (cpi->rc.frame_source_sad > 1.1 * cpi->rc.avg_source_sad ||
+ cyclic_refresh_segment_id_boosted(segment_id) ||
+ source_sad_nonrd > kMedSad))
+ return false;
+
+ return true;
+ }
+ return false;
+}
+
+/*!\brief Estimation of RD cost of an intra mode for Non-RD optimized case.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Calculates RD Cost for an intra mode for a single TX block using Hadamard
+ * transform.
+ * \param[in] plane Color plane
+ * \param[in] block Index of a TX block in a prediction block
+ * \param[in] row Row of a current TX block
+ * \param[in] col Column of a current TX block
+ * \param[in] plane_bsize Block size of a current prediction block
+ * \param[in] tx_size Transform size
+ * \param[in] arg Pointer to a structure that holds parameters
+ * for intra mode search
+ *
+ * \remark Nothing is returned. Instead, best mode and RD Cost of the best mode
+ * are set in \c args->rdc and \c args->mode
+ */
+void av1_estimate_block_intra(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct estimate_block_intra_args *const args = arg;
+ AV1_COMP *const cpi = args->cpi;
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+ uint8_t *const src_buf_base = p->src.buf;
+ uint8_t *const dst_buf_base = pd->dst.buf;
+ const int64_t src_stride = p->src.stride;
+ const int64_t dst_stride = pd->dst.stride;
+
+ (void)block;
+
+ av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+
+ if (args->prune_mode_based_on_sad) {
+ unsigned int this_sad = cpi->ppi->fn_ptr[plane_bsize].sdf(
+ p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride);
+ const unsigned int sad_threshold =
+ args->best_sad != UINT_MAX ? args->best_sad + (args->best_sad >> 4)
+ : UINT_MAX;
+ // Skip the evaluation of current mode if its SAD is more than a threshold.
+ if (this_sad > sad_threshold) {
+ // For the current mode, set rate and distortion to maximum possible
+ // values and return.
+ // Note: args->rdc->rate is checked in av1_nonrd_pick_intra_mode() to skip
+ // the evaluation of the current mode.
+ args->rdc->rate = INT_MAX;
+ args->rdc->dist = INT64_MAX;
+ return;
+ }
+ if (this_sad < args->best_sad) {
+ args->best_sad = this_sad;
+ }
+ }
+
+ RD_STATS this_rdc;
+ av1_invalid_rd_stats(&this_rdc);
+
+ p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
+ pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
+
+ if (plane == 0) {
+ av1_block_yrd(x, &this_rdc, &args->skippable, bsize_tx,
+ AOMMIN(tx_size, TX_16X16));
+ } else {
+ av1_model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, plane, plane);
+ }
+
+ p->src.buf = src_buf_base;
+ pd->dst.buf = dst_buf_base;
+ assert(args->rdc->rate != INT_MAX && args->rdc->dist != INT64_MAX);
+ args->rdc->rate += this_rdc.rate;
+ args->rdc->dist += this_rdc.dist;
+}
+
+/*!\brief Estimates best intra mode for inter mode search
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ *
+ * Using heuristics based on best inter mode, block size, and other decides
+ * whether to check intra modes. If so, estimates and selects best intra mode
+ * from the reduced set of intra modes (max 4 intra modes checked)
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the
+ * data for the current macroblock
+ * \param[in] bsize Current block size
+ * \param[in] best_early_term Flag, indicating that TX for the
+ * best inter mode was skipped
+ * \param[in] ref_cost_intra Cost of signalling intra mode
+ * \param[in] reuse_prediction Flag, indicating prediction re-use
+ * \param[in] orig_dst Original destination buffer
+ * \param[in] tmp_buffers Pointer to a temporary buffers for
+ * prediction re-use
+ * \param[out] this_mode_pred Pointer to store prediction buffer
+ * for prediction re-use
+ * \param[in] best_rdc Pointer to RD cost for the best
+ * selected intra mode
+ * \param[in] best_pickmode Pointer to a structure containing
+ * best mode picked so far
+ * \param[in] ctx Pointer to structure holding coding
+ * contexts and modes for the block
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c best_rdc and best selected mode is placed to \c best_pickmode
+ *
+ */
+void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int best_early_term, unsigned int ref_cost_intra,
+ int reuse_prediction, struct buf_2d *orig_dst,
+ PRED_BUFFER *tmp_buffers,
+ PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
+ BEST_PICKMODE *best_pickmode,
+ PICK_MODE_CONTEXT *ctx) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const unsigned char segment_id = mi->segment_id;
+ const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+ const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
+ const bool is_screen_content =
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+
+ const CommonQuantParams *quant_params = &cm->quant_params;
+
+ RD_STATS this_rdc;
+
+ int intra_cost_penalty = av1_get_intra_cost_penalty(
+ quant_params->base_qindex, quant_params->y_dc_delta_q,
+ cm->seq_params->bit_depth);
+ int64_t inter_mode_thresh =
+ RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
+ int perform_intra_pred = rt_sf->check_intra_pred_nonrd;
+ int force_intra_check = 0;
+ // For spatial enhancement layer: turn off intra prediction if the
+ // previous spatial layer as golden ref is not chosen as best reference.
+ // only do this for temporal enhancement layer and on non-key frames.
+ if (cpi->svc.spatial_layer_id > 0 &&
+ best_pickmode->best_ref_frame != GOLDEN_FRAME &&
+ cpi->svc.temporal_layer_id > 0 &&
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)
+ perform_intra_pred = 0;
+
+ int do_early_exit_rdthresh = 1;
+
+ uint32_t spatial_var_thresh = 50;
+ int motion_thresh = 32;
+ // Adjust thresholds to make intra mode likely tested if the other
+ // references (golden, alt) are skipped/not checked. For now always
+ // adjust for svc mode.
+ if (cpi->ppi->use_svc || (rt_sf->use_nonrd_altref_frame == 0 &&
+ rt_sf->nonrd_prune_ref_frame_search > 0)) {
+ spatial_var_thresh = 150;
+ motion_thresh = 0;
+ }
+
+ // Some adjustments to checking intra mode based on source variance.
+ if (x->source_variance < spatial_var_thresh) {
+ // If the best inter mode is large motion or non-LAST ref reduce intra cost
+ // penalty, so intra mode is more likely tested.
+ if (best_rdc->rdcost != INT64_MAX &&
+ (best_pickmode->best_ref_frame != LAST_FRAME ||
+ abs(mi->mv[0].as_mv.row) >= motion_thresh ||
+ abs(mi->mv[0].as_mv.col) >= motion_thresh)) {
+ intra_cost_penalty = intra_cost_penalty >> 2;
+ inter_mode_thresh =
+ RDCOST(x->rdmult, ref_cost_intra + intra_cost_penalty, 0);
+ do_early_exit_rdthresh = 0;
+ }
+ if ((x->source_variance < AOMMAX(50, (spatial_var_thresh >> 1)) &&
+ x->content_state_sb.source_sad_nonrd >= kHighSad) ||
+ (is_screen_content && x->source_variance < 50 &&
+ ((bsize >= BLOCK_32X32 &&
+ x->content_state_sb.source_sad_nonrd != kZeroSad) ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)))
+ force_intra_check = 1;
+ // For big blocks worth checking intra (since only DC will be checked),
+ // even if best_early_term is set.
+ if (bsize >= BLOCK_32X32) best_early_term = 0;
+ } else if (rt_sf->source_metrics_sb_nonrd &&
+ x->content_state_sb.source_sad_nonrd <= kLowSad) {
+ perform_intra_pred = 0;
+ }
+
+ if (best_rdc->skip_txfm && best_pickmode->best_mode_initial_skip_flag) {
+ if (rt_sf->skip_intra_pred == 1 && best_pickmode->best_mode != NEWMV)
+ perform_intra_pred = 0;
+ else if (rt_sf->skip_intra_pred == 2)
+ perform_intra_pred = 0;
+ }
+
+ if (!(best_rdc->rdcost == INT64_MAX || force_intra_check ||
+ (perform_intra_pred && !best_early_term &&
+ bsize <= cpi->sf.part_sf.max_intra_bsize))) {
+ return;
+ }
+
+ // Early exit based on RD cost calculated using known rate. When
+ // is_screen_content is true, more bias is given to intra modes. Hence,
+ // considered conservative threshold in early exit for the same.
+ const int64_t known_rd = is_screen_content
+ ? CALC_BIASED_RDCOST(inter_mode_thresh)
+ : inter_mode_thresh;
+ if (known_rd > best_rdc->rdcost) return;
+
+ struct estimate_block_intra_args args;
+ init_estimate_block_intra_args(&args, cpi, x);
+ TX_SIZE intra_tx_size = AOMMIN(
+ AOMMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
+ TX_16X16);
+ if (is_screen_content && cpi->rc.high_source_sad &&
+ x->source_variance > spatial_var_thresh && bsize <= BLOCK_16X16)
+ intra_tx_size = TX_4X4;
+
+ PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+ if (reuse_prediction && best_pred != NULL) {
+ const int bh = block_size_high[bsize];
+ const int bw = block_size_wide[bsize];
+ if (best_pred->data == orig_dst->buf) {
+ *this_mode_pred = &tmp_buffers[get_pred_buffer(tmp_buffers, 3)];
+ aom_convolve_copy(best_pred->data, best_pred->stride,
+ (*this_mode_pred)->data, (*this_mode_pred)->stride, bw,
+ bh);
+ best_pickmode->best_pred = *this_mode_pred;
+ }
+ }
+ pd->dst = *orig_dst;
+
+ for (int midx = 0; midx < RTC_INTRA_MODES; ++midx) {
+ const PREDICTION_MODE this_mode = intra_mode_list[midx];
+ const THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+ const int64_t mode_rd_thresh = rd_threshes[mode_index];
+
+ if (is_prune_intra_mode(cpi, midx, force_intra_check, bsize, segment_id,
+ x->content_state_sb.source_sad_nonrd,
+ x->color_sensitivity))
+ continue;
+
+ if (is_screen_content && rt_sf->source_metrics_sb_nonrd) {
+ // For spatially flat blocks with zero motion only check
+ // DC mode.
+ if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
+ x->source_variance == 0 && this_mode != DC_PRED)
+ continue;
+ // Only test Intra for big blocks if spatial_variance is small.
+ else if (bsize > BLOCK_32X32 && x->source_variance > 50)
+ continue;
+ }
+
+ if (rd_less_than_thresh(best_rdc->rdcost, mode_rd_thresh,
+ rd_thresh_freq_fact[mode_index]) &&
+ (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
+ continue;
+ }
+ const BLOCK_SIZE uv_bsize =
+ get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+
+ mi->mode = this_mode;
+ mi->ref_frame[0] = INTRA_FRAME;
+ mi->ref_frame[1] = NONE_FRAME;
+
+ av1_invalid_rd_stats(&this_rdc);
+ args.mode = this_mode;
+ args.skippable = 1;
+ args.rdc = &this_rdc;
+ mi->tx_size = intra_tx_size;
+ compute_intra_yprediction(cm, this_mode, bsize, x, xd);
+ // Look into selecting tx_size here, based on prediction residual.
+ av1_block_yrd(x, &this_rdc, &args.skippable, bsize, mi->tx_size);
+ // TODO(kyslov@) Need to account for skippable
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) {
+ av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_U,
+ av1_estimate_block_intra, &args);
+ }
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) {
+ av1_foreach_transformed_block_in_plane(xd, uv_bsize, AOM_PLANE_V,
+ av1_estimate_block_intra, &args);
+ }
+
+ int mode_cost = 0;
+ if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
+ mode_cost +=
+ x->mode_costs.angle_delta_cost[this_mode - V_PRED]
+ [MAX_ANGLE_DELTA +
+ mi->angle_delta[PLANE_TYPE_Y]];
+ }
+ if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+ mode_cost += x->mode_costs.filter_intra_cost[bsize][0];
+ }
+ this_rdc.rate += ref_cost_intra;
+ this_rdc.rate += intra_cost_penalty;
+ this_rdc.rate += mode_cost;
+ this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+ if (is_screen_content && rt_sf->source_metrics_sb_nonrd) {
+ // For blocks with low spatial variance and color sad,
+ // favor the intra-modes, only on scene/slide change.
+ if (cpi->rc.high_source_sad && x->source_variance < 800 &&
+ (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]))
+ this_rdc.rdcost = CALC_BIASED_RDCOST(this_rdc.rdcost);
+ // Otherwise bias against intra for blocks with zero
+ // motion and no color, on non-scene/slide changes.
+ else if (!cpi->rc.high_source_sad && x->source_variance > 0 &&
+ x->content_state_sb.source_sad_nonrd == kZeroSad &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0)
+ this_rdc.rdcost = (3 * this_rdc.rdcost) >> 1;
+ }
+
+ if (this_rdc.rdcost < best_rdc->rdcost) {
+ *best_rdc = this_rdc;
+ best_pickmode->best_mode = this_mode;
+ best_pickmode->best_tx_size = mi->tx_size;
+ best_pickmode->best_ref_frame = INTRA_FRAME;
+ best_pickmode->best_second_ref_frame = NONE;
+ best_pickmode->best_mode_skip_txfm = this_rdc.skip_txfm;
+ mi->uv_mode = this_mode;
+ mi->mv[0].as_int = INVALID_MV;
+ mi->mv[1].as_int = INVALID_MV;
+ if (!this_rdc.skip_txfm)
+ memset(ctx->blk_skip, 0,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+ if (best_pickmode->best_ref_frame == INTRA_FRAME)
+ memset(ctx->blk_skip, 0,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ mi->tx_size = best_pickmode->best_tx_size;
+}
diff --git a/third_party/aom/av1/encoder/nonrd_opt.h b/third_party/aom/av1/encoder/nonrd_opt.h
new file mode 100644
index 0000000000..a53578ebad
--- /dev/null
+++ b/third_party/aom/av1/encoder/nonrd_opt.h
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_NONRD_OPT_H_
+#define AOM_AV1_ENCODER_NONRD_OPT_H_
+
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/rdopt.h"
+
+#define RTC_INTER_MODES (4)
+#define RTC_INTRA_MODES (4)
+#define RTC_MODES (AOMMAX(RTC_INTER_MODES, RTC_INTRA_MODES))
+#define CALC_BIASED_RDCOST(rdcost) (7 * (rdcost) >> 3)
+#define NUM_COMP_INTER_MODES_RT (6)
+#define NUM_INTER_MODES 12
+#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \
+ (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false)
+#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16)
+#define FILTER_SEARCH_SIZE 2
+#if !CONFIG_REALTIME_ONLY
+#define MOTION_MODE_SEARCH_SIZE 2
+#endif
+
+extern int g_pick_inter_mode_cnt;
+/*!\cond */
+typedef struct {
+ uint8_t *data;
+ int stride;
+ int in_use;
+} PRED_BUFFER;
+
+typedef struct {
+ PRED_BUFFER *best_pred;
+ PREDICTION_MODE best_mode;
+ TX_SIZE best_tx_size;
+ TX_TYPE tx_type;
+ MV_REFERENCE_FRAME best_ref_frame;
+ MV_REFERENCE_FRAME best_second_ref_frame;
+ uint8_t best_mode_skip_txfm;
+ uint8_t best_mode_initial_skip_flag;
+ int_interpfilters best_pred_filter;
+ MOTION_MODE best_motion_mode;
+ WarpedMotionParams wm_params;
+ int num_proj_ref;
+ PALETTE_MODE_INFO pmi;
+ int64_t best_sse;
+} BEST_PICKMODE;
+
+typedef struct {
+ MV_REFERENCE_FRAME ref_frame;
+ PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+typedef struct {
+ MV_REFERENCE_FRAME ref_frame[2];
+ PREDICTION_MODE pred_mode;
+} COMP_REF_MODE;
+
+struct estimate_block_intra_args {
+ AV1_COMP *cpi;
+ MACROBLOCK *x;
+ PREDICTION_MODE mode;
+ int skippable;
+ RD_STATS *rdc;
+ unsigned int best_sad;
+ bool prune_mode_based_on_sad;
+};
+/*!\endcond */
+
+/*!\brief Structure to store parameters and statistics used in non-rd inter mode
+ * evaluation.
+ */
+typedef struct {
+ //! Structure to hold best inter mode data
+ BEST_PICKMODE best_pickmode;
+ //! Structure to RD cost of current mode
+ RD_STATS this_rdc;
+ //! Pointer to the RD Cost for the best mode found so far
+ RD_STATS best_rdc;
+ //! Distortion of chroma planes for all modes and reference frames
+ int64_t uv_dist[RTC_INTER_MODES][REF_FRAMES];
+ //! Buffer to hold predicted block for all reference frames and planes
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+ //! Array to hold variance of all modes and reference frames
+ unsigned int vars[RTC_INTER_MODES][REF_FRAMES];
+ //! Array to hold ref cost of single reference mode for all ref frames
+ unsigned int ref_costs_single[REF_FRAMES];
+ //! Array to hold motion vector for all modes and reference frames
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+ //! Array to hold best mv for all modes and reference frames
+ int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES];
+ //! Array to hold inter mode cost of single ref mode for all ref frames
+ int single_inter_mode_costs[RTC_INTER_MODES][REF_FRAMES];
+ //! Array to hold use reference frame mask for each reference frame
+ int use_ref_frame_mask[REF_FRAMES];
+ //! Array to hold flags of evaluated modes for each reference frame
+ uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
+ //! Array to hold flag indicating if scaled reference frame is used.
+ bool use_scaled_ref_frame[REF_FRAMES];
+} InterModeSearchStateNonrd;
+
+static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2,
+ 2, 2, 3, 3, 3, 4,
+ 4, 4, 5, 5 };
+static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1,
+ 2, 3, 2, 3, 4, 3,
+ 4, 5, 4, 5 };
+
+static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
+ SMOOTH_PRED };
+
+static const PREDICTION_MODE inter_mode_list[] = { NEARESTMV, NEARMV, GLOBALMV,
+ NEWMV };
+
+static const THR_MODES mode_idx[REF_FRAMES][RTC_MODES] = {
+ { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH },
+ { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV },
+ { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 },
+ { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 },
+ { THR_NEARESTG, THR_NEARG, THR_GLOBALG, THR_NEWG },
+ { THR_NEARESTB, THR_NEARB, THR_GLOBALB, THR_NEWB },
+ { THR_NEARESTA2, THR_NEARA2, THR_GLOBALA2, THR_NEWA2 },
+ { THR_NEARESTA, THR_NEARA, THR_GLOBALA, THR_NEWA },
+};
+
+// GLOBALMV in the set below is in fact ZEROMV as we don't do global ME in RT
+// mode
+static const REF_MODE ref_mode_set[NUM_INTER_MODES] = {
+ { LAST_FRAME, NEARESTMV }, { LAST_FRAME, NEARMV },
+ { LAST_FRAME, GLOBALMV }, { LAST_FRAME, NEWMV },
+ { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
+ { GOLDEN_FRAME, GLOBALMV }, { GOLDEN_FRAME, NEWMV },
+ { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV },
+ { ALTREF_FRAME, GLOBALMV }, { ALTREF_FRAME, NEWMV },
+};
+
+static const COMP_REF_MODE comp_ref_mode_set[NUM_COMP_INTER_MODES_RT] = {
+ { { LAST_FRAME, GOLDEN_FRAME }, GLOBAL_GLOBALMV },
+ { { LAST_FRAME, GOLDEN_FRAME }, NEAREST_NEARESTMV },
+ { { LAST_FRAME, LAST2_FRAME }, GLOBAL_GLOBALMV },
+ { { LAST_FRAME, LAST2_FRAME }, NEAREST_NEARESTMV },
+ { { LAST_FRAME, ALTREF_FRAME }, GLOBAL_GLOBALMV },
+ { { LAST_FRAME, ALTREF_FRAME }, NEAREST_NEARESTMV },
+};
+
+static const int_interpfilters filters_ref_set[9] = {
+ [0].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR },
+ [1].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
+ [2].as_filters = { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH },
+ [3].as_filters = { EIGHTTAP_SMOOTH, EIGHTTAP_REGULAR },
+ [4].as_filters = { MULTITAP_SHARP, MULTITAP_SHARP },
+ [5].as_filters = { EIGHTTAP_REGULAR, MULTITAP_SHARP },
+ [6].as_filters = { MULTITAP_SHARP, EIGHTTAP_REGULAR },
+ [7].as_filters = { EIGHTTAP_SMOOTH, MULTITAP_SHARP },
+ [8].as_filters = { MULTITAP_SHARP, EIGHTTAP_SMOOTH }
+};
+
+enum {
+ // INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+ INTER_NEAREST = (1 << NEARESTMV),
+ INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+ INTER_NEAREST_NEAR = (1 << NEARESTMV) | (1 << NEARMV),
+ INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV),
+};
+
+// The original scan order (default_scan_8x8) is modified according to the extra
+// transpose in hadamard c implementation, i.e., aom_hadamard_lp_8x8_c and
+// aom_hadamard_8x8_c.
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8_transpose[64]) = {
+ 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40,
+ 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35,
+ 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30,
+ 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63
+};
+
+// The original scan order (av1_default_iscan_8x8) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_lp_8x8_avx2 and
+// aom_hadamard_8x8_avx2. Since hadamard AVX2 implementation will modify the
+// order of coefficients, such that the normal scan order is no longer
+// guaranteed to scan low coefficients first, therefore we modify the scan order
+// accordingly.
+// Note that this one has to be used together with default_scan_8x8_transpose.
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_default_iscan_8x8_transpose[64]) = {
+ 0, 2, 3, 9, 10, 20, 21, 35, 1, 4, 8, 11, 19, 22, 34, 36,
+ 5, 7, 12, 18, 23, 33, 37, 48, 6, 13, 17, 24, 32, 38, 47, 49,
+ 14, 16, 25, 31, 39, 46, 50, 57, 15, 26, 30, 40, 45, 51, 56, 58,
+ 27, 29, 41, 44, 52, 55, 59, 62, 28, 42, 43, 53, 54, 60, 61, 63
+};
+
+// The original scan order (default_scan_16x16) is modified according to the
+// extra transpose in hadamard c implementation in lp case, i.e.,
+// aom_hadamard_lp_16x16_c.
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_lp_16x16_transpose[256]) = {
+ 0, 8, 2, 4, 10, 16, 24, 18, 12, 6, 64, 14, 20, 26, 32,
+ 40, 34, 28, 22, 72, 66, 68, 74, 80, 30, 36, 42, 48, 56, 50,
+ 44, 38, 88, 82, 76, 70, 128, 78, 84, 90, 96, 46, 52, 58, 1,
+ 9, 3, 60, 54, 104, 98, 92, 86, 136, 130, 132, 138, 144, 94, 100,
+ 106, 112, 62, 5, 11, 17, 25, 19, 13, 7, 120, 114, 108, 102, 152,
+ 146, 140, 134, 192, 142, 148, 154, 160, 110, 116, 122, 65, 15, 21, 27,
+ 33, 41, 35, 29, 23, 73, 67, 124, 118, 168, 162, 156, 150, 200, 194,
+ 196, 202, 208, 158, 164, 170, 176, 126, 69, 75, 81, 31, 37, 43, 49,
+ 57, 51, 45, 39, 89, 83, 77, 71, 184, 178, 172, 166, 216, 210, 204,
+ 198, 206, 212, 218, 224, 174, 180, 186, 129, 79, 85, 91, 97, 47, 53,
+ 59, 61, 55, 105, 99, 93, 87, 137, 131, 188, 182, 232, 226, 220, 214,
+ 222, 228, 234, 240, 190, 133, 139, 145, 95, 101, 107, 113, 63, 121, 115,
+ 109, 103, 153, 147, 141, 135, 248, 242, 236, 230, 238, 244, 250, 193, 143,
+ 149, 155, 161, 111, 117, 123, 125, 119, 169, 163, 157, 151, 201, 195, 252,
+ 246, 254, 197, 203, 209, 159, 165, 171, 177, 127, 185, 179, 173, 167, 217,
+ 211, 205, 199, 207, 213, 219, 225, 175, 181, 187, 189, 183, 233, 227, 221,
+ 215, 223, 229, 235, 241, 191, 249, 243, 237, 231, 239, 245, 251, 253, 247,
+ 255
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// The original scan order (default_scan_16x16) is modified according to the
+// extra shift in hadamard c implementation in fp case, i.e.,
+// aom_hadamard_16x16_c. Note that 16x16 lp and fp hadamard generate different
+// outputs, so we handle them separately.
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_fp_16x16_transpose[256]) = {
+ 0, 4, 2, 8, 6, 16, 20, 18, 12, 10, 64, 14, 24, 22, 32,
+ 36, 34, 28, 26, 68, 66, 72, 70, 80, 30, 40, 38, 48, 52, 50,
+ 44, 42, 84, 82, 76, 74, 128, 78, 88, 86, 96, 46, 56, 54, 1,
+ 5, 3, 60, 58, 100, 98, 92, 90, 132, 130, 136, 134, 144, 94, 104,
+ 102, 112, 62, 9, 7, 17, 21, 19, 13, 11, 116, 114, 108, 106, 148,
+ 146, 140, 138, 192, 142, 152, 150, 160, 110, 120, 118, 65, 15, 25, 23,
+ 33, 37, 35, 29, 27, 69, 67, 124, 122, 164, 162, 156, 154, 196, 194,
+ 200, 198, 208, 158, 168, 166, 176, 126, 73, 71, 81, 31, 41, 39, 49,
+ 53, 51, 45, 43, 85, 83, 77, 75, 180, 178, 172, 170, 212, 210, 204,
+ 202, 206, 216, 214, 224, 174, 184, 182, 129, 79, 89, 87, 97, 47, 57,
+ 55, 61, 59, 101, 99, 93, 91, 133, 131, 188, 186, 228, 226, 220, 218,
+ 222, 232, 230, 240, 190, 137, 135, 145, 95, 105, 103, 113, 63, 117, 115,
+ 109, 107, 149, 147, 141, 139, 244, 242, 236, 234, 238, 248, 246, 193, 143,
+ 153, 151, 161, 111, 121, 119, 125, 123, 165, 163, 157, 155, 197, 195, 252,
+ 250, 254, 201, 199, 209, 159, 169, 167, 177, 127, 181, 179, 173, 171, 213,
+ 211, 205, 203, 207, 217, 215, 225, 175, 185, 183, 189, 187, 229, 227, 221,
+ 219, 223, 233, 231, 241, 191, 245, 243, 237, 235, 239, 249, 247, 253, 251,
+ 255
+};
+#endif
+
+// The original scan order (av1_default_iscan_16x16) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_lp_16x16_avx2.
+// Since hadamard AVX2 implementation will modify the order of coefficients,
+// such that the normal scan order is no longer guaranteed to scan low
+// coefficients first, therefore we modify the scan order accordingly. Note that
+// this one has to be used together with default_scan_lp_16x16_transpose.
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_default_iscan_lp_16x16_transpose[256]) = {
+ 0, 44, 2, 46, 3, 63, 9, 69, 1, 45, 4, 64, 8, 68, 11,
+ 87, 5, 65, 7, 67, 12, 88, 18, 94, 6, 66, 13, 89, 17, 93,
+ 24, 116, 14, 90, 16, 92, 25, 117, 31, 123, 15, 91, 26, 118, 30,
+ 122, 41, 148, 27, 119, 29, 121, 42, 149, 48, 152, 28, 120, 43, 150,
+ 47, 151, 62, 177, 10, 86, 20, 96, 21, 113, 35, 127, 19, 95, 22,
+ 114, 34, 126, 37, 144, 23, 115, 33, 125, 38, 145, 52, 156, 32, 124,
+ 39, 146, 51, 155, 58, 173, 40, 147, 50, 154, 59, 174, 73, 181, 49,
+ 153, 60, 175, 72, 180, 83, 198, 61, 176, 71, 179, 84, 199, 98, 202,
+ 70, 178, 85, 200, 97, 201, 112, 219, 36, 143, 54, 158, 55, 170, 77,
+ 185, 53, 157, 56, 171, 76, 184, 79, 194, 57, 172, 75, 183, 80, 195,
+ 102, 206, 74, 182, 81, 196, 101, 205, 108, 215, 82, 197, 100, 204, 109,
+ 216, 131, 223, 99, 203, 110, 217, 130, 222, 140, 232, 111, 218, 129, 221,
+ 141, 233, 160, 236, 128, 220, 142, 234, 159, 235, 169, 245, 78, 193, 104,
+ 208, 105, 212, 135, 227, 103, 207, 106, 213, 134, 226, 136, 228, 107, 214,
+ 133, 225, 137, 229, 164, 240, 132, 224, 138, 230, 163, 239, 165, 241, 139,
+ 231, 162, 238, 166, 242, 189, 249, 161, 237, 167, 243, 188, 248, 190, 250,
+ 168, 244, 187, 247, 191, 251, 210, 254, 186, 246, 192, 252, 209, 253, 211,
+ 255
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// The original scan order (av1_default_iscan_16x16) is modified to match
+// hadamard AVX2 implementation, i.e., aom_hadamard_16x16_avx2.
+// Since hadamard AVX2 implementation will modify the order of coefficients,
+// such that the normal scan order is no longer guaranteed to scan low
+// coefficients first, therefore we modify the scan order accordingly. Note that
+// this one has to be used together with default_scan_fp_16x16_transpose.
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_default_iscan_fp_16x16_transpose[256]) = {
+ 0, 44, 2, 46, 1, 45, 4, 64, 3, 63, 9, 69, 8, 68, 11,
+ 87, 5, 65, 7, 67, 6, 66, 13, 89, 12, 88, 18, 94, 17, 93,
+ 24, 116, 14, 90, 16, 92, 15, 91, 26, 118, 25, 117, 31, 123, 30,
+ 122, 41, 148, 27, 119, 29, 121, 28, 120, 43, 150, 42, 149, 48, 152,
+ 47, 151, 62, 177, 10, 86, 20, 96, 19, 95, 22, 114, 21, 113, 35,
+ 127, 34, 126, 37, 144, 23, 115, 33, 125, 32, 124, 39, 146, 38, 145,
+ 52, 156, 51, 155, 58, 173, 40, 147, 50, 154, 49, 153, 60, 175, 59,
+ 174, 73, 181, 72, 180, 83, 198, 61, 176, 71, 179, 70, 178, 85, 200,
+ 84, 199, 98, 202, 97, 201, 112, 219, 36, 143, 54, 158, 53, 157, 56,
+ 171, 55, 170, 77, 185, 76, 184, 79, 194, 57, 172, 75, 183, 74, 182,
+ 81, 196, 80, 195, 102, 206, 101, 205, 108, 215, 82, 197, 100, 204, 99,
+ 203, 110, 217, 109, 216, 131, 223, 130, 222, 140, 232, 111, 218, 129, 221,
+ 128, 220, 142, 234, 141, 233, 160, 236, 159, 235, 169, 245, 78, 193, 104,
+ 208, 103, 207, 106, 213, 105, 212, 135, 227, 134, 226, 136, 228, 107, 214,
+ 133, 225, 132, 224, 138, 230, 137, 229, 164, 240, 163, 239, 165, 241, 139,
+ 231, 162, 238, 161, 237, 167, 243, 166, 242, 189, 249, 188, 248, 190, 250,
+ 168, 244, 187, 247, 186, 246, 192, 252, 191, 251, 210, 254, 209, 253, 211,
+ 255
+};
+#endif
+
+// For entropy coding, IDTX shares the scan orders of the other 2D-transforms,
+// but the fastest way to calculate the IDTX transform (i.e. no transposes)
+// results in coefficients that are a transposition of the entropy coding
+// versions. These tables are used as substitute for the scan order for the
+// faster version of IDTX.
+
+// Must be used together with av1_fast_idtx_iscan_4x4
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_fast_idtx_scan_4x4[16]) = { 0, 1, 4, 8, 5, 2, 3, 6,
+ 9, 12, 13, 10, 7, 11, 14, 15 };
+
+// Must be used together with av1_fast_idtx_scan_4x4
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_fast_idtx_iscan_4x4[16]) = { 0, 1, 5, 6, 2, 4, 7, 12,
+ 3, 8, 11, 13, 9, 10, 14, 15 };
+
+static const SCAN_ORDER av1_fast_idtx_scan_order_4x4 = {
+ av1_fast_idtx_scan_4x4, av1_fast_idtx_iscan_4x4
+};
+
+// Must be used together with av1_fast_idtx_iscan_8x8
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_8x8[64]) = {
+ 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+// Must be used together with av1_fast_idtx_scan_8x8
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_8x8[64]) = {
+ 0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42,
+ 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53,
+ 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
+ 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63
+};
+
+static const SCAN_ORDER av1_fast_idtx_scan_order_8x8 = {
+ av1_fast_idtx_scan_8x8, av1_fast_idtx_iscan_8x8
+};
+
+// Must be used together with av1_fast_idtx_iscan_16x16
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_scan_16x16[256]) = {
+ 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4,
+ 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22,
+ 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8,
+ 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100,
+ 85, 70, 55, 40, 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131,
+ 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27,
+ 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208,
+ 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14,
+ 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225,
+ 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46,
+ 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+ 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 79, 94,
+ 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+ 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+ 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+ 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+ 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+ 255
+};
+
+// Must be used together with av1_fast_idtx_scan_16x16
+DECLARE_ALIGNED(16, static const int16_t, av1_fast_idtx_iscan_16x16[256]) = {
+ 0, 1, 5, 6, 14, 15, 27, 28, 44, 45, 65, 66, 90, 91, 119,
+ 120, 2, 4, 7, 13, 16, 26, 29, 43, 46, 64, 67, 89, 92, 118,
+ 121, 150, 3, 8, 12, 17, 25, 30, 42, 47, 63, 68, 88, 93, 117,
+ 122, 149, 151, 9, 11, 18, 24, 31, 41, 48, 62, 69, 87, 94, 116,
+ 123, 148, 152, 177, 10, 19, 23, 32, 40, 49, 61, 70, 86, 95, 115,
+ 124, 147, 153, 176, 178, 20, 22, 33, 39, 50, 60, 71, 85, 96, 114,
+ 125, 146, 154, 175, 179, 200, 21, 34, 38, 51, 59, 72, 84, 97, 113,
+ 126, 145, 155, 174, 180, 199, 201, 35, 37, 52, 58, 73, 83, 98, 112,
+ 127, 144, 156, 173, 181, 198, 202, 219, 36, 53, 57, 74, 82, 99, 111,
+ 128, 143, 157, 172, 182, 197, 203, 218, 220, 54, 56, 75, 81, 100, 110,
+ 129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55, 76, 80, 101, 109,
+ 130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77, 79, 102, 108,
+ 131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78, 103, 107,
+ 132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106,
+ 133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105,
+ 134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253,
+ 135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254,
+ 255
+};
+
+// Indicates the blocks for which RD model should be based on special logic
+static INLINE int get_model_rd_flag(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int large_block = bsize >= BLOCK_32X32;
+ // Only enable for low bitdepth to mitigate issue: b/303023614.
+ return cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block &&
+ !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+ cm->quant_params.base_qindex && !cpi->oxcf.use_highbitdepth;
+}
+/*!\brief Finds predicted motion vectors for a block.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds predicted motion vectors for a block from a certain reference frame.
+ * First, it fills reference MV stack, then picks the test from the stack and
+ * predicts the final MV for a block for each mode.
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the
+ * data for the current macroblock
+ * \param[in] ref_frame Reference frame for which to find
+ * ref MVs
+ * \param[out] frame_mv Predicted MVs for a block
+ * \param[in] yv12_mb Buffer to hold predicted block
+ * \param[in] bsize Current block size
+ * \param[in] force_skip_low_temp_var Flag indicating possible mode search
+ * prune for low temporal variance block
+ * \param[in] skip_pred_mv Flag indicating to skip av1_mv_pred
+ * \param[out] use_scaled_ref_frame Flag to indicate if scaled reference
+ * frame is used.
+ *
+ * \remark Nothing is returned. Instead, predicted MVs are placed into
+ * \c frame_mv array, and use_scaled_ref_frame is set.
+ */
+static INLINE void find_predictors(
+ AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+ struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
+ int force_skip_low_temp_var, int skip_pred_mv, bool *use_scaled_ref_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, ref_frame);
+ const bool ref_is_scaled =
+ ref->y_crop_height != cm->height || ref->y_crop_width != cm->width;
+ const YV12_BUFFER_CONFIG *scaled_ref =
+ av1_get_scaled_ref_frame(cpi, ref_frame);
+ const YV12_BUFFER_CONFIG *yv12 =
+ ref_is_scaled && scaled_ref ? scaled_ref : ref;
+ const int num_planes = av1_num_planes(cm);
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+ x->pred_mv0_sad[ref_frame] = INT_MAX;
+ x->pred_mv1_sad[ref_frame] = INT_MAX;
+ frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+ // TODO(kyslov) this needs various further optimizations. to be continued..
+ assert(yv12 != NULL);
+ if (yv12 != NULL) {
+ struct scale_factors *const sf =
+ scaled_ref ? NULL : get_ref_scale_factors(cm, ref_frame);
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+ // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+ av1_find_best_ref_mvs_from_stack(
+ cm->features.allow_high_precision_mv, mbmi_ext, ref_frame,
+ &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0);
+ frame_mv[GLOBALMV][ref_frame] = mbmi_ext->global_mvs[ref_frame];
+ // Early exit for non-LAST frame if force_skip_low_temp_var is set.
+ if (!ref_is_scaled && bsize >= BLOCK_8X8 && !skip_pred_mv &&
+ !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) {
+ av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+ bsize);
+ }
+ }
+ if (cm->features.switchable_motion_mode) {
+ av1_count_overlappable_neighbors(cm, xd);
+ }
+ mbmi->num_proj_ref = 1;
+ *use_scaled_ref_frame = ref_is_scaled && scaled_ref;
+}
+
+static INLINE void init_mbmi_nonrd(MB_MODE_INFO *mbmi,
+ PREDICTION_MODE pred_mode,
+ MV_REFERENCE_FRAME ref_frame0,
+ MV_REFERENCE_FRAME ref_frame1,
+ const AV1_COMMON *cm) {
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ mbmi->ref_mv_idx = 0;
+ mbmi->mode = pred_mode;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = ref_frame0;
+ mbmi->ref_frame[1] = ref_frame1;
+ pmi->palette_size[PLANE_TYPE_Y] = 0;
+ pmi->palette_size[PLANE_TYPE_UV] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->num_proj_ref = 1;
+ mbmi->interintra_mode = 0;
+ set_default_interp_filters(mbmi, cm->features.interp_filter);
+}
+
+static INLINE void init_estimate_block_intra_args(
+ struct estimate_block_intra_args *args, AV1_COMP *cpi, MACROBLOCK *x) {
+ args->cpi = cpi;
+ args->x = x;
+ args->mode = DC_PRED;
+ args->skippable = 1;
+ args->rdc = 0;
+ args->best_sad = UINT_MAX;
+ args->prune_mode_based_on_sad = false;
+}
+
+static INLINE int get_pred_buffer(PRED_BUFFER *p, int len) {
+ for (int buf_idx = 0; buf_idx < len; buf_idx++) {
+ if (!p[buf_idx].in_use) {
+ p[buf_idx].in_use = 1;
+ return buf_idx;
+ }
+ }
+ return -1;
+}
+
+static INLINE void free_pred_buffer(PRED_BUFFER *p) {
+ if (p != NULL) p->in_use = 0;
+}
+
+#if CONFIG_INTERNAL_STATS
+static INLINE void store_coding_context_nonrd(MACROBLOCK *x,
+ PICK_MODE_CONTEXT *ctx,
+ int mode_index) {
+#else
+static INLINE void store_coding_context_nonrd(MACROBLOCK *x,
+ PICK_MODE_CONTEXT *ctx) {
+#endif // CONFIG_INTERNAL_STATS
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+ // Take a snapshot of the coding context so it can be
+ // restored if we decide to encode this way
+ ctx->rd_stats.skip_txfm = txfm_info->skip_txfm;
+
+ ctx->skippable = txfm_info->skip_txfm;
+#if CONFIG_INTERNAL_STATS
+ ctx->best_mode_index = mode_index;
+#endif // CONFIG_INTERNAL_STATS
+ ctx->mic = *xd->mi[0];
+ ctx->skippable = txfm_info->skip_txfm;
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
+ av1_ref_frame_type(xd->mi[0]->ref_frame));
+}
+
+void av1_block_yrd(MACROBLOCK *x, RD_STATS *this_rdc, int *skippable,
+ BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+void av1_block_yrd_idtx(MACROBLOCK *x, const uint8_t *const pred_buf,
+ int pred_stride, RD_STATS *this_rdc, int *skippable,
+ BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+int64_t av1_model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ RD_STATS *this_rdc, int start_plane,
+ int stop_plane);
+
+void av1_estimate_block_intra(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg);
+
+void av1_estimate_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int best_early_term, unsigned int ref_cost_intra,
+ int reuse_prediction, struct buf_2d *orig_dst,
+ PRED_BUFFER *tmp_buffers,
+ PRED_BUFFER **this_mode_pred, RD_STATS *best_rdc,
+ BEST_PICKMODE *best_pickmode,
+ PICK_MODE_CONTEXT *ctx);
+
+#endif // AOM_AV1_ENCODER_NONRD_OPT_H_
diff --git a/third_party/aom/av1/encoder/nonrd_pickmode.c b/third_party/aom/av1/encoder/nonrd_pickmode.c
new file mode 100644
index 0000000000..f939b6d1fa
--- /dev/null
+++ b/third_party/aom/av1/encoder/nonrd_pickmode.c
@@ -0,0 +1,3537 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
+
+static INLINE int early_term_inter_search_with_sse(int early_term_idx,
+ BLOCK_SIZE bsize,
+ int64_t this_sse,
+ int64_t best_sse,
+ PREDICTION_MODE this_mode) {
+ // Aggressiveness to terminate inter mode search early is adjusted based on
+ // speed and block size.
+ static const double early_term_thresh[4][4] = { { 0.65, 0.65, 0.65, 0.7 },
+ { 0.6, 0.65, 0.85, 0.9 },
+ { 0.5, 0.5, 0.55, 0.6 },
+ { 0.6, 0.75, 0.85, 0.85 } };
+ static const double early_term_thresh_newmv_nearestmv[4] = { 0.3, 0.3, 0.3,
+ 0.3 };
+
+ const int size_group = size_group_lookup[bsize];
+ assert(size_group < 4);
+ assert((early_term_idx > 0) && (early_term_idx < EARLY_TERM_INDICES));
+ const double threshold =
+ ((early_term_idx == EARLY_TERM_IDX_4) &&
+ (this_mode == NEWMV || this_mode == NEARESTMV))
+ ? early_term_thresh_newmv_nearestmv[size_group]
+ : early_term_thresh[early_term_idx - 1][size_group];
+
+ // Terminate inter mode search early based on best sse so far.
+ if ((early_term_idx > 0) && (threshold * this_sse > best_sse)) {
+ return 1;
+ }
+ return 0;
+}
+
+static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+ bp->best_sse = INT64_MAX;
+ bp->best_mode = NEARESTMV;
+ bp->best_ref_frame = LAST_FRAME;
+ bp->best_second_ref_frame = NONE_FRAME;
+ bp->best_tx_size = TX_8X8;
+ bp->tx_type = DCT_DCT;
+ bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ bp->best_mode_skip_txfm = 0;
+ bp->best_mode_initial_skip_flag = 0;
+ bp->best_pred = NULL;
+ bp->best_motion_mode = SIMPLE_TRANSLATION;
+ bp->num_proj_ref = 0;
+ av1_zero(bp->wm_params);
+ av1_zero(bp->pmi);
+}
+
+// Copy best inter mode parameters to best_pickmode
+static INLINE void update_search_state_nonrd(
+ InterModeSearchStateNonrd *search_state, MB_MODE_INFO *const mi,
+ TxfmSearchInfo *txfm_info, RD_STATS *nonskip_rdc, PICK_MODE_CONTEXT *ctx,
+ PREDICTION_MODE this_best_mode, const int64_t sse_y) {
+ BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode;
+
+ best_pickmode->best_sse = sse_y;
+ best_pickmode->best_mode = this_best_mode;
+ best_pickmode->best_motion_mode = mi->motion_mode;
+ best_pickmode->wm_params = mi->wm_params;
+ best_pickmode->num_proj_ref = mi->num_proj_ref;
+ best_pickmode->best_pred_filter = mi->interp_filters;
+ best_pickmode->best_tx_size = mi->tx_size;
+ best_pickmode->best_ref_frame = mi->ref_frame[0];
+ best_pickmode->best_second_ref_frame = mi->ref_frame[1];
+ best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm;
+ best_pickmode->best_mode_initial_skip_flag =
+ (nonskip_rdc->rate == INT_MAX && search_state->this_rdc.skip_txfm);
+ if (!best_pickmode->best_mode_skip_txfm) {
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+}
+
+static INLINE int subpel_select(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int_mv *mv, MV ref_mv, FULLPEL_MV start_mv,
+ bool fullpel_performed_well) {
+ const int frame_lowmotion = cpi->rc.avg_frame_low_motion;
+ const int reduce_mv_pel_precision_highmotion =
+ cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion;
+
+ // Reduce MV precision for higher int MV value & frame-level motion
+ if (reduce_mv_pel_precision_highmotion >= 3) {
+ int mv_thresh = 4;
+ const int is_low_resoln =
+ (cpi->common.width * cpi->common.height <= 320 * 240);
+ mv_thresh = (bsize > BLOCK_32X32) ? 2 : (bsize > BLOCK_16X16) ? 4 : 6;
+ if (frame_lowmotion > 0 && frame_lowmotion < 40) mv_thresh = 12;
+ mv_thresh = (is_low_resoln) ? mv_thresh >> 1 : mv_thresh;
+ if (abs(mv->as_fullmv.row) >= mv_thresh ||
+ abs(mv->as_fullmv.col) >= mv_thresh)
+ return HALF_PEL;
+ } else if (reduce_mv_pel_precision_highmotion >= 1) {
+ int mv_thresh;
+ const int th_vals[2][3] = { { 4, 8, 10 }, { 4, 6, 8 } };
+ const int th_idx = reduce_mv_pel_precision_highmotion - 1;
+ assert(th_idx >= 0 && th_idx < 2);
+ if (frame_lowmotion > 0 && frame_lowmotion < 40)
+ mv_thresh = 12;
+ else
+ mv_thresh = (bsize >= BLOCK_32X32) ? th_vals[th_idx][0]
+ : (bsize >= BLOCK_16X16) ? th_vals[th_idx][1]
+ : th_vals[th_idx][2];
+ if (abs(mv->as_fullmv.row) >= (mv_thresh << 1) ||
+ abs(mv->as_fullmv.col) >= (mv_thresh << 1))
+ return FULL_PEL;
+ else if (abs(mv->as_fullmv.row) >= mv_thresh ||
+ abs(mv->as_fullmv.col) >= mv_thresh)
+ return HALF_PEL;
+ }
+ // Reduce MV precision for relatively static (e.g. background), low-complex
+ // large areas
+ if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 2) {
+ const int qband = x->qindex >> (QINDEX_BITS - 2);
+ assert(qband < 4);
+ if (x->content_state_sb.source_sad_nonrd <= kVeryLowSad &&
+ bsize > BLOCK_16X16 && qband != 0) {
+ if (x->source_variance < 500)
+ return FULL_PEL;
+ else if (x->source_variance < 5000)
+ return HALF_PEL;
+ }
+ } else if (cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex >= 1) {
+ if (fullpel_performed_well && ref_mv.row == 0 && ref_mv.col == 0 &&
+ start_mv.row == 0 && start_mv.col == 0)
+ return HALF_PEL;
+ }
+ return cpi->sf.mv_sf.subpel_force_stop;
+}
+
+static bool use_aggressive_subpel_search_method(MACROBLOCK *x,
+ bool use_adaptive_subpel_search,
+ bool fullpel_performed_well) {
+ if (!use_adaptive_subpel_search) return false;
+ const int qband = x->qindex >> (QINDEX_BITS - 2);
+ assert(qband < 4);
+ if ((qband > 0) && (fullpel_performed_well ||
+ (x->content_state_sb.source_sad_nonrd <= kLowSad) ||
+ (x->source_variance < 100)))
+ return true;
+ return false;
+}
+
+/*!\brief Runs Motion Estimation for a specific block and specific ref frame.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds the best Motion Vector by running Motion Estimation for a specific
+ * block and a specific reference frame. Exits early if RDCost of Full Pel part
+ * exceeds best RD Cost fund so far
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the
+ * data for the current macroblock
+ * \param[in] bsize Current block size
+ * \param[in] tmp_mv Pointer to best found New MV
+ * \param[in] rate_mv Pointer to Rate of the best new MV
+ * \param[in] best_rd_sofar RD Cost of the best mode found so far
+ * \param[in] use_base_mv Flag, indicating that tmp_mv holds
+ * specific MV to start the search with
+ *
+ * \return Returns 0 if ME was terminated after Full Pel Search because too
+ * high RD Cost. Otherwise returns 1. Best New MV is placed into \c tmp_mv.
+ * Rate estimation for this vector is placed to \c rate_mv
+ */
+static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int_mv *tmp_mv,
+ int *rate_mv, int64_t best_rd_sofar,
+ int use_base_mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const AV1_COMMON *cm = &cpi->common;
+ const SPEED_FEATURES *sf = &cpi->sf;
+ MB_MODE_INFO *mi = xd->mi[0];
+ int step_param = (sf->rt_sf.fullpel_search_step_param)
+ ? sf->rt_sf.fullpel_search_step_param
+ : cpi->mv_search_params.mv_step_param;
+ FULLPEL_MV start_mv;
+ const int ref = mi->ref_frame[0];
+ const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv;
+ MV center_mv;
+ int dis;
+ int rv = 0;
+ int cost_list[5];
+ int search_subpel = 1;
+
+ start_mv = get_fullmv_from_mv(&ref_mv);
+
+ if (!use_base_mv)
+ center_mv = ref_mv;
+ else
+ center_mv = tmp_mv->as_mv;
+
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
+ const search_site_config *src_search_sites =
+ av1_get_search_site_config(cpi, x, search_method);
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ FULLPEL_MV_STATS best_mv_stats;
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
+ start_mv, src_search_sites, search_method,
+ /*fine_search_interval=*/0);
+
+ const unsigned int full_var_rd = av1_full_pixel_search(
+ start_mv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
+ &tmp_mv->as_fullmv, &best_mv_stats, NULL);
+
+ // calculate the bit cost on motion vector
+ MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv);
+
+ *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+
+ // TODO(kyslov) Account for Rate Mode!
+ rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar);
+
+ if (rv && search_subpel) {
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+ cost_list);
+ const bool fullpel_performed_well =
+ (bsize == BLOCK_64X64 && full_var_rd * 40 < 62267 * 7) ||
+ (bsize == BLOCK_32X32 && full_var_rd * 8 < 42380) ||
+ (bsize == BLOCK_16X16 && full_var_rd * 8 < 10127);
+ if (sf->rt_sf.reduce_mv_pel_precision_highmotion ||
+ sf->rt_sf.reduce_mv_pel_precision_lowcomplex)
+ ms_params.forced_stop = subpel_select(cpi, x, bsize, tmp_mv, ref_mv,
+ start_mv, fullpel_performed_well);
+
+ MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+ // adaptively downgrade subpel search method based on block properties
+ if (use_aggressive_subpel_search_method(
+ x, sf->rt_sf.use_adaptive_subpel_search, fullpel_performed_well))
+ av1_find_best_sub_pixel_tree_pruned_more(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv,
+ &dis, &x->pred_sse[ref], NULL);
+ else
+ cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &tmp_mv->as_mv,
+ &dis, &x->pred_sse[ref], NULL);
+ *rate_mv =
+ av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ }
+ // The final MV can not be equal to the reference MV as this will trigger an
+ // assert later. This can happen if both NEAREST and NEAR modes were skipped.
+ rv = (tmp_mv->as_mv.col != ref_mv.col || tmp_mv->as_mv.row != ref_mv.row);
+ return rv;
+}
+
+/*!\brief Searches for the best New Motion Vector.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Finds the best Motion Vector by doing Motion Estimation. Uses reduced
+ * complexity ME for non-LAST frames or calls \c combined_motion_search
+ * for LAST reference frame
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the
+ * data for the current macroblock
+ * \param[in] frame_mv Array that holds MVs for all modes
+ * and ref frames
+ * \param[in] ref_frame Reference frame for which to find
+ * the best New MVs
+ * \param[in] gf_temporal_ref Flag, indicating temporal reference
+ * for GOLDEN frame
+ * \param[in] bsize Current block size
+ * \param[in] mi_row Row index in 4x4 units
+ * \param[in] mi_col Column index in 4x4 units
+ * \param[in] rate_mv Pointer to Rate of the best new MV
+ * \param[in] best_rdc Pointer to the RD Cost for the best
+ * mode found so far
+ *
+ * \return Returns -1 if the search was not done, otherwise returns 0.
+ * Best New MV is placed into \c frame_mv array, Rate estimation for this
+ * vector is placed to \c rate_mv
+ */
+static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
+ int_mv frame_mv[][REF_FRAMES],
+ MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
+ BLOCK_SIZE bsize, int mi_row, int mi_col, int *rate_mv,
+ RD_STATS *best_rdc) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ AV1_COMMON *cm = &cpi->common;
+ int_mv *this_ref_frm_newmv = &frame_mv[NEWMV][ref_frame];
+ unsigned int y_sad_zero;
+ if (ref_frame > LAST_FRAME && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+ gf_temporal_ref) {
+ int tmp_sad;
+ int dis;
+
+ if (bsize < BLOCK_16X16) return -1;
+
+ int me_search_size_col = block_size_wide[bsize] >> 1;
+ int me_search_size_row = block_size_high[bsize] >> 1;
+ tmp_sad = av1_int_pro_motion_estimation(
+ cpi, x, bsize, mi_row, mi_col,
+ &x->mbmi_ext.ref_mv_stack[ref_frame][0].this_mv.as_mv, &y_sad_zero,
+ me_search_size_col, me_search_size_row);
+
+ if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
+
+ this_ref_frm_newmv->as_int = mi->mv[0].as_int;
+ int_mv best_mv = mi->mv[0];
+ best_mv.as_mv.row >>= 3;
+ best_mv.as_mv.col >>= 3;
+ MV ref_mv = av1_get_ref_mv(x, 0).as_mv;
+ this_ref_frm_newmv->as_mv.row >>= 3;
+ this_ref_frm_newmv->as_mv.col >>= 3;
+
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv, NULL);
+ if (cpi->sf.rt_sf.reduce_mv_pel_precision_highmotion ||
+ cpi->sf.rt_sf.reduce_mv_pel_precision_lowcomplex) {
+ FULLPEL_MV start_mv = { .row = 0, .col = 0 };
+ ms_params.forced_stop =
+ subpel_select(cpi, x, bsize, &best_mv, ref_mv, start_mv, false);
+ }
+ MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, start_mv));
+ cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, start_mv, NULL, &best_mv.as_mv, &dis,
+ &x->pred_sse[ref_frame], NULL);
+ this_ref_frm_newmv->as_int = best_mv.as_int;
+
+ // When NEWMV is same as ref_mv from the drl, it is preferred to code the
+ // MV as NEARESTMV or NEARMV. In this case, NEWMV needs to be skipped to
+ // avoid an assert failure at a later stage. The scenario can occur if
+ // NEARESTMV was not evaluated for ALTREF.
+ if (this_ref_frm_newmv->as_mv.col == ref_mv.col &&
+ this_ref_frm_newmv->as_mv.row == ref_mv.row)
+ return -1;
+
+ *rate_mv = av1_mv_bit_cost(&this_ref_frm_newmv->as_mv, &ref_mv,
+ x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ } else if (!combined_motion_search(cpi, x, bsize, &frame_mv[NEWMV][ref_frame],
+ rate_mv, best_rdc->rdcost, 0)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd,
+ const ModeCosts *mode_costs,
+ int segment_id, BLOCK_SIZE bsize,
+ unsigned int *ref_costs_single) {
+ int seg_ref_active =
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ if (seg_ref_active) {
+ memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+ } else {
+ int intra_inter_ctx = av1_get_intra_inter_context(xd);
+ ref_costs_single[INTRA_FRAME] =
+ mode_costs->intra_inter_cost[intra_inter_ctx][0];
+ unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+ is_comp_ref_allowed(bsize)) {
+ const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+ base_cost += mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
+ }
+ ref_costs_single[LAST_FRAME] = base_cost;
+ ref_costs_single[GOLDEN_FRAME] = base_cost;
+ ref_costs_single[ALTREF_FRAME] = base_cost;
+ // add cost for last, golden, altref
+ ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[0][0][0];
+ ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][0][1];
+ ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[0][1][0];
+ ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][0][1];
+ ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[0][2][0];
+ }
+}
+
+static INLINE void set_force_skip_flag(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, unsigned int sse,
+ int *force_skip) {
+ if (x->txfm_search_params.tx_mode_search_type == TX_MODE_SELECT &&
+ cpi->sf.rt_sf.tx_size_level_based_on_qstep &&
+ cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) {
+ const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (x->e_mbd.bd - 5);
+ const unsigned int qstep_sq = qstep * qstep;
+ // If the sse is low for low source variance blocks, mark those as
+ // transform skip.
+ // Note: Though qstep_sq is based on ac qstep, the threshold is kept
+ // low so that reliable early estimate of tx skip can be obtained
+ // through its comparison with sse.
+ if (sse < qstep_sq && x->source_variance < qstep_sq &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0)
+ *force_skip = 1;
+ }
+}
+
+#define CAP_TX_SIZE_FOR_BSIZE_GT32(tx_mode_search_type, bsize) \
+ (((tx_mode_search_type) != ONLY_4X4 && (bsize) > BLOCK_32X32) ? true : false)
+#define TX_SIZE_FOR_BSIZE_GT32 (TX_16X16)
+
+static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *const x, unsigned int var,
+ unsigned int sse, int *force_skip) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TX_SIZE tx_size;
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ if (txfm_params->tx_mode_search_type == TX_MODE_SELECT) {
+ int multiplier = 8;
+ unsigned int var_thresh = 0;
+ unsigned int is_high_var = 1;
+ // Use quantizer based thresholds to determine transform size.
+ if (cpi->sf.rt_sf.tx_size_level_based_on_qstep) {
+ const int qband = x->qindex >> (QINDEX_BITS - 2);
+ const int mult[4] = { 8, 7, 6, 5 };
+ assert(qband < 4);
+ multiplier = mult[qband];
+ const int qstep = x->plane[AOM_PLANE_Y].dequant_QTX[1] >> (xd->bd - 5);
+ const unsigned int qstep_sq = qstep * qstep;
+ var_thresh = qstep_sq * 2;
+ if (cpi->sf.rt_sf.tx_size_level_based_on_qstep >= 2) {
+ // If the sse is low for low source variance blocks, mark those as
+ // transform skip.
+ // Note: Though qstep_sq is based on ac qstep, the threshold is kept
+ // low so that reliable early estimate of tx skip can be obtained
+ // through its comparison with sse.
+ if (sse < qstep_sq && x->source_variance < qstep_sq &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 0)
+ *force_skip = 1;
+ // Further lower transform size based on aq mode only if residual
+ // variance is high.
+ is_high_var = (var >= var_thresh);
+ }
+ }
+ // Choose larger transform size for blocks where dc component is dominant or
+ // the ac component is low.
+ if (sse > ((var * multiplier) >> 2) || (var < var_thresh))
+ tx_size =
+ AOMMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
+ else
+ tx_size = TX_8X8;
+
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) && is_high_var)
+ tx_size = TX_8X8;
+ else if (tx_size > TX_16X16)
+ tx_size = TX_16X16;
+ } else {
+ tx_size =
+ AOMMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
+ }
+
+ if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize))
+ tx_size = TX_SIZE_FOR_BSIZE_GT32;
+
+ return AOMMIN(tx_size, TX_16X16);
+}
+
+static void block_variance(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int w, int h,
+ unsigned int *sse, int *sum, int block_size,
+ uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
+ int k = 0;
+ *sse = 0;
+ *sum = 0;
+
+ // This function is called for block sizes >= BLOCK_32x32. As per the design
+ // the aom_get_var_sse_sum_8x8_quad() processes four 8x8 blocks (in a 8x32)
+ // per call. Hence the width and height of the block need to be at least 8 and
+ // 32 samples respectively.
+ assert(w >= 32);
+ assert(h >= 8);
+ for (int row = 0; row < h; row += block_size) {
+ for (int col = 0; col < w; col += 32) {
+ aom_get_var_sse_sum_8x8_quad(src + src_stride * row + col, src_stride,
+ ref + ref_stride * row + col, ref_stride,
+ &sse8x8[k], &sum8x8[k], sse, sum,
+ &var8x8[k]);
+ k += 4;
+ }
+ }
+}
+
+static void block_variance_16x16_dual(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int w,
+ int h, unsigned int *sse, int *sum,
+ int block_size, uint32_t *sse16x16,
+ uint32_t *var16x16) {
+ int k = 0;
+ *sse = 0;
+ *sum = 0;
+ // This function is called for block sizes >= BLOCK_32x32. As per the design
+ // the aom_get_var_sse_sum_16x16_dual() processes four 16x16 blocks (in a
+ // 16x32) per call. Hence the width and height of the block need to be at
+ // least 16 and 32 samples respectively.
+ assert(w >= 32);
+ assert(h >= 16);
+ for (int row = 0; row < h; row += block_size) {
+ for (int col = 0; col < w; col += 32) {
+ aom_get_var_sse_sum_16x16_dual(src + src_stride * row + col, src_stride,
+ ref + ref_stride * row + col, ref_stride,
+ &sse16x16[k], sse, sum, &var16x16[k]);
+ k += 2;
+ }
+ }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+ unsigned int *sse_i, int *sum_i,
+ unsigned int *var_o, unsigned int *sse_o,
+ int *sum_o) {
+ const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+ const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+ const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+ int row, col, k = 0;
+
+ for (row = 0; row < nh; row += 2) {
+ for (col = 0; col < nw; col += 2) {
+ sse_o[k] = sse_i[row * nw + col] + sse_i[row * nw + col + 1] +
+ sse_i[(row + 1) * nw + col] + sse_i[(row + 1) * nw + col + 1];
+ sum_o[k] = sum_i[row * nw + col] + sum_i[row * nw + col + 1] +
+ sum_i[(row + 1) * nw + col] + sum_i[(row + 1) * nw + col + 1];
+ var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+ (b_width_log2_lookup[unit_size] +
+ b_height_log2_lookup[unit_size] + 6));
+ k++;
+ }
+ }
+}
+
+// Adjust the ac_thr according to speed, width, height and normalized sum
+static int ac_thr_factor(int speed, int width, int height, int norm_sum) {
+ if (speed >= 8 && norm_sum < 5) {
+ if (width <= 640 && height <= 480)
+ return 4;
+ else
+ return 2;
+ }
+ return 1;
+}
+
+// Sets early_term flag based on chroma planes prediction
+static INLINE void set_early_term_based_on_uv_plane(
+ AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MACROBLOCKD *xd, int mi_row,
+ int mi_col, int *early_term, int num_blk, const unsigned int *sse_tx,
+ const unsigned int *var_tx, int sum, unsigned int var, unsigned int sse) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ const uint32_t dc_quant = p->dequant_QTX[0];
+ const uint32_t ac_quant = p->dequant_QTX[1];
+ int64_t dc_thr = dc_quant * dc_quant >> 6;
+ int64_t ac_thr = ac_quant * ac_quant >> 6;
+ const int bw = b_width_log2_lookup[bsize];
+ const int bh = b_height_log2_lookup[bsize];
+ int ac_test = 1;
+ int dc_test = 1;
+ const int norm_sum = abs(sum) >> (bw + bh);
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->oxcf.speed > 5)
+ ac_thr = av1_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
+ norm_sum, cpi->svc.temporal_layer_id);
+ else
+ ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum);
+#else
+ ac_thr *= ac_thr_factor(cpi->oxcf.speed, cm->width, cm->height, norm_sum);
+
+#endif
+
+ if (cpi->sf.rt_sf.increase_source_sad_thresh) {
+ dc_thr = dc_thr << 1;
+ ac_thr = ac_thr << 2;
+ }
+
+ for (int k = 0; k < num_blk; k++) {
+ // Check if all ac coefficients can be quantized to zero.
+ if (!(var_tx[k] < ac_thr || var == 0)) {
+ ac_test = 0;
+ break;
+ }
+ // Check if dc coefficient can be quantized to zero.
+ if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+ dc_test = 0;
+ break;
+ }
+ }
+
+ // Check if chroma can be skipped based on ac and dc test flags.
+ if (ac_test && dc_test) {
+ int skip_uv[2] = { 0 };
+ unsigned int var_uv[2];
+ unsigned int sse_uv[2];
+ // Transform skipping test in UV planes.
+ for (int plane = AOM_PLANE_U; plane <= AOM_PLANE_V; plane++) {
+ int j = plane - 1;
+ skip_uv[j] = 1;
+ if (x->color_sensitivity[COLOR_SENS_IDX(plane)]) {
+ skip_uv[j] = 0;
+ struct macroblock_plane *const puv = &x->plane[plane];
+ struct macroblockd_plane *const puvd = &xd->plane[plane];
+ const BLOCK_SIZE uv_bsize = get_plane_block_size(
+ bsize, puvd->subsampling_x, puvd->subsampling_y);
+ // Adjust these thresholds for UV.
+ const int shift_ac = cpi->sf.rt_sf.increase_source_sad_thresh ? 5 : 3;
+ const int shift_dc = cpi->sf.rt_sf.increase_source_sad_thresh ? 4 : 3;
+ const int64_t uv_dc_thr =
+ (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> shift_dc;
+ const int64_t uv_ac_thr =
+ (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> shift_ac;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ plane, plane);
+ var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride,
+ puvd->dst.buf,
+ puvd->dst.stride, &sse_uv[j]);
+ if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+ (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+ skip_uv[j] = 1;
+ else
+ break;
+ }
+ }
+ if (skip_uv[0] & skip_uv[1]) {
+ *early_term = 1;
+ }
+ }
+}
+
+static INLINE void calc_rate_dist_block_param(AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats,
+ int calculate_rd, int *early_term,
+ BLOCK_SIZE bsize,
+ unsigned int sse) {
+ if (calculate_rd) {
+ if (!*early_term) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+
+ model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, rd_stats->sse, bw * bh,
+ &rd_stats->rate, &rd_stats->dist);
+ }
+
+ if (*early_term) {
+ rd_stats->rate = 0;
+ rd_stats->dist = sse << 4;
+ }
+ }
+}
+
+static void model_skip_for_sb_y_large_64(AV1_COMP *cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, MACROBLOCK *x,
+ MACROBLOCKD *xd, RD_STATS *rd_stats,
+ int *early_term, int calculate_rd,
+ int64_t best_sse,
+ unsigned int *var_output,
+ unsigned int var_prune_threshold) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ int test_skip = 1;
+ unsigned int var;
+ int sum;
+ const int bw = b_width_log2_lookup[bsize];
+ const int bh = b_height_log2_lookup[bsize];
+ unsigned int sse16x16[64] = { 0 };
+ unsigned int var16x16[64] = { 0 };
+ assert(xd->mi[0]->tx_size == TX_16X16);
+ assert(bsize > BLOCK_32X32);
+
+ // Calculate variance for whole partition, and also save 16x16 blocks'
+ // variance to be used in following transform skipping test.
+ block_variance_16x16_dual(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, 4 << bw, 4 << bh, &sse, &sum, 16,
+ sse16x16, var16x16);
+
+ var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+ if (var_output) {
+ *var_output = var;
+ if (*var_output > var_prune_threshold) {
+ return;
+ }
+ }
+
+ rd_stats->sse = sse;
+ // Skipping test
+ *early_term = 0;
+ set_force_skip_flag(cpi, x, sse, early_term);
+ // The code below for setting skip flag assumes transform size of at least
+ // 8x8, so force this lower limit on transform.
+ MB_MODE_INFO *const mi = xd->mi[0];
+ if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search &&
+ early_term_inter_search_with_sse(
+ cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse,
+ mi->mode))
+ test_skip = 0;
+
+ if (*early_term) test_skip = 0;
+
+ // Evaluate if the partition block is a skippable block in Y plane.
+ if (test_skip) {
+ const unsigned int *sse_tx = sse16x16;
+ const unsigned int *var_tx = var16x16;
+ const unsigned int num_block = (1 << (bw + bh - 2)) >> 2;
+ set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col,
+ early_term, num_block, sse_tx, var_tx, sum,
+ var, sse);
+ }
+ calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize,
+ sse);
+}
+
+static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, MACROBLOCK *x,
+ MACROBLOCKD *xd, RD_STATS *rd_stats,
+ int *early_term, int calculate_rd,
+ int64_t best_sse,
+ unsigned int *var_output,
+ unsigned int var_prune_threshold) {
+ if (x->force_zeromv_skip_for_blk) {
+ *early_term = 1;
+ rd_stats->rate = 0;
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ return;
+ }
+
+ // For block sizes greater than 32x32, the transform size is always 16x16.
+ // This function avoids calling calculate_variance() for tx_size 16x16 cases
+ // by directly populating variance at tx_size level from
+ // block_variance_16x16_dual() function.
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ if (CAP_TX_SIZE_FOR_BSIZE_GT32(txfm_params->tx_mode_search_type, bsize)) {
+ xd->mi[0]->tx_size = TX_SIZE_FOR_BSIZE_GT32;
+ model_skip_for_sb_y_large_64(cpi, bsize, mi_row, mi_col, x, xd, rd_stats,
+ early_term, calculate_rd, best_sse, var_output,
+ var_prune_threshold);
+ return;
+ }
+
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ int test_skip = 1;
+ unsigned int var;
+ int sum;
+
+ const int bw = b_width_log2_lookup[bsize];
+ const int bh = b_height_log2_lookup[bsize];
+ unsigned int sse8x8[256] = { 0 };
+ int sum8x8[256] = { 0 };
+ unsigned int var8x8[256] = { 0 };
+ TX_SIZE tx_size;
+
+ // Calculate variance for whole partition, and also save 8x8 blocks' variance
+ // to be used in following transform skipping test.
+ block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+ var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+ if (var_output) {
+ *var_output = var;
+ if (*var_output > var_prune_threshold) {
+ return;
+ }
+ }
+
+ rd_stats->sse = sse;
+ // Skipping test
+ *early_term = 0;
+ tx_size = calculate_tx_size(cpi, bsize, x, var, sse, early_term);
+ assert(tx_size <= TX_16X16);
+ // The code below for setting skip flag assumes transform size of at least
+ // 8x8, so force this lower limit on transform.
+ if (tx_size < TX_8X8) tx_size = TX_8X8;
+ xd->mi[0]->tx_size = tx_size;
+
+ MB_MODE_INFO *const mi = xd->mi[0];
+ if (!calculate_rd && cpi->sf.rt_sf.sse_early_term_inter_search &&
+ early_term_inter_search_with_sse(
+ cpi->sf.rt_sf.sse_early_term_inter_search, bsize, sse, best_sse,
+ mi->mode))
+ test_skip = 0;
+
+ if (*early_term) test_skip = 0;
+
+ // Evaluate if the partition block is a skippable block in Y plane.
+ if (test_skip) {
+ unsigned int sse16x16[64] = { 0 };
+ int sum16x16[64] = { 0 };
+ unsigned int var16x16[64] = { 0 };
+ const unsigned int *sse_tx = sse8x8;
+ const unsigned int *var_tx = var8x8;
+ unsigned int num_blks = 1 << (bw + bh - 2);
+
+ if (tx_size >= TX_16X16) {
+ calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+ sum16x16);
+ sse_tx = sse16x16;
+ var_tx = var16x16;
+ num_blks = num_blks >> 2;
+ }
+ set_early_term_based_on_uv_plane(cpi, x, bsize, xd, mi_row, mi_col,
+ early_term, num_blks, sse_tx, var_tx, sum,
+ var, sse);
+ }
+ calc_rate_dist_block_param(cpi, x, rd_stats, calculate_rd, early_term, bsize,
+ sse);
+}
+
+static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ RD_STATS *rd_stats, unsigned int *var_out,
+ int calculate_rd, int *early_term) {
+ if (x->force_zeromv_skip_for_blk && early_term != NULL) {
+ *early_term = 1;
+ rd_stats->rate = 0;
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ }
+
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+
+ unsigned int var = cpi->ppi->fn_ptr[bsize].vf(
+ p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse);
+ int force_skip = 0;
+ xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse, &force_skip);
+ if (var_out) {
+ *var_out = var;
+ }
+
+ if (calculate_rd && (!force_skip || ref == INTRA_FRAME)) {
+ const int bwide = block_size_wide[bsize];
+ const int bhigh = block_size_high[bsize];
+ model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate,
+ &dist);
+ } else {
+ rate = INT_MAX; // this will be overwritten later with av1_block_yrd
+ dist = INT_MAX;
+ }
+ rd_stats->sse = sse;
+ x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ if (force_skip && ref > INTRA_FRAME) {
+ rate = 0;
+ dist = (int64_t)sse << 4;
+ }
+
+ assert(rate >= 0);
+
+ rd_stats->skip_txfm = (rate == 0);
+ rate = AOMMIN(rate, INT_MAX);
+ rd_stats->rate = rate;
+ rd_stats->dist = dist;
+}
+
+static INLINE int get_drl_cost(PREDICTION_MODE this_mode, int ref_mv_idx,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ const int (*const drl_mode_cost0)[2],
+ int8_t ref_frame_type) {
+ int cost = 0;
+ if (this_mode == NEWMV || this_mode == NEW_NEWMV) {
+ for (int idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ cost += drl_mode_cost0[drl_ctx][ref_mv_idx != idx];
+ if (ref_mv_idx == idx) return cost;
+ }
+ }
+ return cost;
+ }
+
+ if (have_nearmv_in_inter_mode(this_mode)) {
+ for (int idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ cost += drl_mode_cost0[drl_ctx][ref_mv_idx != (idx - 1)];
+ if (ref_mv_idx == (idx - 1)) return cost;
+ }
+ }
+ return cost;
+ }
+ return cost;
+}
+
+static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
+ int16_t mode_context) {
+ if (is_inter_compound_mode(mode)) {
+ return mode_costs
+ ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+ }
+
+ int mode_cost = 0;
+ int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+ assert(is_inter_mode(mode));
+
+ if (mode == NEWMV) {
+ mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+ if (mode == GLOBALMV) {
+ mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+ return mode_cost;
+ }
+ }
+}
+
+static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
+ RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row,
+ int mv_col, int speed, uint32_t spatial_variance,
+ CONTENT_STATE_SB content_state_sb) {
+ // Bias against MVs associated with NEWMV mode that are very different from
+ // top/left neighbors.
+ if (this_mode == NEWMV) {
+ int al_mv_average_row;
+ int al_mv_average_col;
+ int row_diff, col_diff;
+ int above_mv_valid = 0;
+ int left_mv_valid = 0;
+ int above_row = INVALID_MV_ROW_COL, above_col = INVALID_MV_ROW_COL;
+ int left_row = INVALID_MV_ROW_COL, left_col = INVALID_MV_ROW_COL;
+ if (bsize >= BLOCK_64X64 && content_state_sb.source_sad_nonrd != kHighSad &&
+ spatial_variance < 300 &&
+ (mv_row > 16 || mv_row < -16 || mv_col > 16 || mv_col < -16)) {
+ this_rdc->rdcost = this_rdc->rdcost << 2;
+ return;
+ }
+ if (xd->above_mbmi) {
+ above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV;
+ above_row = xd->above_mbmi->mv[0].as_mv.row;
+ above_col = xd->above_mbmi->mv[0].as_mv.col;
+ }
+ if (xd->left_mbmi) {
+ left_mv_valid = xd->left_mbmi->mv[0].as_int != INVALID_MV;
+ left_row = xd->left_mbmi->mv[0].as_mv.row;
+ left_col = xd->left_mbmi->mv[0].as_mv.col;
+ }
+ if (above_mv_valid && left_mv_valid) {
+ al_mv_average_row = (above_row + left_row + 1) >> 1;
+ al_mv_average_col = (above_col + left_col + 1) >> 1;
+ } else if (above_mv_valid) {
+ al_mv_average_row = above_row;
+ al_mv_average_col = above_col;
+ } else if (left_mv_valid) {
+ al_mv_average_row = left_row;
+ al_mv_average_col = left_col;
+ } else {
+ al_mv_average_row = al_mv_average_col = 0;
+ }
+ row_diff = al_mv_average_row - mv_row;
+ col_diff = al_mv_average_col - mv_col;
+ if (row_diff > 80 || row_diff < -80 || col_diff > 80 || col_diff < -80) {
+ if (bsize >= BLOCK_32X32)
+ this_rdc->rdcost = this_rdc->rdcost << 1;
+ else
+ this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
+ }
+ } else {
+ // Bias for speed >= 8 for low spatial variance.
+ if (speed >= 8 && spatial_variance < 150 &&
+ (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64))
+ this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
+ }
+}
+
+static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize,
+ MV_REFERENCE_FRAME ref_frame,
+ THR_MODES best_mode_idx,
+ PREDICTION_MODE mode) {
+ const THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+ const BLOCK_SIZE min_size = AOMMAX(bsize - 3, BLOCK_4X4);
+ const BLOCK_SIZE max_size = AOMMIN(bsize + 6, BLOCK_128X128);
+ for (BLOCK_SIZE bs = min_size; bs <= max_size; bs += 3) {
+ int *freq_fact = &x->thresh_freq_fact[bs][thr_mode_idx];
+ if (thr_mode_idx == best_mode_idx) {
+ *freq_fact -= (*freq_fact >> 4);
+ } else {
+ *freq_fact =
+ AOMMIN(*freq_fact + RD_THRESH_INC,
+ cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+ }
+ }
+}
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+static void av1_pickmode_ctx_den_update(
+ AV1_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig,
+ unsigned int ref_frame_cost[REF_FRAMES],
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int reuse_inter_pred,
+ BEST_PICKMODE *bp) {
+ ctx_den->zero_last_cost_orig = zero_last_cost_orig;
+ ctx_den->ref_frame_cost = ref_frame_cost;
+ ctx_den->frame_mv = frame_mv;
+ ctx_den->reuse_inter_pred = reuse_inter_pred;
+ ctx_den->best_tx_size = bp->best_tx_size;
+ ctx_den->best_mode = bp->best_mode;
+ ctx_den->best_ref_frame = bp->best_ref_frame;
+ ctx_den->best_pred_filter = bp->best_pred_filter;
+ ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm;
+}
+
+static void recheck_zeromv_after_denoising(
+ AV1_COMP *cpi, MB_MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd,
+ AV1_DENOISER_DECISION decision, AV1_PICKMODE_CTX_DEN *ctx_den,
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_STATS *best_rdc,
+ BEST_PICKMODE *best_pickmode, BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on
+ // denoised result. Only do this under noise conditions, and if rdcost of
+ // ZEROMV on original source is not significantly higher than rdcost of best
+ // mode.
+ if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow &&
+ ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
+ ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+ (ctx_den->best_ref_frame == GOLDEN_FRAME &&
+ cpi->svc.number_spatial_layers == 1 &&
+ decision == FILTER_ZEROMV_BLOCK))) {
+ // Check if we should pick ZEROMV on denoised signal.
+ AV1_COMMON *const cm = &cpi->common;
+ RD_STATS this_rdc;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+
+ mi->mode = GLOBALMV;
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE_FRAME;
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME);
+ mi->mv[0].as_int = 0;
+ mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[LAST_FRAME][AOM_PLANE_Y];
+ av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+ unsigned int var;
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc, &var, 1, NULL);
+
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
+ this_rdc.rate += cost_mv_ref(mode_costs, GLOBALMV, mode_ctx);
+
+ this_rdc.rate += ctx_den->ref_frame_cost[LAST_FRAME];
+ this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+ txfm_info->skip_txfm = this_rdc.skip_txfm;
+ // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source
+ // is higher than best_ref mode (on original source).
+ if (this_rdc.rdcost > best_rdc->rdcost) {
+ this_rdc = *best_rdc;
+ mi->mode = best_pickmode->best_mode;
+ mi->ref_frame[0] = best_pickmode->best_ref_frame;
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE_FRAME);
+ mi->interp_filters = best_pickmode->best_pred_filter;
+ if (best_pickmode->best_ref_frame == INTRA_FRAME) {
+ mi->mv[0].as_int = INVALID_MV;
+ } else {
+ mi->mv[0].as_int = ctx_den
+ ->frame_mv[best_pickmode->best_mode]
+ [best_pickmode->best_ref_frame]
+ .as_int;
+ if (ctx_den->reuse_inter_pred) {
+ xd->plane[AOM_PLANE_Y].pre[0] = yv12_mb[GOLDEN_FRAME][AOM_PLANE_Y];
+ av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+ }
+ }
+ mi->tx_size = best_pickmode->best_tx_size;
+ txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm;
+ } else {
+ ctx_den->best_ref_frame = LAST_FRAME;
+ *best_rdc = this_rdc;
+ }
+ }
+}
+#endif // CONFIG_AV1_TEMPORAL_DENOISING
+
+/*!\brief Searches for the best interpolation filter
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Iterates through subset of possible interpolation filters (EIGHTTAP_REGULAR,
+ * EIGTHTAP_SMOOTH, MULTITAP_SHARP, depending on FILTER_SEARCH_SIZE) and selects
+ * the one that gives lowest RD cost. RD cost is calculated using curvfit model.
+ * Support for dual filters (different filters in the x & y directions) is
+ * allowed if sf.interp_sf.disable_dual_filter = 0.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the
+ * data for the current macroblock
+ * \param[in] this_rdc Pointer to calculated RD Cost
+ * \param[in] inter_pred_params_sr Pointer to structure holding parameters of
+ inter prediction for single reference
+ * \param[in] mi_row Row index in 4x4 units
+ * \param[in] mi_col Column index in 4x4 units
+ * \param[in] tmp_buffer Pointer to a temporary buffer for
+ * prediction re-use
+ * \param[in] bsize Current block size
+ * \param[in] reuse_inter_pred Flag, indicating prediction re-use
+ * \param[out] this_mode_pred Pointer to store prediction buffer
+ * for prediction re-use
+ * \param[out] this_early_term Flag, indicating that transform can be
+ * skipped
+ * \param[out] var The residue variance of the current
+ * predictor.
+ * \param[in] use_model_yrd_large Flag, indicating special logic to handle
+ * large blocks
+ * \param[in] best_sse Best sse so far.
+ * \param[in] is_single_pred Flag, indicating single mode.
+ *
+ * \remark Nothing is returned. Instead, calculated RD cost is placed to
+ * \c this_rdc and best filter is placed to \c mi->interp_filters. In case
+ * \c reuse_inter_pred flag is set, this function also outputs
+ * \c this_mode_pred. Also \c this_early_temp is set if transform can be
+ * skipped
+ */
+static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
+ InterPredParams *inter_pred_params_sr, int mi_row,
+ int mi_col, PRED_BUFFER *tmp_buffer,
+ BLOCK_SIZE bsize, int reuse_inter_pred,
+ PRED_BUFFER **this_mode_pred,
+ int *this_early_term, unsigned int *var,
+ int use_model_yrd_large, int64_t best_sse,
+ int is_single_pred) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const int bw = block_size_wide[bsize];
+ int dim_factor =
+ (cpi->sf.interp_sf.disable_dual_filter == 0) ? FILTER_SEARCH_SIZE : 1;
+ RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
+ TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
+ PRED_BUFFER *current_pred = *this_mode_pred;
+ int best_skip = 0;
+ int best_early_term = 0;
+ int64_t best_cost = INT64_MAX;
+ int best_filter_index = -1;
+
+ SubpelParams subpel_params;
+ // Initialize inter prediction params at mode level for single reference
+ // mode.
+ if (is_single_pred)
+ init_inter_mode_params(&mi->mv[0].as_mv, inter_pred_params_sr,
+ &subpel_params, xd->block_ref_scale_factors[0],
+ pd->pre->width, pd->pre->height);
+ for (int filter_idx = 0; filter_idx < FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE;
+ ++filter_idx) {
+ int64_t cost;
+ if (cpi->sf.interp_sf.disable_dual_filter &&
+ filters_ref_set[filter_idx].as_filters.x_filter !=
+ filters_ref_set[filter_idx].as_filters.y_filter)
+ continue;
+
+ mi->interp_filters.as_int = filters_ref_set[filter_idx].as_int;
+ if (is_single_pred)
+ av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr,
+ &subpel_params);
+ else
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ unsigned int curr_var = UINT_MAX;
+ if (use_model_yrd_large)
+ model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+ &pf_rd_stats[filter_idx], this_early_term, 1,
+ best_sse, &curr_var, UINT_MAX);
+ else
+ model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[filter_idx], &curr_var,
+ 1, NULL);
+ pf_rd_stats[filter_idx].rate += av1_get_switchable_rate(
+ x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter);
+ cost = RDCOST(x->rdmult, pf_rd_stats[filter_idx].rate,
+ pf_rd_stats[filter_idx].dist);
+ pf_tx_size[filter_idx] = mi->tx_size;
+ if (cost < best_cost) {
+ *var = curr_var;
+ best_filter_index = filter_idx;
+ best_cost = cost;
+ best_skip = pf_rd_stats[filter_idx].skip_txfm;
+ best_early_term = *this_early_term;
+ if (reuse_inter_pred) {
+ if (*this_mode_pred != current_pred) {
+ free_pred_buffer(*this_mode_pred);
+ *this_mode_pred = current_pred;
+ }
+ current_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+ pd->dst.buf = current_pred->data;
+ pd->dst.stride = bw;
+ }
+ }
+ }
+ assert(best_filter_index >= 0 &&
+ best_filter_index < dim_factor * FILTER_SEARCH_SIZE);
+ if (reuse_inter_pred && *this_mode_pred != current_pred)
+ free_pred_buffer(current_pred);
+
+ mi->interp_filters.as_int = filters_ref_set[best_filter_index].as_int;
+ mi->tx_size = pf_tx_size[best_filter_index];
+ this_rdc->rate = pf_rd_stats[best_filter_index].rate;
+ this_rdc->dist = pf_rd_stats[best_filter_index].dist;
+ this_rdc->sse = pf_rd_stats[best_filter_index].sse;
+ this_rdc->skip_txfm = (best_skip || best_early_term);
+ *this_early_term = best_early_term;
+ if (reuse_inter_pred) {
+ pd->dst.buf = (*this_mode_pred)->data;
+ pd->dst.stride = (*this_mode_pred)->stride;
+ } else if (best_filter_index < dim_factor * FILTER_SEARCH_SIZE - 1) {
+ if (is_single_pred)
+ av1_enc_build_inter_predictor_y_nonrd(xd, inter_pred_params_sr,
+ &subpel_params);
+ else
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ }
+}
+#if !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE int is_warped_mode_allowed(const AV1_COMP *cpi,
+ MACROBLOCK *const x,
+ const MB_MODE_INFO *mbmi) {
+ const FeatureFlags *const features = &cpi->common.features;
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ if (cpi->sf.inter_sf.extra_prune_warped) return 0;
+ if (has_second_ref(mbmi)) return 0;
+ MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+
+ if (features->switchable_motion_mode) {
+ // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+ // is allowed.
+ last_motion_mode_allowed = motion_mode_allowed(
+ xd->global_motion, xd, mbmi, features->allow_warped_motion);
+ }
+
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ return 1;
+ }
+
+ return 0;
+}
+
+static void calc_num_proj_ref(AV1_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const FeatureFlags *const features = &cm->features;
+
+ mi->num_proj_ref = 1;
+ WARP_SAMPLE_INFO *const warp_sample_info =
+ &x->warp_sample_info[mi->ref_frame[0]];
+ int *pts0 = warp_sample_info->pts;
+ int *pts_inref0 = warp_sample_info->pts_inref;
+ MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+
+ if (features->switchable_motion_mode) {
+ // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+ // is allowed.
+ last_motion_mode_allowed = motion_mode_allowed(
+ xd->global_motion, xd, mi, features->allow_warped_motion);
+ }
+
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ if (warp_sample_info->num < 0) {
+ warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
+ }
+ mi->num_proj_ref = warp_sample_info->num;
+ }
+}
+
+static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int *this_early_term, int use_model_yrd_large,
+ int *rate_mv, int64_t best_sse) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const FeatureFlags *const features = &cm->features;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ RD_STATS pf_rd_stats[MOTION_MODE_SEARCH_SIZE] = { 0 };
+ int best_skip = 0;
+ int best_early_term = 0;
+ int64_t best_cost = INT64_MAX;
+ int best_mode_index = -1;
+ const int interp_filter = features->interp_filter;
+
+ const MOTION_MODE motion_modes[MOTION_MODE_SEARCH_SIZE] = {
+ SIMPLE_TRANSLATION, WARPED_CAUSAL
+ };
+ int mode_search_size = is_warped_mode_allowed(cpi, x, mi) ? 2 : 1;
+
+ WARP_SAMPLE_INFO *const warp_sample_info =
+ &x->warp_sample_info[mi->ref_frame[0]];
+ int *pts0 = warp_sample_info->pts;
+ int *pts_inref0 = warp_sample_info->pts_inref;
+
+ const int total_samples = mi->num_proj_ref;
+ if (total_samples == 0) {
+ // Do not search WARPED_CAUSAL if there are no samples to use to determine
+ // warped parameters.
+ mode_search_size = 1;
+ }
+
+ const MB_MODE_INFO base_mbmi = *mi;
+ MB_MODE_INFO best_mbmi;
+
+ for (int mode_index = 0; mode_index < mode_search_size; ++mode_index) {
+ int64_t cost = INT64_MAX;
+ MOTION_MODE motion_mode = motion_modes[mode_index];
+ *mi = base_mbmi;
+ mi->motion_mode = motion_mode;
+ if (motion_mode == SIMPLE_TRANSLATION) {
+ mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ if (use_model_yrd_large)
+ model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+ &pf_rd_stats[mode_index], this_early_term, 1,
+ best_sse, NULL, UINT_MAX);
+ else
+ model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL, 1,
+ NULL);
+ pf_rd_stats[mode_index].rate +=
+ av1_get_switchable_rate(x, xd, cm->features.interp_filter,
+ cm->seq_params->enable_dual_filter);
+ cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate,
+ pf_rd_stats[mode_index].dist);
+ } else if (motion_mode == WARPED_CAUSAL) {
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ const ModeCosts *mode_costs = &x->mode_costs;
+ mi->wm_params.wmtype = DEFAULT_WMTYPE;
+ mi->interp_filters =
+ av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+
+ memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+ memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+ // Select the samples according to motion vector difference
+ if (mi->num_proj_ref > 1) {
+ mi->num_proj_ref = av1_selectSamples(&mi->mv[0].as_mv, pts, pts_inref,
+ mi->num_proj_ref, bsize);
+ }
+
+ // Compute the warped motion parameters with a least squares fit
+ // using the collected samples
+ if (!av1_find_projection(mi->num_proj_ref, pts, pts_inref, bsize,
+ mi->mv[0].as_mv.row, mi->mv[0].as_mv.col,
+ &mi->wm_params, mi_row, mi_col)) {
+ if (mi->mode == NEWMV) {
+ const int_mv mv0 = mi->mv[0];
+ const WarpedMotionParams wm_params0 = mi->wm_params;
+ const int num_proj_ref0 = mi->num_proj_ref;
+
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+ &ref_mv.as_mv, NULL);
+
+ // Refine MV in a small range.
+ av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
+ total_samples, cpi->sf.mv_sf.warp_search_method,
+ cpi->sf.mv_sf.warp_search_iters);
+ if (mi->mv[0].as_int == ref_mv.as_int) {
+ continue;
+ }
+
+ if (mv0.as_int != mi->mv[0].as_int) {
+ // Keep the refined MV and WM parameters.
+ int tmp_rate_mv = av1_mv_bit_cost(
+ &mi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ *rate_mv = tmp_rate_mv;
+ } else {
+ // Restore the old MV and WM parameters.
+ mi->mv[0] = mv0;
+ mi->wm_params = wm_params0;
+ mi->num_proj_ref = num_proj_ref0;
+ }
+ }
+ // Build the warped predictor
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, av1_num_planes(cm) - 1);
+ if (use_model_yrd_large)
+ model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+ &pf_rd_stats[mode_index], this_early_term,
+ 1, best_sse, NULL, UINT_MAX);
+ else
+ model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[mode_index], NULL,
+ 1, NULL);
+
+ pf_rd_stats[mode_index].rate +=
+ mode_costs->motion_mode_cost[bsize][mi->motion_mode];
+ cost = RDCOST(x->rdmult, pf_rd_stats[mode_index].rate,
+ pf_rd_stats[mode_index].dist);
+ } else {
+ cost = INT64_MAX;
+ }
+ }
+ if (cost < best_cost) {
+ best_mode_index = mode_index;
+ best_cost = cost;
+ best_skip = pf_rd_stats[mode_index].skip_txfm;
+ best_early_term = *this_early_term;
+ best_mbmi = *mi;
+ }
+ }
+ assert(best_mode_index >= 0 && best_mode_index < FILTER_SEARCH_SIZE);
+
+ *mi = best_mbmi;
+ this_rdc->rate = pf_rd_stats[best_mode_index].rate;
+ this_rdc->dist = pf_rd_stats[best_mode_index].dist;
+ this_rdc->sse = pf_rd_stats[best_mode_index].sse;
+ this_rdc->skip_txfm = (best_skip || best_early_term);
+ *this_early_term = best_early_term;
+ if (best_mode_index < FILTER_SEARCH_SIZE - 1) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+#define COLLECT_NON_SQR_STAT 0
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+
+static AOM_INLINE void print_stage_time(const char *stage_name,
+ int64_t stage_time,
+ int64_t total_time) {
+ printf(" %s: %ld (%f%%)\n", stage_name, stage_time,
+ 100 * stage_time / (float)total_time);
+}
+
+static void print_time(const mode_search_stat_nonrd *const ms_stat,
+ BLOCK_SIZE bsize, int mi_rows, int mi_cols, int mi_row,
+ int mi_col) {
+ if ((mi_row + mi_size_high[bsize] >= mi_rows) &&
+ (mi_col + mi_size_wide[bsize] >= mi_cols)) {
+ int64_t total_time = 0l;
+ int32_t total_blocks = 0;
+ for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) {
+ total_time += ms_stat->total_block_times[bs];
+ total_blocks += ms_stat->num_blocks[bs];
+ }
+
+ printf("\n");
+ for (BLOCK_SIZE bs = 0; bs < BLOCK_SIZES; bs++) {
+ if (ms_stat->num_blocks[bs] == 0) {
+ continue;
+ }
+ if (!COLLECT_NON_SQR_STAT && block_size_wide[bs] != block_size_high[bs]) {
+ continue;
+ }
+
+ printf("BLOCK_%dX%d Num %d, Time: %ld (%f%%), Avg_time %f:\n",
+ block_size_wide[bs], block_size_high[bs], ms_stat->num_blocks[bs],
+ ms_stat->total_block_times[bs],
+ 100 * ms_stat->total_block_times[bs] / (float)total_time,
+ (float)ms_stat->total_block_times[bs] / ms_stat->num_blocks[bs]);
+ for (int j = 0; j < MB_MODE_COUNT; j++) {
+ if (ms_stat->nonskipped_search_times[bs][j] == 0) {
+ continue;
+ }
+
+ int64_t total_mode_time = ms_stat->nonskipped_search_times[bs][j];
+ printf(" Mode %d, %d/%d tps %f\n", j,
+ ms_stat->num_nonskipped_searches[bs][j],
+ ms_stat->num_searches[bs][j],
+ ms_stat->num_nonskipped_searches[bs][j] > 0
+ ? (float)ms_stat->nonskipped_search_times[bs][j] /
+ ms_stat->num_nonskipped_searches[bs][j]
+ : 0l);
+ if (j >= INTER_MODE_START) {
+ total_mode_time = ms_stat->ms_time[bs][j] + ms_stat->ifs_time[bs][j] +
+ ms_stat->model_rd_time[bs][j] +
+ ms_stat->txfm_time[bs][j];
+ print_stage_time("Motion Search Time", ms_stat->ms_time[bs][j],
+ total_time);
+ print_stage_time("Filter Search Time", ms_stat->ifs_time[bs][j],
+ total_time);
+ print_stage_time("Model RD Time", ms_stat->model_rd_time[bs][j],
+ total_time);
+ print_stage_time("Tranfm Search Time", ms_stat->txfm_time[bs][j],
+ total_time);
+ }
+ print_stage_time("Total Mode Time", total_mode_time, total_time);
+ }
+ printf("\n");
+ }
+ printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks);
+ }
+}
+#endif // COLLECT_NONRD_PICK_MODE_STAT
+
+static bool should_prune_intra_modes_using_neighbors(
+ const MACROBLOCKD *xd, bool enable_intra_mode_pruning_using_neighbors,
+ PREDICTION_MODE this_mode, PREDICTION_MODE above_mode,
+ PREDICTION_MODE left_mode) {
+ if (!enable_intra_mode_pruning_using_neighbors) return false;
+
+ // Avoid pruning of DC_PRED as it is the most probable mode to win as per the
+ // statistics generated for nonrd intra mode evaluations.
+ if (this_mode == DC_PRED) return false;
+
+ // Enable the pruning for current mode only if it is not the winner mode of
+ // both the neighboring blocks (left/top).
+ return xd->up_available && this_mode != above_mode && xd->left_available &&
+ this_mode != left_mode;
+}
+
+void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ RD_STATS this_rdc, best_rdc;
+ struct estimate_block_intra_args args;
+ init_estimate_block_intra_args(&args, cpi, x);
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ mi->tx_size =
+ AOMMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]);
+ assert(IMPLIES(xd->lossless[mi->segment_id], mi->tx_size == TX_4X4));
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[mi->tx_size];
+
+ // If the current block size is the same as the transform block size, enable
+ // mode pruning based on the best SAD so far.
+ if (cpi->sf.rt_sf.prune_intra_mode_using_best_sad_so_far && bsize == tx_bsize)
+ args.prune_mode_based_on_sad = true;
+
+ int *bmode_costs;
+ PREDICTION_MODE best_mode = DC_PRED;
+ const MB_MODE_INFO *above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *left_mi = xd->left_mbmi;
+ const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[A];
+ const int left_ctx = intra_mode_context[L];
+ const unsigned int source_variance = x->source_variance;
+ bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
+
+ av1_invalid_rd_stats(&best_rdc);
+ av1_invalid_rd_stats(&this_rdc);
+
+ init_mbmi_nonrd(mi, DC_PRED, INTRA_FRAME, NONE_FRAME, cm);
+ mi->mv[0].as_int = mi->mv[1].as_int = INVALID_MV;
+
+ // Change the limit of this loop to add other intra prediction
+ // mode tests.
+ for (int mode_index = 0; mode_index < RTC_INTRA_MODES; ++mode_index) {
+ PREDICTION_MODE this_mode = intra_mode_list[mode_index];
+
+ // As per the statistics generated for intra mode evaluation in the nonrd
+ // path, it is found that the probability of H_PRED mode being the winner is
+ // very low when the best mode so far is V_PRED (out of DC_PRED and V_PRED).
+ // If V_PRED is the winner mode out of DC_PRED and V_PRED, it could imply
+ // the presence of a vertically dominant pattern. Hence, H_PRED mode is not
+ // evaluated.
+ if (cpi->sf.rt_sf.prune_h_pred_using_best_mode_so_far &&
+ this_mode == H_PRED && best_mode == V_PRED)
+ continue;
+
+ if (should_prune_intra_modes_using_neighbors(
+ xd, cpi->sf.rt_sf.enable_intra_mode_pruning_using_neighbors,
+ this_mode, A, L)) {
+ // Prune V_PRED and H_PRED if source variance of the block is less than
+ // or equal to 50. The source variance threshold is obtained empirically.
+ if ((this_mode == V_PRED || this_mode == H_PRED) && source_variance <= 50)
+ continue;
+
+ // As per the statistics, probability of SMOOTH_PRED being the winner is
+ // low when best mode so far is DC_PRED (out of DC_PRED, V_PRED and
+ // H_PRED). Hence, SMOOTH_PRED mode is not evaluated.
+ if (best_mode == DC_PRED && this_mode == SMOOTH_PRED) continue;
+ }
+
+ this_rdc.dist = this_rdc.rate = 0;
+ args.mode = this_mode;
+ args.skippable = 1;
+ args.rdc = &this_rdc;
+ mi->mode = this_mode;
+ av1_foreach_transformed_block_in_plane(xd, bsize, AOM_PLANE_Y,
+ av1_estimate_block_intra, &args);
+
+ if (this_rdc.rate == INT_MAX) continue;
+
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ if (args.skippable) {
+ this_rdc.rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+ } else {
+ this_rdc.rate += x->mode_costs.skip_txfm_cost[skip_ctx][0];
+ }
+ this_rdc.rate += bmode_costs[this_mode];
+ this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = this_rdc;
+ best_mode = this_mode;
+ if (!this_rdc.skip_txfm) {
+ memset(ctx->blk_skip, 0,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+ }
+
+ mi->mode = best_mode;
+ // Keep DC for UV since mode test is based on Y channel only.
+ mi->uv_mode = UV_DC_PRED;
+ *rd_cost = best_rdc;
+
+ // For lossless: always force the skip flags off.
+ // Even though the blk_skip is set to 0 above in the rdcost comparison,
+ // do it here again in case the above logic changes.
+ if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+ x->txfm_search_info.skip_txfm = 0;
+ memset(ctx->blk_skip, 0,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ }
+
+#if CONFIG_INTERNAL_STATS
+ store_coding_context_nonrd(x, ctx, mi->mode);
+#else
+ store_coding_context_nonrd(x, ctx);
+#endif // CONFIG_INTERNAL_STATS
+}
+
+static AOM_INLINE int is_same_gf_and_last_scale(AV1_COMMON *cm) {
+ struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
+ struct scale_factors *const sf_golden =
+ get_ref_scale_factors(cm, GOLDEN_FRAME);
+ return ((sf_last->x_scale_fp == sf_golden->x_scale_fp) &&
+ (sf_last->y_scale_fp == sf_golden->y_scale_fp));
+}
+
+static AOM_INLINE void get_ref_frame_use_mask(AV1_COMP *cpi, MACROBLOCK *x,
+ MB_MODE_INFO *mi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int gf_temporal_ref,
+ int use_ref_frame[],
+ int *force_skip_low_temp_var) {
+ AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
+
+ // When the ref_frame_config is used to set the reference frame structure
+ // then the usage of alt_ref is determined by the ref_frame_flags
+ // (and not the speed feature use_nonrd_altref_frame).
+ int use_alt_ref_frame = cpi->ppi->rtc_ref.set_ref_frame_config ||
+ cpi->sf.rt_sf.use_nonrd_altref_frame;
+
+ int use_golden_ref_frame = 1;
+ int use_last_ref_frame = 1;
+
+ // When the ref_frame_config is used to set the reference frame structure:
+ // check if LAST is used as a reference. And only remove golden and altref
+ // references below if last is used as a reference.
+ if (cpi->ppi->rtc_ref.set_ref_frame_config)
+ use_last_ref_frame =
+ cpi->ref_frame_flags & AOM_LAST_FLAG ? use_last_ref_frame : 0;
+
+ // frame_since_golden is not used when user sets the referene structure.
+ if (!cpi->ppi->rtc_ref.set_ref_frame_config && use_last_ref_frame &&
+ cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
+ use_golden_ref_frame = 0;
+ }
+
+ if (use_last_ref_frame && cpi->sf.rt_sf.short_circuit_low_temp_var &&
+ x->nonrd_prune_ref_frame_search) {
+ if (is_small_sb)
+ *force_skip_low_temp_var = av1_get_force_skip_low_temp_var_small_sb(
+ &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+ else
+ *force_skip_low_temp_var = av1_get_force_skip_low_temp_var(
+ &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+ // If force_skip_low_temp_var is set, skip golden reference.
+ if (*force_skip_low_temp_var) {
+ use_golden_ref_frame = 0;
+ use_alt_ref_frame = 0;
+ }
+ }
+
+ if (use_last_ref_frame &&
+ (x->nonrd_prune_ref_frame_search > 2 || x->force_zeromv_skip_for_blk ||
+ (x->nonrd_prune_ref_frame_search > 1 && bsize > BLOCK_64X64))) {
+ use_golden_ref_frame = 0;
+ use_alt_ref_frame = 0;
+ }
+
+ if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+ use_golden_ref_frame = 1;
+ use_alt_ref_frame = 0;
+ }
+
+ // Skip golden/altref reference if color is set, on flat blocks with motion.
+ // For screen: always skip golden/alt (if color_sensitivity_sb_g/alt is set)
+ // except when x->nonrd_prune_ref_frame_search = 0. This latter flag
+ // may be set in the variance partition when golden is a much better
+ // reference than last, in which case it may not be worth skipping
+ // golden/altref completely.
+ // Condition on use_last_ref to make sure there remains at least one
+ // reference.
+ if (use_last_ref_frame &&
+ ((cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ x->nonrd_prune_ref_frame_search != 0) ||
+ (x->source_variance < 200 &&
+ x->content_state_sb.source_sad_nonrd >= kLowSad))) {
+ if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ use_golden_ref_frame = 0;
+ if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ use_alt_ref_frame = 0;
+ }
+
+ // For non-screen: if golden and altref are not being selected as references
+ // (use_golden_ref_frame/use_alt_ref_frame = 0) check to allow golden back
+ // based on the sad of nearest/nearmv of LAST ref. If this block sad is large,
+ // keep golden as reference. Only do this for the agrressive pruning mode and
+ // avoid it when color is set for golden reference.
+ if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
+ (cpi->ref_frame_flags & AOM_LAST_FLAG) && !use_golden_ref_frame &&
+ !use_alt_ref_frame && x->pred_mv_sad[LAST_FRAME] != INT_MAX &&
+ x->nonrd_prune_ref_frame_search > 2 &&
+ x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) {
+ int thr = (cm->width * cm->height > RESOLUTION_288P) ? 100 : 150;
+ int pred = x->pred_mv_sad[LAST_FRAME] >>
+ (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ if (pred > thr) use_golden_ref_frame = 1;
+ }
+
+ use_alt_ref_frame =
+ cpi->ref_frame_flags & AOM_ALT_FLAG ? use_alt_ref_frame : 0;
+ use_golden_ref_frame =
+ cpi->ref_frame_flags & AOM_GOLD_FLAG ? use_golden_ref_frame : 0;
+
+ // For spatial layers: enable golden ref if it is set by user and
+ // corresponds to the lower spatial layer.
+ if (cpi->svc.spatial_layer_id > 0 && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
+ x->content_state_sb.source_sad_nonrd < kHighSad) {
+ const int buffslot_golden =
+ cpi->ppi->rtc_ref.ref_idx[GOLDEN_FRAME - LAST_FRAME];
+ if (cpi->ppi->rtc_ref.buffer_time_index[buffslot_golden] ==
+ cpi->svc.current_superframe)
+ use_golden_ref_frame = 1;
+ }
+
+ use_ref_frame[ALTREF_FRAME] = use_alt_ref_frame;
+ use_ref_frame[GOLDEN_FRAME] = use_golden_ref_frame;
+ use_ref_frame[LAST_FRAME] = use_last_ref_frame;
+ // Keep this assert on, as only 3 references are used in nonrd_pickmode
+ // (LAST, GOLDEN, ALTREF), and if all 3 are not set by user then this
+ // frame must be an intra-only frame and hence should never enter the
+ // pickmode here for inter frames.
+ assert(use_last_ref_frame || use_golden_ref_frame || use_alt_ref_frame);
+}
+
+static AOM_INLINE int is_filter_search_enabled_blk(
+ AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int segment_id, int cb_pred_filter_search, InterpFilter *filt_select) {
+ const AV1_COMMON *const cm = &cpi->common;
+ // filt search disabled
+ if (!cpi->sf.rt_sf.use_nonrd_filter_search) return 0;
+ // filt search purely based on mode properties
+ if (!cb_pred_filter_search) return 1;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int enable_interp_search = 0;
+ if (!(xd->left_mbmi && xd->above_mbmi)) {
+ // neighbors info unavailable
+ enable_interp_search = 2;
+ } else if (!(is_inter_block(xd->left_mbmi) &&
+ is_inter_block(xd->above_mbmi))) {
+ // neighbor is INTRA
+ enable_interp_search = 2;
+ } else if (xd->left_mbmi->interp_filters.as_int !=
+ xd->above_mbmi->interp_filters.as_int) {
+ // filters are different
+ enable_interp_search = 2;
+ } else if ((cb_pred_filter_search == 1) &&
+ (xd->left_mbmi->interp_filters.as_filters.x_filter !=
+ EIGHTTAP_REGULAR)) {
+ // not regular
+ enable_interp_search = 2;
+ } else {
+ // enable prediction based on chessboard pattern
+ if (xd->left_mbmi->interp_filters.as_filters.x_filter == EIGHTTAP_SMOOTH)
+ *filt_select = EIGHTTAP_SMOOTH;
+ const int bsl = mi_size_wide_log2[bsize];
+ enable_interp_search =
+ (bool)((((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_frame.frame_number)) &
+ 0x1);
+ if (cyclic_refresh_segment_id_boosted(segment_id)) enable_interp_search = 1;
+ }
+ return enable_interp_search;
+}
+
+static AOM_INLINE int skip_mode_by_threshold(
+ PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, int_mv mv,
+ int frames_since_golden, const int *const rd_threshes,
+ const int *const rd_thresh_freq_fact, int64_t best_cost, int best_skip,
+ int extra_shift) {
+ int skip_this_mode = 0;
+ const THR_MODES mode_index = mode_idx[ref_frame][INTER_OFFSET(mode)];
+ int64_t mode_rd_thresh =
+ best_skip ? ((int64_t)rd_threshes[mode_index]) << (extra_shift + 1)
+ : ((int64_t)rd_threshes[mode_index]) << extra_shift;
+
+ // Increase mode_rd_thresh value for non-LAST for improved encoding
+ // speed
+ if (ref_frame != LAST_FRAME) {
+ mode_rd_thresh = mode_rd_thresh << 1;
+ if (ref_frame == GOLDEN_FRAME && frames_since_golden > 4)
+ mode_rd_thresh = mode_rd_thresh << (extra_shift + 1);
+ }
+
+ if (rd_less_than_thresh(best_cost, mode_rd_thresh,
+ rd_thresh_freq_fact[mode_index]))
+ if (mv.as_int != 0) skip_this_mode = 1;
+
+ return skip_this_mode;
+}
+
+static AOM_INLINE int skip_mode_by_low_temp(
+ PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
+ CONTENT_STATE_SB content_state_sb, int_mv mv, int force_skip_low_temp_var) {
+ // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
+ // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+ // later.
+ if (force_skip_low_temp_var && ref_frame != LAST_FRAME && mv.as_int != 0) {
+ return 1;
+ }
+
+ if (content_state_sb.source_sad_nonrd != kHighSad && bsize >= BLOCK_64X64 &&
+ force_skip_low_temp_var && mode == NEWMV) {
+ return 1;
+ }
+ return 0;
+}
+
+static AOM_INLINE int skip_mode_by_bsize_and_ref_frame(
+ PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE bsize,
+ int extra_prune, unsigned int sse_zeromv_norm, int more_prune) {
+ const unsigned int thresh_skip_golden = 500;
+
+ if (ref_frame != LAST_FRAME && sse_zeromv_norm < thresh_skip_golden &&
+ mode == NEWMV)
+ return 1;
+
+ if (bsize == BLOCK_128X128 && mode == NEWMV) return 1;
+
+ // Skip testing non-LAST if this flag is set.
+ if (extra_prune) {
+ if (extra_prune > 1 && ref_frame != LAST_FRAME &&
+ (bsize > BLOCK_16X16 && mode == NEWMV))
+ return 1;
+
+ if (ref_frame != LAST_FRAME && mode == NEARMV) return 1;
+
+ if (more_prune && bsize >= BLOCK_32X32 && mode == NEARMV) return 1;
+ }
+ return 0;
+}
+
+static void set_block_source_sad(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ struct buf_2d *yv12_mb) {
+ struct macroblock_plane *const p = &x->plane[0];
+ const int y_sad = cpi->ppi->fn_ptr[bsize].sdf(p->src.buf, p->src.stride,
+ yv12_mb->buf, yv12_mb->stride);
+ if (y_sad == 0) x->block_is_zero_sad = 1;
+}
+
+static void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int y_sad,
+ unsigned int source_variance,
+ struct buf_2d yv12_mb[MAX_MB_PLANE]) {
+ const int subsampling_x = cpi->common.seq_params->subsampling_x;
+ const int subsampling_y = cpi->common.seq_params->subsampling_y;
+ const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+ const int high_res = cpi->common.width * cpi->common.height >= 640 * 360;
+ if (bsize == cpi->common.seq_params->sb_size) {
+ // At superblock level color_sensitivity is already set to 0, 1, or 2.
+ // 2 is middle/uncertain level. To avoid additional sad
+ // computations when bsize = sb_size force level 2 to 1 (certain color)
+ // for motion areas.
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 2) {
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] =
+ source_sad_nonrd >= kMedSad ? 1 : 0;
+ }
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 2) {
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] =
+ source_sad_nonrd >= kMedSad ? 1 : 0;
+ }
+ return;
+ }
+ int shift = 3;
+ unsigned int source_var_thr = 50;
+ int uv_sad_thr = 100;
+ if (source_sad_nonrd >= kMedSad && x->source_variance > 0 && high_res)
+ shift = 4;
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ if (cpi->rc.high_source_sad) shift = 6;
+ if (source_sad_nonrd > kMedSad) {
+ source_var_thr = 1200;
+ uv_sad_thr = 10;
+ }
+ }
+ NOISE_LEVEL noise_level = kLow;
+ int norm_sad =
+ y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ unsigned int thresh_spatial = (cpi->common.width > 1920) ? 5000 : 1000;
+ // If the spatial source variance is high and the normalized y_sad
+ // is low, then y-channel is likely good for mode estimation, so keep
+ // color_sensitivity off. For low noise content for now, since there is
+ // some bdrate regression for noisy color clip.
+ if (cpi->noise_estimate.enabled)
+ noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
+ if (noise_level == kLow && source_variance > thresh_spatial &&
+ cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN && norm_sad < 50) {
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 0;
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 0;
+ return;
+ }
+ const int num_planes = av1_num_planes(&cpi->common);
+
+ for (int plane = AOM_PLANE_U; plane < num_planes; ++plane) {
+ // Always check if level = 2. If level = 0 check again for
+ // motion areas for higher resolns, where color artifacts
+ // are more noticeable.
+ if (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 2 ||
+ (x->color_sensitivity[COLOR_SENS_IDX(plane)] == 0 &&
+ source_sad_nonrd >= kMedSad && high_res)) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(bsize, subsampling_x, subsampling_y);
+
+ const int uv_sad = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride);
+
+ const int norm_uv_sad =
+ uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]);
+ x->color_sensitivity[COLOR_SENS_IDX(plane)] =
+ uv_sad > (y_sad >> shift) && norm_uv_sad > 40;
+ if (source_variance < source_var_thr && norm_uv_sad > uv_sad_thr)
+ x->color_sensitivity[COLOR_SENS_IDX(plane)] = 1;
+ }
+ }
+}
+
+static void setup_compound_prediction(const AV1_COMMON *cm, MACROBLOCK *x,
+ struct buf_2d yv12_mb[8][MAX_MB_PLANE],
+ const int *use_ref_frame_mask,
+ const MV_REFERENCE_FRAME *rf,
+ int *ref_mv_idx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ MV_REFERENCE_FRAME ref_frame_comp;
+ if (!use_ref_frame_mask[rf[1]]) {
+ // Need to setup pred_block, if it hasn't been done in find_predictors.
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, rf[1]);
+ const int num_planes = av1_num_planes(cm);
+ if (yv12 != NULL) {
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, rf[1]);
+ av1_setup_pred_block(xd, yv12_mb[rf[1]], yv12, sf, sf, num_planes);
+ }
+ }
+ ref_frame_comp = av1_ref_frame_type(rf);
+ mbmi_ext->mode_context[ref_frame_comp] = 0;
+ mbmi_ext->ref_mv_count[ref_frame_comp] = UINT8_MAX;
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame_comp, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_comp);
+ *ref_mv_idx = mbmi->ref_mv_idx + 1;
+}
+
+static void set_compound_mode(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+ MV_REFERENCE_FRAME ref_frame2, int ref_mv_idx,
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+ PREDICTION_MODE this_mode) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ mi->ref_frame[0] = ref_frame;
+ mi->ref_frame[1] = ref_frame2;
+ mi->compound_idx = 1;
+ mi->comp_group_idx = 0;
+ mi->interinter_comp.type = COMPOUND_AVERAGE;
+ MV_REFERENCE_FRAME ref_frame_comp = av1_ref_frame_type(mi->ref_frame);
+ if (this_mode == GLOBAL_GLOBALMV) {
+ frame_mv[this_mode][ref_frame].as_int = 0;
+ frame_mv[this_mode][ref_frame2].as_int = 0;
+ } else if (this_mode == NEAREST_NEARESTMV) {
+ frame_mv[this_mode][ref_frame].as_int =
+ xd->ref_mv_stack[ref_frame_comp][0].this_mv.as_int;
+ frame_mv[this_mode][ref_frame2].as_int =
+ xd->ref_mv_stack[ref_frame_comp][0].comp_mv.as_int;
+ } else if (this_mode == NEAR_NEARMV) {
+ frame_mv[this_mode][ref_frame].as_int =
+ xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].this_mv.as_int;
+ frame_mv[this_mode][ref_frame2].as_int =
+ xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].comp_mv.as_int;
+ }
+}
+
+// Prune compound mode if the single mode variance is lower than a fixed
+// percentage of the median value.
+static bool skip_comp_based_on_var(
+ const unsigned int (*single_vars)[REF_FRAMES], BLOCK_SIZE bsize) {
+ unsigned int best_var = UINT_MAX;
+ for (int cur_mode_idx = 0; cur_mode_idx < RTC_INTER_MODES; cur_mode_idx++) {
+ for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+ best_var = AOMMIN(best_var, single_vars[cur_mode_idx][ref_idx]);
+ }
+ }
+ const unsigned int thresh_64 = (unsigned int)(0.57356805f * 8659);
+ const unsigned int thresh_32 = (unsigned int)(0.23964763f * 4281);
+
+ // Currently, the thresh for 128 and 16 are not well-tuned. We are using the
+ // results from 64 and 32 as an heuristic.
+ switch (bsize) {
+ case BLOCK_128X128: return best_var < 4 * thresh_64;
+ case BLOCK_64X64: return best_var < thresh_64;
+ case BLOCK_32X32: return best_var < thresh_32;
+ case BLOCK_16X16: return best_var < thresh_32 / 4;
+ default: return false;
+ }
+}
+
+static AOM_FORCE_INLINE void fill_single_inter_mode_costs(
+ int (*single_inter_mode_costs)[REF_FRAMES], int num_inter_modes,
+ const REF_MODE *reference_mode_set, const ModeCosts *mode_costs,
+ const int16_t *mode_context) {
+ bool ref_frame_used[REF_FRAMES] = { false };
+ for (int idx = 0; idx < num_inter_modes; idx++) {
+ ref_frame_used[reference_mode_set[idx].ref_frame] = true;
+ }
+
+ for (int this_ref_frame = LAST_FRAME; this_ref_frame < REF_FRAMES;
+ this_ref_frame++) {
+ if (!ref_frame_used[this_ref_frame]) {
+ continue;
+ }
+
+ const MV_REFERENCE_FRAME rf[2] = { this_ref_frame, NONE_FRAME };
+ const int16_t mode_ctx = av1_mode_context_analyzer(mode_context, rf);
+ for (PREDICTION_MODE this_mode = NEARESTMV; this_mode <= NEWMV;
+ this_mode++) {
+ single_inter_mode_costs[INTER_OFFSET(this_mode)][this_ref_frame] =
+ cost_mv_ref(mode_costs, this_mode, mode_ctx);
+ }
+ }
+}
+
+static AOM_INLINE bool is_globalmv_better(
+ PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame, int rate_mv,
+ const ModeCosts *mode_costs,
+ const int (*single_inter_mode_costs)[REF_FRAMES],
+ const MB_MODE_INFO_EXT *mbmi_ext) {
+ const int globalmv_mode_cost =
+ single_inter_mode_costs[INTER_OFFSET(GLOBALMV)][ref_frame];
+ int this_mode_cost =
+ rate_mv + single_inter_mode_costs[INTER_OFFSET(this_mode)][ref_frame];
+ if (this_mode == NEWMV || this_mode == NEARMV) {
+ const MV_REFERENCE_FRAME rf[2] = { ref_frame, NONE_FRAME };
+ this_mode_cost += get_drl_cost(
+ NEWMV, 0, mbmi_ext, mode_costs->drl_mode_cost0, av1_ref_frame_type(rf));
+ }
+ return this_mode_cost > globalmv_mode_cost;
+}
+
+// Set up the mv/ref_frames etc based on the comp_index. Returns 1 if it
+// succeeds, 0 if it fails.
+static AOM_INLINE int setup_compound_params_from_comp_idx(
+ const AV1_COMP *cpi, MACROBLOCK *x, struct buf_2d yv12_mb[8][MAX_MB_PLANE],
+ PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *ref_frame,
+ MV_REFERENCE_FRAME *ref_frame2, int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+ const int *use_ref_frame_mask, int comp_index,
+ bool comp_use_zero_zeromv_only, MV_REFERENCE_FRAME *last_comp_ref_frame,
+ BLOCK_SIZE bsize) {
+ const MV_REFERENCE_FRAME *rf = comp_ref_mode_set[comp_index].ref_frame;
+ int skip_gf = 0;
+ int skip_alt = 0;
+ *this_mode = comp_ref_mode_set[comp_index].pred_mode;
+ *ref_frame = rf[0];
+ *ref_frame2 = rf[1];
+ assert(*ref_frame == LAST_FRAME);
+ assert(*this_mode == GLOBAL_GLOBALMV || *this_mode == NEAREST_NEARESTMV);
+ if (x->source_variance < 50 && bsize > BLOCK_16X16) {
+ if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ skip_gf = 1;
+ if (x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity_sb_alt[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ skip_alt = 1;
+ }
+ if (comp_use_zero_zeromv_only && *this_mode != GLOBAL_GLOBALMV) {
+ return 0;
+ }
+ if (*ref_frame2 == GOLDEN_FRAME &&
+ (cpi->sf.rt_sf.ref_frame_comp_nonrd[0] == 0 || skip_gf ||
+ !(cpi->ref_frame_flags & AOM_GOLD_FLAG))) {
+ return 0;
+ } else if (*ref_frame2 == LAST2_FRAME &&
+ (cpi->sf.rt_sf.ref_frame_comp_nonrd[1] == 0 ||
+ !(cpi->ref_frame_flags & AOM_LAST2_FLAG))) {
+ return 0;
+ } else if (*ref_frame2 == ALTREF_FRAME &&
+ (cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 0 || skip_alt ||
+ !(cpi->ref_frame_flags & AOM_ALT_FLAG))) {
+ return 0;
+ }
+ int ref_mv_idx = 0;
+ if (*last_comp_ref_frame != rf[1]) {
+ // Only needs to be done once per reference pair.
+ setup_compound_prediction(&cpi->common, x, yv12_mb, use_ref_frame_mask, rf,
+ &ref_mv_idx);
+ *last_comp_ref_frame = rf[1];
+ }
+ set_compound_mode(x, *ref_frame, *ref_frame2, ref_mv_idx, frame_mv,
+ *this_mode);
+ if (*this_mode != GLOBAL_GLOBALMV &&
+ frame_mv[*this_mode][*ref_frame].as_int == 0 &&
+ frame_mv[*this_mode][*ref_frame2].as_int == 0) {
+ return 0;
+ }
+
+ return 1;
+}
+
+static AOM_INLINE bool previous_mode_performed_poorly(
+ PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
+ const unsigned int (*vars)[REF_FRAMES],
+ const int64_t (*uv_dist)[REF_FRAMES]) {
+ unsigned int best_var = UINT_MAX;
+ int64_t best_uv_dist = INT64_MAX;
+ for (int midx = 0; midx < RTC_INTER_MODES; midx++) {
+ best_var = AOMMIN(best_var, vars[midx][ref_frame]);
+ best_uv_dist = AOMMIN(best_uv_dist, uv_dist[midx][ref_frame]);
+ }
+ assert(best_var != UINT_MAX && "Invalid variance data.");
+ const float mult = 1.125f;
+ bool var_bad = mult * best_var < vars[INTER_OFFSET(mode)][ref_frame];
+ if (uv_dist[INTER_OFFSET(mode)][ref_frame] < INT64_MAX &&
+ best_uv_dist != uv_dist[INTER_OFFSET(mode)][ref_frame]) {
+ // If we have chroma info, then take it into account
+ var_bad &= mult * best_uv_dist < uv_dist[INTER_OFFSET(mode)][ref_frame];
+ }
+ return var_bad;
+}
+
+static AOM_INLINE bool prune_compoundmode_with_singlemode_var(
+ PREDICTION_MODE compound_mode, MV_REFERENCE_FRAME ref_frame,
+ MV_REFERENCE_FRAME ref_frame2, const int_mv (*frame_mv)[REF_FRAMES],
+ const uint8_t (*mode_checked)[REF_FRAMES],
+ const unsigned int (*vars)[REF_FRAMES],
+ const int64_t (*uv_dist)[REF_FRAMES]) {
+ const PREDICTION_MODE single_mode0 = compound_ref0_mode(compound_mode);
+ const PREDICTION_MODE single_mode1 = compound_ref1_mode(compound_mode);
+
+ bool first_ref_valid = false, second_ref_valid = false;
+ bool first_ref_bad = false, second_ref_bad = false;
+ if (mode_checked[single_mode0][ref_frame] &&
+ frame_mv[single_mode0][ref_frame].as_int ==
+ frame_mv[compound_mode][ref_frame].as_int &&
+ vars[INTER_OFFSET(single_mode0)][ref_frame] < UINT_MAX) {
+ first_ref_valid = true;
+ first_ref_bad =
+ previous_mode_performed_poorly(single_mode0, ref_frame, vars, uv_dist);
+ }
+ if (mode_checked[single_mode1][ref_frame2] &&
+ frame_mv[single_mode1][ref_frame2].as_int ==
+ frame_mv[compound_mode][ref_frame2].as_int &&
+ vars[INTER_OFFSET(single_mode1)][ref_frame2] < UINT_MAX) {
+ second_ref_valid = true;
+ second_ref_bad =
+ previous_mode_performed_poorly(single_mode1, ref_frame2, vars, uv_dist);
+ }
+ if (first_ref_valid && second_ref_valid) {
+ return first_ref_bad && second_ref_bad;
+ } else if (first_ref_valid || second_ref_valid) {
+ return first_ref_bad || second_ref_bad;
+ }
+ return false;
+}
+
+// Function to setup parameters used for inter mode evaluation in non-rd.
+static AOM_FORCE_INLINE void set_params_nonrd_pick_inter_mode(
+ AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+ RD_STATS *rd_cost, int *force_skip_low_temp_var, int mi_row, int mi_col,
+ int gf_temporal_ref, unsigned char segment_id, BLOCK_SIZE bsize
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ ,
+ PICK_MODE_CONTEXT *ctx, int denoise_svc_pickmode
+#endif
+) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const ModeCosts *mode_costs = &x->mode_costs;
+ int skip_pred_mv = 0;
+
+ // Initialize variance and distortion (chroma) for all modes and reference
+ // frames
+ for (int idx = 0; idx < RTC_INTER_MODES; idx++) {
+ for (int ref = 0; ref < REF_FRAMES; ref++) {
+ search_state->vars[idx][ref] = UINT_MAX;
+ search_state->uv_dist[idx][ref] = INT64_MAX;
+ }
+ }
+
+ // Initialize values of color sensitivity with sb level color sensitivity
+ av1_copy(x->color_sensitivity, x->color_sensitivity_sb);
+
+ init_best_pickmode(&search_state->best_pickmode);
+
+ // Estimate cost for single reference frames
+ estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id, bsize,
+ search_state->ref_costs_single);
+
+ // Reset flag to indicate modes evaluated
+ av1_zero(search_state->mode_checked);
+
+ txfm_info->skip_txfm = 0;
+
+ // Initialize mode decisions
+ av1_invalid_rd_stats(&search_state->best_rdc);
+ av1_invalid_rd_stats(&search_state->this_rdc);
+ av1_invalid_rd_stats(rd_cost);
+ for (int ref_idx = 0; ref_idx < REF_FRAMES; ++ref_idx) {
+ x->warp_sample_info[ref_idx].num = -1;
+ }
+
+ mi->bsize = bsize;
+ mi->ref_frame[0] = NONE_FRAME;
+ mi->ref_frame[1] = NONE_FRAME;
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ // if (cpi->ppi->use_svc) denoise_svc_pickmode =
+ // av1_denoise_svc_non_key(cpi);
+ if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
+ av1_denoiser_reset_frame_stats(ctx);
+ }
+#endif
+
+ // Populate predicated motion vectors for LAST_FRAME
+ if (cpi->ref_frame_flags & AOM_LAST_FLAG) {
+ find_predictors(cpi, x, LAST_FRAME, search_state->frame_mv,
+ search_state->yv12_mb, bsize, *force_skip_low_temp_var,
+ x->force_zeromv_skip_for_blk,
+ &search_state->use_scaled_ref_frame[LAST_FRAME]);
+ }
+ // Update mask to use all reference frame
+ get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
+ search_state->use_ref_frame_mask,
+ force_skip_low_temp_var);
+
+ skip_pred_mv = x->force_zeromv_skip_for_blk ||
+ (x->nonrd_prune_ref_frame_search > 2 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
+
+ // Populate predicated motion vectors for other single reference frame
+ // Start at LAST_FRAME + 1.
+ for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME + 1;
+ ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) {
+ if (search_state->use_ref_frame_mask[ref_frame_iter]) {
+ find_predictors(cpi, x, ref_frame_iter, search_state->frame_mv,
+ search_state->yv12_mb, bsize, *force_skip_low_temp_var,
+ skip_pred_mv,
+ &search_state->use_scaled_ref_frame[ref_frame_iter]);
+ }
+ }
+}
+
+// Function to check the inter mode can be skipped based on mode statistics and
+// speed features settings.
+static AOM_FORCE_INLINE bool skip_inter_mode_nonrd(
+ AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+ int64_t *thresh_sad_pred, int *force_mv_inter_layer, int *is_single_pred,
+ PREDICTION_MODE *this_mode, MV_REFERENCE_FRAME *last_comp_ref_frame,
+ MV_REFERENCE_FRAME *ref_frame, MV_REFERENCE_FRAME *ref_frame2, int idx,
+ int_mv svc_mv, int force_skip_low_temp_var, unsigned int sse_zeromv_norm,
+ int num_inter_modes, unsigned char segment_id, BLOCK_SIZE bsize,
+ bool comp_use_zero_zeromv_only, bool check_globalmv) {
+ AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ const SVC *const svc = &cpi->svc;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+
+ // Skip compound mode based on reference frame mask and type of the mode and
+ // for allowed compound modes, setup ref mv stack and reference frame.
+ if (idx >= num_inter_modes) {
+ const int comp_index = idx - num_inter_modes;
+ if (!setup_compound_params_from_comp_idx(
+ cpi, x, search_state->yv12_mb, this_mode, ref_frame, ref_frame2,
+ search_state->frame_mv, search_state->use_ref_frame_mask,
+ comp_index, comp_use_zero_zeromv_only, last_comp_ref_frame,
+ bsize)) {
+ return true;
+ }
+ *is_single_pred = 0;
+ } else {
+ *this_mode = ref_mode_set[idx].pred_mode;
+ *ref_frame = ref_mode_set[idx].ref_frame;
+ *ref_frame2 = NONE_FRAME;
+ }
+
+ if (x->sb_me_block && *ref_frame == LAST_FRAME) {
+ // We want to make sure to test the superblock MV:
+ // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they
+ // have this sb MV. And don't skip NEWMV_LAST: this will be set to
+ // sb MV in handle_inter_mode_nonrd(), in case NEAREST or NEAR don't
+ // have it.
+ if (*this_mode == NEARESTMV &&
+ search_state->frame_mv[NEARESTMV][LAST_FRAME].as_int ==
+ x->sb_me_mv.as_int) {
+ return false;
+ }
+ if (*this_mode == NEARMV &&
+ search_state->frame_mv[NEARMV][LAST_FRAME].as_int ==
+ x->sb_me_mv.as_int) {
+ return false;
+ }
+ if (*this_mode == NEWMV) {
+ return false;
+ }
+ }
+
+ // Skip the single reference mode for which mode check flag is set.
+ if (*is_single_pred && search_state->mode_checked[*this_mode][*ref_frame]) {
+ return true;
+ }
+
+ // Skip GLOBALMV mode if check_globalmv flag is not enabled.
+ if (!check_globalmv && *this_mode == GLOBALMV) {
+ return true;
+ }
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer1);
+ x->ms_stat_nonrd.num_searches[bsize][*this_mode]++;
+#endif
+ mi->mode = *this_mode;
+ mi->ref_frame[0] = *ref_frame;
+ mi->ref_frame[1] = *ref_frame2;
+
+ // Skip the mode if use reference frame mask flag is not set.
+ if (!search_state->use_ref_frame_mask[*ref_frame]) return true;
+
+ // Skip mode for some modes and reference frames when
+ // force_zeromv_skip_for_blk flag is true.
+ if (x->force_zeromv_skip_for_blk &&
+ ((!(*this_mode == NEARESTMV &&
+ search_state->frame_mv[*this_mode][*ref_frame].as_int == 0) &&
+ *this_mode != GLOBALMV) ||
+ *ref_frame != LAST_FRAME))
+ return true;
+
+ // Skip compound mode based on variance of previously evaluated single
+ // reference modes.
+ if (rt_sf->prune_compoundmode_with_singlemode_var && !*is_single_pred &&
+ prune_compoundmode_with_singlemode_var(
+ *this_mode, *ref_frame, *ref_frame2, search_state->frame_mv,
+ search_state->mode_checked, search_state->vars,
+ search_state->uv_dist)) {
+ return true;
+ }
+
+ *force_mv_inter_layer = 0;
+ if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+ ((*ref_frame == LAST_FRAME && svc->skip_mvsearch_last) ||
+ (*ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf) ||
+ (*ref_frame == ALTREF_FRAME && svc->skip_mvsearch_altref))) {
+ // Only test mode if NEARESTMV/NEARMV is (svc_mv.mv.col, svc_mv.mv.row),
+ // otherwise set NEWMV to (svc_mv.mv.col, svc_mv.mv.row).
+ // Skip newmv and filter search.
+ *force_mv_inter_layer = 1;
+ if (*this_mode == NEWMV) {
+ search_state->frame_mv[*this_mode][*ref_frame] = svc_mv;
+ } else if (search_state->frame_mv[*this_mode][*ref_frame].as_int !=
+ svc_mv.as_int) {
+ return true;
+ }
+ }
+
+ // If the segment reference frame feature is enabled then do nothing if the
+ // current ref frame is not allowed.
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)(*ref_frame))
+ return true;
+
+ // For screen content: skip mode testing based on source_sad.
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ // If source_sad is computed: skip non-zero motion
+ // check for stationary (super)blocks. Otherwise if superblock
+ // has motion skip the modes with zero motion on last reference
+ // for flat blocks, and color is not set.
+ // For the latter condition: the same condition should apply
+ // to newmv if (0, 0), so this latter condition is repeated
+ // below after search_new_mv.
+ if (rt_sf->source_metrics_sb_nonrd) {
+ if ((search_state->frame_mv[*this_mode][*ref_frame].as_int != 0 &&
+ x->content_state_sb.source_sad_nonrd == kZeroSad) ||
+ (search_state->frame_mv[*this_mode][*ref_frame].as_int == 0 &&
+ x->block_is_zero_sad == 0 && *ref_frame == LAST_FRAME &&
+ ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
+ cpi->rc.high_source_sad) &&
+ x->source_variance == 0))
+ return true;
+ }
+ // Skip NEWMV search for flat blocks.
+ if (*this_mode == NEWMV && x->source_variance < 100) return true;
+ // Skip non-LAST for color on flat blocks.
+ if (*ref_frame > LAST_FRAME && x->source_variance == 0 &&
+ (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] == 1 ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] == 1))
+ return true;
+ }
+
+ // Skip mode based on block size, reference frame mode and other block
+ // properties.
+ if (skip_mode_by_bsize_and_ref_frame(
+ *this_mode, *ref_frame, bsize, x->nonrd_prune_ref_frame_search,
+ sse_zeromv_norm, rt_sf->nonrd_aggressive_skip))
+ return true;
+
+ // Skip mode based on low temporal variance and souce sad.
+ if (skip_mode_by_low_temp(*this_mode, *ref_frame, bsize, x->content_state_sb,
+ search_state->frame_mv[*this_mode][*ref_frame],
+ force_skip_low_temp_var))
+ return true;
+
+ // Disable this drop out case if the ref frame segment level feature is
+ // enabled for this segment. This is to prevent the possibility that we
+ // end up unable to pick any mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ // Check for skipping GOLDEN and ALTREF based pred_mv_sad.
+ if (rt_sf->nonrd_prune_ref_frame_search > 0 &&
+ x->pred_mv_sad[*ref_frame] != INT_MAX && *ref_frame != LAST_FRAME) {
+ if ((int64_t)(x->pred_mv_sad[*ref_frame]) > *thresh_sad_pred) return true;
+ }
+ }
+
+ // Check for skipping NEARMV based on pred_mv_sad.
+ if (*this_mode == NEARMV && x->pred_mv1_sad[*ref_frame] != INT_MAX &&
+ x->pred_mv1_sad[*ref_frame] > (x->pred_mv0_sad[*ref_frame] << 1))
+ return true;
+
+ // Skip single reference mode based on rd threshold.
+ if (*is_single_pred) {
+ if (skip_mode_by_threshold(
+ *this_mode, *ref_frame,
+ search_state->frame_mv[*this_mode][*ref_frame],
+ cpi->rc.frames_since_golden, cpi->rd.threshes[segment_id][bsize],
+ x->thresh_freq_fact[bsize], search_state->best_rdc.rdcost,
+ search_state->best_pickmode.best_mode_skip_txfm,
+ (rt_sf->nonrd_aggressive_skip ? 1 : 0)))
+ return true;
+ }
+ return false;
+}
+
+// Function to perform inter mode evaluation for non-rd
+static AOM_FORCE_INLINE bool handle_inter_mode_nonrd(
+ AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+ PICK_MODE_CONTEXT *ctx, PRED_BUFFER **this_mode_pred,
+ PRED_BUFFER *tmp_buffer, InterPredParams inter_pred_params_sr,
+ int *best_early_term, unsigned int *sse_zeromv_norm, bool *check_globalmv,
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ int64_t *zero_last_cost_orig, int denoise_svc_pickmode,
+#endif
+ int idx, int force_mv_inter_layer, int is_single_pred, int gf_temporal_ref,
+ int use_model_yrd_large, int filter_search_enabled_blk, BLOCK_SIZE bsize,
+ PREDICTION_MODE this_mode, InterpFilter filt_select,
+ int cb_pred_filter_search, int reuse_inter_pred,
+ int *sb_me_has_been_tested) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ const int bw = block_size_wide[bsize];
+ const InterpFilter filter_ref = cm->features.interp_filter;
+ const InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+ BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode;
+
+ MV_REFERENCE_FRAME ref_frame = mi->ref_frame[0];
+ MV_REFERENCE_FRAME ref_frame2 = mi->ref_frame[1];
+ int_mv *const this_mv = &search_state->frame_mv[this_mode][ref_frame];
+ unsigned int var = UINT_MAX;
+ int this_early_term = 0;
+ int rate_mv = 0;
+ int is_skippable;
+ int skip_this_mv = 0;
+ unsigned int var_threshold = UINT_MAX;
+ PREDICTION_MODE this_best_mode;
+ RD_STATS nonskip_rdc;
+ av1_invalid_rd_stats(&nonskip_rdc);
+
+ if (x->sb_me_block && this_mode == NEWMV && ref_frame == LAST_FRAME) {
+ // Set the NEWMV_LAST to the sb MV.
+ search_state->frame_mv[NEWMV][LAST_FRAME].as_int = x->sb_me_mv.as_int;
+ } else if (this_mode == NEWMV && !force_mv_inter_layer) {
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+ // Find the best motion vector for single/compound mode.
+ const bool skip_newmv = search_new_mv(
+ cpi, x, search_state->frame_mv, ref_frame, gf_temporal_ref, bsize,
+ mi_row, mi_col, &rate_mv, &search_state->best_rdc);
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+ x->ms_stat_nonrd.ms_time[bsize][this_mode] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+ // Skip NEWMV mode,
+ // (i). For bsize smaller than 16X16
+ // (ii). Based on sad of the predicted mv w.r.t LAST_FRAME
+ // (iii). When motion vector is same as that of reference mv
+ if (skip_newmv) {
+ return true;
+ }
+ }
+
+ // Check the current motion vector is same as that of previously evaluated
+ // motion vectors.
+ for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV;
+ inter_mv_mode++) {
+ if (inter_mv_mode == this_mode) continue;
+ if (is_single_pred &&
+ search_state->mode_checked[inter_mv_mode][ref_frame] &&
+ this_mv->as_int ==
+ search_state->frame_mv[inter_mv_mode][ref_frame].as_int) {
+ skip_this_mv = 1;
+ break;
+ }
+ }
+
+ // Skip single mode if current motion vector is same that of previously
+ // evaluated motion vectors.
+ if (skip_this_mv && is_single_pred) return true;
+
+ // For screen: for spatially flat blocks with non-zero motion,
+ // skip newmv if the motion vector is (0, 0)-LAST, and color is not set.
+ if (this_mode == NEWMV && cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ cpi->svc.spatial_layer_id == 0 && rt_sf->source_metrics_sb_nonrd) {
+ if (this_mv->as_int == 0 && ref_frame == LAST_FRAME &&
+ x->block_is_zero_sad == 0 &&
+ ((x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 &&
+ x->color_sensitivity_sb[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) ||
+ cpi->rc.high_source_sad) &&
+ x->source_variance == 0)
+ return true;
+ }
+
+ mi->mode = this_mode;
+ mi->mv[0].as_int = this_mv->as_int;
+ mi->mv[1].as_int = 0;
+ if (!is_single_pred)
+ mi->mv[1].as_int = search_state->frame_mv[this_mode][ref_frame2].as_int;
+
+ // Set buffers to store predicted samples for reuse
+ if (reuse_inter_pred) {
+ if (!*this_mode_pred) {
+ *this_mode_pred = &tmp_buffer[3];
+ } else {
+ *this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+ pd->dst.buf = (*this_mode_pred)->data;
+ pd->dst.stride = bw;
+ }
+ }
+
+ mi->motion_mode = SIMPLE_TRANSLATION;
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) {
+ calc_num_proj_ref(cpi, x, mi);
+ }
+#endif
+ // set variance threshold for compound mode pruning
+ if (rt_sf->prune_compoundmode_with_singlecompound_var && !is_single_pred &&
+ use_model_yrd_large) {
+ const PREDICTION_MODE single_mode0 = compound_ref0_mode(this_mode);
+ const PREDICTION_MODE single_mode1 = compound_ref1_mode(this_mode);
+ var_threshold =
+ AOMMIN(var_threshold,
+ search_state->vars[INTER_OFFSET(single_mode0)][ref_frame]);
+ var_threshold =
+ AOMMIN(var_threshold,
+ search_state->vars[INTER_OFFSET(single_mode1)][ref_frame2]);
+ }
+
+ // decide interpolation filter, build prediction signal, get sse
+ const bool is_mv_subpel =
+ (mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07);
+ const bool enable_filt_search_this_mode =
+ (filter_search_enabled_blk == 2)
+ ? true
+ : (filter_search_enabled_blk && !force_mv_inter_layer &&
+ is_single_pred &&
+ (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search));
+ if (is_mv_subpel && enable_filt_search_this_mode) {
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+ search_filter_ref(
+ cpi, x, &search_state->this_rdc, &inter_pred_params_sr, mi_row, mi_col,
+ tmp_buffer, bsize, reuse_inter_pred, this_mode_pred, &this_early_term,
+ &var, use_model_yrd_large, best_pickmode->best_sse, is_single_pred);
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+ x->ms_stat_nonrd.ifs_time[bsize][this_mode] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+#if !CONFIG_REALTIME_ONLY
+ } else if (cpi->oxcf.motion_mode_cfg.allow_warped_motion &&
+ this_mode == NEWMV) {
+ // Find the best motion mode when current mode is NEWMV
+ search_motion_mode(cpi, x, &search_state->this_rdc, mi_row, mi_col, bsize,
+ &this_early_term, use_model_yrd_large, &rate_mv,
+ best_pickmode->best_sse);
+ if (this_mode == NEWMV) {
+ this_mv[0] = mi->mv[0];
+ }
+#endif
+ } else {
+ mi->interp_filters =
+ (filter_ref == SWITCHABLE)
+ ? av1_broadcast_interp_filter(default_interp_filter)
+ : av1_broadcast_interp_filter(filter_ref);
+ if (force_mv_inter_layer)
+ mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+ // If it is sub-pel motion and cb_pred_filter_search is enabled, select
+ // the pre-decided filter
+ if (is_mv_subpel && cb_pred_filter_search)
+ mi->interp_filters = av1_broadcast_interp_filter(filt_select);
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+ if (is_single_pred) {
+ SubpelParams subpel_params;
+ // Initialize inter mode level params for single reference mode.
+ init_inter_mode_params(&mi->mv[0].as_mv, &inter_pred_params_sr,
+ &subpel_params, xd->block_ref_scale_factors[0],
+ pd->pre->width, pd->pre->height);
+ av1_enc_build_inter_predictor_y_nonrd(xd, &inter_pred_params_sr,
+ &subpel_params);
+ } else {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ }
+
+ if (use_model_yrd_large) {
+ model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+ &search_state->this_rdc, &this_early_term, 0,
+ best_pickmode->best_sse, &var, var_threshold);
+ } else {
+ model_rd_for_sb_y(cpi, bsize, x, xd, &search_state->this_rdc, &var, 0,
+ &this_early_term);
+ }
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+ x->ms_stat_nonrd.model_rd_time[bsize][this_mode] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+ }
+
+ // update variance for single mode
+ if (is_single_pred) {
+ search_state->vars[INTER_OFFSET(this_mode)][ref_frame] = var;
+ if (this_mv->as_int == 0) {
+ search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
+ }
+ }
+ // prune compound mode based on single mode var threshold
+ if (!is_single_pred && var > var_threshold) {
+ if (reuse_inter_pred) free_pred_buffer(*this_mode_pred);
+ return true;
+ }
+
+ if (ref_frame == LAST_FRAME && this_mv->as_int == 0) {
+ *sse_zeromv_norm = (unsigned int)(search_state->this_rdc.sse >>
+ (b_width_log2_lookup[bsize] +
+ b_height_log2_lookup[bsize]));
+ }
+
+ // Perform early termination based on sse.
+ if (rt_sf->sse_early_term_inter_search &&
+ early_term_inter_search_with_sse(rt_sf->sse_early_term_inter_search,
+ bsize, search_state->this_rdc.sse,
+ best_pickmode->best_sse, this_mode)) {
+ if (reuse_inter_pred) free_pred_buffer(*this_mode_pred);
+ return true;
+ }
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ x->ms_stat_nonrd.num_nonskipped_searches[bsize][this_mode]++;
+#endif
+
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][1];
+ const int no_skip_txfm_cost = mode_costs->skip_txfm_cost[skip_ctx][0];
+ const int64_t sse_y = search_state->this_rdc.sse;
+
+ if (this_early_term) {
+ search_state->this_rdc.skip_txfm = 1;
+ search_state->this_rdc.rate = skip_txfm_cost;
+ search_state->this_rdc.dist = search_state->this_rdc.sse << 4;
+ } else {
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer2);
+#endif
+ // Calculates RD Cost using Hadamard transform.
+ av1_block_yrd(x, &search_state->this_rdc, &is_skippable, bsize,
+ mi->tx_size);
+ if (search_state->this_rdc.skip_txfm ||
+ RDCOST(x->rdmult, search_state->this_rdc.rate,
+ search_state->this_rdc.dist) >=
+ RDCOST(x->rdmult, 0, search_state->this_rdc.sse)) {
+ if (!search_state->this_rdc.skip_txfm) {
+ // Need to store "real" rdc for possible future use if UV rdc
+ // disallows tx skip
+ nonskip_rdc = search_state->this_rdc;
+ nonskip_rdc.rate += no_skip_txfm_cost;
+ }
+ search_state->this_rdc.rate = skip_txfm_cost;
+ search_state->this_rdc.skip_txfm = 1;
+ search_state->this_rdc.dist = search_state->this_rdc.sse;
+ } else {
+ search_state->this_rdc.rate += no_skip_txfm_cost;
+ }
+
+ // Populate predicted sample for chroma planes based on color sensitivity.
+ if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) {
+ RD_STATS rdc_uv;
+ const BLOCK_SIZE uv_bsize =
+ get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_U, AOM_PLANE_U);
+ }
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_V, AOM_PLANE_V);
+ }
+ // Compute sse for chroma planes.
+ const int64_t sse_uv = av1_model_rd_for_sb_uv(
+ cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U, AOM_PLANE_V);
+ if (rdc_uv.dist < x->min_dist_inter_uv)
+ x->min_dist_inter_uv = rdc_uv.dist;
+ search_state->this_rdc.sse += sse_uv;
+ // Restore Y rdc if UV rdc disallows txfm skip
+ if (search_state->this_rdc.skip_txfm && !rdc_uv.skip_txfm &&
+ nonskip_rdc.rate != INT_MAX)
+ search_state->this_rdc = nonskip_rdc;
+ if (is_single_pred) {
+ search_state->uv_dist[INTER_OFFSET(this_mode)][ref_frame] = rdc_uv.dist;
+ }
+ search_state->this_rdc.rate += rdc_uv.rate;
+ search_state->this_rdc.dist += rdc_uv.dist;
+ search_state->this_rdc.skip_txfm =
+ search_state->this_rdc.skip_txfm && rdc_uv.skip_txfm;
+ }
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer2);
+ x->ms_stat_nonrd.txfm_time[bsize][this_mode] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer2);
+#endif
+ }
+
+ this_best_mode = this_mode;
+ // TODO(kyslov) account for UV prediction cost
+ search_state->this_rdc.rate += rate_mv;
+ if (!is_single_pred) {
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
+ search_state->this_rdc.rate += cost_mv_ref(mode_costs, this_mode, mode_ctx);
+ } else {
+ // If the current mode has zeromv but is not GLOBALMV, compare the rate
+ // cost. If GLOBALMV is cheaper, use GLOBALMV instead.
+ if (this_mode != GLOBALMV &&
+ this_mv->as_int == search_state->frame_mv[GLOBALMV][ref_frame].as_int) {
+ if (is_globalmv_better(this_mode, ref_frame, rate_mv, mode_costs,
+ search_state->single_inter_mode_costs, mbmi_ext)) {
+ this_best_mode = GLOBALMV;
+ }
+ }
+
+ search_state->this_rdc.rate +=
+ search_state
+ ->single_inter_mode_costs[INTER_OFFSET(this_best_mode)][ref_frame];
+ }
+
+ if (is_single_pred && this_mv->as_int == 0 && var < UINT_MAX) {
+ search_state->vars[INTER_OFFSET(GLOBALMV)][ref_frame] = var;
+ }
+
+ search_state->this_rdc.rate += search_state->ref_costs_single[ref_frame];
+
+ search_state->this_rdc.rdcost = RDCOST(x->rdmult, search_state->this_rdc.rate,
+ search_state->this_rdc.dist);
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR && is_single_pred) {
+ newmv_diff_bias(xd, this_best_mode, &search_state->this_rdc, bsize,
+ search_state->frame_mv[this_best_mode][ref_frame].as_mv.row,
+ search_state->frame_mv[this_best_mode][ref_frame].as_mv.col,
+ cpi->speed, x->source_variance, x->content_state_sb);
+ }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode &&
+ cpi->denoiser.denoising_level > kDenLowLow) {
+ av1_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
+ // Keep track of zero_last cost.
+ if (ref_frame == LAST_FRAME && this_mv->as_int == 0)
+ *zero_last_cost_orig = search_state->this_rdc.rdcost;
+ }
+#else
+ (void)(sse_y);
+#endif
+
+ search_state->mode_checked[this_mode][ref_frame] = 1;
+ search_state->mode_checked[this_best_mode][ref_frame] = 1;
+
+ if (*check_globalmv) {
+ int32_t abs_mv =
+ abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.row) +
+ abs(search_state->frame_mv[this_best_mode][ref_frame].as_mv.col);
+ // Early exit check: if the magnitude of this_best_mode's mv is small
+ // enough, we skip GLOBALMV check in the next loop iteration.
+ if (abs_mv < 2) {
+ *check_globalmv = false;
+ }
+ }
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer1);
+ x->ms_stat_nonrd.nonskipped_search_times[bsize][this_mode] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1);
+#endif
+
+ if (x->sb_me_block && ref_frame == LAST_FRAME &&
+ search_state->frame_mv[this_best_mode][ref_frame].as_int ==
+ x->sb_me_mv.as_int)
+ *sb_me_has_been_tested = 1;
+
+ // Copy best mode params to search state
+ if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) {
+ search_state->best_rdc = search_state->this_rdc;
+ *best_early_term = this_early_term;
+ update_search_state_nonrd(search_state, mi, txfm_info, &nonskip_rdc, ctx,
+ this_best_mode, sse_y);
+
+ // This is needed for the compound modes.
+ search_state->frame_mv_best[this_best_mode][ref_frame].as_int =
+ search_state->frame_mv[this_best_mode][ref_frame].as_int;
+ if (ref_frame2 > NONE_FRAME) {
+ search_state->frame_mv_best[this_best_mode][ref_frame2].as_int =
+ search_state->frame_mv[this_best_mode][ref_frame2].as_int;
+ }
+
+ if (reuse_inter_pred) {
+ free_pred_buffer(best_pickmode->best_pred);
+ best_pickmode->best_pred = *this_mode_pred;
+ }
+ } else {
+ if (reuse_inter_pred) free_pred_buffer(*this_mode_pred);
+ }
+
+ if (*best_early_term && (idx > 0 || rt_sf->nonrd_aggressive_skip)) {
+ txfm_info->skip_txfm = 1;
+ if (!x->sb_me_block || *sb_me_has_been_tested) return false;
+ }
+ return true;
+}
+
+// Function to perform screen content mode evaluation for non-rd
+static AOM_FORCE_INLINE void handle_screen_content_mode_nonrd(
+ AV1_COMP *cpi, MACROBLOCK *x, InterModeSearchStateNonrd *search_state,
+ PRED_BUFFER *this_mode_pred, PICK_MODE_CONTEXT *ctx,
+ PRED_BUFFER *tmp_buffer, struct buf_2d *orig_dst, int skip_idtx_palette,
+ int try_palette, BLOCK_SIZE bsize, int reuse_inter_pred, int mi_col,
+ int mi_row) {
+ AV1_COMMON *const cm = &cpi->common;
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ BEST_PICKMODE *const best_pickmode = &search_state->best_pickmode;
+
+ // TODO(marpan): Only allow for 8 bit-depth for now, re-enable for 10/12 bit
+ // when issue 3359 is fixed.
+ if (cm->seq_params->bit_depth == 8 &&
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !skip_idtx_palette &&
+ !cpi->oxcf.txfm_cfg.use_inter_dct_only && !x->force_zeromv_skip_for_blk &&
+ is_inter_mode(best_pickmode->best_mode) &&
+ best_pickmode->best_pred != NULL &&
+ (!rt_sf->prune_idtx_nonrd ||
+ (rt_sf->prune_idtx_nonrd && bsize <= BLOCK_32X32 &&
+ best_pickmode->best_mode_skip_txfm != 1 && x->source_variance > 200))) {
+ RD_STATS idtx_rdc;
+ av1_init_rd_stats(&idtx_rdc);
+ int is_skippable;
+ this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+ pd->dst.buf = this_mode_pred->data;
+ pd->dst.stride = bw;
+ const PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+ av1_block_yrd_idtx(x, best_pred->data, best_pred->stride, &idtx_rdc,
+ &is_skippable, bsize, mi->tx_size);
+ int64_t idx_rdcost_y = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist);
+ int allow_idtx = 1;
+ // Incorporate color into rd cost.
+ if ((x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])) {
+ RD_STATS rdc_uv;
+ const BLOCK_SIZE uv_bsize =
+ get_plane_block_size(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)]) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_U, AOM_PLANE_U);
+ }
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_V, AOM_PLANE_V);
+ }
+ av1_model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, AOM_PLANE_U,
+ AOM_PLANE_V);
+ if (rdc_uv.dist < x->min_dist_inter_uv)
+ x->min_dist_inter_uv = rdc_uv.dist;
+ idtx_rdc.rate += rdc_uv.rate;
+ idtx_rdc.dist += rdc_uv.dist;
+ idtx_rdc.skip_txfm = idtx_rdc.skip_txfm && rdc_uv.skip_txfm;
+ if (idx_rdcost_y == 0 && rdc_uv.dist > 0 && x->source_variance < 3000 &&
+ x->content_state_sb.source_sad_nonrd > kMedSad)
+ allow_idtx = 0;
+ }
+ int64_t idx_rdcost = RDCOST(x->rdmult, idtx_rdc.rate, idtx_rdc.dist);
+ if (allow_idtx && idx_rdcost < search_state->best_rdc.rdcost) {
+ best_pickmode->tx_type = IDTX;
+ search_state->best_rdc.rdcost = idx_rdcost;
+ best_pickmode->best_mode_skip_txfm = idtx_rdc.skip_txfm;
+ if (!idtx_rdc.skip_txfm) {
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ xd->tx_type_map[0] = best_pickmode->tx_type;
+ memset(ctx->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk);
+ memset(xd->tx_type_map, best_pickmode->tx_type, ctx->num_4x4_blk);
+ }
+ pd->dst = *orig_dst;
+ }
+
+ if (!try_palette) return;
+ const unsigned int intra_ref_frame_cost =
+ search_state->ref_costs_single[INTRA_FRAME];
+
+ if (!is_mode_intra(best_pickmode->best_mode)) {
+ PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+ if (reuse_inter_pred && best_pred != NULL) {
+ if (best_pred->data == orig_dst->buf) {
+ this_mode_pred = &tmp_buffer[get_pred_buffer(tmp_buffer, 3)];
+ aom_convolve_copy(best_pred->data, best_pred->stride,
+ this_mode_pred->data, this_mode_pred->stride, bw, bh);
+ best_pickmode->best_pred = this_mode_pred;
+ }
+ }
+ pd->dst = *orig_dst;
+ }
+ // Search palette mode for Luma plane in inter frame.
+ av1_search_palette_mode_luma(cpi, x, bsize, intra_ref_frame_cost, ctx,
+ &search_state->this_rdc,
+ search_state->best_rdc.rdcost);
+ // Update best mode data in search_state
+ if (search_state->this_rdc.rdcost < search_state->best_rdc.rdcost) {
+ best_pickmode->pmi = mi->palette_mode_info;
+ best_pickmode->best_mode = DC_PRED;
+ mi->mv[0].as_int = INVALID_MV;
+ mi->mv[1].as_int = INVALID_MV;
+ best_pickmode->best_ref_frame = INTRA_FRAME;
+ best_pickmode->best_second_ref_frame = NONE;
+ search_state->best_rdc.rate = search_state->this_rdc.rate;
+ search_state->best_rdc.dist = search_state->this_rdc.dist;
+ search_state->best_rdc.rdcost = search_state->this_rdc.rdcost;
+ best_pickmode->best_mode_skip_txfm = search_state->this_rdc.skip_txfm;
+ // Keep the skip_txfm off if the color_sensitivity is set.
+ if (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])
+ search_state->this_rdc.skip_txfm = 0;
+ if (!search_state->this_rdc.skip_txfm) {
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ if (xd->tx_type_map[0] != DCT_DCT)
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ }
+}
+
+/*!\brief AV1 inter mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * Top level function for Non-RD optimized inter mode selection.
+ * This finction will loop over subset of inter modes and select the best one
+ * based on calculated modelled RD cost. While making decisions which modes to
+ * check, this function applies heuristics based on previously checked modes,
+ * block residual variance, block size, and other factors to prune certain
+ * modes and reference frames. Currently only single reference frame modes
+ * are checked. Additional heuristics are applied to decide if intra modes
+ * need to be checked.
+ * *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ data/contexts/models for the tile during
+ encoding
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] rd_cost Struct to keep track of the RD information
+ * \param[in] bsize Current block size
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+ MACROBLOCK *x, RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ AV1_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ MV_REFERENCE_FRAME ref_frame, ref_frame2;
+ const unsigned char segment_id = mi->segment_id;
+ int best_early_term = 0;
+ int force_skip_low_temp_var = 0;
+ unsigned int sse_zeromv_norm = UINT_MAX;
+ const int num_inter_modes = NUM_INTER_MODES;
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+ bool check_globalmv = rt_sf->check_globalmv_on_single_ref;
+ PRED_BUFFER tmp_buffer[4];
+ DECLARE_ALIGNED(16, uint8_t, pred_buf[MAX_MB_PLANE * MAX_SB_SQUARE]);
+ PRED_BUFFER *this_mode_pred = NULL;
+ const int reuse_inter_pred =
+ rt_sf->reuse_inter_pred_nonrd && cm->seq_params->bit_depth == AOM_BITS_8;
+ InterModeSearchStateNonrd search_state;
+ av1_zero(search_state.use_ref_frame_mask);
+ av1_zero(search_state.use_scaled_ref_frame);
+ BEST_PICKMODE *const best_pickmode = &search_state.best_pickmode;
+ (void)tile_data;
+
+ const int bh = block_size_high[bsize];
+ const int bw = block_size_wide[bsize];
+ const int pixels_in_block = bh * bw;
+ struct buf_2d orig_dst = pd->dst;
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+#if COLLECT_NONRD_PICK_MODE_STAT
+ // Mode statistics can be collected only when num_workers is 1
+ assert(cpi->mt_info.num_workers <= 1);
+ aom_usec_timer_start(&x->ms_stat_nonrd.bsize_timer);
+#endif
+ int64_t thresh_sad_pred = INT64_MAX;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int_mv svc_mv = { .as_int = 0 };
+ int force_mv_inter_layer = 0;
+ bool comp_use_zero_zeromv_only = 0;
+ int tot_num_comp_modes = NUM_COMP_INTER_MODES_RT;
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ const int denoise_recheck_zeromv = 1;
+ AV1_PICKMODE_CTX_DEN ctx_den;
+ int64_t zero_last_cost_orig = INT64_MAX;
+ int denoise_svc_pickmode = 1;
+ const int resize_pending = is_frame_resize_pending(cpi);
+#endif
+ const ModeCosts *mode_costs = &x->mode_costs;
+ struct scale_factors sf_no_scale;
+ av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height,
+ cm->width, cm->height);
+ if (reuse_inter_pred) {
+ for (int buf_idx = 0; buf_idx < 3; buf_idx++) {
+ tmp_buffer[buf_idx].data = &pred_buf[pixels_in_block * buf_idx];
+ tmp_buffer[buf_idx].stride = bw;
+ tmp_buffer[buf_idx].in_use = 0;
+ }
+ tmp_buffer[3].data = pd->dst.buf;
+ tmp_buffer[3].stride = pd->dst.stride;
+ tmp_buffer[3].in_use = 0;
+ }
+
+ const int gf_temporal_ref = is_same_gf_and_last_scale(cm);
+
+ // If the lower spatial layer uses an averaging filter for downsampling
+ // (phase = 8), the target decimated pixel is shifted by (1/2, 1/2) relative
+ // to source, so use subpel motion vector to compensate. The nonzero motion
+ // is half pixel shifted to left and top, so (-4, -4). This has more effect
+ // on higher resolutions, so condition it on that for now.
+ // Exclude quality layers, which have the same resolution and hence no shift.
+ if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+ !svc->has_lower_quality_layer &&
+ svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
+ cm->width * cm->height > 640 * 480) {
+ svc_mv.as_mv.row = -4;
+ svc_mv.as_mv.col = -4;
+ }
+
+ // Setup parameters used for inter mode evaluation.
+ set_params_nonrd_pick_inter_mode(cpi, x, &search_state, rd_cost,
+ &force_skip_low_temp_var, mi_row, mi_col,
+ gf_temporal_ref, segment_id, bsize
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ ,
+ ctx, denoise_svc_pickmode
+#endif
+ );
+
+ if (rt_sf->use_comp_ref_nonrd && is_comp_ref_allowed(bsize)) {
+ // Only search compound if bsize \gt BLOCK_16X16.
+ if (bsize > BLOCK_16X16) {
+ comp_use_zero_zeromv_only = rt_sf->check_only_zero_zeromv_on_large_blocks;
+ } else {
+ tot_num_comp_modes = 0;
+ }
+ } else {
+ tot_num_comp_modes = 0;
+ }
+
+ if (x->pred_mv_sad[LAST_FRAME] != INT_MAX) {
+ thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1;
+ // Increase threshold for less aggressive pruning.
+ if (rt_sf->nonrd_prune_ref_frame_search == 1)
+ thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2);
+ }
+
+ const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize);
+
+ // decide block-level interp filter search flags:
+ // filter_search_enabled_blk:
+ // 0: disabled
+ // 1: filter search depends on mode properties
+ // 2: filter search forced since prediction is unreliable
+ // cb_pred_filter_search 0: disabled cb prediction
+ InterpFilter filt_select = EIGHTTAP_REGULAR;
+ const int cb_pred_filter_search =
+ x->content_state_sb.source_sad_nonrd > kVeryLowSad
+ ? cpi->sf.interp_sf.cb_pred_filter_search
+ : 0;
+ const int filter_search_enabled_blk =
+ is_filter_search_enabled_blk(cpi, x, mi_row, mi_col, bsize, segment_id,
+ cb_pred_filter_search, &filt_select);
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ x->ms_stat_nonrd.num_blocks[bsize]++;
+#endif
+ init_mbmi_nonrd(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm);
+ mi->tx_size = AOMMIN(
+ AOMMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
+ TX_16X16);
+
+ fill_single_inter_mode_costs(search_state.single_inter_mode_costs,
+ num_inter_modes, ref_mode_set, mode_costs,
+ mbmi_ext->mode_context);
+
+ MV_REFERENCE_FRAME last_comp_ref_frame = NONE_FRAME;
+
+ // Initialize inter prediction params at block level for single reference
+ // mode.
+ InterPredParams inter_pred_params_sr;
+ init_inter_block_params(&inter_pred_params_sr, pd->width, pd->height,
+ mi_row * MI_SIZE, mi_col * MI_SIZE, pd->subsampling_x,
+ pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd),
+ /*is_intrabc=*/0);
+ inter_pred_params_sr.conv_params =
+ get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd);
+
+ x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad;
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ !x->force_zeromv_skip_for_blk &&
+ x->content_state_sb.source_sad_nonrd != kZeroSad &&
+ x->source_variance == 0 && bsize < cm->seq_params->sb_size &&
+ search_state.yv12_mb[LAST_FRAME][0].width == cm->width &&
+ search_state.yv12_mb[LAST_FRAME][0].height == cm->height) {
+ set_block_source_sad(cpi, x, bsize, &search_state.yv12_mb[LAST_FRAME][0]);
+ }
+
+ int sb_me_has_been_tested = 0;
+ x->sb_me_block = x->sb_me_partition;
+ // Only use this feature (force testing of superblock motion) if coding
+ // block size is large.
+ if (x->sb_me_block) {
+ if (cm->seq_params->sb_size == BLOCK_128X128 && bsize < BLOCK_64X64)
+ x->sb_me_block = 0;
+ else if (cm->seq_params->sb_size == BLOCK_64X64 && bsize < BLOCK_32X32)
+ x->sb_me_block = 0;
+ }
+
+ x->min_dist_inter_uv = INT64_MAX;
+ for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) {
+ // If we are at the first compound mode, and the single modes already
+ // perform well, then end the search.
+ if (rt_sf->skip_compound_based_on_var && idx == num_inter_modes &&
+ skip_comp_based_on_var(search_state.vars, bsize)) {
+ break;
+ }
+
+ int is_single_pred = 1;
+ PREDICTION_MODE this_mode;
+
+ if (idx == 0 && !x->force_zeromv_skip_for_blk) {
+ // Set color sensitivity on first tested mode only.
+ // Use y-sad already computed in find_predictors: take the sad with motion
+ // vector closest to 0; the uv-sad computed below in set_color_sensitivity
+ // is for zeromv.
+ // For screen: first check if golden reference is being used, if so,
+ // force color_sensitivity on (=1) if the color sensitivity for sb_g is 1.
+ // The check in set_color_sensitivity() will then follow and check for
+ // setting the flag if the level is still 2 or 0.
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ search_state.use_ref_frame_mask[GOLDEN_FRAME]) {
+ if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 1)
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] = 1;
+ if (x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 1)
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] = 1;
+ }
+ if (search_state.use_ref_frame_mask[LAST_FRAME] &&
+ x->pred_mv0_sad[LAST_FRAME] != INT_MAX) {
+ int y_sad = x->pred_mv0_sad[LAST_FRAME];
+ if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
+ (abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
+ abs(search_state.frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
+ (abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
+ abs(search_state.frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
+ y_sad = x->pred_mv1_sad[LAST_FRAME];
+ set_color_sensitivity(cpi, x, bsize, y_sad, x->source_variance,
+ search_state.yv12_mb[LAST_FRAME]);
+ }
+ }
+
+ // Check the inter mode can be skipped based on mode statistics and speed
+ // features settings.
+ if (skip_inter_mode_nonrd(cpi, x, &search_state, &thresh_sad_pred,
+ &force_mv_inter_layer, &is_single_pred,
+ &this_mode, &last_comp_ref_frame, &ref_frame,
+ &ref_frame2, idx, svc_mv, force_skip_low_temp_var,
+ sse_zeromv_norm, num_inter_modes, segment_id,
+ bsize, comp_use_zero_zeromv_only, check_globalmv))
+ continue;
+
+ // Select prediction reference frames.
+ for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+ xd->plane[plane].pre[0] = search_state.yv12_mb[ref_frame][plane];
+ if (!is_single_pred)
+ xd->plane[plane].pre[1] = search_state.yv12_mb[ref_frame2][plane];
+ }
+
+ mi->ref_frame[0] = ref_frame;
+ mi->ref_frame[1] = ref_frame2;
+ set_ref_ptrs(cm, xd, ref_frame, ref_frame2);
+
+ // Check if the scaled reference frame should be used. This is set in the
+ // find_predictors() for each usable reference. If so, set the
+ // block_ref_scale_factors[] to no reference scaling.
+ if (search_state.use_scaled_ref_frame[ref_frame]) {
+ xd->block_ref_scale_factors[0] = &sf_no_scale;
+ }
+ if (!is_single_pred && search_state.use_scaled_ref_frame[ref_frame2]) {
+ xd->block_ref_scale_factors[1] = &sf_no_scale;
+ }
+
+ // Perform inter mode evaluation for non-rd
+ if (!handle_inter_mode_nonrd(
+ cpi, x, &search_state, ctx, &this_mode_pred, tmp_buffer,
+ inter_pred_params_sr, &best_early_term, &sse_zeromv_norm,
+ &check_globalmv,
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ &zero_last_cost_orig, denoise_svc_pickmode,
+#endif
+ idx, force_mv_inter_layer, is_single_pred, gf_temporal_ref,
+ use_model_yrd_large, filter_search_enabled_blk, bsize, this_mode,
+ filt_select, cb_pred_filter_search, reuse_inter_pred,
+ &sb_me_has_been_tested)) {
+ break;
+ }
+ }
+
+ // Restore mode data of best inter mode
+ mi->mode = best_pickmode->best_mode;
+ mi->motion_mode = best_pickmode->best_motion_mode;
+ mi->wm_params = best_pickmode->wm_params;
+ mi->num_proj_ref = best_pickmode->num_proj_ref;
+ mi->interp_filters = best_pickmode->best_pred_filter;
+ mi->tx_size = best_pickmode->best_tx_size;
+ memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size));
+ mi->ref_frame[0] = best_pickmode->best_ref_frame;
+ mi->mv[0].as_int = search_state
+ .frame_mv_best[best_pickmode->best_mode]
+ [best_pickmode->best_ref_frame]
+ .as_int;
+ mi->mv[1].as_int = 0;
+ if (best_pickmode->best_second_ref_frame > INTRA_FRAME) {
+ mi->ref_frame[1] = best_pickmode->best_second_ref_frame;
+ mi->mv[1].as_int = search_state
+ .frame_mv_best[best_pickmode->best_mode]
+ [best_pickmode->best_second_ref_frame]
+ .as_int;
+ }
+ // Perform intra prediction search, if the best SAD is above a certain
+ // threshold.
+ mi->angle_delta[PLANE_TYPE_Y] = 0;
+ mi->angle_delta[PLANE_TYPE_UV] = 0;
+ mi->filter_intra_mode_info.use_filter_intra = 0;
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_start(&x->ms_stat_nonrd.timer1);
+ x->ms_stat_nonrd.num_searches[bsize][DC_PRED]++;
+ x->ms_stat_nonrd.num_nonskipped_searches[bsize][DC_PRED]++;
+#endif
+
+ int force_palette_test = 0;
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ x->content_state_sb.source_sad_nonrd != kZeroSad &&
+ bsize <= BLOCK_16X16) {
+ unsigned int thresh_sse = cpi->rc.high_source_sad ? 15000 : 200000;
+ unsigned int thresh_source_var = cpi->rc.high_source_sad ? 50 : 200;
+ unsigned int best_sse_inter_motion =
+ (unsigned int)(search_state.best_rdc.sse >>
+ (b_width_log2_lookup[bsize] +
+ b_height_log2_lookup[bsize]));
+ if (best_sse_inter_motion > thresh_sse &&
+ x->source_variance > thresh_source_var)
+ force_palette_test = 1;
+ }
+
+ // Evaluate Intra modes in inter frame
+ if (!x->force_zeromv_skip_for_blk)
+ av1_estimate_intra_mode(cpi, x, bsize, best_early_term,
+ search_state.ref_costs_single[INTRA_FRAME],
+ reuse_inter_pred, &orig_dst, tmp_buffer,
+ &this_mode_pred, &search_state.best_rdc,
+ best_pickmode, ctx);
+
+ int skip_idtx_palette = (x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]) &&
+ x->content_state_sb.source_sad_nonrd != kZeroSad &&
+ !cpi->rc.high_source_sad;
+
+ int try_palette =
+ !skip_idtx_palette && cpi->oxcf.tool_cfg.enable_palette &&
+ av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+ mi->bsize);
+ try_palette =
+ try_palette &&
+ (is_mode_intra(best_pickmode->best_mode) || force_palette_test) &&
+ x->source_variance > 0 && !x->force_zeromv_skip_for_blk &&
+ (cpi->rc.high_source_sad || x->source_variance > 300);
+
+ if (rt_sf->prune_palette_nonrd && bsize > BLOCK_16X16) try_palette = 0;
+
+ // Perform screen content mode evaluation for non-rd
+ handle_screen_content_mode_nonrd(
+ cpi, x, &search_state, this_mode_pred, ctx, tmp_buffer, &orig_dst,
+ skip_idtx_palette, try_palette, bsize, reuse_inter_pred, mi_col, mi_row);
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.timer1);
+ x->ms_stat_nonrd.nonskipped_search_times[bsize][DC_PRED] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.timer1);
+#endif
+
+ pd->dst = orig_dst;
+ // Best mode is finalized. Restore the mode data to mbmi
+ if (try_palette) mi->palette_mode_info = best_pickmode->pmi;
+ mi->mode = best_pickmode->best_mode;
+ mi->ref_frame[0] = best_pickmode->best_ref_frame;
+ mi->ref_frame[1] = best_pickmode->best_second_ref_frame;
+ // For lossless: always force the skip flags off.
+ if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+ txfm_info->skip_txfm = 0;
+ memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk);
+ } else {
+ txfm_info->skip_txfm = best_pickmode->best_mode_skip_txfm;
+ }
+ if (has_second_ref(mi)) {
+ mi->comp_group_idx = 0;
+ mi->compound_idx = 1;
+ mi->interinter_comp.type = COMPOUND_AVERAGE;
+ }
+
+ if (!is_inter_block(mi)) {
+ mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
+ } else {
+ // If inter mode is selected and ref_frame was one that uses the
+ // scaled reference frame, then we can't use reuse_inter_pred.
+ if (search_state.use_scaled_ref_frame[best_pickmode->best_ref_frame] ||
+ (has_second_ref(mi) &&
+ search_state
+ .use_scaled_ref_frame[best_pickmode->best_second_ref_frame]))
+ x->reuse_inter_pred = 0;
+ }
+
+ // Restore the predicted samples of best mode to final buffer
+ if (reuse_inter_pred && best_pickmode->best_pred != NULL) {
+ PRED_BUFFER *const best_pred = best_pickmode->best_pred;
+ if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
+ aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ }
+ }
+
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && resize_pending == 0 &&
+ denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow &&
+ cpi->denoiser.reset == 0) {
+ AV1_DENOISER_DECISION decision = COPY_BLOCK;
+ ctx->sb_skip_denoising = 0;
+ av1_pickmode_ctx_den_update(
+ &ctx_den, zero_last_cost_orig, search_state.ref_costs_single,
+ search_state.frame_mv, reuse_inter_pred, best_pickmode);
+ av1_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision,
+ gf_temporal_ref);
+ if (denoise_recheck_zeromv)
+ recheck_zeromv_after_denoising(
+ cpi, mi, x, xd, decision, &ctx_den, search_state.yv12_mb,
+ &search_state.best_rdc, best_pickmode, bsize, mi_row, mi_col);
+ best_pickmode->best_ref_frame = ctx_den.best_ref_frame;
+ }
+#endif
+
+ // Update the factors used for RD thresholding for all modes.
+ if (cpi->sf.inter_sf.adaptive_rd_thresh && !has_second_ref(mi)) {
+ THR_MODES best_mode_idx =
+ mode_idx[best_pickmode->best_ref_frame][mode_offset(mi->mode)];
+ if (best_pickmode->best_ref_frame == INTRA_FRAME) {
+ // Only consider the modes that are included in the intra_mode_list.
+ int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
+ for (int mode_index = 0; mode_index < intra_modes; mode_index++) {
+ update_thresh_freq_fact(cpi, x, bsize, INTRA_FRAME, best_mode_idx,
+ intra_mode_list[mode_index]);
+ }
+ } else {
+ PREDICTION_MODE this_mode;
+ for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+ update_thresh_freq_fact(cpi, x, bsize, best_pickmode->best_ref_frame,
+ best_mode_idx, this_mode);
+ }
+ }
+ }
+
+#if CONFIG_INTERNAL_STATS
+ store_coding_context_nonrd(x, ctx, mi->mode);
+#else
+ store_coding_context_nonrd(x, ctx);
+#endif // CONFIG_INTERNAL_STATS
+
+#if COLLECT_NONRD_PICK_MODE_STAT
+ aom_usec_timer_mark(&x->ms_stat_nonrd.bsize_timer);
+ x->ms_stat_nonrd.total_block_times[bsize] +=
+ aom_usec_timer_elapsed(&x->ms_stat_nonrd.bsize_timer);
+ print_time(&x->ms_stat_nonrd, bsize, cm->mi_params.mi_rows,
+ cm->mi_params.mi_cols, mi_row, mi_col);
+#endif // COLLECT_NONRD_PICK_MODE_STAT
+
+ *rd_cost = search_state.best_rdc;
+
+ // Reset the xd->block_ref_scale_factors[i], as they may have
+ // been set to pointer &sf_no_scale, which becomes invalid afer
+ // this function.
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+}
diff --git a/third_party/aom/av1/encoder/optical_flow.c b/third_party/aom/av1/encoder/optical_flow.c
new file mode 100644
index 0000000000..dc168e7aee
--- /dev/null
+++ b/third_party/aom/av1/encoder/optical_flow.c
@@ -0,0 +1,1113 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/mathutils.h"
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/optical_flow.h"
+#include "av1/encoder/sparse_linear_solver.h"
+#include "av1/encoder/reconinter_enc.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+
+void av1_init_opfl_params(OPFL_PARAMS *opfl_params) {
+ opfl_params->pyramid_levels = OPFL_PYRAMID_LEVELS;
+ opfl_params->warping_steps = OPFL_WARPING_STEPS;
+ opfl_params->lk_params = NULL;
+}
+
+void av1_init_lk_params(LK_PARAMS *lk_params) {
+ lk_params->window_size = OPFL_WINDOW_SIZE;
+}
+
+// Helper function to determine whether a frame is encoded with high bit-depth.
+static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+ return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+}
+
+// Helper function to determine whether optical flow method is sparse.
+static INLINE int is_sparse(const OPFL_PARAMS *opfl_params) {
+ return (opfl_params->flags & OPFL_FLAG_SPARSE) ? 1 : 0;
+}
+
+static void gradients_over_window(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *ref_frame,
+ const double x_coord, const double y_coord,
+ const int window_size, const int bit_depth,
+ double *ix, double *iy, double *it,
+ LOCALMV *mv);
+
+// coefficients for bilinear interpolation on unit square
+static int pixel_interp(const double x, const double y, const double b00,
+ const double b01, const double b10, const double b11) {
+ const int xint = (int)x;
+ const int yint = (int)y;
+ const double xdec = x - xint;
+ const double ydec = y - yint;
+ const double a = (1 - xdec) * (1 - ydec);
+ const double b = xdec * (1 - ydec);
+ const double c = (1 - xdec) * ydec;
+ const double d = xdec * ydec;
+ // if x, y are already integers, this results to b00
+ int interp = (int)round(a * b00 + b * b01 + c * b10 + d * b11);
+ return interp;
+}
+
+// Scharr filter to compute spatial gradient
+static void spatial_gradient(const YV12_BUFFER_CONFIG *frame, const int x_coord,
+ const int y_coord, const int direction,
+ double *derivative) {
+ double *filter;
+ // Scharr filters
+ double gx[9] = { -3, 0, 3, -10, 0, 10, -3, 0, 3 };
+ double gy[9] = { -3, -10, -3, 0, 0, 0, 3, 10, 3 };
+ if (direction == 0) { // x direction
+ filter = gx;
+ } else { // y direction
+ filter = gy;
+ }
+ int idx = 0;
+ double d = 0;
+ for (int yy = -1; yy <= 1; yy++) {
+ for (int xx = -1; xx <= 1; xx++) {
+ d += filter[idx] *
+ frame->y_buffer[(y_coord + yy) * frame->y_stride + (x_coord + xx)];
+ idx++;
+ }
+ }
+ // normalization scaling factor for scharr
+ *derivative = d / 32.0;
+}
+
+// Determine the spatial gradient at subpixel locations
+// For example, when reducing images for pyramidal LK,
+// corners found in original image may be at subpixel locations.
+static void gradient_interp(double *fullpel_deriv, const double x_coord,
+ const double y_coord, const int w, const int h,
+ double *derivative) {
+ const int xint = (int)x_coord;
+ const int yint = (int)y_coord;
+ double interp;
+ if (xint + 1 > w - 1 || yint + 1 > h - 1) {
+ interp = fullpel_deriv[yint * w + xint];
+ } else {
+ interp = pixel_interp(x_coord, y_coord, fullpel_deriv[yint * w + xint],
+ fullpel_deriv[yint * w + (xint + 1)],
+ fullpel_deriv[(yint + 1) * w + xint],
+ fullpel_deriv[(yint + 1) * w + (xint + 1)]);
+ }
+
+ *derivative = interp;
+}
+
+static void temporal_gradient(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *frame2,
+ const double x_coord, const double y_coord,
+ const int bit_depth, double *derivative,
+ LOCALMV *mv) {
+ const int w = 2;
+ const int h = 2;
+ uint8_t pred1[4];
+ uint8_t pred2[4];
+
+ const int y = (int)y_coord;
+ const int x = (int)x_coord;
+ const double ydec = y_coord - y;
+ const double xdec = x_coord - x;
+ const int is_intrabc = 0; // Is intra-copied?
+ const int is_high_bitdepth = is_frame_high_bitdepth(frame2);
+ const int subsampling_x = 0, subsampling_y = 0; // for y-buffer
+ const int_interpfilters interp_filters =
+ av1_broadcast_interp_filter(MULTITAP_SHARP);
+ const int plane = 0; // y-plane
+ const struct buf_2d ref_buf2 = { NULL, frame2->y_buffer, frame2->y_crop_width,
+ frame2->y_crop_height, frame2->y_stride };
+ struct scale_factors scale;
+ av1_setup_scale_factors_for_frame(&scale, frame->y_crop_width,
+ frame->y_crop_height, frame->y_crop_width,
+ frame->y_crop_height);
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+ subsampling_y, bit_depth, is_high_bitdepth, is_intrabc,
+ &scale, &ref_buf2, interp_filters);
+ inter_pred_params.interp_filter_params[0] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+ inter_pred_params.interp_filter_params[1] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+ inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+ MV newmv = { .row = (int16_t)round((mv->row + xdec) * 8),
+ .col = (int16_t)round((mv->col + ydec) * 8) };
+ av1_enc_build_one_inter_predictor(pred2, w, &newmv, &inter_pred_params);
+ const struct buf_2d ref_buf1 = { NULL, frame->y_buffer, frame->y_crop_width,
+ frame->y_crop_height, frame->y_stride };
+ av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+ subsampling_y, bit_depth, is_high_bitdepth, is_intrabc,
+ &scale, &ref_buf1, interp_filters);
+ inter_pred_params.interp_filter_params[0] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+ inter_pred_params.interp_filter_params[1] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+ inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+ MV zeroMV = { .row = (int16_t)round(xdec * 8),
+ .col = (int16_t)round(ydec * 8) };
+ av1_enc_build_one_inter_predictor(pred1, w, &zeroMV, &inter_pred_params);
+
+ *derivative = pred2[0] - pred1[0];
+}
+
+// Numerical differentiate over window_size x window_size surrounding (x,y)
+// location. Alters ix, iy, it to contain numerical partial derivatives
+static void gradients_over_window(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *ref_frame,
+ const double x_coord, const double y_coord,
+ const int window_size, const int bit_depth,
+ double *ix, double *iy, double *it,
+ LOCALMV *mv) {
+ const double left = x_coord - window_size / 2.0;
+ const double top = y_coord - window_size / 2.0;
+ // gradient operators need pixel before and after (start at 1)
+ const double x_start = AOMMAX(1, left);
+ const double y_start = AOMMAX(1, top);
+ const int frame_height = frame->y_crop_height;
+ const int frame_width = frame->y_crop_width;
+ double deriv_x;
+ double deriv_y;
+ double deriv_t;
+
+ const double x_end = AOMMIN(x_coord + window_size / 2.0, frame_width - 2);
+ const double y_end = AOMMIN(y_coord + window_size / 2.0, frame_height - 2);
+ const int xs = (int)AOMMAX(1, x_start - 1);
+ const int ys = (int)AOMMAX(1, y_start - 1);
+ const int xe = (int)AOMMIN(x_end + 2, frame_width - 2);
+ const int ye = (int)AOMMIN(y_end + 2, frame_height - 2);
+ // with normalization, gradients may be double values
+ double *fullpel_dx = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_x));
+ double *fullpel_dy = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_y));
+ if (!fullpel_dx || !fullpel_dy) {
+ aom_free(fullpel_dx);
+ aom_free(fullpel_dy);
+ return;
+ }
+
+ // TODO(any): This could be more efficient in the case that x_coord
+ // and y_coord are integers.. but it may look more messy.
+
+ // calculate spatial gradients at full pixel locations
+ for (int j = ys; j < ye; j++) {
+ for (int i = xs; i < xe; i++) {
+ spatial_gradient(frame, i, j, 0, &deriv_x);
+ spatial_gradient(frame, i, j, 1, &deriv_y);
+ int idx = (j - ys) * (xe - xs) + (i - xs);
+ fullpel_dx[idx] = deriv_x;
+ fullpel_dy[idx] = deriv_y;
+ }
+ }
+ // compute numerical differentiation for every pixel in window
+ // (this potentially includes subpixels)
+ for (double j = y_start; j < y_end; j++) {
+ for (double i = x_start; i < x_end; i++) {
+ temporal_gradient(frame, ref_frame, i, j, bit_depth, &deriv_t, mv);
+ gradient_interp(fullpel_dx, i - xs, j - ys, xe - xs, ye - ys, &deriv_x);
+ gradient_interp(fullpel_dy, i - xs, j - ys, xe - xs, ye - ys, &deriv_y);
+ int idx = (int)(j - top) * window_size + (int)(i - left);
+ ix[idx] = deriv_x;
+ iy[idx] = deriv_y;
+ it[idx] = deriv_t;
+ }
+ }
+ // TODO(any): to avoid setting deriv arrays to zero for every iteration,
+ // could instead pass these two values back through function call
+ // int first_idx = (int)(y_start - top) * window_size + (int)(x_start - left);
+ // int width = window_size - ((int)(x_start - left) + (int)(left + window_size
+ // - x_end));
+
+ aom_free(fullpel_dx);
+ aom_free(fullpel_dy);
+}
+
+// To compute eigenvalues of 2x2 matrix: Solve for lambda where
+// Determinant(matrix - lambda*identity) == 0
+static void eigenvalues_2x2(const double *matrix, double *eig) {
+ const double a = 1;
+ const double b = -1 * matrix[0] - matrix[3];
+ const double c = -1 * matrix[1] * matrix[2] + matrix[0] * matrix[3];
+ // quadratic formula
+ const double discriminant = b * b - 4 * a * c;
+ eig[0] = (-b - sqrt(discriminant)) / (2.0 * a);
+ eig[1] = (-b + sqrt(discriminant)) / (2.0 * a);
+ // double check that eigenvalues are ordered by magnitude
+ if (fabs(eig[0]) > fabs(eig[1])) {
+ double tmp = eig[0];
+ eig[0] = eig[1];
+ eig[1] = tmp;
+ }
+}
+
+// Shi-Tomasi corner detection criteria
+static double corner_score(const YV12_BUFFER_CONFIG *frame_to_filter,
+ const YV12_BUFFER_CONFIG *ref_frame, const int x,
+ const int y, double *i_x, double *i_y, double *i_t,
+ const int n, const int bit_depth) {
+ double eig[2];
+ LOCALMV mv = { .row = 0, .col = 0 };
+ // TODO(any): technically, ref_frame and i_t are not used by corner score
+ // so these could be replaced by dummy variables,
+ // or change this to spatial gradient function over window only
+ gradients_over_window(frame_to_filter, ref_frame, x, y, n, bit_depth, i_x,
+ i_y, i_t, &mv);
+ double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 };
+ multiply_mat(i_x, i_x, Mres1, 1, n * n, 1);
+ multiply_mat(i_x, i_y, Mres2, 1, n * n, 1);
+ multiply_mat(i_y, i_y, Mres3, 1, n * n, 1);
+ double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] };
+ eigenvalues_2x2(M, eig);
+ return fabs(eig[0]);
+}
+
+// Finds corners in frame_to_filter
+// For less strict requirements (i.e. more corners), decrease threshold
+static int detect_corners(const YV12_BUFFER_CONFIG *frame_to_filter,
+ const YV12_BUFFER_CONFIG *ref_frame,
+ const int maxcorners, int *ref_corners,
+ const int bit_depth) {
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ // TODO(any): currently if maxcorners is decreased, then it only means
+ // corners will be omited from bottom-right of image. if maxcorners
+ // is actually used, then this algorithm would need to re-iterate
+ // and choose threshold based on that
+ assert(maxcorners == frame_height * frame_width);
+ int countcorners = 0;
+ const double threshold = 0.1;
+ double score;
+ const int n = 3;
+ double i_x[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ double i_y[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ double i_t[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ const int fromedge = n;
+ double max_score = corner_score(frame_to_filter, ref_frame, fromedge,
+ fromedge, i_x, i_y, i_t, n, bit_depth);
+ // rough estimate of max corner score in image
+ for (int x = fromedge; x < frame_width - fromedge; x += 1) {
+ for (int y = fromedge; y < frame_height - fromedge; y += frame_height / 5) {
+ for (int i = 0; i < n * n; i++) {
+ i_x[i] = 0;
+ i_y[i] = 0;
+ i_t[i] = 0;
+ }
+ score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n,
+ bit_depth);
+ if (score > max_score) {
+ max_score = score;
+ }
+ }
+ }
+ // score all the points and choose corners over threshold
+ for (int x = fromedge; x < frame_width - fromedge; x += 1) {
+ for (int y = fromedge;
+ (y < frame_height - fromedge) && countcorners < maxcorners; y += 1) {
+ for (int i = 0; i < n * n; i++) {
+ i_x[i] = 0;
+ i_y[i] = 0;
+ i_t[i] = 0;
+ }
+ score = corner_score(frame_to_filter, ref_frame, x, y, i_x, i_y, i_t, n,
+ bit_depth);
+ if (score > threshold * max_score) {
+ ref_corners[countcorners * 2] = x;
+ ref_corners[countcorners * 2 + 1] = y;
+ countcorners++;
+ }
+ }
+ }
+ return countcorners;
+}
+
+// weights is an nxn matrix. weights is filled with a gaussian function,
+// with independent variable: distance from the center point.
+static void gaussian(const double sigma, const int n, const int normalize,
+ double *weights) {
+ double total_weight = 0;
+ for (int j = 0; j < n; j++) {
+ for (int i = 0; i < n; i++) {
+ double distance = sqrt(pow(n / 2 - i, 2) + pow(n / 2 - j, 2));
+ double weight = exp(-0.5 * pow(distance / sigma, 2));
+ weights[j * n + i] = weight;
+ total_weight += weight;
+ }
+ }
+ if (normalize == 1) {
+ for (int j = 0; j < n; j++) {
+ weights[j] = weights[j] / total_weight;
+ }
+ }
+}
+
+static double convolve(const double *filter, const int *img, const int size) {
+ double result = 0;
+ for (int i = 0; i < size; i++) {
+ result += filter[i] * img[i];
+ }
+ return result;
+}
+
+// Applies a Gaussian low-pass smoothing filter to produce
+// a corresponding lower resolution image with halved dimensions
+static void reduce(uint8_t *img, int height, int width, int stride,
+ uint8_t *reduced_img) {
+ const int new_width = width / 2;
+ const int window_size = 5;
+ const double gaussian_filter[25] = {
+ 1. / 256, 1.0 / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16,
+ 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32,
+ 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256,
+ 1. / 64, 3. / 128, 1. / 64, 1. / 256
+ };
+ // filter is 5x5 so need prev and forward 2 pixels
+ int img_section[25];
+ for (int y = 0; y < height - 1; y += 2) {
+ for (int x = 0; x < width - 1; x += 2) {
+ int i = 0;
+ for (int yy = y - window_size / 2; yy <= y + window_size / 2; yy++) {
+ for (int xx = x - window_size / 2; xx <= x + window_size / 2; xx++) {
+ int yvalue = yy;
+ int xvalue = xx;
+ // copied pixels outside the boundary
+ if (yvalue < 0) yvalue = 0;
+ if (xvalue < 0) xvalue = 0;
+ if (yvalue >= height) yvalue = height - 1;
+ if (xvalue >= width) xvalue = width - 1;
+ img_section[i++] = img[yvalue * stride + xvalue];
+ }
+ }
+ reduced_img[(y / 2) * new_width + (x / 2)] = (uint8_t)convolve(
+ gaussian_filter, img_section, window_size * window_size);
+ }
+ }
+}
+
+static int cmpfunc(const void *a, const void *b) {
+ return (*(int *)a - *(int *)b);
+}
+static void filter_mvs(const MV_FILTER_TYPE mv_filter, const int frame_height,
+ const int frame_width, LOCALMV *localmvs, MV *mvs) {
+ const int n = 5; // window size
+ // for smoothing filter
+ const double gaussian_filter[25] = {
+ 1. / 256, 1. / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16,
+ 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32,
+ 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256,
+ 1. / 64, 3. / 128, 1. / 64, 1. / 256
+ };
+ // for median filter
+ int mvrows[25];
+ int mvcols[25];
+ if (mv_filter != MV_FILTER_NONE) {
+ for (int y = 0; y < frame_height; y++) {
+ for (int x = 0; x < frame_width; x++) {
+ int center_idx = y * frame_width + x;
+ int i = 0;
+ double filtered_row = 0;
+ double filtered_col = 0;
+ for (int yy = y - n / 2; yy <= y + n / 2; yy++) {
+ for (int xx = x - n / 2; xx <= x + n / 2; xx++) {
+ int yvalue = yy;
+ int xvalue = xx;
+ // copied pixels outside the boundary
+ if (yvalue < 0) yvalue = 0;
+ if (xvalue < 0) xvalue = 0;
+ if (yvalue >= frame_height) yvalue = frame_height - 1;
+ if (xvalue >= frame_width) xvalue = frame_width - 1;
+ int index = yvalue * frame_width + xvalue;
+ if (mv_filter == MV_FILTER_SMOOTH) {
+ filtered_row += mvs[index].row * gaussian_filter[i];
+ filtered_col += mvs[index].col * gaussian_filter[i];
+ } else if (mv_filter == MV_FILTER_MEDIAN) {
+ mvrows[i] = mvs[index].row;
+ mvcols[i] = mvs[index].col;
+ }
+ i++;
+ }
+ }
+
+ MV mv = mvs[center_idx];
+ if (mv_filter == MV_FILTER_SMOOTH) {
+ mv.row = (int16_t)filtered_row;
+ mv.col = (int16_t)filtered_col;
+ } else if (mv_filter == MV_FILTER_MEDIAN) {
+ qsort(mvrows, 25, sizeof(mv.row), cmpfunc);
+ qsort(mvcols, 25, sizeof(mv.col), cmpfunc);
+ mv.row = mvrows[25 / 2];
+ mv.col = mvcols[25 / 2];
+ }
+ LOCALMV localmv = { .row = ((double)mv.row) / 8,
+ .col = ((double)mv.row) / 8 };
+ localmvs[y * frame_width + x] = localmv;
+ // if mvs array is immediately updated here, then the result may
+ // propagate to other pixels.
+ }
+ }
+ for (int i = 0; i < frame_height * frame_width; i++) {
+ MV mv = { .row = (int16_t)round(8 * localmvs[i].row),
+ .col = (int16_t)round(8 * localmvs[i].col) };
+ mvs[i] = mv;
+ }
+ }
+}
+
+// Computes optical flow at a single pyramid level,
+// using Lucas-Kanade algorithm.
+// Modifies mvs array.
+static void lucas_kanade(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame, const int level,
+ const LK_PARAMS *lk_params, const int num_ref_corners,
+ int *ref_corners, const int mv_stride,
+ const int bit_depth, LOCALMV *mvs) {
+ assert(lk_params->window_size > 0 && lk_params->window_size % 2 == 0);
+ const int n = lk_params->window_size;
+ // algorithm is sensitive to window size
+ double *i_x = (double *)aom_malloc(n * n * sizeof(*i_x));
+ double *i_y = (double *)aom_malloc(n * n * sizeof(*i_y));
+ double *i_t = (double *)aom_malloc(n * n * sizeof(*i_t));
+ double *weights = (double *)aom_malloc(n * n * sizeof(*weights));
+ if (!i_x || !i_y || !i_t || !weights) goto free_lk_buf;
+
+ const int expand_multiplier = (int)pow(2, level);
+ double sigma = 0.2 * n;
+ // normalizing doesn't really affect anything since it's applied
+ // to every component of M and b
+ gaussian(sigma, n, 0, weights);
+ for (int i = 0; i < num_ref_corners; i++) {
+ const double x_coord = 1.0 * ref_corners[i * 2] / expand_multiplier;
+ const double y_coord = 1.0 * ref_corners[i * 2 + 1] / expand_multiplier;
+ int highres_x = ref_corners[i * 2];
+ int highres_y = ref_corners[i * 2 + 1];
+ int mv_idx = highres_y * (mv_stride) + highres_x;
+ LOCALMV mv_old = mvs[mv_idx];
+ mv_old.row = mv_old.row / expand_multiplier;
+ mv_old.col = mv_old.col / expand_multiplier;
+ // using this instead of memset, since it's not completely
+ // clear if zero memset works on double arrays
+ for (int j = 0; j < n * n; j++) {
+ i_x[j] = 0;
+ i_y[j] = 0;
+ i_t[j] = 0;
+ }
+ gradients_over_window(from_frame, to_frame, x_coord, y_coord, n, bit_depth,
+ i_x, i_y, i_t, &mv_old);
+ double Mres1[1] = { 0 }, Mres2[1] = { 0 }, Mres3[1] = { 0 };
+ double bres1[1] = { 0 }, bres2[1] = { 0 };
+ for (int j = 0; j < n * n; j++) {
+ Mres1[0] += weights[j] * i_x[j] * i_x[j];
+ Mres2[0] += weights[j] * i_x[j] * i_y[j];
+ Mres3[0] += weights[j] * i_y[j] * i_y[j];
+ bres1[0] += weights[j] * i_x[j] * i_t[j];
+ bres2[0] += weights[j] * i_y[j] * i_t[j];
+ }
+ double M[4] = { Mres1[0], Mres2[0], Mres2[0], Mres3[0] };
+ double b[2] = { -1 * bres1[0], -1 * bres2[0] };
+ double eig[2] = { 1, 1 };
+ eigenvalues_2x2(M, eig);
+ double threshold = 0.1;
+ if (fabs(eig[0]) > threshold) {
+ // if M is not invertible, then displacement
+ // will default to zeros
+ double u[2] = { 0, 0 };
+ linsolve(2, M, 2, b, u);
+ int mult = 1;
+ if (level != 0)
+ mult = expand_multiplier; // mv doubles when resolution doubles
+ LOCALMV mv = { .row = (mult * (u[0] + mv_old.row)),
+ .col = (mult * (u[1] + mv_old.col)) };
+ mvs[mv_idx] = mv;
+ mvs[mv_idx] = mv;
+ }
+ }
+free_lk_buf:
+ aom_free(weights);
+ aom_free(i_t);
+ aom_free(i_x);
+ aom_free(i_y);
+}
+
+// Warp the src_frame to warper_frame according to mvs.
+// mvs point to src_frame
+static void warp_back_frame(YV12_BUFFER_CONFIG *warped_frame,
+ const YV12_BUFFER_CONFIG *src_frame,
+ const LOCALMV *mvs, int mv_stride) {
+ int w, h;
+ const int fw = src_frame->y_crop_width;
+ const int fh = src_frame->y_crop_height;
+ const int src_fs = src_frame->y_stride, warped_fs = warped_frame->y_stride;
+ const uint8_t *src_buf = src_frame->y_buffer;
+ uint8_t *warped_buf = warped_frame->y_buffer;
+ double temp;
+ for (h = 0; h < fh; h++) {
+ for (w = 0; w < fw; w++) {
+ double cord_x = (double)w + mvs[h * mv_stride + w].col;
+ double cord_y = (double)h + mvs[h * mv_stride + w].row;
+ cord_x = fclamp(cord_x, 0, (double)(fw - 1));
+ cord_y = fclamp(cord_y, 0, (double)(fh - 1));
+ const int floorx = (int)floor(cord_x);
+ const int floory = (int)floor(cord_y);
+ const double fracx = cord_x - (double)floorx;
+ const double fracy = cord_y - (double)floory;
+
+ temp = 0;
+ for (int hh = 0; hh < 2; hh++) {
+ const double weighth = hh ? (fracy) : (1 - fracy);
+ for (int ww = 0; ww < 2; ww++) {
+ const double weightw = ww ? (fracx) : (1 - fracx);
+ int y = floory + hh;
+ int x = floorx + ww;
+ y = clamp(y, 0, fh - 1);
+ x = clamp(x, 0, fw - 1);
+ temp += (double)src_buf[y * src_fs + x] * weightw * weighth;
+ }
+ }
+ warped_buf[h * warped_fs + w] = (uint8_t)round(temp);
+ }
+ }
+}
+
+// Same as warp_back_frame, but using a better interpolation filter.
+static void warp_back_frame_intp(YV12_BUFFER_CONFIG *warped_frame,
+ const YV12_BUFFER_CONFIG *src_frame,
+ const LOCALMV *mvs, int mv_stride) {
+ int w, h;
+ const int fw = src_frame->y_crop_width;
+ const int fh = src_frame->y_crop_height;
+ const int warped_fs = warped_frame->y_stride;
+ uint8_t *warped_buf = warped_frame->y_buffer;
+ const int blk = 2;
+ uint8_t temp_blk[4];
+
+ const int is_intrabc = 0; // Is intra-copied?
+ const int is_high_bitdepth = is_frame_high_bitdepth(src_frame);
+ const int subsampling_x = 0, subsampling_y = 0; // for y-buffer
+ const int_interpfilters interp_filters =
+ av1_broadcast_interp_filter(MULTITAP_SHARP2);
+ const int plane = 0; // y-plane
+ const struct buf_2d ref_buf2 = { NULL, src_frame->y_buffer,
+ src_frame->y_crop_width,
+ src_frame->y_crop_height,
+ src_frame->y_stride };
+ const int bit_depth = src_frame->bit_depth;
+ struct scale_factors scale;
+ av1_setup_scale_factors_for_frame(
+ &scale, src_frame->y_crop_width, src_frame->y_crop_height,
+ src_frame->y_crop_width, src_frame->y_crop_height);
+
+ for (h = 0; h < fh; h++) {
+ for (w = 0; w < fw; w++) {
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, blk, blk, h, w, subsampling_x,
+ subsampling_y, bit_depth, is_high_bitdepth,
+ is_intrabc, &scale, &ref_buf2, interp_filters);
+ inter_pred_params.interp_filter_params[0] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.x_filter];
+ inter_pred_params.interp_filter_params[1] =
+ &av1_interp_filter_params_list[interp_filters.as_filters.y_filter];
+ inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+ MV newmv = { .row = (int16_t)round((mvs[h * mv_stride + w].row) * 8),
+ .col = (int16_t)round((mvs[h * mv_stride + w].col) * 8) };
+ av1_enc_build_one_inter_predictor(temp_blk, blk, &newmv,
+ &inter_pred_params);
+ warped_buf[h * warped_fs + w] = temp_blk[0];
+ }
+ }
+}
+
+#define DERIVATIVE_FILTER_LENGTH 7
+double filter[DERIVATIVE_FILTER_LENGTH] = { -1.0 / 60, 9.0 / 60, -45.0 / 60, 0,
+ 45.0 / 60, -9.0 / 60, 1.0 / 60 };
+
+// Get gradient of the whole frame
+static void get_frame_gradients(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame, double *ix,
+ double *iy, double *it, int grad_stride) {
+ int w, h, k, idx;
+ const int fw = from_frame->y_crop_width;
+ const int fh = from_frame->y_crop_height;
+ const int from_fs = from_frame->y_stride, to_fs = to_frame->y_stride;
+ const uint8_t *from_buf = from_frame->y_buffer;
+ const uint8_t *to_buf = to_frame->y_buffer;
+
+ const int lh = DERIVATIVE_FILTER_LENGTH;
+ const int hleft = (lh - 1) / 2;
+
+ for (h = 0; h < fh; h++) {
+ for (w = 0; w < fw; w++) {
+ // x
+ ix[h * grad_stride + w] = 0;
+ for (k = 0; k < lh; k++) {
+ // if we want to make this block dependent, need to extend the
+ // boundaries using other initializations.
+ idx = w + k - hleft;
+ idx = clamp(idx, 0, fw - 1);
+ ix[h * grad_stride + w] += filter[k] * 0.5 *
+ ((double)from_buf[h * from_fs + idx] +
+ (double)to_buf[h * to_fs + idx]);
+ }
+ // y
+ iy[h * grad_stride + w] = 0;
+ for (k = 0; k < lh; k++) {
+ // if we want to make this block dependent, need to extend the
+ // boundaries using other initializations.
+ idx = h + k - hleft;
+ idx = clamp(idx, 0, fh - 1);
+ iy[h * grad_stride + w] += filter[k] * 0.5 *
+ ((double)from_buf[idx * from_fs + w] +
+ (double)to_buf[idx * to_fs + w]);
+ }
+ // t
+ it[h * grad_stride + w] =
+ (double)to_buf[h * to_fs + w] - (double)from_buf[h * from_fs + w];
+ }
+ }
+}
+
+// Solve for linear equations given by the H-S method
+static void solve_horn_schunck(const double *ix, const double *iy,
+ const double *it, int grad_stride, int width,
+ int height, const LOCALMV *init_mvs,
+ int init_mv_stride, LOCALMV *mvs,
+ int mv_stride) {
+ // TODO(bohanli): May just need to allocate the buffers once per optical flow
+ // calculation
+ int *row_pos = aom_calloc(width * height * 28, sizeof(*row_pos));
+ int *col_pos = aom_calloc(width * height * 28, sizeof(*col_pos));
+ double *values = aom_calloc(width * height * 28, sizeof(*values));
+ double *mv_vec = aom_calloc(width * height * 2, sizeof(*mv_vec));
+ double *mv_init_vec = aom_calloc(width * height * 2, sizeof(*mv_init_vec));
+ double *temp_b = aom_calloc(width * height * 2, sizeof(*temp_b));
+ double *b = aom_calloc(width * height * 2, sizeof(*b));
+ if (!row_pos || !col_pos || !values || !mv_vec || !mv_init_vec || !temp_b ||
+ !b) {
+ goto free_hs_solver_buf;
+ }
+
+ // the location idx for neighboring pixels, k < 4 are the 4 direct neighbors
+ const int check_locs_y[12] = { 0, 0, -1, 1, -1, -1, 1, 1, 0, 0, -2, 2 };
+ const int check_locs_x[12] = { -1, 1, 0, 0, -1, 1, -1, 1, -2, 2, 0, 0 };
+
+ int h, w, checkh, checkw, k, ret;
+ const int offset = height * width;
+ SPARSE_MTX A;
+ int c = 0;
+ const double lambda = 100;
+
+ for (w = 0; w < width; w++) {
+ for (h = 0; h < height; h++) {
+ mv_init_vec[w * height + h] = init_mvs[h * init_mv_stride + w].col;
+ mv_init_vec[w * height + h + offset] =
+ init_mvs[h * init_mv_stride + w].row;
+ }
+ }
+
+ // get matrix A
+ for (w = 0; w < width; w++) {
+ for (h = 0; h < height; h++) {
+ int center_num_direct = 4;
+ const int center_idx = w * height + h;
+ if (w == 0 || w == width - 1) center_num_direct--;
+ if (h == 0 || h == height - 1) center_num_direct--;
+ // diagonal entry for this row from the center pixel
+ double cor_w = center_num_direct * center_num_direct + center_num_direct;
+ row_pos[c] = center_idx;
+ col_pos[c] = center_idx;
+ values[c] = lambda * cor_w;
+ c++;
+ row_pos[c] = center_idx + offset;
+ col_pos[c] = center_idx + offset;
+ values[c] = lambda * cor_w;
+ c++;
+ // other entries from direct neighbors
+ for (k = 0; k < 4; k++) {
+ checkh = h + check_locs_y[k];
+ checkw = w + check_locs_x[k];
+ if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+ continue;
+ }
+ int this_idx = checkw * height + checkh;
+ int this_num_direct = 4;
+ if (checkw == 0 || checkw == width - 1) this_num_direct--;
+ if (checkh == 0 || checkh == height - 1) this_num_direct--;
+ cor_w = -center_num_direct - this_num_direct;
+ row_pos[c] = center_idx;
+ col_pos[c] = this_idx;
+ values[c] = lambda * cor_w;
+ c++;
+ row_pos[c] = center_idx + offset;
+ col_pos[c] = this_idx + offset;
+ values[c] = lambda * cor_w;
+ c++;
+ }
+ // entries from neighbors on the diagonal corners
+ for (k = 4; k < 8; k++) {
+ checkh = h + check_locs_y[k];
+ checkw = w + check_locs_x[k];
+ if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+ continue;
+ }
+ int this_idx = checkw * height + checkh;
+ cor_w = 2;
+ row_pos[c] = center_idx;
+ col_pos[c] = this_idx;
+ values[c] = lambda * cor_w;
+ c++;
+ row_pos[c] = center_idx + offset;
+ col_pos[c] = this_idx + offset;
+ values[c] = lambda * cor_w;
+ c++;
+ }
+ // entries from neighbors with dist of 2
+ for (k = 8; k < 12; k++) {
+ checkh = h + check_locs_y[k];
+ checkw = w + check_locs_x[k];
+ if (checkh < 0 || checkh >= height || checkw < 0 || checkw >= width) {
+ continue;
+ }
+ int this_idx = checkw * height + checkh;
+ cor_w = 1;
+ row_pos[c] = center_idx;
+ col_pos[c] = this_idx;
+ values[c] = lambda * cor_w;
+ c++;
+ row_pos[c] = center_idx + offset;
+ col_pos[c] = this_idx + offset;
+ values[c] = lambda * cor_w;
+ c++;
+ }
+ }
+ }
+ ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
+ 2 * width * height, &A);
+ if (ret < 0) goto free_hs_solver_buf;
+ // subtract init mv part from b
+ av1_mtx_vect_multi_left(&A, mv_init_vec, temp_b, 2 * width * height);
+ for (int i = 0; i < 2 * width * height; i++) {
+ b[i] = -temp_b[i];
+ }
+ av1_free_sparse_mtx_elems(&A);
+
+ // add cross terms to A and modify b with ExEt / EyEt
+ for (w = 0; w < width; w++) {
+ for (h = 0; h < height; h++) {
+ int curidx = w * height + h;
+ // modify b
+ b[curidx] += -ix[h * grad_stride + w] * it[h * grad_stride + w];
+ b[curidx + offset] += -iy[h * grad_stride + w] * it[h * grad_stride + w];
+ // add cross terms to A
+ row_pos[c] = curidx;
+ col_pos[c] = curidx + offset;
+ values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w];
+ c++;
+ row_pos[c] = curidx + offset;
+ col_pos[c] = curidx;
+ values[c] = ix[h * grad_stride + w] * iy[h * grad_stride + w];
+ c++;
+ }
+ }
+ // Add diagonal terms to A
+ for (int i = 0; i < c; i++) {
+ if (row_pos[i] == col_pos[i]) {
+ if (row_pos[i] < offset) {
+ w = row_pos[i] / height;
+ h = row_pos[i] % height;
+ values[i] += pow(ix[h * grad_stride + w], 2);
+ } else {
+ w = (row_pos[i] - offset) / height;
+ h = (row_pos[i] - offset) % height;
+ values[i] += pow(iy[h * grad_stride + w], 2);
+ }
+ }
+ }
+
+ ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
+ 2 * width * height, &A);
+ if (ret < 0) goto free_hs_solver_buf;
+
+ // solve for the mvs
+ ret = av1_conjugate_gradient_sparse(&A, b, 2 * width * height, mv_vec);
+ if (ret < 0) goto free_hs_solver_buf;
+
+ // copy mvs
+ for (w = 0; w < width; w++) {
+ for (h = 0; h < height; h++) {
+ mvs[h * mv_stride + w].col = mv_vec[w * height + h];
+ mvs[h * mv_stride + w].row = mv_vec[w * height + h + offset];
+ }
+ }
+free_hs_solver_buf:
+ aom_free(row_pos);
+ aom_free(col_pos);
+ aom_free(values);
+ aom_free(mv_vec);
+ aom_free(mv_init_vec);
+ aom_free(b);
+ aom_free(temp_b);
+ av1_free_sparse_mtx_elems(&A);
+}
+
+// Calculate optical flow from from_frame to to_frame using the H-S method.
+static void horn_schunck(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame, const int level,
+ const int mv_stride, const int mv_height,
+ const int mv_width, const OPFL_PARAMS *opfl_params,
+ LOCALMV *mvs) {
+ // mvs are always on level 0, here we define two new mv arrays that is of size
+ // of this level.
+ const int fw = from_frame->y_crop_width;
+ const int fh = from_frame->y_crop_height;
+ const int factor = (int)pow(2, level);
+ int w, h, k, init_mv_stride;
+ LOCALMV *init_mvs = NULL, *refine_mvs = NULL;
+ double *ix = NULL, *iy = NULL, *it = NULL;
+ YV12_BUFFER_CONFIG temp_frame;
+ temp_frame.y_buffer = NULL;
+ if (level == 0) {
+ init_mvs = mvs;
+ init_mv_stride = mv_stride;
+ } else {
+ init_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+ if (!init_mvs) goto free_hs_buf;
+ init_mv_stride = fw;
+ for (h = 0; h < fh; h++) {
+ for (w = 0; w < fw; w++) {
+ init_mvs[h * init_mv_stride + w].row =
+ mvs[h * factor * mv_stride + w * factor].row / (double)factor;
+ init_mvs[h * init_mv_stride + w].col =
+ mvs[h * factor * mv_stride + w * factor].col / (double)factor;
+ }
+ }
+ }
+ refine_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+ if (!refine_mvs) goto free_hs_buf;
+ // temp frame for warping
+ temp_frame.y_buffer =
+ (uint8_t *)aom_calloc(fh * fw, sizeof(*temp_frame.y_buffer));
+ if (!temp_frame.y_buffer) goto free_hs_buf;
+ temp_frame.y_crop_height = fh;
+ temp_frame.y_crop_width = fw;
+ temp_frame.y_stride = fw;
+ // gradient buffers
+ ix = aom_calloc(fw * fh, sizeof(*ix));
+ iy = aom_calloc(fw * fh, sizeof(*iy));
+ it = aom_calloc(fw * fh, sizeof(*it));
+ if (!ix || !iy || !it) goto free_hs_buf;
+ // For each warping step
+ for (k = 0; k < opfl_params->warping_steps; k++) {
+ // warp from_frame with init_mv
+ if (level == 0) {
+ warp_back_frame_intp(&temp_frame, to_frame, init_mvs, init_mv_stride);
+ } else {
+ warp_back_frame(&temp_frame, to_frame, init_mvs, init_mv_stride);
+ }
+ // calculate frame gradients
+ get_frame_gradients(from_frame, &temp_frame, ix, iy, it, fw);
+ // form linear equations and solve mvs
+ solve_horn_schunck(ix, iy, it, fw, fw, fh, init_mvs, init_mv_stride,
+ refine_mvs, fw);
+ // update init_mvs
+ for (h = 0; h < fh; h++) {
+ for (w = 0; w < fw; w++) {
+ init_mvs[h * init_mv_stride + w].col += refine_mvs[h * fw + w].col;
+ init_mvs[h * init_mv_stride + w].row += refine_mvs[h * fw + w].row;
+ }
+ }
+ }
+ // copy back the mvs if needed
+ if (level != 0) {
+ for (h = 0; h < mv_height; h++) {
+ for (w = 0; w < mv_width; w++) {
+ mvs[h * mv_stride + w].row =
+ init_mvs[h / factor * init_mv_stride + w / factor].row *
+ (double)factor;
+ mvs[h * mv_stride + w].col =
+ init_mvs[h / factor * init_mv_stride + w / factor].col *
+ (double)factor;
+ }
+ }
+ }
+free_hs_buf:
+ if (level != 0) aom_free(init_mvs);
+ aom_free(refine_mvs);
+ aom_free(temp_frame.y_buffer);
+ aom_free(ix);
+ aom_free(iy);
+ aom_free(it);
+}
+
+// Apply optical flow iteratively at each pyramid level
+static void pyramid_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame,
+ const int bit_depth,
+ const OPFL_PARAMS *opfl_params,
+ const OPTFLOW_METHOD method, LOCALMV *mvs) {
+ assert(opfl_params->pyramid_levels > 0 &&
+ opfl_params->pyramid_levels <= MAX_PYRAMID_LEVELS);
+ int levels = opfl_params->pyramid_levels;
+ const int frame_height = from_frame->y_crop_height;
+ const int frame_width = from_frame->y_crop_width;
+ if ((frame_height / pow(2.0, levels - 1) < 50 ||
+ frame_height / pow(2.0, levels - 1) < 50) &&
+ levels > 1)
+ levels = levels - 1;
+ uint8_t *images1[MAX_PYRAMID_LEVELS] = { NULL };
+ uint8_t *images2[MAX_PYRAMID_LEVELS] = { NULL };
+ int *ref_corners = NULL;
+
+ images1[0] = from_frame->y_buffer;
+ images2[0] = to_frame->y_buffer;
+ YV12_BUFFER_CONFIG *buffers1 = aom_malloc(levels * sizeof(*buffers1));
+ YV12_BUFFER_CONFIG *buffers2 = aom_malloc(levels * sizeof(*buffers2));
+ if (!buffers1 || !buffers2) goto free_pyramid_buf;
+ buffers1[0] = *from_frame;
+ buffers2[0] = *to_frame;
+ int fw = frame_width;
+ int fh = frame_height;
+ for (int i = 1; i < levels; i++) {
+ // TODO(bohanli): may need to extend buffers for better interpolation SIMD
+ images1[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images1[i]));
+ images2[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images2[i]));
+ if (!images1[i] || !images2[i]) goto free_pyramid_buf;
+ int stride;
+ if (i == 1)
+ stride = from_frame->y_stride;
+ else
+ stride = fw;
+ reduce(images1[i - 1], fh, fw, stride, images1[i]);
+ reduce(images2[i - 1], fh, fw, stride, images2[i]);
+ fh /= 2;
+ fw /= 2;
+ YV12_BUFFER_CONFIG a = { .y_buffer = images1[i],
+ .y_crop_width = fw,
+ .y_crop_height = fh,
+ .y_stride = fw };
+ YV12_BUFFER_CONFIG b = { .y_buffer = images2[i],
+ .y_crop_width = fw,
+ .y_crop_height = fh,
+ .y_stride = fw };
+ buffers1[i] = a;
+ buffers2[i] = b;
+ }
+ // Compute corners for specific frame
+ int num_ref_corners = 0;
+ if (is_sparse(opfl_params)) {
+ int maxcorners = from_frame->y_crop_width * from_frame->y_crop_height;
+ ref_corners = aom_malloc(maxcorners * 2 * sizeof(*ref_corners));
+ if (!ref_corners) goto free_pyramid_buf;
+ num_ref_corners = detect_corners(from_frame, to_frame, maxcorners,
+ ref_corners, bit_depth);
+ }
+ const int stop_level = 0;
+ for (int i = levels - 1; i >= stop_level; i--) {
+ if (method == LUCAS_KANADE) {
+ assert(is_sparse(opfl_params));
+ lucas_kanade(&buffers1[i], &buffers2[i], i, opfl_params->lk_params,
+ num_ref_corners, ref_corners, buffers1[0].y_crop_width,
+ bit_depth, mvs);
+ } else if (method == HORN_SCHUNCK) {
+ assert(!is_sparse(opfl_params));
+ horn_schunck(&buffers1[i], &buffers2[i], i, buffers1[0].y_crop_width,
+ buffers1[0].y_crop_height, buffers1[0].y_crop_width,
+ opfl_params, mvs);
+ }
+ }
+free_pyramid_buf:
+ for (int i = 1; i < levels; i++) {
+ aom_free(images1[i]);
+ aom_free(images2[i]);
+ }
+ aom_free(ref_corners);
+ aom_free(buffers1);
+ aom_free(buffers2);
+}
+// Computes optical flow by applying algorithm at
+// multiple pyramid levels of images (lower-resolution, smoothed images)
+// This accounts for larger motions.
+// Inputs:
+// from_frame Frame buffer.
+// to_frame: Frame buffer. MVs point from_frame -> to_frame.
+// from_frame_idx: Index of from_frame.
+// to_frame_idx: Index of to_frame. Return all zero MVs when idx are equal.
+// bit_depth:
+// opfl_params: contains algorithm-specific parameters.
+// mv_filter: MV_FILTER_NONE, MV_FILTER_SMOOTH, or MV_FILTER_MEDIAN.
+// method: LUCAS_KANADE, HORN_SCHUNCK
+// mvs: pointer to MVs. Contains initialization, and modified
+// based on optical flow. Must have
+// dimensions = from_frame->y_crop_width * from_frame->y_crop_height
+void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame,
+ const int from_frame_idx, const int to_frame_idx,
+ const int bit_depth, const OPFL_PARAMS *opfl_params,
+ const MV_FILTER_TYPE mv_filter,
+ const OPTFLOW_METHOD method, MV *mvs) {
+ const int frame_height = from_frame->y_crop_height;
+ const int frame_width = from_frame->y_crop_width;
+ // TODO(any): deal with the case where frames are not of the same dimensions
+ assert(frame_height == to_frame->y_crop_height &&
+ frame_width == to_frame->y_crop_width);
+ if (from_frame_idx == to_frame_idx) {
+ // immediately return all zero mvs when frame indices are equal
+ for (int yy = 0; yy < frame_height; yy++) {
+ for (int xx = 0; xx < frame_width; xx++) {
+ MV mv = { .row = 0, .col = 0 };
+ mvs[yy * frame_width + xx] = mv;
+ }
+ }
+ return;
+ }
+
+ // Initialize double mvs based on input parameter mvs array
+ LOCALMV *localmvs =
+ aom_malloc(frame_height * frame_width * sizeof(*localmvs));
+ if (!localmvs) return;
+
+ filter_mvs(MV_FILTER_SMOOTH, frame_height, frame_width, localmvs, mvs);
+
+ for (int i = 0; i < frame_width * frame_height; i++) {
+ MV mv = mvs[i];
+ LOCALMV localmv = { .row = ((double)mv.row) / 8,
+ .col = ((double)mv.col) / 8 };
+ localmvs[i] = localmv;
+ }
+ // Apply optical flow algorithm
+ pyramid_optical_flow(from_frame, to_frame, bit_depth, opfl_params, method,
+ localmvs);
+
+ // Update original mvs array
+ for (int j = 0; j < frame_height; j++) {
+ for (int i = 0; i < frame_width; i++) {
+ int idx = j * frame_width + i;
+ if (j + localmvs[idx].row < 0 || j + localmvs[idx].row >= frame_height ||
+ i + localmvs[idx].col < 0 || i + localmvs[idx].col >= frame_width) {
+ continue;
+ }
+ MV mv = { .row = (int16_t)round(8 * localmvs[idx].row),
+ .col = (int16_t)round(8 * localmvs[idx].col) };
+ mvs[idx] = mv;
+ }
+ }
+
+ filter_mvs(mv_filter, frame_height, frame_width, localmvs, mvs);
+
+ aom_free(localmvs);
+}
+#endif
diff --git a/third_party/aom/av1/encoder/optical_flow.h b/third_party/aom/av1/encoder/optical_flow.h
new file mode 100644
index 0000000000..2fbe474d77
--- /dev/null
+++ b/third_party/aom/av1/encoder/optical_flow.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_OPTICAL_FLOW_H_
+#define AOM_AV1_ENCODER_OPTICAL_FLOW_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/mv.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_OPTICAL_FLOW_API
+
+typedef enum { LUCAS_KANADE, HORN_SCHUNCK } OPTFLOW_METHOD;
+
+typedef enum {
+ MV_FILTER_NONE,
+ MV_FILTER_SMOOTH,
+ MV_FILTER_MEDIAN
+} MV_FILTER_TYPE;
+
+typedef struct LOCALMV {
+ double row;
+ double col;
+} LOCALMV;
+
+#define MAX_PYRAMID_LEVELS 5
+// default options for optical flow
+#define OPFL_WINDOW_SIZE 15
+#define OPFL_PYRAMID_LEVELS 3 // total levels
+#define OPFL_WARPING_STEPS 3
+
+// parameters specific to Lucas-Kanade
+typedef struct lk_params {
+ int window_size;
+} LK_PARAMS;
+
+// generic structure to contain parameters for all
+// optical flow algorithms
+typedef struct opfl_params {
+ int pyramid_levels;
+ int warping_steps;
+ LK_PARAMS *lk_params;
+ int flags;
+} OPFL_PARAMS;
+
+#define OPFL_FLAG_SPARSE 1
+
+void av1_init_opfl_params(OPFL_PARAMS *opfl_params);
+
+void av1_init_lk_params(LK_PARAMS *lk_params);
+
+void av1_optical_flow(const YV12_BUFFER_CONFIG *from_frame,
+ const YV12_BUFFER_CONFIG *to_frame,
+ const int from_frame_idx, const int to_frame_idx,
+ const int bit_depth, const OPFL_PARAMS *opfl_params,
+ const MV_FILTER_TYPE mv_filter,
+ const OPTFLOW_METHOD method, MV *mvs);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_OPTICAL_FLOW_H_
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
new file mode 100644
index 0000000000..7f79e9596e
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.c
@@ -0,0 +1,975 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "av1/common/pred_common.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/tx_search.h"
+
+#define AV1_K_MEANS_DIM 1
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+#define AV1_K_MEANS_DIM 2
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+
+static int int16_comparer(const void *a, const void *b) {
+ return (*(int16_t *)a - *(int16_t *)b);
+}
+
+int av1_remove_duplicates(int16_t *centroids, int num_centroids) {
+ int num_unique; // number of unique centroids
+ int i;
+ qsort(centroids, num_centroids, sizeof(*centroids), int16_comparer);
+ // Remove duplicates.
+ num_unique = 1;
+ for (i = 1; i < num_centroids; ++i) {
+ if (centroids[i] != centroids[i - 1]) { // found a new unique centroid
+ centroids[num_unique++] = centroids[i];
+ }
+ }
+ return num_unique;
+}
+
+static int delta_encode_cost(const int *colors, int num, int bit_depth,
+ int min_val) {
+ if (num <= 0) return 0;
+ int bits_cost = bit_depth;
+ if (num == 1) return bits_cost;
+ bits_cost += 2;
+ int max_delta = 0;
+ int deltas[PALETTE_MAX_SIZE];
+ const int min_bits = bit_depth - 3;
+ for (int i = 1; i < num; ++i) {
+ const int delta = colors[i] - colors[i - 1];
+ deltas[i - 1] = delta;
+ assert(delta >= min_val);
+ if (delta > max_delta) max_delta = delta;
+ }
+ int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+ assert(bits_per_delta <= bit_depth);
+ int range = (1 << bit_depth) - colors[0] - min_val;
+ for (int i = 0; i < num - 1; ++i) {
+ bits_cost += bits_per_delta;
+ range -= deltas[i];
+ bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range));
+ }
+ return bits_cost;
+}
+
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+ const uint16_t *colors, int n_colors,
+ uint8_t *cache_color_found, int *out_cache_colors) {
+ if (n_cache <= 0) {
+ for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i];
+ return n_colors;
+ }
+ memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found));
+ int n_in_cache = 0;
+ int in_cache_flags[PALETTE_MAX_SIZE];
+ memset(in_cache_flags, 0, sizeof(in_cache_flags));
+ for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) {
+ for (int j = 0; j < n_colors; ++j) {
+ if (colors[j] == color_cache[i]) {
+ in_cache_flags[j] = 1;
+ cache_color_found[i] = 1;
+ ++n_in_cache;
+ break;
+ }
+ }
+ }
+ int j = 0;
+ for (int i = 0; i < n_colors; ++i)
+ if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i];
+ assert(j == n_colors - n_in_cache);
+ return j;
+}
+
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *zero_count,
+ int *min_bits) {
+ const int n = pmi->palette_size[1];
+ const int max_val = 1 << bit_depth;
+ int max_d = 0;
+ *min_bits = bit_depth - 4;
+ *zero_count = 0;
+ for (int i = 1; i < n; ++i) {
+ const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] -
+ pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1];
+ const int v = abs(delta);
+ const int d = AOMMIN(v, max_val - v);
+ if (d > max_d) max_d = d;
+ if (d == 0) ++(*zero_count);
+ }
+ return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+}
+
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+ const uint16_t *color_cache, int n_cache,
+ int bit_depth) {
+ const int n = pmi->palette_size[0];
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache =
+ av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+ cache_color_found, out_cache_colors);
+ const int total_bits =
+ n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1);
+ return av1_cost_literal(total_bits);
+}
+
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+ const uint16_t *color_cache, int n_cache,
+ int bit_depth) {
+ const int n = pmi->palette_size[1];
+ int total_bits = 0;
+ // U channel palette color cost.
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache = av1_index_color_cache(
+ color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n,
+ cache_color_found, out_cache_colors);
+ total_bits +=
+ n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0);
+
+ // V channel palette color cost.
+ int zero_count = 0, min_bits_v = 0;
+ const int bits_v =
+ av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+ const int bits_using_delta =
+ 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+ const int bits_using_raw = bit_depth * n;
+ total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
+ return av1_cost_literal(total_bits);
+}
+
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
+ int orig_width, int orig_height,
+ int new_width, int new_height) {
+ int j;
+ assert(new_width >= orig_width);
+ assert(new_height >= orig_height);
+ if (new_width == orig_width && new_height == orig_height) return;
+
+ for (j = orig_height - 1; j >= 0; --j) {
+ memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+ // Copy last column to extra columns.
+ memset(color_map + j * new_width + orig_width,
+ color_map[j * new_width + orig_width - 1], new_width - orig_width);
+ }
+ // Copy last row to extra rows.
+ for (j = orig_height; j < new_height; ++j) {
+ memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+ new_width);
+ }
+}
+
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
+ int n_cache, int n_colors,
+ int stride, int16_t *centroids,
+ int bit_depth) {
+ if (n_cache <= 0) return;
+ for (int i = 0; i < n_colors * stride; i += stride) {
+ int min_diff = abs((int)centroids[i] - (int)color_cache[0]);
+ int idx = 0;
+ for (int j = 1; j < n_cache; ++j) {
+ const int this_diff = abs((int)centroids[i] - (int)color_cache[j]);
+ if (this_diff < min_diff) {
+ min_diff = this_diff;
+ idx = j;
+ }
+ }
+ const int min_threshold = 4 << (bit_depth - 8);
+ if (min_diff <= min_threshold) centroids[i] = color_cache[idx];
+ }
+}
+
+/*!\brief Calculate the luma palette cost from a given color palette
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * Given the base colors as specified in centroids[], calculate the RD cost
+ * of palette mode.
+ */
+static AOM_INLINE void palette_rd_y(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+ BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int16_t *centroids,
+ int n, uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating,
+ MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+ int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable,
+ int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
+ uint8_t *tx_type_map, int *beat_best_palette_rd,
+ bool *do_header_rd_based_breakout, int discount_color_cost) {
+ if (do_header_rd_based_breakout != NULL) *do_header_rd_based_breakout = false;
+ optimize_palette_colors(color_cache, n_cache, n, 1, centroids,
+ cpi->common.seq_params->bit_depth);
+ const int num_unique_colors = av1_remove_duplicates(centroids, n);
+ if (num_unique_colors < PALETTE_MIN_SIZE) {
+ // Too few unique colors to create a palette. And DC_PRED will work
+ // well for that case anyway. So skip.
+ return;
+ }
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ if (cpi->common.seq_params->use_highbitdepth) {
+ for (int i = 0; i < num_unique_colors; ++i) {
+ pmi->palette_colors[i] = clip_pixel_highbd(
+ (int)centroids[i], cpi->common.seq_params->bit_depth);
+ }
+ } else {
+ for (int i = 0; i < num_unique_colors; ++i) {
+ pmi->palette_colors[i] = clip_pixel(centroids[i]);
+ }
+ }
+ pmi->palette_size[0] = num_unique_colors;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ int block_width, block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+ &cols);
+ av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
+ 1);
+ extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+
+ RD_STATS tokenonly_rd_stats;
+ int this_rate;
+
+ if (do_header_rd_based_gating) {
+ assert(do_header_rd_based_breakout != NULL);
+ const int palette_mode_rate = intra_mode_info_cost_y(
+ cpi, x, mbmi, bsize, dc_mode_cost, discount_color_cost);
+ const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+ // Less aggressive pruning when prune_luma_palette_size_search_level == 1.
+ const int header_rd_shift =
+ (cpi->sf.intra_sf.prune_luma_palette_size_search_level == 1) ? 1 : 0;
+ // Terminate further palette_size search, if the header cost corresponding
+ // to lower palette_size is more than *best_rd << header_rd_shift. This
+ // logic is implemented with a right shift in the LHS to prevent a possible
+ // overflow with the left shift in RHS.
+ if ((header_rd >> header_rd_shift) > *best_rd) {
+ *do_header_rd_based_breakout = true;
+ return;
+ }
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+ *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) return;
+ this_rate = tokenonly_rd_stats.rate + palette_mode_rate;
+ } else {
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+ *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) return;
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost,
+ discount_color_cost);
+ }
+
+ int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
+ tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+ }
+ // Collect mode stats for multiwinner mode processing
+ const int txfm_search_done = 1;
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
+ this_rd, cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ // Setting beat_best_rd flag because current mode rd is better than best_rd.
+ // This flag need to be updated only for palette evaluation in key frames
+ if (beat_best_rd) *beat_best_rd = 1;
+ memcpy(best_palette_color_map, color_map,
+ block_width * block_height * sizeof(color_map[0]));
+ *best_mbmi = *mbmi;
+ memcpy(blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(x->txfm_search_info.blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ if (rate) *rate = this_rate;
+ if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+ if (distortion) *distortion = tokenonly_rd_stats.dist;
+ if (skippable) *skippable = tokenonly_rd_stats.skip_txfm;
+ if (beat_best_palette_rd) *beat_best_palette_rd = 1;
+ }
+}
+
+static AOM_INLINE int is_iter_over(int curr_idx, int end_idx, int step_size) {
+ assert(step_size != 0);
+ return (step_size > 0) ? curr_idx >= end_idx : curr_idx <= end_idx;
+}
+
+// Performs count-based palette search with number of colors in interval
+// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
+// be less than start_n. Saves the last numbers searched in last_n_searched and
+// returns the best number of colors found.
+static AOM_INLINE int perform_top_color_palette_search(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+ BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data,
+ int16_t *top_colors, int start_n, int end_n, int step_size,
+ bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache,
+ int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+ int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+ uint8_t *best_blk_skip, uint8_t *tx_type_map, int discount_color_cost) {
+ int16_t centroids[PALETTE_MAX_SIZE];
+ int n = start_n;
+ int top_color_winner = end_n;
+ /* clang-format off */
+ assert(IMPLIES(step_size < 0, start_n > end_n));
+ /* clang-format on */
+ assert(IMPLIES(step_size > 0, start_n < end_n));
+ while (!is_iter_over(n, end_n, step_size)) {
+ int beat_best_palette_rd = 0;
+ bool do_header_rd_based_breakout = false;
+ memcpy(centroids, top_colors, n * sizeof(top_colors[0]));
+ palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+ color_cache, n_cache, do_header_rd_based_gating, best_mbmi,
+ best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+ tx_type_map, &beat_best_palette_rd,
+ &do_header_rd_based_breakout, discount_color_cost);
+ *last_n_searched = n;
+ if (do_header_rd_based_breakout) {
+ // Terminate palette_size search by setting last_n_searched to end_n.
+ *last_n_searched = end_n;
+ break;
+ }
+ if (beat_best_palette_rd) {
+ top_color_winner = n;
+ } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
+ // At search level 2, we return immediately if we don't see an improvement
+ return top_color_winner;
+ }
+ n += step_size;
+ }
+ return top_color_winner;
+}
+
+// Performs k-means based palette search with number of colors in interval
+// [start_n, end_n) with step size step_size. If step_size < 0, then end_n can
+// be less than start_n. Saves the last numbers searched in last_n_searched and
+// returns the best number of colors found.
+static AOM_INLINE int perform_k_means_palette_search(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+ BLOCK_SIZE bsize, int dc_mode_cost, const int16_t *data, int lower_bound,
+ int upper_bound, int start_n, int end_n, int step_size,
+ bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache,
+ int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+ int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+ uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+ int data_points, int discount_color_cost) {
+ int16_t centroids[PALETTE_MAX_SIZE];
+ const int max_itr = 50;
+ int n = start_n;
+ int top_color_winner = end_n;
+ /* clang-format off */
+ assert(IMPLIES(step_size < 0, start_n > end_n));
+ /* clang-format on */
+ assert(IMPLIES(step_size > 0, start_n < end_n));
+ while (!is_iter_over(n, end_n, step_size)) {
+ int beat_best_palette_rd = 0;
+ bool do_header_rd_based_breakout = false;
+ for (int i = 0; i < n; ++i) {
+ centroids[i] =
+ lower_bound + (2 * i + 1) * (upper_bound - lower_bound) / n / 2;
+ }
+ av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+ palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+ color_cache, n_cache, do_header_rd_based_gating, best_mbmi,
+ best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+ tx_type_map, &beat_best_palette_rd,
+ &do_header_rd_based_breakout, discount_color_cost);
+ *last_n_searched = n;
+ if (do_header_rd_based_breakout) {
+ // Terminate palette_size search by setting last_n_searched to end_n.
+ *last_n_searched = end_n;
+ break;
+ }
+ if (beat_best_palette_rd) {
+ top_color_winner = n;
+ } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
+ // At search level 2, we return immediately if we don't see an improvement
+ return top_color_winner;
+ }
+ n += step_size;
+ }
+ return top_color_winner;
+}
+
+// Sets the parameters to search the current number of colors +- 1
+static AOM_INLINE void set_stage2_params(int *min_n, int *max_n, int *step_size,
+ int winner, int end_n) {
+ // Set min to winner - 1 unless we are already at the border, then we set it
+ // to winner + 1
+ *min_n = (winner == PALETTE_MIN_SIZE) ? (PALETTE_MIN_SIZE + 1)
+ : AOMMAX(winner - 1, PALETTE_MIN_SIZE);
+ // Set max to winner + 1 unless we are already at the border, then we set it
+ // to winner - 1
+ *max_n =
+ (winner == end_n) ? (winner - 1) : AOMMIN(winner + 1, PALETTE_MAX_SIZE);
+
+ // Set the step size to max_n - min_n so we only search those two values.
+ // If max_n == min_n, then set step_size to 1 to avoid infinite loop later.
+ *step_size = AOMMAX(1, *max_n - *min_n);
+}
+
+static AOM_INLINE void fill_data_and_get_bounds(const uint8_t *src,
+ const int src_stride,
+ const int rows, const int cols,
+ const int is_high_bitdepth,
+ int16_t *data, int *lower_bound,
+ int *upper_bound) {
+ if (is_high_bitdepth) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+ *lower_bound = *upper_bound = src_ptr[0];
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ const int val = src_ptr[c];
+ data[c] = (int16_t)val;
+ *lower_bound = AOMMIN(*lower_bound, val);
+ *upper_bound = AOMMAX(*upper_bound, val);
+ }
+ src_ptr += src_stride;
+ data += cols;
+ }
+ return;
+ }
+
+ // low bit depth
+ *lower_bound = *upper_bound = src[0];
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ const int val = src[c];
+ data[c] = (int16_t)val;
+ *lower_bound = AOMMIN(*lower_bound, val);
+ *upper_bound = AOMMAX(*upper_bound, val);
+ }
+ src += src_stride;
+ data += cols;
+ }
+}
+
+/*! \brief Colors are sorted by their count: the higher the better.
+ */
+struct ColorCount {
+ //! Color index in the histogram.
+ int index;
+ //! Histogram count.
+ int count;
+};
+
+int color_count_comp(const void *c1, const void *c2) {
+ const struct ColorCount *color_count1 = (const struct ColorCount *)c1;
+ const struct ColorCount *color_count2 = (const struct ColorCount *)c2;
+ if (color_count1->count > color_count2->count) return -1;
+ if (color_count1->count < color_count2->count) return 1;
+ if (color_count1->index < color_count2->index) return -1;
+ return 1;
+}
+
+static void find_top_colors(const int *const count_buf, int bit_depth,
+ int n_colors, int16_t *top_colors) {
+ // Top color array, serving as a priority queue if more than n_colors are
+ // found.
+ struct ColorCount top_color_counts[PALETTE_MAX_SIZE] = { { 0 } };
+ int n_color_count = 0;
+ for (int i = 0; i < (1 << bit_depth); ++i) {
+ if (count_buf[i] > 0) {
+ if (n_color_count < n_colors) {
+ // Keep adding to the top colors.
+ top_color_counts[n_color_count].index = i;
+ top_color_counts[n_color_count].count = count_buf[i];
+ ++n_color_count;
+ if (n_color_count == n_colors) {
+ qsort(top_color_counts, n_colors, sizeof(top_color_counts[0]),
+ color_count_comp);
+ }
+ } else {
+ // Check the worst in the sorted top.
+ if (count_buf[i] > top_color_counts[n_colors - 1].count) {
+ int j = n_colors - 1;
+ // Move up to the best one.
+ while (j >= 1 && count_buf[i] > top_color_counts[j - 1].count) --j;
+ memmove(top_color_counts + j + 1, top_color_counts + j,
+ (n_colors - j - 1) * sizeof(top_color_counts[0]));
+ top_color_counts[j].index = i;
+ top_color_counts[j].count = count_buf[i];
+ }
+ }
+ }
+ }
+ assert(n_color_count == n_colors);
+
+ for (int i = 0; i < n_colors; ++i) {
+ top_colors[i] = top_color_counts[i].index;
+ }
+}
+
+void av1_rd_pick_palette_intra_sby(
+ const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost,
+ MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+ int *rate, int *rate_tokenonly, int64_t *distortion, uint8_t *skippable,
+ int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+ uint8_t *tx_type_map) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+ bsize));
+ assert(PALETTE_MAX_SIZE == 8);
+ assert(PALETTE_MIN_SIZE == 2);
+
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *const src = x->plane[0].src.buf;
+ int block_width, block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+ &cols);
+ const SequenceHeader *const seq_params = cpi->common.seq_params;
+ const int is_hbd = seq_params->use_highbitdepth;
+ const int bit_depth = seq_params->bit_depth;
+ const int discount_color_cost = cpi->sf.rt_sf.use_nonrd_pick_mode;
+ int unused;
+
+ int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
+ int colors, colors_threshold = 0;
+ if (is_hbd) {
+ int count_buf_8bit[1 << 8]; // Maximum (1 << 8) bins for hbd path.
+ av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth, count_buf,
+ count_buf_8bit, &colors_threshold, &colors);
+ } else {
+ av1_count_colors(src, src_stride, rows, cols, count_buf, &colors);
+ colors_threshold = colors;
+ }
+
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ int color_thresh_palette = 64;
+ // Allow for larger color_threshold for palette search, based on color,
+ // scene_change, and block source variance.
+ // Since palette is Y based, only allow larger threshold if block
+ // color_dist is below threshold.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ cpi->sf.rt_sf.increase_color_thresh_palette && cpi->rc.high_source_sad &&
+ x->source_variance > 50) {
+ int64_t norm_color_dist = 0;
+ if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+ norm_color_dist = x->min_dist_inter_uv >>
+ (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+ if (x->color_sensitivity[0] && x->color_sensitivity[1])
+ norm_color_dist = norm_color_dist >> 1;
+ }
+ if (norm_color_dist < 8000) color_thresh_palette += 20;
+ }
+ if (colors_threshold > 1 && colors_threshold <= color_thresh_palette) {
+ int16_t *const data = x->palette_buffer->kmeans_data_buf;
+ int16_t centroids[PALETTE_MAX_SIZE];
+ int lower_bound, upper_bound;
+ fill_data_and_get_bounds(src, src_stride, rows, cols, is_hbd, data,
+ &lower_bound, &upper_bound);
+
+ mbmi->mode = DC_PRED;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+
+ // Find the dominant colors, stored in top_colors[].
+ int16_t top_colors[PALETTE_MAX_SIZE] = { 0 };
+ find_top_colors(count_buf, bit_depth, AOMMIN(colors, PALETTE_MAX_SIZE),
+ top_colors);
+
+ // The following are the approaches used for header rdcost based gating
+ // for early termination for different values of prune_palette_search_level.
+ // 0: Pruning based on header rdcost for ascending order palette_size
+ // search.
+ // 1: When colors > PALETTE_MIN_SIZE, enabled only for coarse palette_size
+ // search and for finer search do_header_rd_based_gating parameter is
+ // explicitly passed as 'false'.
+ // 2: Enabled only for ascending order palette_size search and for
+ // descending order search do_header_rd_based_gating parameter is explicitly
+ // passed as 'false'.
+ const bool do_header_rd_based_gating =
+ cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
+
+ // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+ // where the dominant colors and the k-means results are similar.
+ if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
+ (colors > PALETTE_MIN_SIZE)) {
+ // Start index and step size below are chosen to evaluate unique
+ // candidates in neighbor search, in case a winner candidate is found in
+ // coarse search. Example,
+ // 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
+ // size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
+ // If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
+ // (3) and 8 (7).
+ // 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
+ // as for 8 colors) then step size should also be 2, to cover all
+ // candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
+ // 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
+ // coarse search will evaluate 3 and 6. For the winner, unique neighbors
+ // (3: 2,4 or 6: 5,7) would be evaluated.
+
+ // Start index for coarse palette search for dominant colors and k-means
+ const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+ 3, 3, 2,
+ 3, 3, 2 };
+ // Step size for coarse palette search for dominant colors and k-means
+ const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+ 3, 3, 3,
+ 3, 3, 3 };
+
+ // Choose the start index and step size for coarse search based on number
+ // of colors
+ const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE);
+ const int min_n = start_n_lookup_table[max_n];
+ const int step_size = step_size_lookup_table[max_n];
+ assert(min_n >= PALETTE_MIN_SIZE);
+ // Perform top color coarse palette search to find the winner candidate
+ const int top_color_winner = perform_top_color_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
+ step_size, do_header_rd_based_gating, &unused, color_cache, n_cache,
+ best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+ discount_color_cost);
+ // Evaluate neighbors for the winner color (if winner is found) in the
+ // above coarse search for dominant colors
+ if (top_color_winner <= max_n) {
+ int stage2_min_n, stage2_max_n, stage2_step_size;
+ set_stage2_params(&stage2_min_n, &stage2_max_n, &stage2_step_size,
+ top_color_winner, max_n);
+ // perform finer search for the winner candidate
+ perform_top_color_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n,
+ stage2_max_n + 1, stage2_step_size,
+ /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+ best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+ tx_type_map, discount_color_cost);
+ }
+ // K-means clustering.
+ // Perform k-means coarse palette search to find the winner candidate
+ const int k_means_winner = perform_k_means_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+ min_n, max_n + 1, step_size, do_header_rd_based_gating, &unused,
+ color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+ rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+ best_blk_skip, tx_type_map, color_map, rows * cols,
+ discount_color_cost);
+ // Evaluate neighbors for the winner color (if winner is found) in the
+ // above coarse search for k-means
+ if (k_means_winner <= max_n) {
+ int start_n_stage2, end_n_stage2, step_size_stage2;
+ set_stage2_params(&start_n_stage2, &end_n_stage2, &step_size_stage2,
+ k_means_winner, max_n);
+ // perform finer search for the winner candidate
+ perform_k_means_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+ start_n_stage2, end_n_stage2 + 1, step_size_stage2,
+ /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+ best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+ tx_type_map, color_map, rows * cols, discount_color_cost);
+ }
+ } else {
+ const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
+ min_n = PALETTE_MIN_SIZE;
+ // Perform top color palette search in ascending order
+ int last_n_searched = min_n;
+ perform_top_color_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
+ 1, do_header_rd_based_gating, &last_n_searched, color_cache, n_cache,
+ best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+ distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+ discount_color_cost);
+ if (last_n_searched < max_n) {
+ // Search in descending order until we get to the previous best
+ perform_top_color_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n,
+ last_n_searched, -1, /*do_header_rd_based_gating=*/false, &unused,
+ color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+ rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+ best_blk_skip, tx_type_map, discount_color_cost);
+ }
+ // K-means clustering.
+ if (colors == PALETTE_MIN_SIZE) {
+ // Special case: These colors automatically become the centroids.
+ assert(colors == 2);
+ centroids[0] = lower_bound;
+ centroids[1] = upper_bound;
+ palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
+ color_cache, n_cache, /*do_header_rd_based_gating=*/false,
+ best_mbmi, best_palette_color_map, best_rd, rate,
+ rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+ best_blk_skip, tx_type_map, NULL, NULL,
+ discount_color_cost);
+ } else {
+ // Perform k-means palette search in ascending order
+ last_n_searched = min_n;
+ perform_k_means_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+ min_n, max_n + 1, 1, do_header_rd_based_gating, &last_n_searched,
+ color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+ rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+ best_blk_skip, tx_type_map, color_map, rows * cols,
+ discount_color_cost);
+ if (last_n_searched < max_n) {
+ // Search in descending order until we get to the previous best
+ perform_k_means_palette_search(
+ cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
+ max_n, last_n_searched, -1, /*do_header_rd_based_gating=*/false,
+ &unused, color_cache, n_cache, best_mbmi, best_palette_color_map,
+ best_rd, rate, rate_tokenonly, distortion, skippable,
+ beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+ rows * cols, discount_color_cost);
+ }
+ }
+ }
+ }
+
+ if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+ memcpy(color_map, best_palette_color_map,
+ block_width * block_height * sizeof(best_palette_color_map[0]));
+ // Gather the stats to determine whether to use screen content tools in
+ // function av1_determine_sc_tools_with_encoding().
+ x->palette_pixels += (block_width * block_height);
+ }
+ *mbmi = *best_mbmi;
+}
+
+void av1_rd_pick_palette_intra_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
+ int dc_mode_cost,
+ uint8_t *best_palette_color_map,
+ MB_MODE_INFO *const best_mbmi,
+ int64_t *best_rd, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+ mbmi->bsize));
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const SequenceHeader *const seq_params = cpi->common.seq_params;
+ int this_rate;
+ int64_t this_rd;
+ int colors_u, colors_v;
+ int colors_threshold_u = 0, colors_threshold_v = 0, colors_threshold = 0;
+ const int src_stride = x->plane[1].src.stride;
+ const uint8_t *const src_u = x->plane[1].src.buf;
+ const uint8_t *const src_v = x->plane[2].src.buf;
+ uint8_t *const color_map = xd->plane[1].color_index_map;
+ RD_STATS tokenonly_rd_stats;
+ int plane_block_width, plane_block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+ &plane_block_height, &rows, &cols);
+
+ mbmi->uv_mode = UV_DC_PRED;
+ if (seq_params->use_highbitdepth) {
+ int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
+ int count_buf_8bit[1 << 8]; // Maximum (1 << 8) bins for hbd path.
+ av1_count_colors_highbd(src_u, src_stride, rows, cols,
+ seq_params->bit_depth, count_buf, count_buf_8bit,
+ &colors_threshold_u, &colors_u);
+ av1_count_colors_highbd(src_v, src_stride, rows, cols,
+ seq_params->bit_depth, count_buf, count_buf_8bit,
+ &colors_threshold_v, &colors_v);
+ } else {
+ int count_buf[1 << 8];
+ av1_count_colors(src_u, src_stride, rows, cols, count_buf, &colors_u);
+ av1_count_colors(src_v, src_stride, rows, cols, count_buf, &colors_v);
+ colors_threshold_u = colors_u;
+ colors_threshold_v = colors_v;
+ }
+
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+
+ colors_threshold = colors_threshold_u > colors_threshold_v
+ ? colors_threshold_u
+ : colors_threshold_v;
+ if (colors_threshold > 1 && colors_threshold <= 64) {
+ int r, c, n, i, j;
+ const int max_itr = 50;
+ int lb_u, ub_u, val_u;
+ int lb_v, ub_v, val_v;
+ int16_t *const data = x->palette_buffer->kmeans_data_buf;
+ int16_t centroids[2 * PALETTE_MAX_SIZE];
+
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+ uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+ if (seq_params->use_highbitdepth) {
+ lb_u = src_u16[0];
+ ub_u = src_u16[0];
+ lb_v = src_v16[0];
+ ub_v = src_v16[0];
+ } else {
+ lb_u = src_u[0];
+ ub_u = src_u[0];
+ lb_v = src_v[0];
+ ub_v = src_v[0];
+ }
+
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ if (seq_params->use_highbitdepth) {
+ val_u = src_u16[r * src_stride + c];
+ val_v = src_v16[r * src_stride + c];
+ data[(r * cols + c) * 2] = val_u;
+ data[(r * cols + c) * 2 + 1] = val_v;
+ } else {
+ val_u = src_u[r * src_stride + c];
+ val_v = src_v[r * src_stride + c];
+ data[(r * cols + c) * 2] = val_u;
+ data[(r * cols + c) * 2 + 1] = val_v;
+ }
+ if (val_u < lb_u)
+ lb_u = val_u;
+ else if (val_u > ub_u)
+ ub_u = val_u;
+ if (val_v < lb_v)
+ lb_v = val_v;
+ else if (val_v > ub_v)
+ ub_v = val_v;
+ }
+ }
+
+ const int colors = colors_u > colors_v ? colors_u : colors_v;
+ const int max_colors =
+ colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
+ for (n = PALETTE_MIN_SIZE; n <= max_colors; ++n) {
+ for (i = 0; i < n; ++i) {
+ centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+ centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+ }
+ av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+ optimize_palette_colors(color_cache, n_cache, n, 2, centroids,
+ cpi->common.seq_params->bit_depth);
+ // Sort the U channel colors in ascending order.
+ for (i = 0; i < 2 * (n - 1); i += 2) {
+ int min_idx = i;
+ int min_val = centroids[i];
+ for (j = i + 2; j < 2 * n; j += 2)
+ if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+ if (min_idx != i) {
+ int temp_u = centroids[i], temp_v = centroids[i + 1];
+ centroids[i] = centroids[min_idx];
+ centroids[i + 1] = centroids[min_idx + 1];
+ centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+ }
+ }
+ av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+ extend_palette_color_map(color_map, cols, rows, plane_block_width,
+ plane_block_height);
+ pmi->palette_size[1] = n;
+ for (i = 1; i < 3; ++i) {
+ for (j = 0; j < n; ++j) {
+ if (seq_params->use_highbitdepth)
+ pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+ (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
+ else
+ pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+ clip_pixel((int)centroids[j * 2 + i - 1]);
+ }
+ }
+
+ if (cpi->sf.intra_sf.early_term_chroma_palette_size_search) {
+ const int palette_mode_rate =
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+ const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+ // Terminate further palette_size search, if header cost corresponding
+ // to lower palette_size is more than the best_rd.
+ if (header_rd >= *best_rd) break;
+ av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ this_rate = tokenonly_rd_stats.rate + palette_mode_rate;
+ } else {
+ av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+ }
+
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ *best_mbmi = *mbmi;
+ memcpy(best_palette_color_map, color_map,
+ plane_block_width * plane_block_height *
+ sizeof(best_palette_color_map[0]));
+ *rate = this_rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *skippable = tokenonly_rd_stats.skip_txfm;
+ }
+ }
+ }
+ if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+ memcpy(color_map, best_palette_color_map,
+ plane_block_width * plane_block_height *
+ sizeof(best_palette_color_map[0]));
+ }
+}
+
+void av1_restore_uv_color_map(const AV1_COMP *cpi, MACROBLOCK *x) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ int src_stride = x->plane[1].src.stride;
+ const uint8_t *const src_u = x->plane[1].src.buf;
+ const uint8_t *const src_v = x->plane[2].src.buf;
+ int16_t *const data = x->palette_buffer->kmeans_data_buf;
+ int16_t centroids[2 * PALETTE_MAX_SIZE];
+ uint8_t *const color_map = xd->plane[1].color_index_map;
+ int r, c;
+ const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+ const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+ int plane_block_width, plane_block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+ &plane_block_height, &rows, &cols);
+
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ if (cpi->common.seq_params->use_highbitdepth) {
+ data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+ data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+ } else {
+ data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+ data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+ }
+ }
+ }
+
+ for (r = 1; r < 3; ++r) {
+ for (c = 0; c < pmi->palette_size[1]; ++c) {
+ centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+ }
+ }
+
+ av1_calc_indices(data, centroids, color_map, rows * cols,
+ pmi->palette_size[1], 2);
+ extend_palette_color_map(color_map, cols, rows, plane_block_width,
+ plane_block_height);
+}
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
new file mode 100644
index 0000000000..7da863a0cc
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Declares functions used in palette search.
+ */
+#ifndef AOM_AV1_ENCODER_PALETTE_H_
+#define AOM_AV1_ENCODER_PALETTE_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct PICK_MODE_CONTEXT;
+struct macroblock;
+
+/*!\cond */
+#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c
+
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int16_t *data, int16_t *centroids,
+ uint8_t *indices, int n, int k,
+ int max_itr);
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int16_t *data, int16_t *centroids,
+ uint8_t *indices, int n, int k,
+ int max_itr);
+/*!\endcond */
+
+/*!\brief Calculates the cluster to which each data point belong.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] data The data points whose cluster indices are
+ * to be computed. The data layout is
+ * NUM_DATA_POINTS X DATA_DIM.
+ * \param[in] centroids Pointer to the centroids. The data layout
+ * is NUM_CENTROIDS X DATA_DIM.
+ * \param[in] indices Pointer to store the computed indices.
+ * \param[in] n Number of data points.
+ * \param[in] k Number of clusters.
+ * \param[in] dim Data dimension.
+ *
+ * \remark Returns nothing, but saves each data's cluster index in \a indices.
+ */
+static INLINE void av1_calc_indices(const int16_t *data,
+ const int16_t *centroids, uint8_t *indices,
+ int n, int k, int dim) {
+ assert(n > 0);
+ assert(k > 0);
+ if (dim == 1) {
+ av1_calc_indices_dim1(data, centroids, indices, /*total_dist=*/NULL, n, k);
+ } else if (dim == 2) {
+ av1_calc_indices_dim2(data, centroids, indices, /*total_dist=*/NULL, n, k);
+ } else {
+ assert(0 && "Untemplated k means dimension");
+ }
+}
+
+/*!\brief Performs k-means cluster on the data.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] data The data points to be clustered. The data
+ * layout is NUM_DATA_POINTS X DATA_DIM.
+ * \param[in] centroids Pointer to store the computed centroids.
+ * The data layout is
+ * NUM_CENTROIDS X DATA_DIM.
+ * \param[in] indices Pointer to store the computed indices. For
+ * each training data.
+ * \param[in] n Number of data points.
+ * \param[in] k Number of clusters.
+ * \param[in] dim Data dimension.
+ * \param[in] max_itr Maximum number of iterations to run.
+ *
+ * \remark Returns nothing, but saves each cluster's centroid in centroids and
+ * each data's cluster index in \a indices.
+ *
+ * \attention The output centroids are rounded off to nearest integers.
+ */
+static INLINE void av1_k_means(const int16_t *data, int16_t *centroids,
+ uint8_t *indices, int n, int k, int dim,
+ int max_itr) {
+ assert(n > 0);
+ assert(k > 0);
+ if (dim == 1) {
+ AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr);
+ } else if (dim == 2) {
+ AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr);
+ } else {
+ assert(0 && "Untemplated k means dimension");
+ }
+}
+
+/*!\brief Removes duplicated centroid indices.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] centroids A list of centroids index.
+ * \param[in] num_centroids Number of centroids.
+ *
+ * \return Returns the number of unique centroids and saves the unique centroids
+ * in beginning of the centroids array.
+ *
+ * \attention The centroids should be rounded to integers before calling this
+ * method.
+ */
+int av1_remove_duplicates(int16_t *centroids, int num_centroids);
+
+/*!\brief Checks what colors are in the color cache.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] color_cache A cache of colors.
+ * \param[in] n_cache Number of colors in the cache.
+ * \param[in] colors New base colors.
+ * \param[in] n_colors Number of new colors.
+ * \param[in] cache_color_found Stores what cached colors are presented in
+ * colors.
+ * \param[in] out_cache_colors Stores what colors are not in the cache.
+ *
+ * \return Returns the number of colors that are not in cache. In addition,
+ * records whether each cache color is presented in colors in cache_color_found,
+ * and stores and stores the out of cache colors in out_cache_colors.
+ */
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+ const uint16_t *colors, int n_colors,
+ uint8_t *cache_color_found, int *out_cache_colors);
+
+/*!\brief Gets the rate cost for each delta-encoding v palette.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] pmi Struct that stores the palette mode info.
+ * \param[in] bit_depth Pixel bitdepth of the sequence.
+ * \param[in] zero_count Stores the number of zero deltas.
+ * \param[in] min_bits Minimum bits for the deltas. Sets to
+ * bit_depth - 4.
+ *
+ * \return Returns the number of bits used to transmit each v palette color
+ * delta and assigns zero_count with the number of deltas being 0.
+ */
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *zero_count, int *min_bits);
+
+/*!\brief Gets the rate cost for transmitting luma palette color values.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] pmi Struct that stores the palette mode info.
+ * \param[in] color_cache Color cache presented at the decoder.
+ * \param[in] n_cache Number of colors in the cache.
+ * \param[in] bit_depth Pixel bitdepth of the sequence.
+ *
+ * \return Returns the rate needed to transmit the palette. Note that this does
+ * not include the cost of transmitted the color map.
+ */
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+ const uint16_t *color_cache, int n_cache,
+ int bit_depth);
+
+/*!\brief Gets the rate cost for transmitting luma palette chroma values.
+ *
+ * \ingroup palette_mode_search
+ * \param[in] pmi Struct that stores the palette mode info.
+ * \param[in] color_cache Color cache presented at the decoder.
+ * \param[in] n_cache Number of colors in the cache.
+ * \param[in] bit_depth Pixel bitdepth of the sequence.
+ *
+ * \return Returns the rate needed to transmit the palette. Note that this does
+ * not include the cost of transmitted the color map.
+ */
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+ const uint16_t *color_cache, int n_cache,
+ int bit_depth);
+
+/*!\brief Search for the best palette in the luma plane.
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sby(
+ const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize,
+ int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+ int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx,
+ uint8_t *best_blk_skip, uint8_t *tx_type_map);
+
+/*!\brief Search for the best palette in the chroma plane.
+ *
+ * \ingroup palette_mode_search
+ * \callergraph
+ * This function is used in both inter and intra frame coding.
+ */
+void av1_rd_pick_palette_intra_sbuv(const struct AV1_COMP *cpi,
+ struct macroblock *x, int dc_mode_cost,
+ uint8_t *best_palette_color_map,
+ MB_MODE_INFO *const best_mbmi,
+ int64_t *best_rd, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ uint8_t *skippable);
+
+/*!\brief Resets palette color map for chroma channels.
+ */
+void av1_restore_uv_color_map(const struct AV1_COMP *cpi, struct macroblock *x);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PALETTE_H_
diff --git a/third_party/aom/av1/encoder/partition_cnn_weights.h b/third_party/aom/av1/encoder/partition_cnn_weights.h
new file mode 100644
index 0000000000..504038c63a
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_cnn_weights.h
@@ -0,0 +1,2139 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/ml.h"
+
+#define CNN_BRANCH_0_OUT_CH 20
+#define CNN_BRANCH_1_OUT_CH 4
+#define CNN_BRANCH_2_OUT_CH 20
+#define CNN_BRANCH_3_OUT_CH 20
+#define CNN_TOT_OUT_CH \
+ (((CNN_BRANCH_0_OUT_CH) + (CNN_BRANCH_1_OUT_CH) + (CNN_BRANCH_2_OUT_CH) + \
+ (CNN_BRANCH_3_OUT_CH)))
+#define CNN_BRANCH_0_OUT_SIZE (CNN_BRANCH_0_OUT_CH)
+#define CNN_BRANCH_1_OUT_SIZE ((CNN_BRANCH_1_OUT_CH)*2 * 2)
+#define CNN_BRANCH_2_OUT_SIZE ((CNN_BRANCH_2_OUT_CH)*4 * 4)
+#define CNN_BRANCH_3_OUT_SIZE ((CNN_BRANCH_3_OUT_CH)*8 * 8)
+#define CNN_OUT_BUF_SIZE \
+ (((CNN_BRANCH_0_OUT_SIZE) + (CNN_BRANCH_1_OUT_SIZE) + \
+ (CNN_BRANCH_2_OUT_SIZE) + (CNN_BRANCH_3_OUT_SIZE)))
+
+#define NUM_DNN_BRANCHES 4
+#define NUM_CNN_LAYERS 5
+#define BRANCH_0_NUM_DNN_LAYERS 2
+#define BRANCH_1_NUM_DNN_LAYERS 2
+#define BRANCH_2_NUM_DNN_LAYERS 2
+#define BRANCH_3_NUM_DNN_LAYERS 2
+#define CNN_LAYER_0_HEIGHT 5
+#define CNN_LAYER_0_WIDTH 5
+#define CNN_LAYER_0_IN_CH 1
+#define CNN_LAYER_0_OUT_CH 20
+#define CNN_LAYER_0_HORZ_STRIDE 4
+#define CNN_LAYER_0_VERT_STRIDE 4
+#define CNN_LAYER_1_HEIGHT 2
+#define CNN_LAYER_1_WIDTH 2
+#define CNN_LAYER_1_IN_CH 20
+#define CNN_LAYER_1_OUT_CH 20
+#define CNN_LAYER_1_HORZ_STRIDE 2
+#define CNN_LAYER_1_VERT_STRIDE 2
+#define CNN_LAYER_2_HEIGHT 2
+#define CNN_LAYER_2_WIDTH 2
+#define CNN_LAYER_2_IN_CH 20
+#define CNN_LAYER_2_OUT_CH 20
+#define CNN_LAYER_2_HORZ_STRIDE 2
+#define CNN_LAYER_2_VERT_STRIDE 2
+#define CNN_LAYER_3_HEIGHT 2
+#define CNN_LAYER_3_WIDTH 2
+#define CNN_LAYER_3_IN_CH 20
+#define CNN_LAYER_3_OUT_CH 4
+#define CNN_LAYER_3_HORZ_STRIDE 2
+#define CNN_LAYER_3_VERT_STRIDE 2
+#define CNN_LAYER_4_HEIGHT 2
+#define CNN_LAYER_4_WIDTH 2
+#define CNN_LAYER_4_IN_CH 4
+#define CNN_LAYER_4_OUT_CH 20
+#define CNN_LAYER_4_HORZ_STRIDE 2
+#define CNN_LAYER_4_VERT_STRIDE 2
+#define BRANCH_0_NUM_DNN_FEATURES 37
+#define BRANCH_0_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_0_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_0_NUM_LOGITS 1
+#define BRANCH_1_NUM_DNN_FEATURES 25
+#define BRANCH_1_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_1_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_1_NUM_LOGITS 1
+#define BRANCH_2_NUM_DNN_FEATURES 25
+#define BRANCH_2_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_2_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_2_NUM_LOGITS 1
+#define BRANCH_3_NUM_DNN_FEATURES 41
+#define BRANCH_3_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_3_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_3_NUM_LOGITS 1
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_0_kernel[] = {
+ 0.131894f, -0.593536f, -0.212935f, -0.00220011f, -0.396949f,
+ 0.287753f, -0.91875f, -0.0095057f, 0.804197f, -0.395239f,
+ 0.516604f, 1.16439f, 0.445784f, -0.163349f, 0.746488f,
+ -0.33891f, -0.562652f, 0.481403f, 0.755378f, -0.200753f,
+ 0.0784307f, 0.105657f, 0.0205673f, -0.524089f, -0.476146f,
+ -0.161206f, -0.65079f, 0.137474f, 0.28584f, 0.508768f,
+ -0.643386f, 0.227068f, -0.899507f, -0.413382f, 0.631466f,
+ 0.398203f, -0.544392f, 0.825155f, 0.671847f, -0.249779f,
+ 0.323121f, 0.125357f, -0.719564f, -0.0714854f, -0.168472f,
+ -0.213246f, -0.674525f, 0.330148f, -0.138414f, 0.20462f,
+ -0.518571f, -0.15091f, -0.605116f, -0.448732f, -0.475599f,
+ 0.738f, -0.328526f, 0.755035f, 0.969414f, -0.321039f,
+ -0.23068f, 0.408567f, -0.377813f, -0.273974f, 1.0684f,
+ 0.373968f, -0.450305f, 0.439258f, -0.381846f, -0.267331f,
+ 0.30613f, -0.39369f, 0.622438f, -0.52877f, -0.334991f,
+ 0.263193f, -0.402121f, 0.64142f, 0.793048f, -0.0231174f,
+ -0.68474f, -0.293338f, -0.737511f, -0.462654f, 0.474629f,
+ 0.141397f, -0.152529f, 0.345879f, -0.499991f, 0.00174024f,
+ 0.337387f, -0.131151f, 0.427385f, -0.457449f, -0.879614f,
+ -0.425908f, -0.263172f, 0.0344974f, 1.07861f, -0.00416662f,
+ 0.0208952f, 0.233905f, 0.765965f, 0.0423685f, -0.117554f,
+ -0.248237f, 0.49848f, -0.845131f, 0.223648f, -0.838709f,
+ 0.5834f, 0.309956f, -0.0625093f, -0.619619f, 0.918957f,
+ 0.358271f, -0.668459f, 0.518783f, -0.418963f, -0.206788f,
+ 0.364983f, -0.0396087f, 0.624309f, -0.138679f, -0.142453f,
+ 0.28309f, 0.895092f, -0.215713f, 0.439025f, 0.659333f,
+ -0.366025f, -0.413518f, 0.66657f, -0.265919f, 0.473471f,
+ -1.0729f, -0.526702f, 0.2838f, 0.367648f, -0.61242f,
+ 0.121656f, 0.547727f, -0.0636793f, -0.33006f, -0.306604f,
+ -0.00897731f, 0.688242f, 0.0944626f, 0.321508f, 0.0437392f,
+ -0.560035f, -0.768334f, 0.0571051f, -0.0427601f, -0.0437806f,
+ -0.816209f, -0.395829f, 0.293733f, 0.217645f, -0.646428f,
+ 0.132448f, -0.435806f, -0.0556814f, 0.0218857f, 0.348525f,
+ -0.17296f, 0.669057f, 0.638604f, -0.0995596f, -0.024099f,
+ -0.262332f, -0.548975f, 0.357894f, 0.43873f, -0.688234f,
+ -0.425519f, 0.190986f, -0.074778f, 0.294232f, -0.548969f,
+ -0.731198f, 0.03616f, -0.475969f, -0.306075f, -0.111929f,
+ -0.234146f, 0.612669f, 0.882254f, -0.622893f, 0.262431f,
+ 0.465242f, 0.245384f, -0.811016f, 0.501798f, -0.925875f,
+ 0.264373f, 0.307766f, -0.26872f, 0.113027f, -0.158875f,
+ 0.0711483f, 0.220275f, -0.0699022f, -0.0111303f, -0.435384f,
+ -0.720014f, 0.593484f, -0.964082f, 0.750925f, 0.252433f,
+ 0.964332f, -0.256904f, -0.421715f, -0.403851f, -0.188081f,
+ 0.694014f, -1.00183f, 0.798921f, 0.0603123f, 0.213814f,
+ 0.739642f, -0.0203375f, 0.72569f, -0.260224f, 0.0199516f,
+ -0.322451f, 0.318204f, -0.38392f, 0.740994f, -0.265215f,
+ -0.54541f, -0.51479f, -0.458397f, 0.519564f, 0.0509182f,
+ 0.0363331f, -0.293051f, 0.317714f, -0.327488f, -0.0840401f,
+ 0.318437f, -0.619403f, 0.641094f, -0.288435f, -0.260185f,
+ 0.181083f, -0.169294f, 0.292645f, 0.140405f, 0.0572885f,
+ -0.637428f, -0.102616f, 0.288955f, 0.817314f, 0.116855f,
+ 0.635532f, 0.283334f, -0.236391f, -0.305035f, -0.217365f,
+ -0.033021f, -0.455858f, 0.439922f, -0.104039f, 0.373376f,
+ 0.310659f, 0.388789f, 0.266341f, 0.0746306f, -0.428192f,
+ -0.202695f, -0.347625f, 0.00585741f, 0.366203f, 0.221413f,
+ 0.518856f, 0.57245f, -0.375071f, -0.2436f, -0.511895f,
+ -1.03708f, 0.681455f, -0.111544f, -0.183563f, 0.109729f,
+ -0.422646f, -0.529777f, 0.747473f, -0.270223f, -0.11435f,
+ 0.378931f, 0.420456f, 0.236331f, 0.49261f, -0.0666801f,
+ 0.0475846f, 0.906095f, -0.4146f, -0.020588f, -0.653285f,
+ 0.135335f, 0.543846f, -0.309061f, 0.11899f, -0.639168f,
+ -0.719994f, -0.219706f, -0.645631f, -0.829049f, -0.0114746f,
+ 0.834604f, 0.0378035f, 0.107957f, 0.546929f, -0.674395f,
+ -0.854817f, -1.1443f, 0.223413f, -0.326324f, 0.440971f,
+ 0.383582f, -0.495084f, 0.280091f, -0.53116f, 0.0333923f,
+ -0.354339f, -0.0449156f, -0.538896f, -0.753355f, 0.463995f,
+ 0.000969967f, -0.2832f, 0.587276f, 0.853094f, -0.481985f,
+ -0.138202f, 0.180989f, -0.349044f, -0.417534f, 0.455591f,
+ 0.287332f, 0.251496f, 0.381416f, 0.339632f, -0.0825727f,
+ 0.352739f, 0.161697f, -0.319764f, -0.258015f, 0.668833f,
+ -0.553303f, -0.578815f, -0.3758f, 0.289f, 0.247368f,
+ 0.00681103f, 0.421092f, -0.191033f, -0.425868f, -0.1239f,
+ 0.0540422f, -0.0856856f, 0.481168f, -0.0283741f, -0.196018f,
+ 0.230923f, -0.145288f, 0.52188f, 0.00628462f, -0.604556f,
+ -0.562879f, 0.319282f, 0.323799f, 0.453941f, 0.271129f,
+ -0.0520196f, 0.684571f, -0.391779f, -0.404614f, 0.134097f,
+ -0.825482f, 0.0913949f, 0.483543f, 0.159084f, 0.301637f,
+ 0.427013f, 0.196153f, 0.460091f, -0.730573f, -0.12278f,
+ 0.221665f, 0.674622f, -0.623363f, -0.0761517f, 0.637979f,
+ -0.468498f, 0.527276f, -0.596894f, -0.34675f, -0.251241f,
+ 0.418533f, -0.476696f, -0.901267f, -0.0088241f, -0.12421f,
+ -0.660316f, -0.0222117f, -0.470898f, -1.10739f, -0.441645f,
+ 0.39516f, -0.0117906f, 0.254122f, 0.00722599f, -1.00697f,
+ 0.48908f, -0.122287f, -0.378608f, -0.339145f, 0.682463f,
+ 0.305606f, 0.453628f, -0.49923f, -0.791388f, -0.202515f,
+ 0.23214f, -0.434209f, -0.778283f, -0.538015f, 0.145769f,
+ 0.446281f, -0.339329f, -0.198478f, -0.183717f, -0.855441f,
+ -0.105778f, 0.575067f, -0.18592f, -0.348094f, 0.740614f,
+ 0.041549f, -0.109663f, 0.0434492f, 0.245242f, -1.22192f,
+ 0.685896f, -0.208115f, -0.0616216f, -1.00552f, 0.31045f,
+ -0.184394f, 0.466705f, -0.0984364f, -0.506252f, 0.144874f,
+ 0.357038f, 0.675221f, -0.822171f, -0.52729f, 0.991212f,
+ 0.432422f, 0.383493f, -0.372395f, 0.35651f, -0.25369f,
+ 0.660208f, -0.117745f, -0.142433f, -0.724115f, -1.0035f,
+ -0.59178f, 0.563444f, -0.282531f, -0.599989f, 0.507424f,
+ -0.782875f, 0.755029f, -0.754962f, -0.617825f, 0.565984f,
+ -0.826878f, -0.456563f, 0.0212161f, 0.469867f, -0.144864f,
+ 0.225748f, -0.279029f, 0.21052f, -0.440183f, 0.936069f,
+ 0.170595f, 0.40966f, 0.452453f, -0.576006f, 1.50696f,
+ 0.649049f, 0.094957f, -0.167706f, -0.258342f, 0.59269f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_0_bias[] = {
+ 0.00475215f, -0.00362332f, -0.00317542f, 0.190083f, 0.0488147f,
+ -0.0268093f, -0.00432231f, 0.0112229f, 0.0626653f, -0.0025698f,
+ 0.0018675f, -0.00368139f, -0.00159125f, -0.00034354f, 0.311437f,
+ 0.000136436f, 0.0667295f, 0.0251274f, 0.00226553f, -0.000638344f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_1_kernel[] = {
+ 0.228403f, 0.241933f, 0.181079f, 0.101728f, 0.278455f,
+ -0.222078f, 0.387578f, 0.0847356f, -0.0737012f, 0.26518f,
+ -1.0817f, 0.0404161f, -0.805199f, 0.336576f, -0.541494f,
+ 0.246264f, 0.116597f, -0.756804f, -0.914136f, 0.410265f,
+ 0.413294f, 0.07873f, 0.450017f, -0.264346f, 0.549095f,
+ 1.03755f, -0.203542f, 1.61018f, 0.374131f, 0.402515f,
+ -2.36115f, 0.116427f, -0.172157f, -0.231482f, -0.905736f,
+ -0.0183059f, -0.575746f, 0.110348f, -0.268018f, 0.140399f,
+ 0.427196f, 0.0718528f, 0.247936f, -0.326661f, 0.150404f,
+ -0.659979f, -0.157148f, 0.00826241f, -0.679275f, -0.131564f,
+ -1.04822f, 1.06039f, -0.207898f, 0.510167f, 0.484233f,
+ 0.138972f, -0.0801639f, -0.184416f, 0.0741107f, -0.0299281f,
+ 0.112263f, 0.380071f, -0.0185269f, -0.0821188f, 0.918796f,
+ -0.576106f, 0.593007f, 0.479446f, 0.0440703f, 0.322379f,
+ 0.176783f, -0.147111f, 0.0953247f, -0.636377f, 0.0702104f,
+ 0.130979f, 0.293892f, -0.0112124f, -0.040347f, -0.16034f,
+ 0.3252f, -0.586802f, 0.601786f, -0.487148f, -0.458777f,
+ 0.463835f, 0.144942f, 0.00339965f, -0.779966f, 0.0585298f,
+ -1.20758f, -0.275614f, 0.292346f, -0.132781f, 0.337892f,
+ -0.357677f, 1.48511f, 0.172907f, -0.148668f, 0.243184f,
+ -0.503392f, -0.0791543f, 0.0265389f, -0.102267f, 0.213294f,
+ 0.0657801f, 0.156996f, 0.0891168f, 0.120805f, 0.261285f,
+ -0.343025f, -0.0792235f, -0.106415f, 0.133878f, -0.112981f,
+ -0.00151126f, -0.0643829f, 0.0458938f, -0.0452731f, -0.00147422f,
+ 0.1871f, -0.0208793f, 0.0752037f, 0.0794674f, 0.167666f,
+ 0.198028f, -0.361015f, -0.0661721f, -0.10672f, -0.0773641f,
+ -1.15856f, -0.516443f, -0.322702f, 0.15668f, 0.0075841f,
+ -0.157731f, 0.270926f, -0.241551f, 0.0169097f, -0.0263953f,
+ -0.303556f, -0.239237f, 0.117792f, -0.137871f, 0.122054f,
+ -0.587381f, 0.112938f, 0.0867262f, -0.27909f, -0.203622f,
+ -0.622195f, 0.42623f, 0.670704f, 0.190826f, -0.304979f,
+ -0.570075f, -0.240699f, 0.43744f, 0.632896f, -0.563846f,
+ -0.0160434f, -0.0709745f, 0.816662f, 0.269999f, -0.358734f,
+ 0.193644f, 1.19339f, -0.118223f, -0.363291f, -0.723616f,
+ -1.58825f, 0.0222856f, 0.769852f, 0.322713f, 0.0857619f,
+ -0.669756f, -1.08414f, 1.18593f, 0.486166f, -0.520646f,
+ 0.0861854f, -0.134197f, 0.258337f, 0.223345f, 0.697639f,
+ -0.57261f, 0.54031f, 0.892644f, 0.497572f, -0.287076f,
+ -1.95928f, -0.0568128f, -0.253335f, 0.00233392f, -0.192787f,
+ -0.115203f, -0.0975649f, 0.277954f, 0.000704534f, -0.315884f,
+ 0.309583f, 0.357458f, 0.0939298f, -0.072701f, 0.433045f,
+ -0.536938f, 0.534523f, 0.184585f, -0.0415175f, -0.120909f,
+ -1.2622f, 0.412449f, -0.114741f, 0.290453f, -0.441671f,
+ -0.0242497f, -0.20746f, 0.139019f, -0.422668f, -0.146732f,
+ -0.688828f, -0.00339426f, 0.04166f, 0.41755f, 0.405675f,
+ 0.562564f, 0.0216812f, 0.0271391f, 0.215227f, 0.328183f,
+ -1.6442f, -0.827838f, 0.115491f, 0.0951442f, -0.133779f,
+ -0.0482928f, 0.203177f, 0.322953f, -0.513259f, 0.0676788f,
+ -0.0877928f, 0.224448f, 0.451957f, 0.314243f, 0.307403f,
+ 0.35653f, 0.0286278f, 2.27554f, 0.569313f, -0.0488753f,
+ -2.48809f, 0.274555f, -0.248375f, -0.635634f, -0.187663f,
+ 0.1827f, -0.409634f, -0.0280568f, -0.207119f, -0.208192f,
+ -0.410268f, -0.017669f, 0.134856f, 0.434551f, 0.165201f,
+ 0.584608f, -0.389997f, -0.088713f, 0.118087f, 0.00210905f,
+ -1.07698f, -0.520967f, -0.198742f, 0.190255f, -0.162639f,
+ 0.0122759f, 0.460774f, -0.684633f, -0.149512f, 0.167556f,
+ -0.295034f, -0.0650964f, 0.0868653f, -0.691352f, 0.089795f,
+ 0.0620608f, 0.0531289f, 0.0124286f, 0.151921f, 1.51067f,
+ -0.10586f, -0.0311871f, 0.114706f, 0.0565205f, -0.159634f,
+ -0.423987f, -0.226896f, 0.0605352f, -0.36324f, -0.142205f,
+ -0.252249f, 0.0666312f, 0.316655f, 0.00687196f, 0.131079f,
+ -0.128281f, -0.293468f, 1.3327f, 0.542277f, -0.060088f,
+ -1.73475f, 0.0542297f, -0.227522f, -0.376004f, -0.147028f,
+ 0.0228252f, 0.0569538f, -0.0796497f, 0.0937596f, -0.0660153f,
+ -0.979219f, -0.377322f, 0.0523787f, 0.467299f, 0.0824278f,
+ 0.437147f, 0.263637f, 0.0325681f, 0.303581f, 0.353479f,
+ -0.142369f, -0.394797f, 0.597185f, 0.116482f, -0.0782593f,
+ 0.364539f, -0.30396f, 0.119016f, -0.0022429f, -0.044292f,
+ -0.0110531f, 0.233571f, 0.000975879f, 0.447332f, -0.0320396f,
+ 0.541609f, 0.14232f, 0.163905f, 0.848609f, 0.19954f,
+ -0.186591f, -0.44465f, -0.431672f, 0.159037f, -0.129977f,
+ -0.141778f, 0.246818f, -0.197539f, -0.70115f, 0.185449f,
+ 0.400274f, -0.0350744f, 0.239727f, -0.290504f, 0.0698443f,
+ -0.180374f, -0.759591f, -0.0569088f, -0.50246f, -0.0986616f,
+ -0.892114f, 0.306737f, -0.133937f, 0.285625f, 0.495471f,
+ -0.686222f, -0.168647f, -0.0926158f, 0.351772f, -0.0215394f,
+ 0.361223f, 0.0657142f, 0.268229f, -0.616299f, 0.0564718f,
+ -0.294013f, -0.588019f, 0.0234195f, -0.426863f, -0.511253f,
+ -0.72177f, 0.420903f, 0.0987506f, 0.309368f, 0.523532f,
+ 1.06073f, -0.33028f, 0.0818142f, 0.0130354f, 0.0180882f,
+ 0.0316898f, -0.416614f, -0.566344f, -0.163083f, 0.285085f,
+ -0.0534352f, 0.385496f, 0.151068f, -0.208295f, -0.175648f,
+ 0.0476705f, 0.190428f, -0.643391f, 0.484004f, -0.421836f,
+ -0.19829f, -0.227574f, -0.0869152f, 1.09881f, 0.345129f,
+ -0.236732f, -0.381935f, -1.46271f, 0.465914f, 0.610375f,
+ 0.689968f, -0.688546f, 1.95033f, 0.420946f, 0.0282428f,
+ 0.147823f, 0.669393f, 0.429085f, -0.328385f, -0.150439f,
+ -0.419097f, -0.828102f, 0.248743f, 0.24644f, 0.0186131f,
+ -0.384319f, -0.126294f, -0.417067f, 0.271483f, -0.0128456f,
+ -0.881351f, 0.152581f, 0.185584f, -0.745827f, 0.0551359f,
+ 0.127083f, 0.936983f, -0.0225341f, 0.575861f, 0.767417f,
+ -0.140867f, -0.762518f, 0.422446f, -0.0611973f, 0.0515641f,
+ -0.144168f, -0.298882f, 0.308461f, 0.0208704f, 0.213872f,
+ -0.258708f, 1.13186f, 0.314083f, -0.347536f, -0.137768f,
+ 0.653953f, -0.217883f, -0.56112f, -0.864661f, 0.488836f,
+ 0.268133f, -0.548664f, -0.765226f, 0.117082f, 0.326798f,
+ -0.678246f, 0.477785f, -1.27584f, 0.198912f, -0.710395f,
+ 1.39096f, -0.411577f, -0.55119f, 0.51092f, -0.295023f,
+ 0.245983f, -0.0957192f, -0.312001f, 0.0175991f, 0.524423f,
+ -0.126379f, 0.124687f, -1.53945f, -0.342856f, 0.514072f,
+ 0.400884f, -0.00581101f, -0.219327f, 0.0977873f, 0.337551f,
+ -0.058603f, 0.20034f, 0.0429945f, 0.676803f, -0.273585f,
+ -0.173435f, -0.581596f, 0.226263f, -0.0946223f, -0.060088f,
+ -0.0100809f, -0.022242f, -0.22218f, -0.030463f, -0.141389f,
+ -0.190757f, -0.00526518f, -0.77519f, -0.0825695f, 0.308403f,
+ 0.262792f, -0.601842f, 0.0783697f, 0.197527f, 0.0714048f,
+ 0.0392629f, -0.388628f, 0.172541f, -0.0222009f, 0.252096f,
+ 0.0728652f, 0.173632f, 0.192914f, -0.00969965f, 0.0530136f,
+ -0.00765759f, 0.440234f, -0.0943323f, 0.112319f, 0.0878737f,
+ -0.739021f, 0.385305f, 0.133334f, -0.396697f, 0.177818f,
+ -0.0712558f, 0.516923f, 0.102174f, 0.17158f, -0.211068f,
+ 0.295795f, -0.36198f, 0.179087f, -0.845744f, -0.242514f,
+ -1.49073f, 0.272702f, 0.59011f, -0.408184f, -0.0731313f,
+ 0.234643f, 0.589642f, -0.100778f, 0.516921f, -0.700154f,
+ 0.316432f, 0.36117f, 0.0380282f, 0.480101f, -0.0975487f,
+ 0.941452f, 0.231705f, -0.151182f, -1.20305f, 0.28255f,
+ -0.0427662f, -0.00717175f, -0.842085f, -0.357376f, 0.545581f,
+ -0.290714f, 0.741498f, 1.00377f, 0.483864f, 0.150405f,
+ 0.0834512f, -0.10031f, 0.424054f, -0.0223491f, -0.0696701f,
+ -0.134479f, -0.747227f, 0.422208f, 0.123858f, -0.392624f,
+ -0.0299847f, -0.0376142f, -0.392536f, -0.0343114f, 0.298224f,
+ -0.375899f, 0.693119f, 0.27909f, -0.53463f, 0.105459f,
+ -0.0267383f, 0.5094f, -0.411557f, 0.451749f, -0.348479f,
+ -0.0497316f, -0.353913f, -0.14858f, 0.241838f, 0.331039f,
+ 0.756607f, -0.0701661f, -0.827264f, -0.367772f, 0.447201f,
+ 0.834616f, -0.00497265f, -0.0557285f, 0.055088f, -0.300115f,
+ -0.143833f, -1.07838f, -0.106896f, 0.16945f, 0.0170324f,
+ 0.108754f, 0.335893f, -0.0923708f, 0.450209f, -0.0713308f,
+ -0.0233037f, -0.0129902f, -1.40664f, -0.0996218f, 0.711236f,
+ 0.400716f, 0.227871f, 2.01499f, 0.572926f, 0.135673f,
+ -0.0340458f, -0.316736f, 0.24257f, -0.700768f, -0.194985f,
+ 0.312011f, -0.179599f, 0.128114f, 0.0725977f, -0.193816f,
+ 0.352143f, 0.070641f, -0.467808f, -0.399047f, 0.10136f,
+ 0.671574f, -0.553965f, 0.105729f, 0.210383f, 0.065048f,
+ 0.248198f, -0.731674f, 0.588725f, -0.308237f, 0.24511f,
+ 0.00608906f, 0.170906f, 0.246175f, 0.149521f, 0.106071f,
+ 0.160246f, 0.118487f, -0.104102f, 0.872823f, 0.227478f,
+ 0.0182631f, -0.115083f, 0.0142445f, 0.307947f, -0.884925f,
+ 0.0767105f, 0.0414042f, -0.448021f, -0.0400193f, -0.0765448f,
+ -0.411931f, -0.199624f, 0.333371f, 0.17267f, -0.0431816f,
+ 0.190826f, -0.0758961f, -1.02831f, -0.0414525f, 0.605374f,
+ -0.0188181f, -0.2207f, 1.30004f, -0.207005f, -0.0333617f,
+ 0.227145f, 0.105059f, -0.0473393f, -0.448752f, -0.0342152f,
+ -0.0244812f, 0.220329f, 0.0313591f, -0.0902074f, -0.0731945f,
+ 0.88488f, 0.306306f, -0.275613f, -0.476372f, 0.00678104f,
+ 0.442029f, 0.122049f, 0.118042f, 0.270527f, -0.462538f,
+ 0.0665021f, -0.260255f, 0.209182f, 0.162321f, 0.0629934f,
+ -0.244896f, -0.078863f, 0.655585f, -0.0506617f, -0.487128f,
+ 0.118765f, -0.34408f, 0.0930615f, -0.365632f, -0.0670776f,
+ 0.44428f, 0.286734f, 0.146608f, 0.686757f, -0.0738428f,
+ -0.10034f, -0.928438f, -0.172601f, -0.0959575f, -0.010532f,
+ 0.277549f, 0.28773f, -0.318883f, 0.71254f, 0.273593f,
+ -0.382845f, -0.0104587f, -0.647769f, 0.25541f, 0.194625f,
+ 0.265197f, -0.750938f, -0.0650515f, -0.567092f, 0.070613f,
+ 0.209531f, 0.429699f, 0.130676f, 0.514914f, 0.615778f,
+ 0.594535f, -0.0878778f, 0.40593f, -0.303383f, 0.0907863f,
+ -0.320068f, 0.0137162f, -0.303424f, 0.594207f, -0.236524f,
+ -0.692627f, -0.990063f, -0.0262934f, 0.222375f, 0.503412f,
+ 0.220224f, 0.676871f, -0.150996f, 0.379777f, 0.841339f,
+ -1.05981f, 0.259943f, -0.781745f, 0.0346478f, 0.115791f,
+ -0.25171f, -0.00872158f, 0.395561f, -0.0849893f, -1.20134f,
+ -0.313938f, 0.789542f, 0.159606f, -0.782095f, -0.229754f,
+ 0.266687f, -0.0354282f, -0.3041f, 0.0338618f, -0.390001f,
+ -0.28362f, -0.436144f, 0.777351f, 0.855321f, 0.653338f,
+ -0.0382912f, -0.204577f, 1.13828f, 0.220395f, -4.60853f,
+ 0.575694f, 0.0453189f, 1.76567f, 0.466151f, -0.366109f,
+ 0.594717f, 0.278891f, -0.750676f, -0.332739f, -0.942304f,
+ 0.280363f, 0.284561f, 0.209326f, 0.238347f, -0.0124311f,
+ -0.439463f, -0.036186f, 0.165997f, 0.374717f, -0.481148f,
+ -0.626417f, 0.0223598f, 0.039337f, -0.379918f, 0.211046f,
+ 0.0795812f, 0.863355f, -0.341448f, 0.421494f, 0.410477f,
+ -0.117025f, -0.511108f, 0.565193f, -0.063582f, -0.031349f,
+ -0.0750174f, 0.387941f, 0.541266f, 0.0919753f, 1.05041f,
+ 0.263004f, 0.289006f, 0.0439694f, -1.22439f, -0.247832f,
+ 0.260967f, 0.355794f, 0.599694f, -0.69418f, 0.372805f,
+ -0.161731f, 0.0720574f, 0.0394657f, 0.122772f, -0.458067f,
+ -0.370826f, -1.34495e-05f, -0.373404f, 0.0245539f, -2.3472f,
+ -2.61448f, 0.264794f, 0.0601582f, -0.968597f, -0.196022f,
+ -0.727067f, 0.167346f, 0.517478f, 0.0035377f, 0.777219f,
+ 0.553128f, 0.727211f, 0.606202f, -0.495604f, 2.41445f,
+ 0.465214f, -0.0443004f, 0.142972f, 0.141459f, -0.17771f,
+ 0.0156117f, 0.169264f, 0.0428022f, -0.164827f, -0.240632f,
+ 0.215289f, -0.213134f, -0.184163f, 0.0161321f, -0.20025f,
+ -0.0311616f, 0.00292108f, -0.0131921f, 0.0437664f, -0.104817f,
+ -0.131906f, 0.0822771f, 0.237307f, -0.347567f, -1.2485f,
+ 0.253616f, -0.442217f, 0.0514077f, 0.337561f, -0.0147658f,
+ -0.132888f, -0.643821f, 0.445573f, -0.0146213f, 0.235511f,
+ 0.53583f, -0.640644f, 0.0280044f, 0.00628834f, 0.143885f,
+ 0.380077f, -0.542342f, 0.363101f, 0.0647334f, -0.476556f,
+ -0.822676f, 0.482454f, -0.0467326f, -0.253083f, 0.116726f,
+ 0.317333f, 0.548131f, -0.234667f, 0.579923f, -0.420683f,
+ 0.595613f, -0.279864f, -0.753204f, -0.516844f, -0.436574f,
+ -0.120682f, -0.278939f, 0.752202f, -0.183443f, -0.14632f,
+ -0.0344068f, 0.127638f, -0.225245f, 0.489391f, 0.145082f,
+ -0.73672f, 0.980065f, -0.0367412f, 0.40632f, -0.802509f,
+ 0.356897f, 0.366172f, 1.23858f, -0.978381f, -0.684924f,
+ -0.0870693f, -0.353628f, 0.695788f, -0.244593f, -1.8897f,
+ -0.257803f, 0.686937f, 0.405155f, -0.125696f, 0.258075f,
+ 0.570584f, -0.439481f, -0.59798f, 0.0745711f, -0.235162f,
+ 0.133048f, -0.243033f, 0.0415527f, -0.00118735f, 0.00980514f,
+ -0.297429f, -0.144983f, 0.463093f, 0.0965441f, -0.338508f,
+ -0.651077f, 0.817577f, -0.0364773f, -0.388465f, 0.113288f,
+ 0.231198f, 0.316208f, -0.592201f, 0.530376f, -0.431434f,
+ 0.0200985f, 0.104303f, -0.130705f, 0.4374f, 0.362342f,
+ 0.70641f, 0.20037f, 0.309128f, -0.484535f, -1.18469f,
+ 0.513893f, 0.201236f, -0.022396f, 0.179638f, -0.361289f,
+ -0.0794946f, -1.04704f, -0.0281103f, 0.0494822f, 0.00196415f,
+ 0.0625478f, -0.229033f, 0.12018f, 0.542629f, -0.222423f,
+ -0.0123321f, -0.0988525f, 0.773192f, -0.192218f, -3.19156f,
+ 0.300606f, 0.462751f, 2.2968f, 0.137182f, 0.132539f,
+ 0.165884f, 0.128818f, -0.155856f, -0.558538f, -0.231742f,
+ -0.244377f, -0.442397f, 0.250947f, 0.0850658f, -0.00820139f,
+ 0.391284f, 0.17453f, 0.306003f, -0.531499f, -0.624451f,
+ 0.564584f, -0.343953f, -0.0278713f, 0.212664f, -0.135969f,
+ -0.0179867f, -0.687887f, 0.371065f, -0.0537029f, 0.0499509f,
+ 0.0980684f, -0.0438569f, 0.186731f, 0.182105f, 0.172254f,
+ -0.149446f, -0.0247637f, 0.148098f, 1.20772f, -0.136664f,
+ 0.00983112f, 0.0181381f, -0.0147549f, -0.0846561f, -0.827022f,
+ 0.00207177f, 0.0478215f, 0.0652549f, 0.0898219f, -0.0224959f,
+ -0.0274246f, 0.0166498f, -0.0211715f, -0.502932f, 0.0961452f,
+ 0.251206f, -0.0623632f, 0.741566f, 0.0078449f, -2.99162f,
+ -0.187244f, 0.0743479f, 1.46425f, 0.0737923f, 0.0133544f,
+ 0.20922f, -0.178671f, -0.0528492f, -0.526717f, 0.0282125f,
+ -0.0363201f, 0.37406f, -0.303658f, -0.066803f, 0.132237f,
+ 0.962057f, -0.399733f, 0.191765f, -0.452606f, -0.348732f,
+ 0.444939f, 0.153025f, 0.0796317f, 0.265985f, -0.319638f,
+ 0.0278161f, -0.333734f, 0.226108f, 0.147895f, -0.124066f,
+ -0.37306f, 0.19541f, 0.200175f, -0.0593244f, 0.0333887f,
+ -0.0284278f, 0.462491f, 0.0686487f, -0.332435f, -0.437166f,
+ 0.302795f, 0.100542f, 0.0265019f, 0.767212f, -0.140621f,
+ 0.11558f, -0.70584f, -0.00017415f, 0.00793092f, -0.0490901f,
+ 0.0598338f, 0.484876f, -0.13025f, 0.660349f, 0.147503f,
+ -0.462766f, 0.0843824f, 0.218493f, 0.310921f, -0.162284f,
+ 0.210404f, -0.788799f, 0.0698512f, -0.484799f, 0.0311505f,
+ -0.308243f, 0.417298f, 0.0593723f, 0.208908f, 0.451437f,
+ 0.354546f, -0.0700888f, -0.281678f, -0.311177f, 0.00914652f,
+ -0.372084f, 0.135036f, 0.185393f, 0.461347f, -0.114241f,
+ -0.402347f, -0.692327f, 0.0376155f, -0.200267f, 0.565963f,
+ -0.0627442f, 0.429677f, 0.170514f, 0.350565f, 0.699528f,
+ -0.948126f, -0.364205f, 0.348878f, -0.137832f, -0.0791649f,
+ -0.0462295f, -0.255078f, -0.398509f, 0.136783f, -0.0164628f,
+ -0.555472f, 0.690396f, 0.147715f, 0.000523095f, 0.14874f,
+ 0.524804f, 0.162974f, 0.797599f, 0.277473f, -0.500696f,
+ 0.189917f, -0.333309f, 0.00613646f, -1.07817f, 0.0470502f,
+ 0.210766f, 0.159768f, -0.447774f, -0.252968f, -1.72739f,
+ 0.0658259f, -0.448747f, 2.26511f, 0.349651f, 0.157232f,
+ 0.956842f, 0.856676f, 0.149227f, -0.626957f, -0.566771f,
+ -0.0980846f, 0.351668f, -0.362741f, -0.0272282f, -0.113632f,
+ 0.366015f, -0.00790003f, -0.458632f, -0.31157f, -0.182257f,
+ -0.953975f, 0.0583582f, 0.164721f, -0.900107f, -0.115542f,
+ 0.0654192f, 0.99056f, -0.247976f, 0.48254f, 0.670196f,
+ 0.098585f, -0.212855f, 0.310072f, 0.0894616f, 0.151944f,
+ 0.119629f, -0.26735f, 0.162257f, -0.0305818f, 0.681526f,
+ -0.229847f, 1.01556f, 0.29132f, 0.740113f, 0.0703937f,
+ 0.537892f, -0.18653f, -0.0252359f, -0.420014f, 0.197631f,
+ -0.176629f, 0.00674754f, 0.301288f, -0.162816f, 0.636235f,
+ -0.341362f, 0.197296f, -0.589747f, -0.749363f, -0.277197f,
+ -1.27291f, -0.0857908f, -0.147591f, -0.0956297f, -0.109097f,
+ 0.0717554f, 0.359078f, 0.301457f, 0.486934f, -0.260955f,
+ -0.126821f, 1.55756f, 0.477469f, -1.45363f, 1.42198f,
+ -0.360847f, -0.0211924f, -0.0184957f, -0.110706f, -0.152136f,
+ 0.104703f, 0.267615f, 0.127392f, 0.172996f, 0.258326f,
+ 0.268578f, -0.431123f, -0.114419f, 0.0101172f, -0.195671f,
+ 0.0792025f, -0.151505f, -0.064077f, 0.0479777f, -0.141882f,
+ 0.121492f, -0.139132f, -0.348252f, 0.341043f, -0.565367f,
+ -0.0791259f, -0.781086f, 0.0140045f, 0.571094f, -0.00875077f,
+ 0.217132f, -0.202345f, 0.157213f, 0.228445f, 0.366612f,
+ -0.529989f, 0.42241f, -0.540538f, -0.0425556f, -0.207774f,
+ -0.0663941f, 0.37836f, -0.0650245f, -0.0828694f, -0.0835478f,
+ -0.795512f, 0.470268f, 0.1551f, -0.69017f, -0.116735f,
+ 0.157614f, 0.555973f, -0.293311f, 0.245428f, -0.0853701f,
+ -0.449278f, -0.0551647f, -0.00137429f, 0.709439f, -0.456796f,
+ 0.132062f, -0.0449484f, -0.308599f, 0.180608f, -2.24196f,
+ 0.421478f, -0.640946f, -0.460397f, -0.920628f, -0.184949f,
+ -0.0416982f, 0.6484f, -0.22806f, 0.412229f, -0.468079f,
+ -0.72372f, -0.347698f, -1.3899f, 0.631876f, 0.0611046f,
+ 0.0294258f, -0.128091f, -0.205615f, 0.355348f, -0.267725f,
+ -0.644835f, 0.435879f, 0.517477f, -0.338123f, -0.157764f,
+ 0.32762f, -0.166454f, 0.221007f, -0.0438278f, -0.0777725f,
+ 0.10986f, 0.941545f, -0.542284f, -0.172312f, -0.256597f,
+ -0.0181391f, 0.220623f, -0.432456f, 0.0164074f, 0.250226f,
+ -0.522576f, 0.783109f, 0.198703f, -0.784554f, -0.0929628f,
+ 0.326861f, 0.470293f, 0.442684f, 0.271879f, -0.108256f,
+ 0.0483558f, -0.403151f, 0.36183f, -0.268186f, 0.270851f,
+ -0.696826f, -0.166037f, -0.354658f, 0.405977f, -0.473447f,
+ 0.649689f, -0.0863114f, -0.147319f, 0.0869966f, 0.319792f,
+ 0.493026f, -1.07456f, 0.354751f, 0.114605f, -0.120647f,
+ -0.238315f, 0.0290955f, -0.355299f, -0.45381f, 0.0812865f,
+ -0.0180434f, 0.00861318f, -0.892943f, -0.0127801f, -1.66398f,
+ 0.290505f, 0.126832f, 2.08173f, -0.0454847f, -0.162481f,
+ 1.07426f, 0.228566f, 0.280528f, -0.537625f, -0.175288f,
+ -0.118012f, 0.649114f, -0.349926f, -0.0189864f, -0.30934f,
+ -0.363178f, -0.119822f, -0.22656f, 0.484513f, -0.173269f,
+ 0.41987f, -0.448517f, -0.0950466f, 0.482443f, 0.061558f,
+ 0.4219f, -0.536388f, 0.0781972f, 0.212489f, 0.104229f,
+ -0.0792804f, 0.402066f, -0.676313f, -0.2272f, -0.16379f,
+ 0.260145f, -0.0504658f, -0.0826579f, -1.37749f, 0.00790747f,
+ 0.0841031f, -0.0671308f, -0.00301736f, -0.386206f, 0.190311f,
+ 0.0702639f, 0.0643968f, 0.133741f, -0.0141555f, -0.0365324f,
+ 0.87028f, 0.207894f, -0.421266f, 0.689256f, 0.145037f,
+ -0.270796f, 0.212604f, -0.345326f, 0.0074631f, -1.72379f,
+ 0.0672097f, -0.273153f, 1.30503f, -1.01324f, 0.00284696f,
+ 0.851459f, 0.176847f, 0.30948f, -0.57144f, -0.0596695f,
+ -0.111189f, 0.130361f, -0.298286f, 0.0567591f, -0.0885215f,
+ -0.847601f, 0.238624f, -0.162391f, 0.452357f, -0.0192713f,
+ 0.226661f, 0.0762922f, -0.0894055f, 0.332702f, 0.424484f,
+ 0.0443207f, -0.162345f, -0.601036f, 0.280527f, -0.137362f,
+ 0.266345f, 0.729438f, -0.887182f, 0.152943f, -0.573548f,
+ -0.0201383f, -0.56521f, 0.033582f, 0.300284f, -0.144472f,
+ 0.633026f, 0.30866f, 0.0653073f, 0.316901f, 0.0721326f,
+ 0.192252f, -0.833162f, 0.194292f, -0.08663f, -0.189401f,
+ -0.178242f, 0.111488f, 0.522487f, -0.65497f, 0.457049f,
+ 0.390654f, 0.0522936f, -0.39712f, -0.293717f, -0.374656f,
+ -0.118916f, -0.853076f, -0.0829578f, -0.17335f, -0.0218694f,
+ 0.367968f, 0.478469f, 0.0913813f, 0.519251f, 0.803526f,
+ -0.272516f, -0.341329f, 0.0897285f, 0.247653f, 0.000898686f,
+ 0.313196f, 0.000587979f, -0.314189f, -0.449439f, -0.0291611f,
+ -0.356287f, -0.722904f, -0.0480958f, -0.523758f, -0.576146f,
+ 0.133754f, 0.616921f, -0.085494f, 0.487487f, 0.745129f,
+ 0.993267f, 0.256555f, 0.0822743f, 0.0411971f, 0.139388f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_1_bias[] = {
+ 0.00447951f, 0.0202534f, 0.00970833f, -0.00460874f, 0.0942288f,
+ -0.0534704f, 0.00829869f, -0.0255174f, -0.0809143f, 0.00169117f,
+ 0.0177427f, 0.0259387f, 0.0291077f, -0.0267599f, 0.100275f,
+ -0.00389366f, 0.0315499f, 0.0265846f, -0.000206604f, 0.0302221f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_2_kernel[] = {
+ 0.153048f, 0.0725422f, 0.068901f, -0.475608f, 0.0736706f,
+ -0.134076f, 0.229289f, 0.0217921f, 0.0449205f, -1.00002f,
+ 0.149133f, 0.0497258f, 0.118988f, 0.0741764f, 0.0385486f,
+ 0.225181f, 0.012966f, 0.155593f, -3.07175f, -0.0641051f,
+ 0.09161f, 0.0259005f, -0.209998f, -0.420298f, 0.0587126f,
+ 0.00352744f, 0.0451313f, -0.049384f, 0.11516f, 0.083135f,
+ 0.103675f, -0.0185604f, 0.0623248f, -0.0993726f, 0.0448522f,
+ 0.0134017f, -0.294776f, -0.251924f, 0.0712635f, -0.0764298f,
+ -0.463766f, -0.0295011f, -0.579168f, 0.573853f, -0.00596607f,
+ 0.0237762f, -0.0500104f, -0.0969275f, 0.155573f, 0.0515382f,
+ -0.178454f, -0.154008f, -0.278299f, -0.166421f, 0.0149533f,
+ -0.0700236f, 0.239287f, -1.19545f, -0.0744625f, 0.143037f,
+ 0.141874f, 0.086302f, 0.0838633f, -0.454179f, 0.120308f,
+ -0.0896718f, 0.254909f, 0.0714462f, 0.00471098f, -0.869494f,
+ 0.209407f, 0.138285f, 0.0816641f, 0.0666266f, 0.0848555f,
+ 0.173313f, 0.0695633f, 0.285667f, -3.15384f, 0.00140275f,
+ -0.969824f, -0.0318689f, -0.00487396f, 0.412541f, 0.0263593f,
+ -0.249824f, 0.0897776f, 0.0208836f, -0.0982745f, -0.16049f,
+ -0.12719f, -0.186166f, 0.102338f, 0.273931f, -0.0886306f,
+ -0.19513f, -0.0135712f, -0.194127f, -0.0834291f, 0.426623f,
+ -0.0705446f, 0.0327476f, 0.0800862f, 0.478757f, -0.00849111f,
+ -0.554911f, -0.0489312f, -0.184029f, -0.227428f, 0.159989f,
+ -0.0677731f, -0.0901436f, 0.00308696f, -0.352243f, 0.278715f,
+ 0.306374f, -0.0772054f, -0.0122733f, -0.0693457f, 0.074365f,
+ -0.267458f, -0.123612f, -0.495954f, 0.552604f, -0.103951f,
+ -0.121771f, 0.179966f, -0.377947f, -1.35472f, 0.153294f,
+ -0.445284f, -0.089813f, -0.00529807f, 0.254047f, -0.0378426f,
+ 0.114597f, -0.143052f, 0.0815258f, -0.10528f, 0.00833533f,
+ -0.117508f, 0.129052f, 0.0706719f, -1.39506f, 0.0124731f,
+ 0.109831f, -0.0744156f, 0.181612f, 0.0787894f, 0.0293352f,
+ 0.494929f, 0.00997207f, -0.585882f, -0.0844138f, -0.00864134f,
+ -0.109943f, 0.0713114f, 0.14883f, 0.0610554f, 0.204145f,
+ -0.00390313f, 0.0184763f, -0.111387f, 0.175442f, -0.0840215f,
+ -0.178785f, -0.0693612f, -0.254507f, -0.191549f, 0.501561f,
+ -0.0858995f, -0.164921f, 0.0250706f, -0.0916282f, 0.247085f,
+ 0.13877f, -0.419487f, -0.295065f, -0.213812f, -0.10362f,
+ 0.138243f, 0.086985f, 0.113633f, -0.459273f, 0.12388f,
+ -0.139296f, 0.253792f, 0.0421624f, 0.0665065f, -0.977282f,
+ 0.199927f, 0.115194f, 0.099045f, 0.0534806f, 0.089283f,
+ 0.0815367f, 0.150901f, 0.253458f, -3.24825f, -0.0118163f,
+ -0.544565f, 0.0201825f, -0.0682201f, 0.759028f, 0.00479696f,
+ -0.00625607f, 0.058007f, -0.0811189f, -0.114617f, -0.0998578f,
+ 0.133312f, 0.0246256f, -0.0167416f, 0.196118f, 0.109823f,
+ 0.109489f, 0.474682f, -0.763475f, 0.0818745f, 0.0798777f,
+ -0.0994905f, -0.00138143f, -0.108563f, 0.697289f, -0.103702f,
+ -0.306085f, -0.0996705f, -0.142618f, -0.130989f, 0.0813303f,
+ -0.0909275f, -0.10786f, -0.0280431f, 0.206877f, -1.70798f,
+ 0.525568f, 0.559891f, -0.166132f, -0.227574f, -0.150955f,
+ 0.0849226f, 0.00497342f, -0.168667f, -0.282575f, 0.00537805f,
+ -0.0185572f, 0.0607167f, -0.0534948f, -0.0215776f, -0.14825f,
+ -0.0164577f, -0.0611978f, 0.0347562f, 0.286917f, 0.226598f,
+ 0.149497f, -0.478101f, -0.246006f, 0.0663239f, -0.121728f,
+ 0.267087f, 0.0802681f, -0.184741f, -0.558267f, 0.0437066f,
+ 0.13816f, -0.0710939f, 0.0725697f, 0.339857f, 0.161069f,
+ 0.304871f, 0.108138f, 0.193396f, 0.0891607f, -0.0701939f,
+ -0.182038f, -0.451873f, -0.233883f, 0.0444747f, 0.0436545f,
+ -0.245894f, -0.0721136f, 0.309013f, 0.278996f, 0.0259377f,
+ 0.0278116f, 0.0686773f, -0.271237f, 0.235082f, -0.0778285f,
+ -0.456541f, -0.109303f, -0.074565f, -0.407301f, -0.162191f,
+ -0.801819f, 0.372435f, -0.559083f, -0.039189f, 0.0477762f,
+ 0.0875363f, 0.0699926f, 0.116552f, -0.308217f, 0.0341607f,
+ -0.14202f, 0.135517f, 0.0316971f, 0.153297f, -0.759722f,
+ 0.12849f, 0.114229f, 0.0814893f, 0.275402f, 0.0403976f,
+ 0.0357503f, 0.212295f, 0.0673998f, -2.59822f, -0.0475021f,
+ -0.0594725f, 0.0659163f, 0.0469717f, -0.0370461f, -0.12863f,
+ -0.381743f, -0.0445055f, -0.106843f, -0.0880648f, 0.00591106f,
+ 0.235514f, -0.165162f, -0.0696645f, 0.115374f, 0.245558f,
+ 0.192049f, -0.388628f, -0.48291f, 0.154313f, -0.160207f,
+ 0.125928f, 0.122039f, 0.0713794f, -0.161244f, 0.128082f,
+ -0.234659f, 0.0680219f, 0.0597933f, 0.208421f, -0.163623f,
+ 0.196873f, 0.156603f, 0.184179f, -0.278331f, -0.0481286f,
+ 0.0828152f, 0.247004f, 0.0915582f, -0.0906229f, -0.20376f,
+ 0.136593f, 0.0740336f, -0.0134935f, -0.355048f, 0.0898485f,
+ -0.0962068f, 0.185804f, -0.0145596f, 0.0966589f, -0.515784f,
+ 0.121602f, 0.0320428f, 0.11093f, -0.0559421f, 0.0355484f,
+ 0.192128f, 0.0500888f, 0.133641f, -1.73282f, -0.0624599f,
+ 0.122524f, 0.0757292f, -0.0974648f, -0.193649f, 0.0561096f,
+ 0.0159959f, 0.0334472f, -0.0168832f, -0.12386f, -0.112419f,
+ 0.19552f, 0.0308502f, 0.0537643f, -0.0181012f, 0.0392183f,
+ 0.0461833f, -0.52623f, -0.238252f, 0.0821762f, -0.212384f,
+ 0.112901f, 0.096063f, 0.0540225f, 0.0773583f, 0.143045f,
+ -0.101551f, 0.282418f, 0.0176749f, -0.00244542f, -0.780154f,
+ -0.254428f, -5.82215f, 0.106638f, 0.11746f, 0.0486823f,
+ 0.164562f, 0.0303006f, 0.229614f, -2.41845f, -0.117122f,
+ 0.0451654f, 0.0237383f, -0.208731f, 0.0721137f, 0.0761163f,
+ -0.0569416f, -0.00830511f, -0.045256f, 0.14535f, -0.0189222f,
+ -0.283363f, -3.15502f, 0.0971161f, -0.035913f, 0.00813281f,
+ 0.0187974f, -0.361573f, -0.302067f, 0.118014f, -0.0956148f,
+ -0.596567f, 0.0105443f, -0.49019f, -0.0801959f, 0.0322344f,
+ -0.0280032f, 0.0555038f, -0.111495f, -0.0994456f, 0.0178021f,
+ 0.0358362f, 1.07063f, -0.0833138f, 0.0621246f, 0.0637157f,
+ 0.0999207f, 0.191975f, -1.2811f, 0.0341681f, 0.14818f,
+ 0.0957259f, 0.109909f, 0.0566115f, 0.0585633f, 0.179939f,
+ -0.104372f, 0.309091f, 0.0172941f, 0.0243182f, -0.935252f,
+ -0.296257f, -5.83634f, 0.0899249f, 0.455347f, 0.129505f,
+ 0.220212f, 0.0214801f, 0.284802f, -2.94585f, -0.0805413f,
+ -1.01819f, 0.00534034f, -0.057203f, 0.0869331f, 0.0207575f,
+ -0.124479f, -0.0465806f, 0.0894252f, 0.32203f, 0.0858497f,
+ 0.25178f, 0.0932205f, 0.0888455f, 0.233153f, -0.446398f,
+ -0.00791233f, 0.0909603f, -0.0904397f, 0.131835f, 0.475597f,
+ -0.1236f, 0.0231622f, 0.138602f, -0.097731f, -0.0282484f,
+ -0.549095f, -0.0457428f, -0.0895407f, -0.293965f, 0.166872f,
+ 0.46719f, 0.236254f, 0.0615991f, 0.499236f, 0.540366f,
+ 0.402035f, 0.0606324f, -0.0499928f, -0.0155198f, 0.0994403f,
+ -0.14773f, -0.183433f, -0.612093f, -0.334201f, -0.110877f,
+ -0.143441f, 0.05815f, -0.318586f, -0.344235f, 0.199593f,
+ 0.51109f, -0.252281f, -0.028834f, 0.0615421f, 0.0623699f,
+ 0.210745f, -0.236448f, 0.166279f, 0.127516f, -0.0971157f,
+ -0.204389f, 0.208112f, 0.0377023f, 0.271837f, -0.00859528f,
+ 0.0797081f, -0.00582115f, 0.140018f, -0.384865f, -0.0853243f,
+ -0.586727f, -0.0664489f, -0.631436f, -0.245828f, -0.0647894f,
+ -0.171912f, -0.0801706f, 0.0731614f, -0.11725f, 0.281478f,
+ -0.03047f, 0.0363488f, -0.0481651f, -0.326329f, -0.0155898f,
+ -0.428316f, -0.0989367f, -0.271902f, -0.00263837f, 0.366168f,
+ 0.325989f, 0.165463f, 0.0668512f, -0.142202f, 0.419992f,
+ 0.164971f, -0.515479f, -0.187585f, -0.151783f, -0.0682468f,
+ 0.0910191f, 0.117086f, 0.106579f, 0.0961825f, 0.162148f,
+ -0.129645f, 0.301039f, 0.000320343f, -0.0558097f, -0.844295f,
+ -0.218919f, -5.7571f, 0.0982612f, 0.238955f, 0.0703565f,
+ 0.0969388f, 0.107202f, 0.321585f, -3.00594f, -0.058755f,
+ -0.620004f, 0.052114f, 0.128423f, -0.177673f, -0.00341509f,
+ -0.146756f, -0.0414309f, -0.0893262f, -0.0584779f, -0.129552f,
+ 0.127629f, 0.13275f, -0.0973342f, -0.215617f, 0.0724309f,
+ 0.0102229f, 0.178137f, -0.943374f, -0.171465f, 0.304949f,
+ -0.0963836f, -0.0346437f, -0.138667f, -0.234184f, 0.0344159f,
+ -0.319592f, -0.0990766f, -0.16065f, 0.369432f, 0.194911f,
+ 0.363348f, -0.356009f, -0.00736217f, 0.241788f, -2.21311f,
+ 0.704816f, 0.697019f, 0.129186f, -0.132799f, -0.11861f,
+ 0.0383451f, 0.0247782f, -0.12687f, 0.0256552f, 0.048413f,
+ 0.00660549f, 0.0457962f, -0.012819f, 0.115991f, -0.1117f,
+ -0.291045f, -0.646138f, 0.0813613f, 0.112063f, 0.191675f,
+ 0.120835f, -0.444267f, -0.340385f, 0.0391936f, -0.151132f,
+ 0.184419f, 0.124998f, -0.14089f, 0.214087f, 0.00108535f,
+ 0.119611f, 0.0236965f, 0.0715074f, -0.225997f, -0.0126552f,
+ -0.459214f, -0.490444f, 0.173716f, 0.355811f, -0.13607f,
+ -0.191091f, -0.530085f, -0.400666f, 0.011221f, 0.10527f,
+ -0.11498f, -0.011864f, 0.364376f, 0.0319587f, -0.0528563f,
+ 0.0353899f, 0.0393453f, -0.289211f, -0.347785f, -0.0417157f,
+ 0.545848f, 0.741785f, -0.0732565f, -1.29687f, -0.0433128f,
+ -1.44162f, 0.318894f, -0.377784f, 0.123751f, -0.00444347f,
+ 0.0957118f, 0.0893616f, 0.0911595f, 0.092917f, 0.127681f,
+ -0.159929f, 0.190417f, -0.0297948f, -0.00132599f, -0.742756f,
+ -0.0364169f, -4.00108f, 0.0784767f, 0.223048f, 0.0430138f,
+ 0.0180493f, 0.212842f, 0.122987f, -2.83267f, -0.0641464f,
+ -0.173247f, 0.100946f, 0.0804885f, 0.0172631f, 0.0877408f,
+ -0.353222f, 0.0108262f, -0.0452121f, -0.116127f, 0.268154f,
+ -0.132587f, -0.27481f, -0.0316914f, 0.0610525f, 0.439691f,
+ 0.00966415f, -0.78962f, -0.424823f, -0.0214365f, -0.113846f,
+ 0.100793f, 0.126482f, 0.0415354f, 0.0427995f, 0.14273f,
+ -0.315674f, 0.110095f, 0.0061568f, 0.0320474f, -0.3596f,
+ -0.12533f, -1.28837f, 0.174673f, -0.235912f, 0.00495439f,
+ 0.0695473f, 0.266489f, 0.049248f, 0.0868526f, -0.0685969f,
+ 0.102984f, 0.0924639f, -0.027535f, 0.0709277f, 0.155776f,
+ -0.190944f, 0.188273f, -0.00897471f, 0.0964232f, -0.475822f,
+ -0.209374f, -5.00252f, 0.103495f, 0.110698f, 0.00682092f,
+ 0.208586f, 0.0489575f, 0.0966254f, -1.42973f, -0.0645128f,
+ 0.0515961f, 0.0571281f, -0.0992321f, 0.00791648f, 0.0087609f,
+ 0.0607367f, 0.0315705f, 0.0183317f, 0.0756087f, -0.0292847f,
+ -0.212932f, -0.782259f, 0.0899944f, 0.102677f, 0.0681135f,
+ 0.0447764f, -0.481969f, -0.221459f, 0.0794475f, -0.229157f,
+ 0.136781f, 0.0832359f, 0.0297807f, -0.00287225f, -5.97897f,
+ -0.0960581f, 0.250945f, -0.00133314f, -0.112396f, -0.856922f,
+ 0.115776f, 0.124536f, 0.0914194f, -0.160775f, 0.128684f,
+ 0.106718f, 0.100665f, 0.139579f, -0.86141f, -0.190323f,
+ 0.0884896f, 0.0363845f, -0.19831f, 0.121601f, 0.0264453f,
+ -0.00557822f, 0.0720238f, -0.0140132f, -0.166814f, -0.266214f,
+ 0.00500545f, 0.0146905f, 0.126035f, 0.0812372f, 0.0615973f,
+ 0.0766063f, -0.420156f, -0.126157f, -0.0284299f, -0.112513f,
+ -0.567008f, -0.0100263f, -0.607567f, 0.193053f, 0.0067527f,
+ -0.0753897f, 0.00134269f, -0.0512249f, -0.161661f, 0.0667741f,
+ -0.113702f, -0.071606f, -0.300563f, 0.276479f, -0.155318f,
+ -0.0512306f, 0.0896443f, -0.987911f, 0.0440889f, 0.430958f,
+ 0.175427f, 0.101385f, 0.0303662f, 0.0672653f, -6.62463f,
+ -0.10475f, 0.228249f, -0.00482173f, -0.0608713f, -0.895836f,
+ 0.187976f, 0.162173f, 0.0747544f, 0.219953f, 0.0682489f,
+ 0.142665f, 0.100287f, 0.301887f, -1.97736f, -0.295001f,
+ -1.0733f, -0.0562668f, -0.0604295f, 0.0304073f, 0.194274f,
+ -0.243593f, 0.0727137f, 0.0610967f, -0.0692415f, -0.02967f,
+ 0.055633f, 0.0192402f, 0.105841f, 0.102236f, -0.0757102f,
+ -0.0067639f, 0.0102317f, -0.257959f, -0.0638652f, 0.45521f,
+ -0.114967f, 0.0921177f, 0.223796f, 0.277072f, -0.0613282f,
+ -0.564693f, -0.151333f, -0.158035f, 0.228491f, 0.12997f,
+ -0.192625f, -0.125344f, 0.0983258f, -0.931206f, 0.618715f,
+ 0.273759f, -0.145527f, -0.099431f, -0.119551f, 0.0663484f,
+ -0.161419f, -0.202377f, -0.545393f, 0.0917645f, 0.042263f,
+ -0.17117f, -0.178622f, -0.336977f, 0.866715f, 0.0376922f,
+ -0.319728f, -0.127406f, 0.0599384f, 0.268804f, -0.0331844f,
+ 0.355326f, -0.103902f, 0.0425935f, 0.00525512f, -0.133687f,
+ -0.122695f, 0.145582f, 0.139013f, -0.0053352f, 0.0313566f,
+ 0.327295f, -0.0117993f, 0.233524f, 0.162388f, -0.0793262f,
+ 0.454543f, 0.0442224f, -0.742673f, -0.144882f, 0.0874983f,
+ -0.0707259f, 0.0219869f, 0.201728f, 0.0204537f, 0.0788857f,
+ -0.0374329f, 0.0724169f, 0.0743593f, -0.0193526f, -0.313546f,
+ -0.418882f, -0.0815754f, -0.197144f, 0.305053f, 0.330196f,
+ -0.131006f, -0.00113249f, 0.0750458f, -0.541764f, 0.299935f,
+ 0.308516f, -0.20547f, -0.333066f, 0.0285833f, 0.191147f,
+ 0.160372f, 0.0724649f, 0.0426326f, 0.153046f, -6.59656f,
+ -0.081237f, 0.219163f, 0.0147081f, -0.0109837f, -1.01487f,
+ 0.170055f, 0.163386f, 0.106413f, 0.150188f, 0.0688875f,
+ 0.0541359f, 0.156307f, 0.178844f, -1.51054f, -0.149477f,
+ -0.504503f, 0.017878f, -0.181821f, -0.0999659f, 0.0484548f,
+ -0.32211f, 0.0406744f, 0.0017627f, 0.0220593f, 0.0900512f,
+ -0.561625f, 0.107279f, -0.0861521f, -0.0862376f, 0.0816765f,
+ 0.168072f, 0.150063f, -0.816825f, -0.13569f, 0.557555f,
+ -0.155265f, 0.025135f, -0.109304f, -0.0487062f, -0.00347487f,
+ -0.454803f, -0.0394371f, -0.214597f, -0.248898f, 0.286501f,
+ -0.249246f, -0.138935f, 0.00391409f, -0.122544f, -2.14993f,
+ 0.588942f, 0.541231f, 0.0154047f, -0.359742f, 0.0520729f,
+ 0.0667058f, 0.0418163f, -0.132533f, -0.184759f, 0.0546118f,
+ -0.131198f, 0.109664f, -0.0714679f, -0.114163f, -0.243081f,
+ -0.0405089f, 0.0342795f, 0.0801825f, -0.268408f, 0.192207f,
+ 0.0800494f, -0.586539f, -0.118155f, -0.0508569f, -0.193987f,
+ 0.261478f, 0.105719f, -0.125361f, -0.0956201f, 0.0233802f,
+ 0.271098f, 0.0113352f, 0.0910447f, 0.00628244f, -0.071722f,
+ 0.21439f, 0.0747191f, 0.207765f, -0.0782454f, -0.0151716f,
+ -0.196505f, -0.44798f, -0.228597f, 0.0549039f, -0.120715f,
+ -0.19388f, -0.0768461f, 0.361102f, 0.122936f, -0.0334211f,
+ -0.202503f, -0.0450776f, -0.272345f, 0.662321f, 0.109247f,
+ -0.218026f, -0.0669386f, -0.0864701f, -0.633421f, -0.158007f,
+ -1.10778f, 0.351211f, -0.541458f, -0.0171707f, 0.149606f,
+ 0.106105f, 0.0880349f, 0.0968455f, 0.113269f, -5.01949f,
+ -0.106404f, 0.175578f, -0.030045f, -0.0267249f, -0.563713f,
+ 0.173885f, 0.130772f, 0.0334519f, 0.0770157f, 0.0394389f,
+ -0.0290326f, 0.220003f, 0.180901f, -1.62203f, -0.151858f,
+ -0.202386f, -0.0067836f, 0.0287665f, -0.194183f, -0.239834f,
+ -0.484159f, 0.00671722f, -0.122459f, 0.0808959f, -0.263769f,
+ -0.015066f, -0.0429868f, -0.111255f, -0.231872f, 0.219659f,
+ -0.0437412f, -0.536618f, -0.477831f, 0.0421895f, -0.0815851f,
+ 0.119638f, 0.0786293f, -0.000668378f, 0.0305567f, -0.0868189f,
+ -0.178327f, 0.0799657f, 0.0280923f, -0.211395f, -0.464577f,
+ 0.216912f, 0.0761976f, 0.160288f, -0.416372f, -0.10286f,
+ -0.0733786f, 0.261033f, 0.0493698f, 0.143137f, -0.179979f,
+ 0.15655f, 0.0897976f, -0.0258041f, -0.152852f, -6.15512f,
+ -0.118917f, 0.227283f, -0.0514043f, -0.0786432f, -0.523485f,
+ 0.1644f, 0.0869001f, 0.0984082f, -0.428288f, 0.0791992f,
+ 0.141904f, 0.0652073f, 0.104429f, -0.775125f, -0.121479f,
+ 0.0841637f, 0.0135705f, -0.208863f, -0.0629523f, 0.0455794f,
+ 0.0513898f, -0.0147657f, 0.0401145f, 0.0660079f, 0.0210609f,
+ -0.0151801f, 0.0562111f, 0.140308f, -0.0196394f, 0.0230753f,
+ -0.0336115f, -0.422411f, -0.196974f, -0.0405748f, -0.283428f,
+ 0.15458f, 0.0876296f, 0.0314038f, 0.16389f, -7.01385f,
+ -0.117146f, 0.197273f, -0.0400688f, 0.0143951f, -0.964007f,
+ -0.0618919f, 0.0406891f, 0.07992f, -0.144132f, 0.116416f,
+ 0.0326838f, 0.103641f, 0.171805f, -1.05158f, -0.182589f,
+ 0.116991f, 0.0530774f, -0.212454f, -0.016727f, -0.0565992f,
+ 0.0712873f, 0.0445466f, -0.000107032f, -0.121449f, -0.15148f,
+ 0.0220338f, 0.0762024f, 0.12253f, 0.0622466f, 0.0835822f,
+ 0.0465119f, -0.388743f, -0.34665f, -0.0720734f, -0.101581f,
+ -0.630565f, -0.0512685f, -0.520541f, 0.0530119f, -0.0245276f,
+ -0.19116f, -0.0144446f, -0.0604486f, 0.187251f, -0.021341f,
+ -0.217823f, 0.0510256f, -0.197946f, 0.060955f, -0.0617316f,
+ 0.0741673f, 0.117591f, -1.47844f, -0.0911093f, 0.359225f,
+ 0.145027f, 0.127513f, 0.0617905f, 0.141154f, -7.63868f,
+ -0.0808127f, 0.274843f, 0.00693195f, -0.0283113f, -0.853871f,
+ -0.15737f, 0.0858904f, 0.0746279f, 0.109912f, 0.193775f,
+ 0.0698094f, 0.174159f, 0.259556f, -1.49885f, -0.156706f,
+ -1.04113f, -0.0329546f, -0.0491449f, -0.0304125f, 0.0514892f,
+ -0.244284f, 0.126814f, -0.0387081f, -0.153173f, -0.0566748f,
+ 0.294111f, -0.0170534f, 0.102381f, 0.447606f, -0.0613267f,
+ -0.0636869f, -0.0347599f, -0.259572f, -0.0657846f, 0.454352f,
+ -0.169453f, -0.00177987f, 0.133279f, -0.0863932f, -0.134423f,
+ -0.475107f, -0.00448962f, -0.214607f, 0.111413f, 0.194377f,
+ -0.0710837f, 0.0562353f, 0.0401193f, 0.248595f, 0.538374f,
+ 0.449469f, -0.39111f, 0.0125057f, 0.0448811f, -0.00707751f,
+ -0.164894f, -0.317516f, -0.56231f, -0.270262f, 0.127016f,
+ -0.12092f, -0.0881587f, -0.323908f, 0.872344f, 0.103391f,
+ 0.267971f, -0.155088f, -0.0136683f, 0.309517f, 0.119901f,
+ 0.271307f, -0.188463f, 0.185121f, -0.142777f, -0.110535f,
+ -0.163107f, 0.175502f, 0.0801924f, 0.240499f, 0.0874759f,
+ 0.308907f, -0.00222504f, 0.193366f, 0.109018f, -0.0772158f,
+ -0.520675f, 0.0259432f, -0.736666f, -0.296579f, 0.043486f,
+ -0.128932f, 0.0417669f, 0.125747f, 0.157879f, 0.112857f,
+ -0.0595681f, 0.0611936f, -0.042125f, -0.270338f, 0.120072f,
+ -0.36675f, -0.0347962f, -0.119539f, 0.0873369f, 0.296432f,
+ -0.069501f, -0.0383859f, 0.0913597f, -0.40747f, 0.234276f,
+ 0.332536f, -0.732132f, -0.312291f, 0.137759f, 0.227593f,
+ 0.14165f, 0.129068f, 0.102734f, 0.135818f, -7.35883f,
+ -0.101533f, 0.256027f, -0.0142278f, -0.0561601f, -1.09899f,
+ -0.106538f, 0.0612256f, 0.099487f, -0.0605983f, 0.134311f,
+ 0.052226f, 0.143672f, 0.219944f, -1.47539f, -0.101828f,
+ -0.429979f, 0.010478f, -0.0132605f, 0.103363f, 0.0267373f,
+ -0.338865f, 0.0090188f, 0.0810085f, -0.124368f, -0.0133776f,
+ 0.595666f, -0.00162201f, -0.212444f, -0.26342f, 0.0913656f,
+ -0.106279f, 0.414515f, -0.709901f, -0.00198859f, 0.305288f,
+ -0.188536f, -0.0377482f, -0.131909f, -0.116099f, -0.236827f,
+ -0.36356f, 0.0179455f, -0.202143f, -0.00395508f, 0.177363f,
+ 0.0630679f, -0.145173f, -0.0558639f, -0.44879f, -1.55687f,
+ 0.473398f, 0.50531f, -0.0656231f, -0.137197f, 0.064707f,
+ 0.122083f, 0.0321111f, -0.167096f, 0.0406581f, -0.0793592f,
+ -0.0777081f, 0.0321379f, -0.0108834f, -0.0652323f, -0.102918f,
+ 0.0178664f, 0.0781873f, 0.0613189f, -0.04177f, 0.159566f,
+ 0.15134f, -0.445996f, -0.384905f, 0.0951659f, -0.175046f,
+ 0.255746f, 0.177047f, -0.150632f, 0.200522f, 0.00778549f,
+ 0.232168f, -0.0304652f, 0.083155f, -0.125395f, -0.0203289f,
+ -0.23874f, 0.0349836f, 0.231701f, -0.14849f, -0.204272f,
+ -0.198309f, -0.364955f, -0.228428f, 0.0614142f, -0.040976f,
+ -0.227785f, -0.0898404f, 0.271566f, -0.209196f, 0.0226431f,
+ -0.0911715f, 0.0840369f, -0.299411f, -0.529182f, 0.0622292f,
+ 0.202475f, 0.0155583f, -0.083114f, 0.124253f, -0.22721f,
+ -1.02565f, 0.193961f, -0.54287f, -0.00849364f, 0.11124f,
+ 0.0993531f, 0.120621f, 0.0959537f, 0.136274f, -5.23358f,
+ -0.107433f, 0.155286f, -0.0136043f, -0.0246768f, -0.631187f,
+ -0.0493852f, 0.0446751f, 0.0588353f, 0.160766f, -0.0354385f,
+ -0.0672548f, 0.243743f, 0.186004f, -1.20199f, -0.151872f,
+ -0.0760096f, -0.00775123f, -0.0122227f, 0.0891327f, -0.377876f,
+ -0.469926f, -0.134715f, -0.0969362f, 0.212542f, 0.0871489f,
+ 0.164638f, -0.0485785f, -0.167754f, -0.515052f, 0.13821f,
+ 0.0515572f, -0.430691f, -0.394719f, 0.143947f, -0.00670816f,
+ 0.129623f, 0.140299f, 0.0336978f, 0.153545f, -0.350927f,
+ -0.213485f, 0.0344809f, 0.0405889f, 0.0749967f, -0.369352f,
+ -0.109398f, 0.0350649f, 0.190893f, -0.284106f, -0.185376f,
+ 0.0105842f, 0.263692f, 0.160429f, 0.0998209f, -0.127779f,
+ 0.140558f, 0.108968f, -0.0122672f, 0.102875f, -5.72172f,
+ -0.161288f, 0.135935f, -0.0143087f, 0.106556f, -0.649813f,
+ -0.123049f, -0.0108861f, 0.102918f, -0.298137f, 0.0329013f,
+ 0.100763f, 0.12018f, 0.100782f, -0.648036f, -0.111122f,
+ 0.12363f, 0.0211952f, -0.225201f, 0.0506021f, 0.0167621f,
+ 0.0608759f, -0.0245646f, 0.0503477f, -0.0972749f, -0.0415155f,
+ -0.00578366f, -0.0977591f, 0.124867f, 0.0134788f, -0.0375816f,
+ -0.00581233f, -0.272292f, -0.250393f, 0.024511f, -0.184891f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_2_bias[] = {
+ 0.182474f, 0.0223202f, 0.204111f, 0.0573683f, 0.111143f,
+ 0.0800926f, -0.0364215f, 0.192371f, 0.00498262f, 0.302543f,
+ 0.0133081f, 0.119719f, 0.237522f, -0.266705f, 0.129427f,
+ 0.0695857f, 0.22068f, 0.231667f, 0.405829f, -0.0972567f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_3_kernel[] = {
+ -0.0393876f, -0.269924f, -0.0703231f, -0.0236484f, 0.170478f,
+ 0.245566f, 0.175963f, 0.104194f, -0.0490501f, -0.157605f,
+ -0.0275165f, -0.0169499f, -0.250725f, 0.215203f, -0.00733655f,
+ 0.0111298f, 0.205606f, 0.928046f, 0.15139f, 0.0955483f,
+ -0.015115f, -0.126643f, 0.0957605f, -0.140178f, -0.0246866f,
+ 0.097097f, 0.116287f, 0.177746f, 0.0570021f, -0.0518686f,
+ -0.0446482f, -0.0125318f, 0.0116092f, 0.102431f, 0.0898519f,
+ 0.0870372f, -0.843274f, 0.383311f, -0.102761f, -0.0246494f,
+ 0.0312555f, 0.19472f, 0.111573f, 0.0920392f, -0.0555618f,
+ 0.326461f, 0.219357f, -0.133727f, -0.118399f, -0.0611432f,
+ -0.169931f, 0.123733f, -0.204607f, 0.082592f, 0.0323181f,
+ 0.201618f, -0.00388867f, -0.053583f, 0.0266333f, -0.0951787f,
+ -0.0358283f, -0.0649549f, 0.0119263f, -0.11812f, 0.209851f,
+ -0.036616f, -0.014911f, -0.138096f, -0.139664f, -0.207395f,
+ 0.0128848f, -0.201816f, 0.0899419f, 0.343308f, -0.0096243f,
+ -0.212605f, -0.0905284f, -0.0597114f, -0.055261f, -0.0653405f,
+ 0.0330484f, -0.27681f, -0.0994095f, -0.0468272f, 0.145713f,
+ 0.267216f, 0.185335f, 0.1798f, -0.0437882f, -0.200401f,
+ -0.0398117f, -0.0736501f, -0.166349f, 0.203316f, 0.0710647f,
+ 0.061825f, 0.281131f, 0.733323f, 0.215488f, 0.00145659f,
+ -0.138995f, -0.0833713f, 0.107809f, -0.105343f, -0.0672139f,
+ 0.101852f, 0.135455f, 0.132903f, 0.0312017f, -0.0643586f,
+ -0.0274546f, -0.0687466f, -0.020233f, 0.109444f, 0.0774587f,
+ 0.139497f, -0.800587f, 0.325783f, -0.0546695f, -0.092003f,
+ -0.0773301f, 0.189672f, 0.0604666f, 0.0939425f, 0.679495f,
+ 0.114789f, -0.161153f, 0.12843f, -0.0345385f, -0.134641f,
+ -0.153995f, 0.0823055f, -0.0349296f, 0.0299183f, -0.0606872f,
+ 0.137588f, 0.0449805f, -0.0555399f, -0.00553351f, -0.120719f,
+ -0.204701f, -0.0739813f, 0.0584115f, -0.104833f, -0.110989f,
+ 0.00845446f, 0.0630702f, -0.147861f, 0.0268545f, -0.216419f,
+ 0.00531986f, -0.206641f, 0.253082f, 0.413215f, -0.05909f,
+ -0.0939983f, -0.116818f, -0.0450892f, -0.0551134f, -0.00696931f,
+ -0.113003f, -0.289192f, -0.00884866f, -0.0365724f, 0.0401887f,
+ 0.238622f, 0.149151f, 0.175751f, -0.157425f, -0.138924f,
+ -0.0277598f, -0.0285915f, 0.10165f, 0.209532f, 0.0862249f,
+ 0.0256428f, 0.623204f, -0.0941196f, 0.20345f, -0.132869f,
+ 0.00947298f, -0.14753f, 0.103918f, -0.161799f, 0.125566f,
+ 0.10916f, 0.115446f, 0.135627f, -0.0181667f, -0.0734694f,
+ -0.0154729f, -0.085849f, -0.000427605f, 0.113614f, 0.0776308f,
+ 0.111899f, -0.214917f, 0.393234f, -0.132223f, 0.020783f,
+ -0.074902f, 0.217477f, 0.107883f, 0.109466f, 0.146609f,
+ 0.317061f, 0.074379f, -0.0505457f, -0.0503772f, -0.0678954f,
+ -0.220003f, 0.114878f, 0.176014f, -0.00657996f, -0.0875497f,
+ 0.065582f, 0.00238612f, -0.063395f, 0.0295323f, -0.127126f,
+ 0.099813f, -0.115452f, 0.0106309f, -0.179632f, -0.0436553f,
+ 0.0120295f, 0.0652713f, -0.131512f, -0.081714f, -0.205363f,
+ -0.0374944f, -0.196707f, 0.680568f, -0.00991824f, -0.0212223f,
+ -0.186258f, -0.432361f, -0.0291303f, -0.0475983f, -0.071383f,
+ -0.0116416f, -0.28257f, -0.0635272f, -0.0576546f, -0.280129f,
+ 0.286528f, 0.199997f, 0.192851f, 0.323829f, -0.185006f,
+ -0.04791f, -0.0882187f, -0.0496895f, 0.293135f, 0.125539f,
+ 0.0341828f, 0.993452f, 0.0369177f, 0.0453796f, 0.0329807f,
+ 0.157673f, -0.153195f, 0.122383f, -0.161983f, -0.317619f,
+ 0.105129f, 0.155673f, 0.152489f, 0.0685417f, -0.0595907f,
+ -0.026657f, -0.0954336f, -0.0359557f, 0.105617f, 0.0825066f,
+ 0.100189f, -0.22125f, 0.382508f, -0.0247677f, -0.115807f,
+ -0.0639787f, 0.177786f, 0.0566206f, 0.0496389f, 1.31533f,
+ 0.0482907f, -0.118743f, 0.190632f, 0.172867f, -0.108446f,
+ -0.200186f, 0.122572f, 0.0897468f, 0.0155328f, -0.0380217f,
+ 0.125161f, -0.141723f, -0.023157f, 0.0270805f, -0.101961f,
+ 0.12358f, -0.0866255f, 0.00306761f, -0.131764f, -0.461118f,
+ -0.00803936f, 0.0895496f, -0.153905f, 0.207623f, -0.249099f,
+ -0.0198487f, -0.160013f, 0.81136f, -0.109978f, -0.0880332f,
+ -0.0761368f, -0.0755881f, -0.0384827f, -0.0554777f, -0.0750048f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_3_bias[] = {
+ 0.0106809f, 0.136699f, 0.285316f, 0.395746f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_4_kernel[] = {
+ -0.0161019f, -0.088871f, 0.0463358f, -0.198037f, 0.038122f,
+ 0.0135483f, -0.196641f, -0.433531f, 0.527972f, -0.143716f,
+ 0.558627f, 0.459889f, 0.322864f, -0.491514f, -0.190915f,
+ -0.0765601f, 0.210329f, 0.689389f, -0.100415f, -1.8788f,
+ 0.2228f, 0.292781f, -0.954838f, -0.0788763f, -0.131402f,
+ -0.17154f, 0.049934f, -0.0541183f, -0.530529f, -0.666165f,
+ 0.195492f, 0.218548f, -0.314895f, 0.0749444f, -0.191344f,
+ 0.349469f, 0.00811248f, -0.760157f, 0.0707434f, -0.0719285f,
+ -0.264495f, -0.432009f, -0.432686f, 0.155738f, -0.020197f,
+ 0.19278f, -0.658335f, -0.273143f, -0.286079f, 0.243402f,
+ 0.497701f, 0.0121003f, -0.666308f, 0.028172f, -0.547901f,
+ -0.11755f, 0.322028f, 0.0878274f, -0.0328334f, 0.311816f,
+ 0.0951026f, -1.11429f, -0.0417486f, 0.123467f, -0.0910681f,
+ -0.0154255f, 0.311201f, -0.0156158f, -0.600437f, 0.0274156f,
+ -0.174907f, -1.29313f, -0.178656f, 0.596556f, -0.421725f,
+ -0.289137f, 0.529297f, 0.114833f, -0.0155887f, -0.308232f,
+ -0.0228361f, 0.184017f, 0.138232f, 0.146347f, -0.117867f,
+ 0.248351f, -0.282846f, -0.18058f, 0.348355f, -0.415754f,
+ 0.0657168f, 0.431728f, -0.231043f, -0.186745f, 0.137401f,
+ -0.282329f, -0.159678f, 0.754262f, 0.037824f, -1.68521f,
+ -0.290175f, 0.289588f, -0.18683f, -0.300385f, 0.285449f,
+ -0.00386456f, 0.0563485f, -0.376541f, 0.159899f, -0.697312f,
+ 0.0284389f, 0.437307f, 0.3968f, -0.372082f, -0.232535f,
+ 0.394629f, 0.00315248f, -0.38374f, 0.0311291f, -0.624353f,
+ 0.498083f, -0.342663f, -0.125978f, 0.186797f, 0.187723f,
+ 0.149335f, -0.82727f, -0.0740974f, -0.659039f, 0.42671f,
+ -0.448835f, 0.150677f, 0.830742f, -0.233148f, -0.65308f,
+ -0.0878935f, -0.407797f, -0.511826f, -0.0739023f, 0.506305f,
+ -0.187451f, 0.0284968f, -0.822238f, 0.362523f, -0.270865f,
+ 0.032335f, 0.560413f, -0.00388247f, -0.446333f, 0.163147f,
+ -0.409633f, -0.372575f, 0.306993f, 0.55953f, -0.24362f,
+ -0.0929369f, -0.520298f, -0.444022f, 0.186077f, -0.0942208f,
+ 0.624049f, -0.429625f, -0.869528f, 0.405257f, -0.120445f,
+ 0.537685f, -0.3911f, 0.142142f, 0.0913808f, -0.00375967f,
+ 0.382781f, 0.60505f, -0.271608f, -0.0630436f, -0.150625f,
+ -0.0124598f, 0.0132878f, 0.138475f, -0.106264f, -0.416581f,
+ -0.518415f, 0.185127f, -0.464622f, -0.0102925f, 0.0389567f,
+ 0.406439f, -0.0414264f, -0.366185f, -0.511867f, -0.650255f,
+ 0.278252f, 0.0270234f, 0.262788f, -0.0294793f, 0.12651f,
+ 0.421537f, 0.0300837f, 0.0742187f, 0.281954f, -0.122069f,
+ -0.450145f, -0.312206f, -0.402633f, -0.0868137f, 0.190433f,
+ -0.149602f, -0.175029f, 0.00900023f, -0.266596f, 0.21721f,
+ -0.245079f, -1.09798f, 0.319409f, -0.337938f, 0.358514f,
+ 0.0771549f, 0.447087f, -0.305507f, -0.285492f, 0.383896f,
+ 0.145933f, -0.264944f, -0.118486f, 0.068805f, -0.194231f,
+ -1.79133f, 0.363408f, -0.17434f, -0.229629f, 0.132188f,
+ 0.207548f, -0.876264f, 0.265634f, 0.139332f, 0.236206f,
+ -0.0145184f, 0.562865f, 0.526612f, -0.0333508f, -0.421885f,
+ 0.273485f, -0.110882f, 0.425557f, 0.513303f, -0.422322f,
+ 0.0563155f, -0.0409693f, 0.194768f, -0.419828f, -0.107195f,
+ -1.19224f, 0.48552f, 0.132782f, -0.00932096f, -0.225484f,
+ -0.428484f, -0.0392684f, 0.750697f, 0.337615f, 0.158476f,
+ 0.413484f, 0.326017f, -0.757107f, -0.183962f, 0.00884361f,
+ 0.126507f, -0.0751588f, -0.308782f, -0.104237f, -0.703877f,
+ -0.491806f, -0.204251f, -0.317212f, 0.0815479f, 0.296323f,
+ 0.219632f, -0.039859f, 0.556257f, 0.176144f, -0.0750654f,
+ -0.106419f, 0.00400385f, -0.172266f, 0.000178763f, 0.146532f,
+ 0.255202f, -0.427235f, -0.182198f, -0.256557f, 0.260255f,
+ -0.0143364f, 0.0868664f, -0.564373f, -0.0876947f, 0.726289f,
+ 0.0160001f, -0.381562f, -0.638214f, -0.803803f, 0.25945f,
+ -0.371542f, -0.419611f, 0.238617f, 0.371834f, -0.226777f,
+ -0.894602f, 0.37458f, -0.354866f, 0.0249312f, 0.142374f,
+ 0.433813f, -0.0218183f, -0.33248f, 0.107223f, 0.390823f,
+ -0.0271108f, -0.616878f, -0.604984f, 0.517269f, -0.293573f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_4_bias[] = {
+ -0.290371f, -0.0560272f, -0.118144f, -0.270583f, 0.401388f,
+ -0.308677f, 0.150729f, -0.0324442f, -0.135937f, 0.0875581f,
+ 0.0206493f, -0.212682f, -0.0266535f, -0.326656f, 0.0185105f,
+ -1.01429f, -0.00315052f, -0.0273938f, -0.0263379f, -0.171702f
+};
+
+static const CNN_CONFIG av1_intra_mode_cnn_partition_cnn_config = {
+ NUM_CNN_LAYERS, // num_layers
+ 0, // is_residue
+ 0, // ext_width
+ 0, // ext_height
+ 0, // strict_bounds
+ {
+ {
+ CNN_LAYER_0_IN_CH, // in_channels
+ CNN_LAYER_0_WIDTH, // filter_width
+ CNN_LAYER_0_WIDTH, // filter_height
+ CNN_LAYER_0_OUT_CH, // out_channels
+ CNN_LAYER_0_HORZ_STRIDE, // skip_width
+ CNN_LAYER_0_VERT_STRIDE, // skip_height
+ 0, // maxpool
+ av1_intra_mode_cnn_partition_cnn_layer_0_kernel, // weights
+ av1_intra_mode_cnn_partition_cnn_layer_0_bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ NO_BRANCH_CONFIG, // branch_config
+ NO_BN_PARAMS, // bn_params
+ -1, // output_num
+ },
+ {
+ CNN_LAYER_1_IN_CH, // in_channels
+ CNN_LAYER_1_WIDTH, // filter_width
+ CNN_LAYER_1_WIDTH, // filter_height
+ CNN_LAYER_1_OUT_CH, // out_channels
+ CNN_LAYER_1_HORZ_STRIDE, // skip_width
+ CNN_LAYER_1_VERT_STRIDE, // skip_height
+ 0, // maxpool
+ av1_intra_mode_cnn_partition_cnn_layer_1_kernel, // weights
+ av1_intra_mode_cnn_partition_cnn_layer_1_bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ NO_BRANCH_CONFIG, // branch_config
+ NO_BN_PARAMS, // bn_params
+ 3, // output_num
+ },
+ {
+ CNN_LAYER_2_IN_CH, // in_channels
+ CNN_LAYER_2_WIDTH, // filter_width
+ CNN_LAYER_2_WIDTH, // filter_height
+ CNN_LAYER_2_OUT_CH, // out_channels
+ CNN_LAYER_2_HORZ_STRIDE, // skip_width
+ CNN_LAYER_2_VERT_STRIDE, // skip_height
+ 0, // maxpool
+ av1_intra_mode_cnn_partition_cnn_layer_2_kernel, // weights
+ av1_intra_mode_cnn_partition_cnn_layer_2_bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ NO_BRANCH_CONFIG, // branch_config
+ NO_BN_PARAMS, // bn_params
+ 2, // output_num
+ },
+ {
+ CNN_LAYER_3_IN_CH, // in_channels
+ CNN_LAYER_3_WIDTH, // filter_width
+ CNN_LAYER_3_WIDTH, // filter_height
+ CNN_LAYER_3_OUT_CH, // out_channels
+ CNN_LAYER_3_HORZ_STRIDE, // skip_width
+ CNN_LAYER_3_VERT_STRIDE, // skip_height
+ 0, // maxpool
+ av1_intra_mode_cnn_partition_cnn_layer_3_kernel, // weights
+ av1_intra_mode_cnn_partition_cnn_layer_3_bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ NO_BRANCH_CONFIG, // branch_config
+ NO_BN_PARAMS, // bn_params
+ 1, // output_num
+ },
+ {
+ CNN_LAYER_4_IN_CH, // in_channels
+ CNN_LAYER_4_WIDTH, // filter_width
+ CNN_LAYER_4_WIDTH, // filter_height
+ CNN_LAYER_4_OUT_CH, // out_channels
+ CNN_LAYER_4_HORZ_STRIDE, // skip_width
+ CNN_LAYER_4_VERT_STRIDE, // skip_height
+ 0, // maxpool
+ av1_intra_mode_cnn_partition_cnn_layer_4_kernel, // weights
+ av1_intra_mode_cnn_partition_cnn_layer_4_bias, // bias
+ PADDING_VALID, // pad
+ RELU, // activation
+ 0, // deconvolve
+ 0, // branch
+ BRANCH_NO_COPY, // branch_copy_type
+ BRANCH_NOC, // branch_combine_type
+ NO_BRANCH_CONFIG, // branch_config
+ NO_BN_PARAMS, // bn_params
+ 0, // output_num
+ },
+ },
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel[] = {
+ 0.604356f, -0.236007f, 0.342172f, 0.531397f, -0.635698f,
+ -0.591573f, 0.833872f, 0.492814f, -0.100308f, 0.186385f,
+ 0.202779f, 0.263578f, 0.330001f, -0.15531f, 0.879584f,
+ -0.0048796f, 0.490796f, 0.242254f, -0.292211f, -0.696912f,
+ 0.746664f, 0.129371f, -0.0122443f, 0.196234f, -0.251605f,
+ -0.385617f, 0.157707f, 0.699963f, 0.0432536f, -0.11141f,
+ -0.0353473f, -0.0364045f, -0.113556f, -0.520842f, 0.231248f,
+ 0.230638f, -0.323852f, -1.08633f, -0.0469168f, -0.481821f,
+ 0.366838f, 0.189627f, -0.0637262f, -0.484917f, -0.109874f,
+ 0.292237f, 0.368702f, -0.183896f, -0.109038f, -1.22613f,
+ -0.880355f, -1.63768f, 0.337426f, -0.940994f, 0.413097f,
+ -0.37879f, -0.480525f, -0.594819f, -0.0172653f, -0.499436f,
+ -0.298395f, -0.840181f, -0.0758645f, -0.772089f, -0.232727f,
+ -0.815968f, 0.160785f, -0.0767165f, 0.0064244f, -0.540491f,
+ 0.417776f, -0.384337f, -0.497377f, 0.68414f, 0.00797514f,
+ 0.262626f, 0.203732f, 0.702047f, 0.0617544f, 0.0878249f,
+ -0.315032f, -0.0169776f, 0.403986f, 0.815872f, 0.135388f,
+ 0.0858594f, 0.169172f, -0.638227f, -1.65268f, -0.0476042f,
+ -0.982685f, 0.45707f, -0.0577537f, 0.367329f, 0.176513f,
+ -0.356454f, 0.0979095f, -0.277476f, 0.257271f, -0.333451f,
+ 0.0241497f, 0.0671127f, 0.221216f, 0.106065f, 0.537151f,
+ 0.0257329f, 0.265559f, -0.348353f, 0.285569f, -0.0610511f,
+ -1.59334f, -1.63826f, -0.164898f, -0.36605f, -0.489304f,
+ 0.729241f, 0.0197627f, 0.200291f, -0.231506f, -0.255715f,
+ -0.0932264f, -0.728793f, 0.468297f, -1.09592f, -0.079791f,
+ -1.76531f, -0.182904f, -2.05897f, -0.371894f, 0.207124f,
+ 0.255029f, 0.186501f, -0.005805f, 0.00160733f, -0.178206f,
+ -0.352757f, -0.164741f, -0.557583f, -0.559692f, -0.00731467f,
+ 0.149326f, 0.409735f, 0.22083f, -0.332572f, -0.1741f,
+ -0.0519008f, -0.266402f, 0.294031f, -2.4453f, 0.339851f,
+ -0.573747f, -5.97783f, -0.084142f, 0.20286f, -0.576038f,
+ -0.111081f, 0.101238f, -5.83427f, -1.98537f, 0.322796f,
+ -0.60171f, 0.212412f, 0.247176f, 0.603694f, -0.54357f,
+ -0.693439f, 0.250725f, -4.31988f, 0.0935924f, 0.43669f,
+ -0.139706f, -0.158391f, 0.244309f, 0.619213f, -0.309154f,
+ -0.135341f, 0.475815f, -0.290804f, -0.109038f, -0.0937104f,
+ 0.0385907f, -0.29105f, -0.0597651f, -0.451187f, -1.51821f,
+ 0.141772f, 0.822204f, -0.729661f, -0.109908f, 0.178217f,
+ -0.750278f, 0.113762f, -0.0959985f, 0.066579f, -0.104209f,
+ -0.951378f, 1.4087f, -1.13175f, -1.09103f, -1.50416f,
+ -0.182273f, -1.80129f, -0.152135f, 0.356931f, 0.205591f,
+ 0.183148f, -0.498671f, -0.183034f, -0.176428f, 0.395706f,
+ -0.589908f, -0.318276f, -0.421162f, 0.658766f, -0.186752f,
+ 0.0656253f, 0.248002f, 0.289618f, -0.458111f, -0.130789f,
+ -0.542988f, 0.405804f, -0.35364f, -0.311927f, 0.218339f,
+ 0.309215f, -0.130347f, -0.0257543f, 0.0413234f, -0.190205f,
+ -0.242382f, 0.819886f, -0.255157f, -0.181219f, -0.290903f,
+ -0.301995f, -0.0469988f, 0.702936f, 0.209122f, 0.0234243f,
+ 0.598637f, 0.0305196f, 0.0423457f, -0.618799f, 0.0190867f,
+ 0.420584f, -0.224752f, -0.410077f, 0.127854f, 0.395261f,
+ -0.393685f, -0.282822f, 0.0289504f, 0.0406515f, -0.511531f,
+ -0.497611f, 0.0252715f, 0.0812549f, 0.80205f, 1.29084f,
+ 0.764972f, 0.561258f, -0.23499f, 0.217594f, -0.690935f,
+ -0.26607f, 0.357955f, 0.391608f, 0.448352f, 0.458586f,
+ -0.790071f, 0.719959f, -0.468052f, 1.24579f, 0.220705f,
+ 0.284044f, 0.141346f, 0.246687f, 0.147826f, -0.403557f,
+ -0.00648195f, 0.398034f, -0.100464f, -0.77107f, -0.188274f,
+ -0.219245f, -0.0330375f, 0.367585f, -0.220391f, 0.308736f,
+ 0.221399f, 0.340292f, 0.037597f, 0.606083f, 0.665634f,
+ -0.755529f, -0.95989f, -0.243673f, 0.233709f, -0.454628f,
+ -0.110952f, 0.776062f, 0.731136f, -0.140422f, 0.19261f,
+ 0.355086f, 0.975026f, 0.190936f, 0.776205f, 0.982781f,
+ 0.555569f, 0.42382f, -0.409721f, 0.25053f, -0.271328f,
+ 0.859941f, -0.0210901f, 0.0176916f, -0.562895f, -0.0787431f,
+ -0.861032f, -0.34022f, -0.571995f, 0.205436f, 0.346968f,
+ 0.377033f, -1.08484f, 0.297007f, -1.01693f, 0.189463f,
+ -0.483242f, 0.147058f, 0.0159503f, 0.0908779f, -0.46962f,
+ 0.174024f, -0.490704f, -0.383501f, -0.0507626f, 0.00902188f,
+ -0.202495f, 0.205047f, 0.0562261f, -0.143371f, 0.219524f,
+ -0.317294f, -0.0575756f, -0.0595825f, -0.000625279f, -0.278864f,
+ -0.0516874f, -0.225259f, 0.429046f, -0.0952421f, 0.0799135f,
+ -0.122883f, -0.262308f, -0.481006f, -0.0466122f, -0.402822f,
+ 0.150595f, -0.0919558f, -0.356765f, -0.199222f, 0.219389f,
+ -0.214452f, -0.196361f, -0.095758f, -0.115891f, -0.143777f,
+ 0.549843f, -0.113036f, 0.764895f, -0.0114812f, -0.0684054f,
+ -0.98045f, -0.0170634f, 0.247719f, -0.18718f, -0.381566f,
+ 0.150758f, -0.526257f, 1.00851f, 0.776634f, 1.69728f,
+ -0.303058f, 0.228967f, -0.414134f, 0.0858226f, -0.285472f,
+ 0.431459f, 0.315318f, 0.587835f, 0.335737f, -0.0222039f,
+ 0.18945f, 0.274008f, 0.609263f, 0.320232f, -0.214137f,
+ -0.0297668f, 0.0439046f, -0.52821f, -0.0127375f, 0.431885f,
+ 0.508846f, -0.329189f, -0.166778f, -0.94338f, -0.358807f,
+ 0.208641f, -0.517986f, -0.128278f, 0.693464f, -0.24408f,
+ -0.0669412f, -0.410287f, 0.0444145f, -0.264179f, 0.143884f,
+ 0.276842f, 0.498934f, -0.682557f, -0.217198f, -0.8249f,
+ -0.40446f, -0.115376f, 0.417934f, 0.65605f, -0.00570035f,
+ -0.365742f, -0.367625f, 0.526824f, -0.0164913f, -0.255998f,
+ 0.247292f, 0.0846536f, 0.109302f, -0.302996f, 0.160564f,
+ 0.0228132f, 0.035211f, -0.236951f, 0.493801f, 1.37315f,
+ -0.182348f, 0.234437f, -0.256906f, 0.12523f, 0.667113f,
+ -0.437981f, -0.0721831f, 0.303976f, -0.041336f, -0.145894f,
+ -0.733741f, 0.436056f, 0.368542f, -0.149072f, -0.290281f,
+ 0.0946743f, -0.0579292f, 0.264539f, 0.170048f, 0.262411f,
+ 0.049679f, 0.371369f, 0.760675f, 0.482157f, -0.0196783f,
+ 0.260888f, 0.948856f, 0.170228f, -0.134432f, -0.942235f,
+ -1.23226f, -0.373963f, -0.0381773f, -0.17947f, 0.00947998f,
+ 0.01086f, 0.389578f, -0.380389f, -0.0865851f, -0.220328f,
+ -0.171901f, -0.384325f, -0.0787615f, 0.392678f, 0.123392f,
+ -0.0895824f, 0.00480886f, -0.162918f, 0.214336f, -0.00147339f,
+ 0.203899f, -0.00292344f, -0.148594f, 0.0425697f, -0.306896f,
+ -0.342225f, -0.45088f, -0.184454f, -0.00923638f, -0.521993f,
+ -0.334464f, 0.156497f, -0.0856832f, -0.277661f, -0.0721105f,
+ -0.488781f, -0.509543f, -0.012664f, 0.0940558f, -0.29869f,
+ 0.0434843f, -0.0178945f, -0.0525666f, -0.303178f, 0.713507f,
+ -0.137413f, -0.170289f, -0.142942f, -0.316002f, 0.229125f,
+ -0.277585f, 0.0125026f, 0.508316f, -1.20614f, -0.915129f,
+ -1.63389f, -0.454604f, -0.893951f, -0.447403f, -0.751423f,
+ 1.3886f, 0.617818f, 0.611458f, -0.884173f, -0.7779f,
+ -0.608639f, -0.164759f, -0.631846f, -0.176894f, -0.459361f,
+ -0.187119f, 0.173283f, -0.477191f, -0.156736f, 0.182675f,
+ 0.598854f, -0.489941f, -0.420493f, -0.162002f, 0.344418f,
+ 0.33832f, -0.187463f, -0.388721f, -0.0733151f, -0.138835f,
+ 0.313699f, 0.0625967f, -0.291488f, 0.114088f, -0.356843f,
+ 0.197506f, 0.0320749f, 1.16745f, -0.36081f, 1.63416f,
+ 0.198392f, 1.13928f, -0.317971f, 0.531019f, 0.526518f,
+ 0.185814f, 0.0923607f, 0.192858f, -0.234378f, 0.18091f,
+ -0.228837f, 0.397216f, 0.581501f, 0.284376f, -0.130434f,
+ 0.20076f, 0.242662f, -0.0480872f, 0.131746f, 0.362712f,
+ 0.0146821f, 0.475679f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias[] = {
+ 0.477356f, 0.385222f, 0.389122f, 0.539506f, -0.0272558f, 0.581605f,
+ -0.800961f, 0.142229f, 0.117549f, -0.0724944f, 0.102095f, -0.71319f,
+ -0.0162434f, -0.132858f, 0.543411f, -0.626599f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel[] = {
+ 0.195436f, -0.623354f, 1.27907f, 0.270071f, -0.677612f,
+ 0.0266141f, 0.272991f, -0.425446f, 0.891889f, -0.299836f,
+ -0.611825f, -0.0322273f, 0.185276f, 0.238639f, -0.150954f,
+ 0.083495f, -0.472106f, 0.573506f, 1.16465f, -0.154947f,
+ 0.640631f, -1.59467f, -9.8166f, -0.22889f, -0.189912f,
+ 0.227052f, -0.540787f, 0.0840873f, -3.04293f, -0.0209975f,
+ -6.10979f, -5.92801f, 0.288467f, -0.169476f, 0.0527948f,
+ -1.21202f, -0.280915f, 0.290863f, -0.601877f, 0.0598784f,
+ -0.592136f, -0.535588f, -0.0434018f, -0.653223f, 0.00339129f,
+ -0.133273f, 0.279463f, 0.483879f, 0.463664f, -0.14174f,
+ -1.56354f, 0.560043f, -1.44639f, 0.673528f, -0.108418f,
+ -0.707313f, 0.49633f, -0.0321971f, 0.411475f, -0.382184f,
+ -0.965501f, -0.0507655f, 0.540415f, -0.977297f, 0.370382f,
+ -0.375683f, 0.0844529f, -2.0002f, -0.346289f, 0.621251f,
+ -0.489855f, 0.191252f, -0.576629f, -0.35773f, 0.023167f,
+ 0.180793f, -0.417864f, 0.0587254f, 0.167824f, 0.0612058f,
+ -0.712108f, 0.155614f, 0.900036f, -0.480124f, 0.146117f,
+ 0.467011f, 0.412525f, 0.312724f, 0.551826f, -0.179601f,
+ 0.706261f, 0.00674965f, -0.495221f, 0.140829f, -0.0619195f,
+ -0.0697912f, 0.511967f, -0.0318237f, -0.285946f, -0.28608f,
+ 0.0894142f, 0.234351f, -0.272328f, -0.350369f, -0.392605f,
+ 0.287318f, 0.310426f, 0.293524f, 0.357681f, -0.157868f,
+ 0.149652f, -0.259363f, 0.192941f, -0.850096f, 0.456507f,
+ 0.387857f, -0.491187f, -0.0541993f, -0.28118f, 0.193991f,
+ -0.0956664f, 0.0679829f, 0.0341118f, 0.141826f, 0.271538f,
+ -0.285295f, -0.68666f, 0.306414f, 0.600678f, 0.494801f,
+ -1.11907f, 0.524849f, 0.151169f, 0.474068f, -0.43441f,
+ -0.229138f, 0.0345483f, 0.682888f, -0.471534f, -0.0457066f,
+ -2.36721f, 0.446407f, 0.20396f, -1.17868f, 0.815363f,
+ -1.13897f, 0.397217f, -0.593796f, -6.95512f, 0.650695f,
+ 0.771657f, 0.15227f, -0.824519f, 0.617854f, -0.295353f,
+ -0.101207f, 0.600989f, -0.550653f, -0.722371f, 0.292006f,
+ -0.451891f, 0.54544f, 0.354278f, 0.0136258f, 0.192003f,
+ 0.258275f, -0.0443647f, 0.0928186f, 0.667775f, 0.239558f,
+ 0.0523887f, 0.71586f, 0.292563f, 0.362479f, 0.373453f,
+ 0.250638f, -0.423037f, -0.486574f, -0.619397f, 0.343888f,
+ 0.974971f, 0.574218f, 0.273989f, -0.209956f, -0.274333f,
+ 0.0553766f, 0.263918f, 0.733824f, 0.038713f, -0.0788992f,
+ 0.292014f, 0.111808f, -0.197507f, 0.593668f, -0.0245337f,
+ 0.0873662f, 0.530997f, 0.620717f, 0.310697f, -1.54861f,
+ 1.12915f, 0.0991346f, -0.59214f, 0.422325f, -0.0157936f,
+ 0.380975f, 0.626403f, 0.268064f, -0.615231f, -1.43172f,
+ 0.0928048f, 0.0949026f, -0.470912f, -0.0867527f, -0.0381206f,
+ 0.178393f, -1.13737f, 0.12798f, 0.258214f, -0.803364f,
+ 0.177506f, 0.542718f, 0.660656f, 0.145091f, 0.183056f,
+ -0.47338f, 0.469287f, 0.10832f, 0.0994899f, -0.402719f,
+ 0.157287f, 0.523071f, -0.324493f, 0.343599f, 0.664839f,
+ -0.0375519f, -0.279238f, -0.0722333f, 0.395344f, -0.289316f,
+ 0.0259298f, -0.843245f, -0.160021f, 0.741429f, -1.38726f,
+ -0.2969f, -0.240443f, 0.247731f, -1.04088f, -0.280454f,
+ -0.237054f, -0.759227f, 0.0456369f, -0.647453f, -1.02372f,
+ -0.200395f, -0.546839f, -0.104226f, -0.152727f, -0.56685f,
+ -0.0559663f, -0.425494f, -0.610679f, -0.987096f, -0.575138f,
+ -0.0887979f, 0.463646f, -1.041f, -0.49412f, -0.175298f,
+ -0.463296f, -0.955177f, 0.17852f, -1.10694f, 0.181991f,
+ -0.18998f, 0.227818f, 0.688237f, -1.10444f, 0.549108f,
+ -0.171849f, -0.245614f, 0.120624f, 1.29571f, 0.607116f,
+ 0.00809927f, 0.1041f, -1.22918f, -0.212948f, 0.430239f,
+ -1.57341f, 0.482054f, 0.275905f, 0.939785f, -1.0209f,
+ -0.355534f, 0.397337f, -0.0593077f, -0.239603f, 0.475483f,
+ -0.999101f, -0.140578f, 1.04787f, -0.591981f, -0.306989f,
+ -0.879012f, -0.994715f, 0.0343158f, 0.218509f, 0.34704f,
+ 0.0672934f, -0.178941f, 0.20509f, -0.360031f, 0.161241f,
+ -0.324775f, -0.359531f, -0.0657085f, -0.864422f, -0.444865f,
+ 0.597095f, -0.948691f, 0.240001f, -0.783159f, -0.569422f,
+ 0.974205f, -1.04539f, 0.345915f, -0.681558f, -0.246047f,
+ 0.256174f, 0.493667f, 0.681324f, 0.155613f, 0.773309f,
+ -0.647027f, -0.214744f, -0.474202f, -0.661092f, -1.02316f,
+ 0.0572593f, -0.437082f, -0.119874f, -0.464877f, -0.58067f,
+ -0.218029f, 0.319516f, -0.378983f, -0.0698695f, 0.554693f,
+ -0.537875f, 0.126429f, -0.145113f, -0.594312f, -0.218021f,
+ -0.703569f, 0.0720548f, 0.261054f, -0.81438f, 0.249921f,
+ 0.165296f, -0.079028f, -0.322647f, 0.134458f, 0.0975046f,
+ 0.538594f, -0.250126f, 0.142309f, 0.526486f, 0.0532615f,
+ -0.383332f, -0.38143f, -0.101611f, 0.519776f, -0.278364f,
+ -0.23287f, -0.29139f, 0.22353f, 0.472085f, 0.366264f,
+ 0.741187f, 0.42019f, 0.0676459f, -0.230008f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias[] = {
+ -0.48603f, -0.578556f, 0.257639f, 0.459915f, 0.178156f, -1.16663f,
+ 0.828891f, 0.620291f, 0.413257f, -1.00508f, -0.574179f, -1.20623f,
+ -0.377837f, -0.0360333f, 0.681536f, 0.137189f, -0.458718f, 0.387131f,
+ 0.0233112f, 0.126045f, 0.361304f, 0.655317f, 0.413134f, 0.769947f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_0_logits_kernel[] = {
+ 0.67244f, -2.59179f, 0.50425f, -1.86481f, 1.15891f, -1.26447f,
+ 0.761081f, 0.645117f, -1.78594f, -0.872703f, -0.192054f, -1.82359f,
+ -0.560935f, 0.838959f, 0.502264f, -1.28958f, -0.205551f, 0.635671f,
+ -1.12619f, -1.68277f, 0.83361f, 1.57235f, 1.15839f, 0.35345f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_0_logits_bias[] = {
+ 1.14463f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel[] = {
+ 0.364612f, 0.237868f, -0.192821f, 0.12364f, 0.522205f,
+ -0.205785f, -0.503288f, -0.426503f, -0.083073f, 0.0164429f,
+ 0.184278f, -0.426055f, 0.0717997f, -0.261968f, 0.176412f,
+ -0.101226f, 0.0400285f, -0.332051f, 0.344385f, 0.189565f,
+ 0.441162f, 0.330462f, -0.719857f, -1.14209f, 0.557831f,
+ 0.104756f, 0.0562001f, -0.465923f, -0.344592f, -0.191554f,
+ -0.0656866f, -0.640162f, 0.419388f, 0.409308f, -1.68632f,
+ -1.10829f, 0.105485f, -0.14561f, -0.944738f, 0.104629f,
+ -0.146837f, 0.538823f, -0.153157f, 0.321081f, -1.77714f,
+ -0.0559296f, 0.324136f, -0.497023f, -1.15793f, -0.740144f,
+ -0.0888472f, 0.010059f, -0.18394f, -0.234405f, -0.10586f,
+ 0.130958f, -0.101944f, -0.186483f, -0.447049f, -0.900026f,
+ 0.128444f, 0.401696f, 0.128509f, 0.123778f, 0.062168f,
+ -0.321755f, -0.0691584f, 0.254468f, -0.115212f, -0.848885f,
+ 0.817005f, 0.0615853f, 0.153363f, 0.513855f, 0.789225f,
+ 0.356168f, 0.371613f, 0.269541f, 0.268173f, 0.220481f,
+ -0.109063f, -0.00620798f, -0.0334622f, 0.236267f, -0.0235294f,
+ -0.0800253f, 0.0294184f, 0.047131f, -0.224047f, 0.0890737f,
+ -0.356293f, 0.0989534f, 0.16799f, 0.498266f, 0.612581f,
+ -0.372897f, -0.75125f, 0.77698f, 1.1032f, -0.0764679f,
+ 0.0266299f, 0.309532f, 0.461305f, 0.0193521f, -0.0939161f,
+ -0.276156f, -0.102714f, -0.0828328f, 0.40003f, 0.122542f,
+ 0.0867203f, -0.170738f, 0.0850642f, -0.130762f, 0.082324f,
+ -0.115218f, -0.0244491f, 0.0434331f, 0.216453f, 0.443733f,
+ -0.173679f, -0.161617f, 0.316209f, -0.689656f, -1.52007f,
+ -0.421018f, 0.430833f, -0.00734122f, 0.284499f, -0.0207885f,
+ 0.0572024f, -0.878942f, 0.388264f, 0.0191589f, -0.123415f,
+ -0.0461196f, -0.0444461f, -0.00383171f, 0.0945655f, -0.0597219f,
+ -0.374918f, 0.0182124f, 0.523083f, 0.00519547f, 0.80513f,
+ -0.221433f, -1.30591f, -0.416917f, -0.718173f, 0.622999f,
+ 0.941798f, 0.0477536f, 0.0303772f, 0.268078f, 0.414778f,
+ 0.394325f, 0.299733f, -0.583208f, 0.309379f, 0.416581f,
+ 0.0299948f, -0.409145f, -0.161557f, -0.214082f, -0.0098119f,
+ 0.221912f, 0.107135f, 0.0692518f, 0.00490957f, 0.107613f,
+ -0.368404f, -0.548006f, 0.208274f, 0.550475f, 0.643678f,
+ -1.65859f, 0.095938f, -0.0434245f, -0.0792685f, 0.838109f,
+ -0.0138653f, -0.527573f, -0.123472f, -0.235618f, -0.677401f,
+ -0.125877f, -0.175604f, -0.203196f, 0.113478f, -0.228323f,
+ -0.53539f, 0.134458f, 0.0534899f, -0.213006f, -0.138679f,
+ -2.15023f, 0.186303f, 0.48566f, -1.22301f, -0.240982f,
+ -0.486836f, -0.121181f, -0.131382f, -0.0320283f, 0.278828f,
+ 0.342581f, -0.182257f, -0.365193f, -0.226351f, 0.108928f,
+ -0.100159f, 0.448355f, -0.0768947f, 0.0633719f, -0.104786f,
+ 0.0456653f, 0.0965752f, 0.156403f, -0.157337f, 0.212259f,
+ 0.317939f, 0.124193f, -0.329475f, 0.206868f, -2.15986f,
+ -0.108385f, -0.396769f, -0.0317231f, -0.271524f, -0.184697f,
+ 0.662615f, 0.412926f, -0.0217462f, -0.0285475f, -0.118826f,
+ 0.0252706f, -0.137091f, 0.198973f, 0.329509f, -0.0831966f,
+ -0.621237f, 0.0896179f, 0.805261f, -0.019675f, 0.962452f,
+ 0.307433f, 0.892168f, -0.537587f, -2.46145f, 0.125606f,
+ 0.920491f, 0.219462f, 0.292765f, -0.748238f, -0.0537239f,
+ -0.224326f, 0.505492f, 0.176426f, 0.0343168f, 0.16708f,
+ -0.581393f, 0.951726f, -1.1777f, -0.561914f, -1.53288f,
+ 0.864567f, -1.19648f, -1.24141f, -0.334688f, -0.622026f,
+ 0.666876f, -0.197005f, -0.600507f, -0.851924f, 0.492299f,
+ 0.31078f, -0.0736115f, 0.030999f, -6.02463e-05f, -0.0604341f,
+ -0.0254238f, 0.139222f, 0.333235f, 0.366534f, -0.191982f,
+ -0.0156092f, 0.44234f, -0.0193213f, 0.0938745f, -0.015709f,
+ -0.12043f, 0.00895591f, 0.0464401f, 0.0530699f, -0.623018f,
+ -1.23372f, -0.538647f, -1.12389f, 0.26742f, 0.548694f,
+ 0.00540655f, -0.219703f, 0.314894f, -0.573463f, -0.241555f,
+ 0.441851f, 0.422491f, 0.253785f, -0.384683f, 0.0370165f,
+ 0.226669f, 0.245587f, 0.215265f, -0.122272f, 0.0492235f,
+ 0.000658591f, -0.312877f, 0.436487f, -0.229199f, -0.174373f,
+ 0.904268f, -0.855845f, -0.877293f, -0.65409f, 0.313795f,
+ 0.461748f, -0.737766f, -0.228523f, 0.182181f, 0.334522f,
+ 0.0629676f, -0.151087f, 0.178798f, -0.325809f, -0.331672f,
+ 0.0865837f, -0.0684225f, 0.0252008f, -0.0820631f, 0.0481863f,
+ 0.209473f, -0.0242151f, -0.0898919f, -0.163828f, -0.164282f,
+ 0.581888f, 0.816896f, 0.0607674f, 0.364855f, -0.346512f,
+ -0.764174f, 0.595561f, 0.302872f, 0.206361f, 0.106917f,
+ -0.972338f, 0.176948f, 0.6415f, -0.131897f, -0.155802f,
+ 0.216337f, -0.342511f, 0.123743f, -0.123014f, 0.0205439f,
+ 0.15173f, -0.23801f, -1.00387f, 0.651328f, 0.237439f,
+ -0.542952f, 1.066f, -0.161107f, -0.593545f, 0.219343f,
+ -0.178094f, 0.0789992f, 0.428332f, 0.23827f, -0.327421f,
+ 0.416144f, 0.00394653f, 0.052046f, -0.238289f, 0.405942f,
+ 0.00141984f, 0.161017f, 0.077111f, 0.0823985f, 0.0981208f,
+ 0.109949f, -0.0428502f, 0.343629f, -0.722978f, -0.375269f,
+ -0.111634f, -0.271523f, 0.712093f, 0.684904f, -0.572331f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias[] = {
+ 0.583367f, -0.202004f, -0.207626f, 0.412451f, -0.258311f, 0.0304954f,
+ -0.102458f, 0.450087f, -0.376851f, -0.338702f, 0.335226f, 0.889072f,
+ 0.502411f, 0.649282f, 0.15345f, -0.0109896f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel[] = {
+ 0.0214882f, -0.934339f, -0.173335f, 0.8362f, -0.764234f,
+ 0.525163f, 0.409749f, 0.821539f, -0.784157f, -0.455593f,
+ 0.446099f, 0.406756f, 0.479242f, -0.814038f, -0.419332f,
+ 0.328869f, -0.340707f, 0.133219f, 0.0320347f, 0.25089f,
+ -0.324917f, -0.0684265f, 0.0377777f, -0.262556f, 0.673458f,
+ -0.0291454f, -0.417957f, -1.0075f, -0.481537f, 0.922105f,
+ -0.000516239f, -0.40034f, 0.242067f, -0.43178f, 0.32001f,
+ 0.143599f, -0.345172f, 0.126093f, 0.148518f, -1.12151f,
+ -1.03435f, 0.551691f, -0.310001f, -0.323194f, -0.595128f,
+ -0.395689f, 0.737268f, -0.729227f, 0.590804f, -0.590022f,
+ -1.01427f, -0.521159f, -0.617579f, 1.07292f, -0.613047f,
+ -0.619093f, 0.335268f, 0.473753f, -0.795027f, 1.24635f,
+ -0.556193f, 0.241046f, -0.0354181f, -0.354215f, 0.716752f,
+ -0.00200745f, -1.25171f, -0.440731f, -0.763918f, -0.588614f,
+ -0.183901f, -0.396056f, 0.226903f, 0.921471f, 1.10465f,
+ 0.207053f, 0.57681f, -0.555699f, 0.235469f, -0.92149f,
+ 0.625808f, 0.29653f, -0.81775f, -0.307889f, -1.41384f,
+ -0.136205f, -0.365314f, -0.516741f, 0.748052f, 0.617947f,
+ 0.0973239f, 0.839607f, 0.530668f, -0.227032f, -0.449044f,
+ -1.04725f, -0.244363f, -0.396888f, -0.146161f, 0.359789f,
+ 0.0436599f, 1.21645f, -0.336069f, 0.0534646f, -0.00200328f,
+ 0.658551f, -0.156142f, -1.0728f, 0.0951015f, 0.234837f,
+ -0.380525f, 0.041783f, -0.269273f, 0.0386013f, -0.455589f,
+ -0.174338f, 0.0345251f, 0.17116f, -0.507642f, 0.210453f,
+ 0.739987f, -0.0438776f, 0.570145f, -0.118811f, 0.0548662f,
+ 0.153458f, -0.89887f, 0.493704f, 0.283351f, 0.785441f,
+ -0.586002f, -0.0616167f, -0.714328f, -0.145941f, -0.449656f,
+ 0.850117f, 0.279997f, 0.204143f, -0.31356f, 0.947057f,
+ -0.135787f, 0.747071f, 0.0145968f, -0.81414f, 0.431009f,
+ -0.275824f, -0.342928f, -0.0528272f, -0.592183f, 0.433915f,
+ -0.251752f, -0.311815f, -1.47533f, -1.43677f, 0.0698436f,
+ 1.01341f, 0.305063f, -0.252003f, -0.428915f, -0.00104153f,
+ -0.368267f, -0.354523f, -0.27956f, -0.771664f, 0.232092f,
+ -0.428495f, 0.424952f, -0.343229f, 0.196899f, -0.761084f,
+ -0.0110293f, -0.335361f, 0.571637f, -0.423489f, -0.52773f,
+ 0.0108043f, -0.504715f, -1.1419f, -0.402904f, -0.160747f,
+ -0.329184f, 0.375374f, -1.02604f, -0.601371f, 0.631652f,
+ 0.0742486f, -0.464765f, 0.467445f, 0.240562f, -0.38211f,
+ -0.459004f, 0.704196f, 0.021357f, 0.860785f, -1.16731f,
+ -0.479029f, -0.139644f, -0.444087f, 0.322326f, -0.25455f,
+ 0.874399f, 0.477696f, 0.0464487f, 1.20658f, 0.0993356f,
+ 0.00682712f, -0.10163f, -0.371765f, -0.629513f, -0.679196f,
+ -0.193935f, 0.47405f, -0.18238f, 0.254918f, -0.35306f,
+ -0.375611f, 0.119771f, -0.257282f, -0.565124f, 0.162667f,
+ -0.356128f, 0.870351f, 0.241847f, -0.264712f, -0.384322f,
+ 0.31807f, 0.211621f, -0.180767f, 0.764944f, 0.368646f,
+ 0.186111f, 1.02458f, -0.494252f, -0.483375f, -0.699664f,
+ 0.00415657f, -0.189376f, -0.677103f, -0.030319f, 0.667087f,
+ 0.810951f, -0.488237f, -0.387355f, -0.726579f, -0.304763f,
+ 1.10392f, -0.775977f, -0.247731f, 0.532396f, 1.24089f,
+ 0.206621f, -0.670568f, -1.08142f, -0.342503f, 0.189854f,
+ -0.200846f, 0.784204f, 0.641112f, -0.509346f, 0.0805264f,
+ -1.40006f, 0.322084f, -0.823739f, -1.12965f, -0.215668f,
+ 0.099673f, 0.425966f, 0.771697f, 0.338834f, 0.345364f,
+ -0.297826f, -0.176746f, -0.297299f, -1.80029f, -0.178348f,
+ 0.421194f, -0.19155f, 0.417653f, 0.374441f, -0.135654f,
+ -0.895843f, 0.220647f, 0.368264f, 0.369233f, 0.382707f,
+ 0.0800511f, 0.542053f, 0.318896f, -0.385539f, 0.313305f,
+ -1.01166f, -0.222379f, -1.53708f, 1.32407f, -0.665444f,
+ -0.102348f, 0.0410504f, -0.616825f, 1.3108f, 0.405902f,
+ 1.27777f, 0.0630558f, -0.172696f, 0.16224f, -1.10111f,
+ -3.31326f, -0.242566f, 0.831422f, 0.917397f, 0.311749f,
+ -0.238613f, 0.438007f, -0.407089f, -0.0202555f, -1.82502f,
+ -0.907965f, -0.300031f, -0.616669f, -0.767921f, 0.285919f,
+ -0.112019f, 0.252677f, 0.350892f, 0.000214244f, 0.315915f,
+ 0.260344f, 0.327362f, -0.0211213f, -0.41241f, 0.0418355f,
+ 0.103328f, -0.0158439f, -0.230505f, -0.0215114f, 0.266739f,
+ -0.234376f, -0.352583f, 0.0709437f, -0.90649f, -0.535843f,
+ 1.21322f, -1.05144f, -0.983682f, -0.189956f, 1.14208f,
+ -0.0188492f, -0.254821f, -0.463214f, -0.708714f, 0.0447348f,
+ -0.220831f, 0.476299f, 0.102544f, 1.1173f, -0.36981f,
+ -0.814102f, 0.103604f, -0.247871f, 0.0610701f, -0.356616f,
+ -0.144093f, 1.66496f, 0.180206f, -1.04384f, -0.65883f,
+ 0.0290771f, -0.622728f, 0.761523f, -0.909091f, -0.0340348f,
+ 0.666895f, -0.0232575f, 0.962643f, -2.50103f, -1.69745f,
+ -0.0482305f, 0.771811f, -1.32233f, -0.778722f, -0.203309f,
+ 0.395875f, -0.171812f, 0.253794f, 0.432799f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias[] = {
+ -0.152159f, 0.552347f, -0.806068f, 0.227901f, 0.335896f, 0.180785f,
+ 0.75277f, 0.982208f, 0.409823f, -0.17755f, -0.125365f, 0.738114f,
+ 0.202331f, 0.751737f, -0.360511f, 0.149254f, 0.085073f, -0.214542f,
+ 0.529727f, -0.0348777f, -2.13162f, -0.893332f, -0.136952f, -0.71258f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_1_logits_kernel[] = {
+ -0.632145f, 0.738727f, -0.750737f, -0.931571f, -1.79763f, -2.31153f,
+ 0.912733f, 0.879995f, -1.00602f, -1.02467f, 0.0536835f, 1.76011f,
+ -0.898546f, 1.06959f, 1.60471f, -1.7312f, -0.877168f, -0.681185f,
+ -1.57286f, -1.16038f, -4.11303f, -3.06351f, -3.02536f, -2.92186f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_1_logits_bias[] = {
+ 1.33207f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel[] = {
+ 0.0419551f, 0.0924078f, -0.153084f, 0.191642f, 0.069586f,
+ -0.530661f, 0.431968f, 0.000453838f, 0.793047f, 0.0161817f,
+ -0.476075f, -0.156638f, -0.219066f, 0.372716f, -0.0642299f,
+ 0.156813f, -0.105819f, -0.0519422f, 0.149935f, 0.295544f,
+ 0.192037f, -0.0450383f, 0.828794f, -0.0510661f, -1.22549f,
+ -0.100293f, -0.178274f, 0.0304427f, -0.0664097f, -0.0438936f,
+ 0.948248f, 0.425486f, -0.238206f, 1.3744f, 0.336897f,
+ 0.0760769f, -0.583508f, 0.0735519f, -0.117024f, 0.0501598f,
+ 0.332212f, 0.199531f, 0.424764f, 0.206712f, 0.342868f,
+ 0.592673f, -0.0961148f, -0.190113f, -0.155027f, 0.00789871f,
+ -0.0514839f, -0.416154f, -0.290309f, 0.407541f, 0.48534f,
+ 0.126564f, 0.0709566f, -0.0469664f, 0.735403f, -0.365963f,
+ 0.150295f, -0.50147f, 0.021383f, 0.76514f, 0.0085721f,
+ -0.416384f, 1.22268f, 0.0832438f, 0.367813f, -0.12012f,
+ 0.823183f, -0.0525972f, -0.325526f, -0.0983032f, 0.370128f,
+ 0.368778f, 0.138971f, -0.0397997f, 0.411058f, -0.0400404f,
+ 0.588437f, -0.29963f, -0.107992f, -1.75238f, -0.274387f,
+ 0.430418f, 0.495152f, 0.283172f, -0.441166f, 0.195339f,
+ -0.436182f, -0.252613f, 0.176204f, -0.126541f, -0.474833f,
+ -0.0721603f, -0.496599f, -0.0608464f, 0.0333451f, -0.0621485f,
+ 0.0843859f, 0.0637854f, -0.145291f, 0.14876f, 0.181665f,
+ -0.675805f, 0.294903f, 0.301118f, -0.225957f, 0.0105897f,
+ -0.136427f, -0.555925f, -0.158853f, -0.216779f, 0.0612481f,
+ -0.107158f, 0.352451f, 0.140536f, -0.0148237f, 0.189371f,
+ -0.091046f, -0.0476226f, 0.366054f, -0.0723413f, 0.389883f,
+ -0.0213411f, 0.0279539f, 0.194827f, -0.271502f, -0.166474f,
+ 0.0690549f, 0.0584665f, 0.0198415f, -0.442348f, 0.1571f,
+ -0.113463f, -0.16822f, -0.0580659f, -0.13441f, -0.0022386f,
+ 0.251521f, -0.160494f, -0.0753547f, 0.0897289f, 0.137917f,
+ 0.129836f, 0.0816833f, -0.626288f, 0.0643293f, -1.20001f,
+ 0.085631f, -0.195602f, 0.251244f, 0.0321744f, 0.0493178f,
+ -0.220616f, 0.724075f, -0.00831514f, 2.00319f, 0.407932f,
+ 0.0710799f, -0.166128f, 0.0126611f, -0.229644f, -0.0984299f,
+ 0.632041f, -0.0946141f, 0.295315f, 0.100934f, 0.184883f,
+ -0.236173f, 0.158081f, 0.195775f, 0.413542f, 0.789801f,
+ 0.767741f, 0.166275f, -0.348271f, -0.384074f, -0.291648f,
+ -0.119899f, 0.0368354f, 0.0751987f, 1.04217f, -0.159002f,
+ -2.71592f, -0.788502f, -1.06268f, 0.536057f, 0.0575876f,
+ 1.06811f, 0.12033f, 0.198578f, -0.0419196f, 0.0631388f,
+ 0.623138f, -0.142226f, 1.33129f, 0.0868059f, -0.0287825f,
+ 0.139378f, -0.143037f, 0.307452f, 0.0363987f, -0.0976368f,
+ 0.040544f, 0.0269327f, -0.0845524f, 0.0674699f, 0.104501f,
+ -0.0351155f, 0.167071f, 0.00986971f, 0.10284f, 0.0300016f,
+ 0.192601f, 0.0397177f, 0.0251346f, -0.00912908f, -0.0452825f,
+ 0.0164356f, -0.0275149f, 0.194846f, 0.0943608f, 1.61674f,
+ 0.0124345f, 0.523787f, 0.0397258f, -0.17208f, -0.147808f,
+ -1.23583f, 0.676385f, 0.551994f, 0.0233041f, 0.0116391f,
+ -0.466706f, 0.154725f, -0.207371f, 0.606662f, 0.247286f,
+ 0.31216f, 0.173765f, -0.268033f, 0.224422f, 0.314649f,
+ 0.481922f, -0.190604f, -0.0129162f, 0.270552f, 0.135195f,
+ 0.0927735f, -0.226099f, 0.53897f, 0.103309f, -0.0257271f,
+ -0.0246776f, 0.442013f, -0.179246f, -1.02581f, 0.206176f,
+ -0.326365f, 0.391623f, -0.103549f, 0.115645f, 0.0269328f,
+ -0.584517f, -0.237502f, 0.157996f, 0.0447407f, -0.161f,
+ -0.126072f, -0.148967f, -0.416347f, 0.0236496f, -1.12612f,
+ 0.0120709f, -0.00979376f, 0.0507126f, -0.172262f, 0.0697059f,
+ -0.212334f, 0.335731f, -0.0301362f, -0.839583f, -0.238539f,
+ 0.0636752f, -0.0467217f, -0.0372118f, -0.144615f, -0.161773f,
+ -0.648242f, 0.158197f, -0.051471f, -0.0615805f, -0.0426936f,
+ -0.0745554f, 0.358975f, 0.358297f, 0.0568553f, -1.14383f,
+ -0.103955f, 0.728194f, -0.224945f, -0.31659f, -0.204458f,
+ 0.171763f, -0.465666f, 0.899234f, -0.37042f, -0.0894774f,
+ 0.11478f, -0.334957f, 0.0896514f, 0.413251f, 0.359471f,
+ 1.41597f, 0.558082f, 0.153486f, 0.0270558f, -0.0178797f,
+ 0.124983f, -0.12273f, -1.04516f, -0.125375f, 0.370336f,
+ -0.209423f, -0.36816f, -0.66077f, -0.0180773f, -0.628921f,
+ -0.178542f, 0.0346841f, 0.0319309f, -0.470138f, 0.172763f,
+ 0.0798846f, -0.259737f, -0.652461f, -0.386283f, -0.474447f,
+ -0.924054f, -0.0154613f, -0.613712f, -0.138068f, -0.337842f,
+ 0.217921f, -0.0711405f, 0.000404091f, -0.703766f, 0.0364683f,
+ 0.150173f, 0.0126249f, 0.170594f, 0.0371879f, -0.0862515f,
+ -0.23454f, -0.0144143f, 0.164947f, 0.45591f, 0.115703f,
+ 0.069752f, -0.011993f, 0.0402097f, 0.00697581f, 0.0811613f,
+ 0.384752f, 0.341977f, 0.06087f, 0.0590107f, 0.00812679f,
+ 0.121211f, -0.0612108f, 0.167851f, 0.195781f, -1.62162f,
+ 0.336292f, -0.0772523f, -0.310786f, 0.188257f, -0.0325804f,
+ -0.240098f, 0.158748f, -0.265264f, 3.19593f, -0.449251f,
+ -1.33102f, -0.482856f, -0.435731f, 0.300808f, 0.346503f,
+ 2.67378f, -0.152379f, 0.219322f, -0.146119f, -0.0584806f,
+ -0.0276895f, -0.21955f, -0.479179f, -0.689545f, 0.152799f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias[] = {
+ -0.296575f, 0.101072f, -0.208429f, 0.111585f, 0.699552f, -0.379484f,
+ 0.313244f, -0.746369f, 0.867757f, 0.457318f, -0.0190943f, -0.290745f,
+ 0.45592f, -0.160465f, -0.634243f, 0.0829737f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel[] = {
+ 0.27511f, -2.14172f, 1.25755f, -0.554772f, 0.589508f,
+ 0.228307f, 0.0754914f, 1.07061f, 0.293323f, 0.65162f,
+ -0.272016f, -1.33519f, -0.606759f, -0.57827f, 0.368807f,
+ -1.48668f, 0.162439f, 0.0821667f, 0.225535f, -0.795996f,
+ 0.0328293f, 0.975476f, -0.187514f, 2.47069f, -1.5638f,
+ -0.461524f, 0.00310062f, 1.1556f, -0.286206f, 0.00426021f,
+ 0.585836f, 0.900007f, 0.384055f, 0.189435f, -0.157291f,
+ -0.0710573f, -0.0663986f, -0.710772f, -0.669136f, -0.379493f,
+ -1.2634f, -0.377524f, 0.824094f, 0.312308f, 0.125368f,
+ -0.382737f, 0.637109f, 0.61907f, -0.741184f, 0.00257198f,
+ -0.0151343f, -0.669826f, -0.439855f, 0.564852f, -0.0588036f,
+ -1.38123f, -1.1126f, 0.701831f, 0.198686f, 0.266866f,
+ 0.270172f, -0.692401f, 0.272533f, -1.70914f, 0.66064f,
+ 0.0886659f, -0.132233f, 0.270531f, -0.479581f, 0.704338f,
+ -0.307039f, -0.111792f, -2.05753f, -0.231749f, 0.300528f,
+ 0.383266f, -0.130857f, -0.373944f, 1.21025f, 0.704655f,
+ -0.589422f, 0.267185f, -0.109065f, -0.195991f, 0.20209f,
+ -0.0676526f, -0.183926f, 0.164894f, 0.0877923f, 0.565943f,
+ -0.0610466f, -0.86354f, -0.80853f, -0.176111f, -1.45016f,
+ -2.29078f, -0.124524f, -0.139305f, -0.187858f, -0.0250151f,
+ -0.572544f, 0.185336f, -0.69275f, -0.430354f, -0.30861f,
+ -0.754258f, -0.468221f, -0.160487f, -0.766692f, -0.636418f,
+ -0.71016f, 0.576125f, -0.240476f, -0.954556f, -0.104693f,
+ 0.155557f, -0.840224f, -0.685457f, -0.0346927f, -0.644882f,
+ -1.92475f, -0.314544f, 0.463569f, 0.323569f, -0.990124f,
+ -0.213658f, 0.407183f, 1.19797f, -4.77004f, -0.0613379f,
+ -2.40345f, -0.0591791f, -0.477622f, -0.303556f, 0.104077f,
+ -0.974128f, -0.035172f, 1.47064f, 0.233727f, -0.0754056f,
+ 0.158553f, 0.0614361f, -1.38865f, 0.690729f, 0.568455f,
+ 0.205866f, -0.0236852f, -0.0921077f, -0.538954f, 0.336613f,
+ -0.427115f, 0.791754f, -1.819f, -0.404432f, 0.670242f,
+ -0.0343869f, -0.37191f, 0.0271262f, 0.988161f, -0.547343f,
+ 0.925304f, 0.548079f, -0.430343f, -0.214109f, 0.242013f,
+ 1.39027f, 0.37648f, -1.63524f, -0.158864f, -0.572779f,
+ -0.766801f, -2.62032f, 0.47799f, -1.12025f, -0.115283f,
+ 1.22349f, -0.262132f, -0.151274f, 0.390483f, -0.496482f,
+ 1.06166f, -0.183052f, 0.54647f, 0.847486f, 0.0229506f,
+ 0.653309f, -0.020736f, -1.27453f, 0.48386f, -0.366625f,
+ -0.515725f, -1.31196f, 0.140701f, -0.183636f, 0.000413912f,
+ 0.300993f, -0.849529f, -0.59764f, -0.212992f, -0.933365f,
+ -1.4054f, -0.091982f, 0.41695f, 0.264004f, -0.26379f,
+ -0.0738219f, 0.434052f, 1.16617f, -0.639624f, -0.146465f,
+ 0.0409936f, -0.900182f, 0.73517f, 0.805746f, -0.208088f,
+ 1.74459f, -0.0592751f, 0.624865f, -0.62325f, -0.446315f,
+ 0.150526f, 0.0526697f, 0.374254f, -0.658043f, 1.02623f,
+ -0.941758f, 0.381217f, -0.359448f, 0.160051f, 0.556455f,
+ 0.239382f, 0.75851f, 0.437583f, -0.122221f, 0.746136f,
+ 0.218286f, -0.426729f, 0.0353903f, -0.830513f, -0.877586f,
+ 0.488077f, -0.132354f, -0.180756f, 0.736163f, -0.202934f,
+ -0.882534f, 0.166305f, 0.183122f, 0.0599858f, 0.442687f,
+ 0.0522908f, -1.17755f, -1.03733f, 0.392363f, 0.672718f,
+ -1.44704f, 0.360623f, 0.390298f, -0.213968f, 0.169783f,
+ -0.717536f, -0.830984f, -0.445049f, 0.196772f, -0.730634f,
+ -1.09497f, 0.344012f, -0.292802f, -0.67966f, 0.138515f,
+ -0.361803f, 0.936778f, -0.189802f, 0.197777f, -0.367507f,
+ -0.293653f, 0.447759f, -0.409245f, -0.687568f, -0.431301f,
+ -0.271234f, -0.585413f, -0.936414f, -0.396049f, -0.29388f,
+ -0.0930843f, 0.0179339f, 0.262463f, -0.166598f, 0.0171466f,
+ -0.329641f, 0.39343f, 0.657445f, -0.579052f, -0.312444f,
+ -0.0915881f, -0.432622f, -0.247645f, 0.485749f, -0.602508f,
+ -0.347936f, 0.287353f, 0.288705f, 0.168397f, 0.568228f,
+ -0.493586f, 1.04155f, -0.097956f, 0.658928f, -0.561007f,
+ 0.0457783f, 2.12744f, 0.182683f, -0.690282f, 0.183302f,
+ 0.0309499f, -0.722251f, 0.0660448f, -0.333277f, 0.198929f,
+ -0.724102f, -0.405597f, 0.614868f, -0.292862f, 0.886513f,
+ 0.142353f, -1.48934f, -0.97273f, 0.199683f, 0.522121f,
+ 0.0877478f, -0.172593f, -1.58858f, 0.113191f, -0.436178f,
+ 0.640895f, -0.504676f, 0.0658654f, -0.361301f, 0.604323f,
+ 0.315196f, -0.423021f, -0.323484f, -0.563163f, 0.118989f,
+ -0.404508f, -0.0550995f, -0.0359236f, -0.126574f, -0.357288f,
+ -0.0494502f, 1.04959f, -0.31646f, -0.0376684f, -0.300744f,
+ -0.135016f, 0.102696f, -0.392333f, -1.17502f, 0.505227f,
+ 0.337608f, -0.348831f, -0.420815f, 0.202791f, -0.154264f,
+ -0.563686f, 0.0942187f, 0.353862f, 0.0303509f, -0.132794f,
+ 0.420746f, 0.143529f, 0.455822f, -1.28348f, -1.35662f,
+ -0.850688f, -1.76361f, -0.717546f, 0.443111f, 0.227155f,
+ -0.863307f, -0.452033f, -0.278151f, 1.86233f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias[] = {
+ -0.103218f, -0.359587f, 0.619666f, -0.473497f, -0.649803f, 0.86992f,
+ -0.115561f, 0.335114f, -0.285044f, -0.59295f, 0.24497f, 0.611583f,
+ 0.38568f, 0.137913f, -0.281191f, -0.0107777f, 0.487236f, -0.262363f,
+ 0.696962f, 0.121565f, 0.312511f, 0.430916f, 0.694134f, 0.393632f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_2_logits_kernel[] = {
+ -2.42496f, -1.239f, 0.832673f, 1.56923f, -2.6175f, -1.42492f,
+ -0.311387f, -1.94237f, 0.54071f, -2.50391f, 0.352205f, -0.96572f,
+ 1.47144f, -2.04702f, -1.12372f, -0.709186f, 0.812238f, 0.310389f,
+ 0.789163f, -0.65236f, 1.77018f, 0.273867f, 1.19506f, 1.07022f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_2_logits_bias[] = {
+ 0.953424f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel[] = {
+ 0.0485154f, 0.0496279f, 0.0268229f, -0.0584843f, -0.166928f,
+ 0.0316731f, -0.0895094f, -0.0433243f, -0.00893639f, -0.0886265f,
+ -0.0345622f, -0.235395f, -0.213754f, -0.00212398f, 0.0218857f,
+ -0.0054983f, -0.0248236f, 0.081822f, -0.0355708f, -0.0795593f,
+ -0.106995f, -0.0596378f, 0.0350686f, -0.133863f, -0.00582928f,
+ 0.114963f, 0.193906f, -0.00419085f, 0.0430529f, -0.128318f,
+ 0.0614715f, -0.000952935f, -0.0345722f, -0.109459f, 0.074204f,
+ -0.0865131f, 0.0649158f, -0.0942417f, -0.10122f, -0.047551f,
+ -1.27825f, -0.0125456f, -0.019722f, -0.152058f, 0.280306f,
+ -0.121231f, -0.0565484f, 0.0959188f, 0.0603919f, 0.0457468f,
+ 0.967589f, 0.105892f, -0.118326f, 0.198933f, 0.163437f,
+ -0.056824f, -0.0302956f, -0.07366f, -0.681407f, -0.0781575f,
+ 0.255732f, -0.0712105f, 0.177882f, 0.709206f, -0.232457f,
+ 1.33809f, -0.0328557f, 0.0572231f, -1.01361f, 0.130676f,
+ -0.205159f, 0.975398f, 0.356293f, 0.0766364f, -0.297397f,
+ -0.0261066f, -0.0933549f, 0.0568851f, -0.0123034f, -0.0433538f,
+ 0.131003f, 0.890705f, 0.0084565f, 0.00547395f, 0.00157634f,
+ 0.0047937f, -0.0511092f, 0.0300034f, -0.00604993f, -0.0133502f,
+ -0.000274302f, 0.129728f, -0.00532916f, 0.0855351f, 0.136885f,
+ 0.0175562f, -0.0123633f, -0.000512229f, -0.019924f, -0.0316328f,
+ 0.422972f, 0.0460336f, 0.0170841f, -0.00086795f, -0.0655137f,
+ 0.0287308f, -0.0375644f, -0.0329215f, -0.0273072f, 0.0241426f,
+ -0.0429052f, 0.0221593f, -0.063881f, -0.0347391f, -6.44339e-07f,
+ 0.0476934f, -0.0150068f, 0.0146403f, -0.0653099f, 0.0107635f,
+ 0.012407f, 0.0048935f, 1.50975f, 0.322256f, 0.17881f,
+ 0.0943775f, -0.100583f, -0.367022f, -0.156525f, -0.0397161f,
+ 0.0752784f, -0.00219022f, -0.887456f, 0.0153415f, -0.0148185f,
+ -0.56435f, 0.163996f, -0.0221024f, -0.0115872f, -0.0529284f,
+ 0.156838f, -1.13813f, -0.207863f, -0.00484959f, 0.135719f,
+ 0.131004f, 0.0417939f, 0.31453f, 0.121719f, -0.101515f,
+ 0.267951f, 0.219727f, 0.0398821f, 0.0713504f, 3.65918e-06f,
+ -0.00659998f, 0.477343f, -0.128426f, 0.0648877f, 0.111884f,
+ 0.224552f, 0.0617426f, 0.117742f, 0.031377f, 0.0586865f,
+ -0.459293f, 0.100211f, -0.14127f, 0.624412f, 0.014659f,
+ -1.41807f, -0.382452f, -0.695931f, -0.103153f, 0.145808f,
+ 0.333526f, -0.256367f, 0.096842f, 0.102458f, -0.181224f,
+ 0.729272f, 0.151177f, 1.46729f, 0.111044f, -4.28813f,
+ 0.0178379f, 0.47641f, -6.57533f, 0.0633335f, 0.496934f,
+ -0.154657f, -9.07298e-05f, 0.848937f, -5.40143f, 0.375685f,
+ 0.23586f, -0.166591f, -0.0191648f, -0.039862f, -3.25093f,
+ 0.168472f, -0.260317f, -5.51548f, 0.0575334f, 0.328979f,
+ 0.112644f, 0.231339f, -0.122641f, 0.0567331f, 1.19541f,
+ -0.038735f, 0.0630576f, 0.176668f, 0.0757184f, -0.833104f,
+ 0.133669f, 0.982669f, 0.0311783f, 0.0908558f, -0.10065f,
+ -0.0386599f, -0.231587f, -0.83876f, -0.347148f, 0.225529f,
+ -1.29625f, 0.0806834f, 0.369648f, -1.63367f, 0.118057f,
+ -0.311948f, 0.95022f, -0.354807f, -0.648657f, -1.72048f,
+ 0.260397f, 0.915555f, 0.057737f, -0.162019f, -0.453543f,
+ -1.70388f, -0.311632f, -0.731593f, -0.678089f, 0.10438f,
+ -0.293911f, 0.144864f, 0.039212f, 0.0289241f, -0.0685266f,
+ 0.634592f, -0.0798614f, -0.119197f, -0.00517433f, -0.04653f,
+ -0.127568f, -0.0582645f, 0.0735302f, -0.0946823f, 0.00865585f,
+ 0.0115748f, 0.0194847f, 0.0455664f, 0.181006f, -0.0824601f,
+ 0.0869093f, 0.264767f, -0.0750432f, 0.135136f, 0.316511f,
+ 0.399015f, 0.0994808f, -0.166944f, -0.102126f, 0.457858f,
+ 0.300488f, 0.467582f, 0.830244f, -0.0511439f, -0.522892f,
+ -0.183049f, 0.2626f, 0.118382f, 0.241674f, 0.250399f,
+ -0.0963507f, -0.83231f, -0.227699f, -0.133314f, 0.231718f,
+ -0.0700274f, 0.891311f, 0.224742f, -0.572836f, 0.402798f,
+ -0.191576f, 0.740922f, -0.00374073f, 0.658178f, -0.209364f,
+ -0.416259f, 0.166297f, 0.0095577f, -0.0876076f, 0.424954f,
+ 0.265226f, -0.129343f, -0.203146f, -0.194637f, -0.818142f,
+ -0.164152f, -0.368962f, 0.273373f, 0.599927f, -0.19859f,
+ 0.0939651f, -0.12458f, -0.751816f, -0.302997f, -0.139176f,
+ -0.372737f, 0.332704f, -0.206045f, -0.00593763f, -0.452363f,
+ -0.2704f, -0.198846f, 0.0976308f, -0.216124f, 0.110122f,
+ -0.220342f, 0.00763426f, -0.0272775f, -0.190395f, -0.0359411f,
+ -0.0395759f, 0.000941162f, -1.49959f, 0.0914233f, 0.448346f,
+ -0.420435f, -0.0102102f, -0.0757978f, -0.0177687f, -0.0231492f,
+ -0.142125f, 1.31774f, 0.0269368f, 0.134566f, 0.152079f,
+ -0.139933f, 0.139226f, -0.214467f, -0.194446f, -0.555893f,
+ 0.271197f, -0.111047f, 0.0888069f, -0.198121f, 0.0871713f,
+ 0.100612f, 0.429782f, -0.3787f, 0.123147f, -0.12538f,
+ 0.235678f, 0.139237f, 0.223326f, 0.85806f, -0.00554756f,
+ 0.285095f, 0.0954683f, 0.0464989f, 0.100806f, -0.0211297f,
+ 0.121672f, 0.242473f, 0.0810475f, -0.834356f, 0.119629f,
+ 0.111338f, -0.227126f, 0.159296f, -0.0584685f, -0.108265f,
+ -0.0909221f, -0.21749f, 0.0929309f, -0.176815f, 0.178067f,
+ -0.0025905f, 0.317883f, 0.313045f, 0.26774f, -0.589329f,
+ -1.19882f, -0.285513f, -0.109478f, 0.309441f, -0.0604479f,
+ 0.947461f, -0.142342f, -0.9086f, -0.814788f, 0.184588f,
+ -0.0736317f, 0.276237f, 0.13132f, -0.3931f, -0.381744f,
+ -0.0122719f, 0.0246101f, -0.0920412f, 0.11331f, -0.110355f,
+ 0.00848064f, 0.0931248f, -0.0638655f, -4.30869e-05f, -0.300367f,
+ 0.0489508f, 0.464441f, -0.0466243f, -0.0137732f, 0.0099241f,
+ -0.223972f, 0.188966f, -0.653173f, -0.354322f, 0.189237f,
+ -0.624276f, -1.46218f, -0.075161f, -0.516172f, 0.40993f,
+ 0.291178f, -1.95088f, -0.0352157f, 0.196354f, -0.335897f,
+ 0.0857039f, 0.605319f, -1.12923f, -0.638387f, 1.41868f,
+ 0.0955757f, -0.00913477f, 0.315935f, -0.671223f, -0.851436f,
+ -0.157464f, -0.296763f, 0.182277f, -0.139309f, 0.232789f,
+ 0.869562f, 0.248894f, 0.242709f, 0.195479f, 0.106153f,
+ 0.358881f, 0.167443f, 0.982987f, 0.104767f, -0.033925f,
+ -0.0263185f, 0.0045304f, 0.0722479f, -0.111307f, 0.00128896f,
+ 0.406128f, -0.00944947f, 0.121592f, 0.546284f, -0.00175696f,
+ 0.776588f, 0.238846f, 0.064469f, 0.27082f, 0.269187f,
+ 0.0294455f, 0.62364f, -0.27872f, -0.0488013f, 0.229024f,
+ 0.154457f, 0.0445898f, 0.349943f, 0.0710998f, 0.0820674f,
+ 0.0279449f, 0.172826f, -0.122156f, -0.164688f, 0.0292124f,
+ 0.0496112f, -0.741762f, 0.0673926f, 0.108159f, -0.0942327f,
+ -0.0562883f, 0.558231f, 0.0552399f, 0.211393f, 0.0376817f,
+ -0.275788f, 0.0548436f, 0.212732f, 0.163603f, 0.0663363f,
+ -0.0252315f, 0.164533f, 0.0826088f, 0.0301389f, 0.345705f,
+ -0.0378046f, -0.139581f, 1.30162f, 1.23551f, -0.446693f,
+ 0.682534f, -0.0831157f, -0.0121595f, 1.50505f, 0.0839017f,
+ -0.953413f, 0.0820985f, -0.125556f, 0.699796f, -0.140453f,
+ 0.168438f, -0.110966f, 0.173806f, 0.114683f, 0.132502f,
+ -0.0453539f, -0.133096f, 0.511947f, -0.180657f, -0.0298605f,
+ 0.291437f, -0.0275017f, -0.229703f, -0.0504205f, 0.559622f,
+ 0.384601f, 0.111024f, -0.0773559f, -0.0591752f, -0.0866182f,
+ -0.189437f, -0.262345f, -0.0372182f, 0.149925f, 0.154644f,
+ -0.188298f, 0.236949f, -0.199328f, -0.378909f, -0.680128f,
+ 0.277184f, -0.172784f, 0.184717f, -0.23899f, 0.0712069f,
+ 0.0235425f, 0.4225f, -0.441487f, 0.177434f, -0.298303f,
+ 0.295696f, 0.17346f, 0.220542f, -0.680116f, 0.00266223f,
+ -0.0408459f, -0.15486f, 0.24335f, 0.237258f, -0.0283245f,
+ 0.19703f, -0.100027f, 0.0554843f, -1.03081f, 0.151745f,
+ 0.538582f, 0.370368f, 0.196683f, 0.0222123f, -0.0831401f,
+ -0.0832803f, -0.286743f, -0.686003f, 0.0995004f, 0.148901f,
+ -0.0436037f, -0.316508f, 0.00391835f, -0.228452f, 0.940058f,
+ 0.520047f, -0.334211f, 0.652142f, -0.0755971f, 0.0965123f,
+ -0.98191f, 0.394096f, -0.420466f, 0.327284f, -0.134651f,
+ 0.849297f, -0.523372f, 0.010327f, 0.133636f, 0.298119f,
+ -0.257389f, 0.0376153f, -0.198298f, 0.0736235f, 0.608809f,
+ 0.0291836f, -0.290005f, -0.141316f, 0.0184599f, 0.0554437f,
+ 0.0621519f, 0.485276f, 0.617062f, -0.0924811f, -0.0120834f,
+ 0.0817611f, 0.100421f, -0.0153553f, -0.135958f, -0.0185322f,
+ -0.395803f, -0.204862f, 0.547916f, -0.438117f, 0.0229788f,
+ 0.406981f, 0.795584f, -2.02756f, -0.8355f, -0.386789f,
+ 0.00968368f, 1.2147f, -0.740869f, -1.18415f, -0.954918f,
+ -0.541142f, 0.0596003f, 0.107189f, -0.411708f, -0.964593f,
+ 0.511906f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias[] = {
+ -0.485545f, 0.131552f, 0.796833f, -0.157582f, -0.0948124f, 0.00818613f,
+ -0.485562f, 0.3826f, -0.0839326f, 0.170998f, 0.279545f, -0.287143f,
+ 0.184986f, -0.0719864f, 0.19748f, 0.404145f
+};
+
+static const float
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel[] = {
+ 1.30172f, 0.720189f, 0.261675f, -0.466201f, 1.21773f,
+ 0.495525f, 0.62398f, 0.44567f, -0.330993f, -0.269798f,
+ 0.835161f, -0.294874f, 0.186981f, 0.0162467f, 0.367654f,
+ 0.658468f, 1.08325f, 1.01558f, 0.12783f, -0.280581f,
+ 2.2204f, 0.0337286f, -0.403649f, -0.230908f, -0.35188f,
+ 0.437712f, -0.103634f, -0.645929f, 1.17407f, 0.157385f,
+ 0.212438f, 1.41874f, 0.284242f, -0.493105f, 1.0703f,
+ 0.00632116f, 1.18222f, -0.26003f, 0.276795f, -0.823156f,
+ 0.29577f, -0.157467f, -0.18092f, 0.0237336f, 0.205715f,
+ -0.295679f, 0.165443f, -0.628279f, 1.00804f, 0.361232f,
+ 0.646155f, -0.028651f, 1.64317f, 0.334251f, -1.50713f,
+ -1.51685f, -0.488522f, 0.169694f, -0.593176f, -0.372682f,
+ -1.50223f, 0.35076f, -0.24641f, -0.237189f, 0.190502f,
+ -0.948191f, -0.303346f, 0.45108f, -0.794368f, -2.3116f,
+ 0.404008f, -2.67269f, -0.941992f, -0.45336f, 0.0655987f,
+ -0.288432f, 0.106068f, 0.286978f, 0.121403f, 0.462739f,
+ 0.0130292f, 0.240597f, -2.30983f, -0.453309f, -0.149335f,
+ 0.856424f, -0.186576f, 0.769961f, -0.0657097f, -0.976188f,
+ 0.972971f, -0.532728f, -0.699334f, -0.168803f, 0.361945f,
+ 0.950769f, 1.5368f, -0.223899f, 1.17547f, -0.281483f,
+ 0.533619f, 0.315344f, 0.0854543f, 0.464701f, 0.346828f,
+ 0.271794f, -0.0185388f, 0.109517f, 0.371662f, -0.10852f,
+ 0.244092f, 0.491959f, -0.750281f, 1.41865f, -3.51221f,
+ 0.298194f, -0.0790832f, -0.134158f, -0.424084f, 0.189593f,
+ -0.238361f, -0.407872f, -0.366222f, -0.606813f, -0.230498f,
+ 0.387248f, -0.102734f, -0.190544f, -1.43649f, 0.141338f,
+ -0.0438917f, 0.204628f, 1.57033f, 0.0366937f, -0.14733f,
+ 0.048198f, -0.122631f, 0.183354f, 0.0658753f, -0.243381f,
+ 0.0246889f, -0.768798f, -0.0644054f, 0.775073f, 1.63419f,
+ 0.491624f, 0.21898f, -0.358944f, 3.31304f, 0.0195916f,
+ 0.236174f, 0.530704f, 0.140124f, 0.0736778f, -0.27361f,
+ -0.598836f, -1.01659f, 0.361765f, 0.00455986f, -0.345222f,
+ 1.68731f, 0.764082f, 0.193555f, 0.322782f, 1.19801f,
+ 0.538935f, -0.0393231f, -0.0248292f, -0.151168f, 0.479879f,
+ -0.208582f, 0.22798f, 0.335473f, -0.00295455f, 0.139539f,
+ 0.400814f, 0.478307f, -0.189376f, 0.540084f, 0.466072f,
+ 0.920231f, 0.398774f, -0.472403f, -0.0431972f, -0.581665f,
+ -0.990058f, 0.258995f, -0.0148889f, 0.27105f, 0.340334f,
+ 0.223576f, -0.0405193f, -1.23888f, -1.45229f, -1.44543f,
+ -0.376146f, 0.132601f, -0.4064f, -0.583611f, -0.374588f,
+ 0.0659428f, 0.325652f, -0.338456f, 0.253767f, -0.0181164f,
+ 0.681732f, 0.222041f, 0.837496f, 1.09735f, 0.156328f,
+ 0.177236f, -0.702702f, 0.473689f, 0.322118f, 0.43343f,
+ 0.315441f, -0.40798f, 0.0811291f, 0.631431f, 0.361929f,
+ 0.0723276f, 0.0164498f, 0.0293847f, 0.156406f, -1.10453f,
+ 0.837977f, -1.03449f, -0.348408f, 1.71953f, -0.401765f,
+ 0.64272f, -0.182438f, -0.233954f, 0.364597f, 0.269177f,
+ -0.578512f, 0.397216f, 0.0425122f, -0.258728f, 1.41621f,
+ -0.688768f, 0.0944726f, 0.253163f, -0.989037f, 1.72726f,
+ 1.15976f, -0.0460612f, 0.534186f, -0.136814f, 0.49327f,
+ 0.115744f, -0.633052f, -0.433855f, -1.01874f, -0.324035f,
+ 0.489487f, 1.08696f, 0.836376f, -0.423477f, -0.421309f,
+ 1.07348f, 0.323266f, 0.717604f, 0.366422f, 0.32983f,
+ 0.336583f, 0.749292f, -0.210666f, 0.387101f, -0.583376f,
+ 0.0391101f, -1.07537f, 0.914591f, -0.51303f, 1.15023f,
+ -0.0378782f, 0.262889f, -0.841128f, 0.41619f, -0.669704f,
+ -0.109995f, 1.01825f, -0.194853f, 0.120739f, 0.627889f,
+ -0.00269221f, 0.751152f, -0.529865f, -1.50238f, 0.184521f,
+ 0.795464f, 0.106099f, 1.83117f, 0.0883305f, 0.306844f,
+ -0.0671504f, -0.169306f, -0.214575f, -0.121606f, -0.234965f,
+ 0.109752f, -0.35831f, -0.07894f, 0.497203f, -2.63013f,
+ 0.815608f, -0.193593f, -0.62292f, 0.338941f, 0.0970922f,
+ -0.531178f, 0.723346f, 0.35063f, 0.182647f, -0.257013f,
+ 0.784924f, -0.217915f, -0.0797363f, -0.399706f, -0.485602f,
+ 1.23155f, 0.345998f, 0.322949f, -0.168196f, -0.173313f,
+ 0.282205f, 0.45117f, 0.918706f, -0.046172f, -0.0873883f,
+ 0.56103f, -0.485768f, 0.546199f, 0.254997f, 0.394296f,
+ 0.607178f, 0.667532f, -0.343883f, 0.374402f, -0.531439f,
+ 2.27782f, -1.13255f, 0.505867f, -0.514742f, 0.998571f,
+ -1.60984f, -0.172873f, -0.0604094f, 0.719791f, -0.733982f,
+ 0.348905f, 1.39008f, -0.895343f, -0.677064f, -1.84221f,
+ 0.0434018f, -0.534794f, 0.0434753f, -0.266576f, 0.268099f,
+ -0.242935f, 0.00166289f, 0.0263789f, -0.224794f, -0.113493f,
+ -0.236397f, 0.0879936f, 0.510895f, -0.511789f, -1.48962f,
+ -2.78268f, -0.0495784f, -0.0343907f, 0.440459f, -0.364209f,
+ 0.833223f, -0.0589337f, 0.00181418f, 0.455499f, 0.101762f,
+ -1.16424f, 0.270405f, 0.219033f, -4.91105f
+ };
+
+static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias[] = {
+ -0.40114f, -0.372342f, -0.216186f, -0.240014f, -0.341773f, -0.344489f,
+ -0.113037f, 0.198479f, 0.482958f, -0.630072f, -0.728704f, -0.171963f,
+ 0.519883f, 0.253003f, -0.121618f, -0.0569875f, -0.485568f, -0.147577f,
+ 0.533305f, -0.587251f, -0.120837f, -0.483953f, 0.445641f, -0.125136f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_3_logits_kernel[] = {
+ -1.57431f, -1.09069f, 1.67996f, -0.669702f, 0.499807f, -3.03145f,
+ -0.878135f, 0.637818f, -1.58419f, -3.79756f, 0.62755f, -0.446646f,
+ 0.653269f, -0.667854f, -2.19774f, -3.53349f, 2.6107f, -0.685892f,
+ -1.2603f, -0.89707f, -0.715551f, 0.382202f, 2.09574f, 0.469386f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_3_logits_bias[] = {
+ -0.022787f
+};
+
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_0_dnn_config = {
+ BRANCH_0_NUM_DNN_FEATURES,
+ BRANCH_0_NUM_LOGITS,
+ BRANCH_0_NUM_DNN_LAYERS,
+ {
+ BRANCH_0_NUM_DNN_LAYER_0_UNITS,
+ BRANCH_0_NUM_DNN_LAYER_1_UNITS,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel,
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel,
+ av1_intra_mode_cnn_partition_branch_0_logits_kernel,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias,
+ av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias,
+ av1_intra_mode_cnn_partition_branch_0_logits_bias,
+ },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_1_dnn_config = {
+ BRANCH_1_NUM_DNN_FEATURES,
+ BRANCH_1_NUM_LOGITS,
+ BRANCH_1_NUM_DNN_LAYERS,
+ {
+ BRANCH_1_NUM_DNN_LAYER_0_UNITS,
+ BRANCH_1_NUM_DNN_LAYER_1_UNITS,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel,
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel,
+ av1_intra_mode_cnn_partition_branch_1_logits_kernel,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias,
+ av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias,
+ av1_intra_mode_cnn_partition_branch_1_logits_bias,
+ },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_2_dnn_config = {
+ BRANCH_2_NUM_DNN_FEATURES,
+ BRANCH_2_NUM_LOGITS,
+ BRANCH_2_NUM_DNN_LAYERS,
+ {
+ BRANCH_2_NUM_DNN_LAYER_0_UNITS,
+ BRANCH_2_NUM_DNN_LAYER_1_UNITS,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel,
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel,
+ av1_intra_mode_cnn_partition_branch_2_logits_kernel,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias,
+ av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias,
+ av1_intra_mode_cnn_partition_branch_2_logits_bias,
+ },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_3_dnn_config = {
+ BRANCH_3_NUM_DNN_FEATURES,
+ BRANCH_3_NUM_LOGITS,
+ BRANCH_3_NUM_DNN_LAYERS,
+ {
+ BRANCH_3_NUM_DNN_LAYER_0_UNITS,
+ BRANCH_3_NUM_DNN_LAYER_1_UNITS,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel,
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel,
+ av1_intra_mode_cnn_partition_branch_3_logits_kernel,
+ },
+ {
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias,
+ av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias,
+ av1_intra_mode_cnn_partition_branch_3_logits_bias,
+ },
+};
+
+#undef NUM_DNN_BRANCHES
+#undef NUM_CNN_LAYERS
+#undef BRANCH_0_NUM_DNN_LAYERS
+#undef BRANCH_1_NUM_DNN_LAYERS
+#undef BRANCH_2_NUM_DNN_LAYERS
+#undef BRANCH_3_NUM_DNN_LAYERS
+#undef CNN_LAYER_0_HEIGHT
+#undef CNN_LAYER_0_WIDTH
+#undef CNN_LAYER_0_IN_CH
+#undef CNN_LAYER_0_OUT_CH
+#undef CNN_LAYER_0_HORZ_STRIDE
+#undef CNN_LAYER_0_VERT_STRIDE
+#undef CNN_LAYER_1_HEIGHT
+#undef CNN_LAYER_1_WIDTH
+#undef CNN_LAYER_1_IN_CH
+#undef CNN_LAYER_1_OUT_CH
+#undef CNN_LAYER_1_HORZ_STRIDE
+#undef CNN_LAYER_1_VERT_STRIDE
+#undef CNN_LAYER_2_HEIGHT
+#undef CNN_LAYER_2_WIDTH
+#undef CNN_LAYER_2_IN_CH
+#undef CNN_LAYER_2_OUT_CH
+#undef CNN_LAYER_2_HORZ_STRIDE
+#undef CNN_LAYER_2_VERT_STRIDE
+#undef CNN_LAYER_3_HEIGHT
+#undef CNN_LAYER_3_WIDTH
+#undef CNN_LAYER_3_IN_CH
+#undef CNN_LAYER_3_OUT_CH
+#undef CNN_LAYER_3_HORZ_STRIDE
+#undef CNN_LAYER_3_VERT_STRIDE
+#undef CNN_LAYER_4_HEIGHT
+#undef CNN_LAYER_4_WIDTH
+#undef CNN_LAYER_4_IN_CH
+#undef CNN_LAYER_4_OUT_CH
+#undef CNN_LAYER_4_HORZ_STRIDE
+#undef CNN_LAYER_4_VERT_STRIDE
+#undef BRANCH_0_NUM_DNN_FEATURES
+#undef BRANCH_0_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_0_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_0_NUM_LOGITS
+#undef BRANCH_1_NUM_DNN_FEATURES
+#undef BRANCH_1_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_1_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_1_NUM_LOGITS
+#undef BRANCH_2_NUM_DNN_FEATURES
+#undef BRANCH_2_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_2_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_2_NUM_LOGITS
+#undef BRANCH_3_NUM_DNN_FEATURES
+#undef BRANCH_3_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_3_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_3_NUM_LOGITS
+
+static const float av1_intra_mode_cnn_partition_split_thresh_hdres[5] = {
+ 100.000000f, 4.750139f, 1.655964f, 3.711212f, 0.963839f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_hdres[5] = {
+ -100.000000f, -2.404842f, -3.858223f, -2.041206f, -1.573735f,
+};
+
+static const float av1_intra_mode_cnn_partition_split_thresh_midres[5] = {
+ 100.000000f, 3.218737f, 2.657764f, 0.868458f, 2.454447f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_midres[5] = {
+ -100.000000f, -3.842426f, -4.005076f, -3.642994f, -2.467197f,
+};
+
+static const float av1_intra_mode_cnn_partition_split_thresh_lowres[5] = {
+ 100.000000f, 1.890757f, 2.658417f, 1.450626f, 1.833180f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_lowres[5] = {
+ -100.000000f, -4.100921f, -4.564202f, -5.695176f, -1.483546f,
+};
+
+static const float av1_intra_mode_cnn_partition_mean[1] = {
+ 1.191922f,
+};
+
+static const float av1_intra_mode_cnn_partition_std[1] = {
+ 1.730044f,
+};
+
+static const int quad_to_linear_0[1] = { 0 };
+static const int quad_to_linear_1[4] = { 0, 1, 2, 3 };
+static const int quad_to_linear_2[16] = { 0, 1, 4, 5, 2, 3, 6, 7,
+ 8, 9, 12, 13, 10, 11, 14, 15 };
+static const int quad_to_linear_3[64] = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 16, 17, 24, 25, 18, 19, 26, 27,
+ 4, 5, 12, 13, 6, 7, 14, 15, 20, 21, 28, 29, 22, 23, 30, 31,
+ 32, 33, 40, 41, 34, 35, 42, 43, 48, 49, 56, 57, 50, 51, 58, 59,
+ 36, 37, 44, 45, 38, 39, 46, 47, 52, 53, 60, 61, 54, 55, 62, 63
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h
new file mode 100644
index 0000000000..71c1ace782
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_model_weights.h
@@ -0,0 +1,5646 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// TODO(chiyotsai@google.com): The performance of these models are getting worse
+// due the changes in the encoder. We should retrain the models here to get
+// better performance once we have the time.
+
+#define FEATURE_SIZE 10
+#define LABEL_SIZE 16
+// nn model for ab partition pruning, 128x128.
+static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = {
+ -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f, -0.469759f,
+ 0.426152f, 0.489798f, 0.469865f, 0.773821f, 0.088517f, 0.074585f,
+ 0.838754f, 0.048449f, -0.007584f, 0.638968f, 0.233305f, -0.319236f,
+ -0.257124f, -0.170869f, 0.137180f, 0.114852f, -0.721241f, -0.947962f,
+ -0.411298f, 0.494306f, -0.060435f, -0.648421f, -0.126624f, 0.072686f,
+ -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f, -0.189925f,
+ 0.134361f, -0.258070f, -0.177558f, 0.158049f, 0.168668f, -0.062919f,
+ 0.341986f, 0.038100f, -0.435577f, -0.321255f, 0.203213f, 0.213061f,
+ 0.533304f, 0.359296f, -0.079558f, 0.004637f, 0.663904f, 0.043779f,
+ 0.383018f, 1.136559f, -0.084155f, 0.333057f, -0.199011f, 0.152059f,
+ -0.078419f, -0.167752f, -0.093651f, 0.083171f, -0.190143f, 0.086195f,
+ -0.280632f, -0.160663f, -0.017298f, 0.122628f, -0.138116f, 0.062927f,
+ 0.222462f, 0.626979f, 0.426928f, 0.117170f, -0.240457f, 0.053750f,
+ 0.038017f, 0.007359f, -0.017595f, 0.101407f, 0.332891f, 0.074933f,
+ 0.306498f, 0.219380f, -0.151638f, -0.247976f, 0.343405f, 0.121256f,
+ 0.049173f, 0.171474f, -0.139608f, -1.016599f, -0.345553f, -0.901138f,
+ 0.243401f, 0.059928f, -0.089396f, -0.195565f, 0.364705f, -0.020400f,
+ -1.383672f, 0.413018f, 0.536950f, -0.020904f, -1.335306f, -0.732290f,
+ 0.102885f, 0.315290f, -0.208521f, -0.081811f, 0.182300f, 0.125712f,
+ -0.593833f, -0.220639f, -0.314155f, 0.188327f, 0.118503f, 0.524427f,
+ -1.083859f, -1.130640f, 0.390352f, -0.045591f, 0.113160f, -0.009149f,
+ -0.096183f, 0.115829f, 0.377752f, 0.318396f, -0.591983f, 0.004797f,
+ -0.497377f, -0.342248f, 0.079546f, -0.025249f, -0.295972f, 0.615501f,
+ -0.464372f, 0.418315f, -0.173556f, 0.105217f, 0.298073f, 0.082478f,
+ 0.033223f, 0.977341f, -0.372982f, -0.052337f, 0.154124f, 0.396787f,
+ 0.536654f, -0.139061f, -0.223702f, 0.229666f, -0.846766f, 0.107723f,
+ 0.563839f, -0.483141f, 0.304813f, -0.765283f, 0.070964f, 0.151101f,
+ 0.275188f, 0.490303f, 1.175892f, 0.085377f, -0.191200f, 0.544532f,
+ -0.365075f, 0.167546f, 0.052183f, -0.220529f, -0.212227f, -0.144988f,
+ -0.273356f, -0.062023f, 0.103993f, -0.238493f, -0.161204f, -0.054611f,
+ -0.166672f, 0.128327f, 0.461751f, -0.545822f, 0.739798f, 0.594386f,
+ -0.163192f, -0.332501f, 0.363834f, -0.065043f, 0.474812f, -0.138811f,
+ 0.170924f, -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f,
+ 0.340591f, 0.041783f, 0.055419f, 0.015155f, -0.981830f, -1.355237f,
+ 0.347516f, 1.155327f, 0.081319f, 0.274163f, -0.327230f, -0.113478f,
+ 0.556552f, -0.055986f, 0.217318f, -0.445351f, 0.325759f, 0.526547f,
+ -0.657434f, -0.572214f, -0.037087f, 0.081384f, 0.064518f, 0.014892f,
+ 0.215279f, 1.834504f, -0.242107f, 0.079810f, 0.129558f, 0.079588f,
+ -0.035189f, -0.221745f, -0.163414f, 0.043978f, -1.028662f, -0.623609f,
+ 1.130336f, 0.664661f, -0.063975f, -0.415863f, 0.018581f, 0.157758f,
+ 0.200570f, 0.063420f, 0.901039f, -0.746286f, 0.196230f, -0.290592f,
+ 0.042373f, -0.502500f, 0.183638f, 0.103394f, -0.298858f, 0.145436f,
+ 0.196916f, 0.108319f, -0.448572f, -0.881385f, 0.302497f, 0.121679f,
+ -0.021327f, 0.025150f, 0.481306f, -0.359634f, 0.350257f, -0.228647f,
+ -0.669860f, 0.260025f, -0.034182f, 0.619247f, -0.158826f, -0.405864f,
+ 0.674112f, -0.027885f, -0.325274f, -0.241492f, 0.036024f, -0.437685f,
+ -0.091458f, -0.109295f, -0.350676f, 0.044706f, 0.297059f, 0.016290f,
+ 1.121203f, 1.289062f, -1.299476f, -1.129221f, 0.103752f, 0.131302f,
+ -0.263265f, 0.222155f, -0.229908f, 0.013922f, -0.226001f, -0.248383f,
+ -0.004415f, -0.020958f, 0.055634f, 0.086200f, 0.114556f, -0.184061f,
+ -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f, 0.023781f,
+ -0.264460f, 0.157026f, -0.235228f, -0.102564f, 0.043463f, -0.187823f,
+ -0.257500f, -0.199049f, -0.242210f, 0.030448f, 0.221604f, 0.151804f,
+ -0.100404f, -0.073931f, 0.144749f, -0.001572f, -1.438079f, -0.233716f,
+ 0.733422f, 1.727080f, -0.036397f, 0.027551f, 0.425321f, 0.085703f,
+ 0.031186f, 0.032333f, -0.675130f, 1.437733f, -0.202392f, -0.525003f,
+ 0.087048f, 0.328194f, -0.079989f, -0.391088f, -0.238732f, -0.120660f,
+ -0.139600f, 0.154665f, 0.026202f, -0.233501f, -0.009046f, -0.149187f,
+ -0.199646f, 0.115375f, 0.209762f, -0.014875f, 0.124038f, -0.119985f,
+ 1.079625f, -0.461513f, 0.614114f, 0.021003f, 0.439449f, -0.824834f,
+ -0.299701f, 0.193817f, -0.870551f, -1.262313f, -0.079517f, 0.341570f,
+ 0.305310f, -0.089721f, -0.317314f, -0.075631f, 0.127172f, -0.208635f,
+ 1.191922f, 0.163141f, 0.564285f, 0.286352f, 0.480865f, 0.173094f,
+ -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f, 0.090258f,
+ -0.016099f, 0.193230f, 0.188061f, 0.398144f, 0.722781f, 0.769949f,
+ 0.025442f, -0.162016f, 0.070192f, -0.056946f, -0.100957f, -0.219934f,
+ -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f, -0.017493f,
+ 0.527446f, 0.083605f, 0.588318f, 0.878215f, 0.028747f, -0.146479f,
+ -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f, -0.101340f,
+ -0.027733f, -0.282611f, 0.265366f, 0.082362f, -0.265420f, -0.131124f,
+ 0.166303f, 0.040194f, -0.100710f, 0.579151f, -0.530136f, 0.163422f,
+ -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f, -0.090302f,
+ 1.723272f, 0.552370f, -0.295954f, -0.439095f, -0.266730f, 0.027936f,
+ 0.539616f, -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f,
+ -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f, -0.159378f,
+ 0.029145f, -0.050892f, -0.223407f, -0.246239f, 0.043152f, -0.018460f,
+ 0.169972f, -0.187769f, -0.034670f, -0.238330f, 0.288070f, -0.093243f,
+ -0.437105f, -0.573376f, 0.660073f, 0.285727f, 0.408470f, 0.158475f,
+ 0.032699f, 0.056280f, -0.237176f, -0.083003f, 0.105598f, -0.169522f,
+ -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f,
+ 0.029124f, 0.009580f, -0.252034f, 0.103087f, 1.156561f, 0.603848f,
+ -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f, 0.076095f,
+ 1.490819f, 0.415893f, -0.277788f, -0.115787f, 0.093750f, 0.270726f,
+ -0.395983f, -0.353742f, 0.034605f, 0.005342f, 0.184537f, 0.086445f,
+ 0.156417f, 1.476367f, 0.122587f, 0.002145f, 0.431057f, -0.381184f,
+ -1.646457f, -0.014009f, -0.671224f, 0.193726f, -0.019247f, -0.031267f,
+ -0.046208f, 0.298733f, 0.064734f, 0.616984f, 0.039381f, 0.182722f,
+ -0.116670f, 0.233093f, -1.214374f, -0.817970f, -0.064394f, -0.584783f,
+ 0.077697f, -0.266720f, 0.130875f, -0.235295f, -0.265754f, -0.159999f,
+ -0.250114f, -0.183017f, 0.194403f, -0.105808f, -0.169215f, -0.240866f,
+ -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f,
+ -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f,
+ 0.881460f, -0.678603f, 0.008666f, -0.252053f, -0.341035f, -0.175290f,
+ 0.183012f, 0.385991f, 0.079888f, -0.014039f, -0.148653f, 0.671778f,
+ -0.130219f, 1.086467f, 0.129267f, -0.040400f, -0.201221f, -0.077005f,
+ 0.015890f, 0.000781f, 0.137764f, 1.389546f, 0.172152f, 0.047279f,
+ -0.042783f, 0.127740f, 0.141467f, -0.335738f, -1.396392f, 0.031496f,
+ 0.357385f, 0.343602f, -0.714553f, 0.311014f, 0.132845f, 0.061149f,
+ 0.006796f, 0.568106f, -0.255949f, 0.104134f, -0.993447f, 0.298135f,
+ -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f,
+ 0.068481f, 0.036240f, -0.495801f, 0.180574f, -0.766129f, 0.886967f,
+ -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f,
+ 0.019016f, -0.015837f, 0.600197f, 0.429773f, 0.315026f, 0.319667f,
+ 0.214617f, -0.017316f, 0.270257f, -0.040524f, 0.695803f, -0.015223f,
+ -1.554965f, 0.356997f, -1.472428f, 0.024637f, -0.562958f, 0.870351f,
+ 0.193635f, 0.036063f, 0.328638f, 0.200274f, -1.634707f, 0.110534f,
+ 0.420104f, -0.072042f, -0.006404f, 0.171680f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer0[64] = {
+ 0.643147f, -1.348826f, 0.431627f, 0.000000f, 0.102717f, -0.772628f,
+ -0.034351f, -0.761977f, -0.638397f, 0.541969f, -0.391311f, 0.563076f,
+ 0.148553f, 0.267217f, -0.788092f, 0.544573f, -0.546280f, 0.000000f,
+ -0.446945f, 0.127732f, 0.270624f, -0.219435f, -1.220203f, 0.324584f,
+ 0.110885f, 0.276547f, 0.179726f, -0.375160f, 0.026401f, -0.032595f,
+ 0.000000f, -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f,
+ 0.476453f, -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f,
+ 0.915351f, -0.209962f, 0.000000f, -0.025731f, 0.218288f, 0.000000f,
+ 0.047726f, -0.813077f, -1.263281f, 0.239087f, 0.278614f, -0.030753f,
+ 0.000000f, 0.346744f, -0.948543f, -1.174211f, 0.216377f, 0.498913f,
+ 0.853918f, 0.002504f, -0.190403f, 0.452050f,
+};
+
+static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = {
+ 0.179769f, 1.499417f, -0.445135f, -0.142278f, -0.337661f, 0.682064f,
+ -0.203213f, 0.302171f, 0.226877f, -0.422169f, 1.687586f, 0.783773f,
+ 0.220995f, 0.253482f, 0.370435f, -1.342775f, 0.337229f, -0.271473f,
+ 0.291796f, 1.362227f, -1.751397f, -0.086178f, 0.725496f, -0.118597f,
+ 0.227963f, -0.501577f, 0.223849f, -0.122421f, -0.123437f, -0.051045f,
+ -0.020115f, 0.212711f, 0.246025f, 0.088120f, -0.168995f, 1.740190f,
+ -0.195098f, 0.680339f, -0.589572f, -0.075244f, 0.878766f, 0.064092f,
+ -3.548527f, 0.001660f, 0.107926f, -0.169501f, -0.455212f, 0.123045f,
+ -1.836998f, 0.330365f, 1.301475f, 0.454761f, -0.576552f, -0.190761f,
+ 0.208459f, 0.618483f, 1.383364f, 0.970718f, 0.390174f, 0.406252f,
+ -0.564519f, -0.312062f, 1.345712f, -0.151873f, 0.109290f, 0.408847f,
+ 0.391243f, 0.152024f, 0.181764f, -0.036263f, -0.160466f, 0.153595f,
+ 0.049163f, -0.753012f, -1.804062f, 0.347475f, -2.746580f, 0.575618f,
+ 0.261799f, 0.210505f, -0.302054f, -0.109872f, 0.199506f, -1.182971f,
+ 0.723668f, 0.177758f, -0.338202f, 0.254396f, -0.220023f, 0.043504f,
+ 0.669866f, -0.040816f, -0.402730f, 0.017990f, 0.215523f, -0.216816f,
+ 0.454826f, -0.726067f, -0.018750f, -0.928679f, 0.154315f, -0.465641f,
+ 0.144566f, -0.030064f, -0.054667f, -0.154055f, 0.625384f, 1.323795f,
+ -0.159496f, 0.097072f, -0.463197f, -0.057938f, 0.750290f, -0.233061f,
+ 0.412631f, -0.535223f, -0.151423f, -0.154583f, 0.024721f, -0.494448f,
+ 0.230594f, -0.980138f, -0.653968f, 0.126079f, 0.051814f, -0.053219f,
+ -0.421708f, -0.228853f, 0.237885f, 0.888157f, 0.059655f, 0.241295f,
+ 0.210443f, 0.228238f, 0.119127f, -0.051989f, -0.355408f, 0.182215f,
+ 0.244277f, -0.104577f, -0.558035f, -0.023270f, 0.054571f, 0.700646f,
+ -0.223006f, 0.115523f, 0.023391f, 0.437264f, 0.709477f, -0.531212f,
+ -0.094731f, 0.328161f, -0.105418f, -0.133511f, 0.497168f, -0.030948f,
+ -0.407132f, -0.043943f, 0.155505f, 0.251945f, 0.205010f, 0.167160f,
+ 0.083654f, -0.636810f, 0.401315f, -0.398414f, 0.290046f, 0.206846f,
+ 0.042218f, 0.168150f, 0.843181f, -0.671242f, -0.202392f, -0.073301f,
+ 0.142895f, 0.237466f, 0.212145f, -0.091828f, 0.187038f, -0.720841f,
+ -0.616069f, -0.238021f, 0.065365f, 0.434119f, 0.179023f, -0.040107f,
+ -0.430734f, -0.297368f, 0.575954f, 0.382619f, -0.709787f, -0.320810f,
+ 0.242342f, -0.047614f, 0.705216f, 0.098077f, 0.357179f, 0.046017f,
+ 0.115074f, -0.412305f, -0.272304f, 0.048096f, -0.803811f, 0.275000f,
+ 0.642198f, 0.180286f, -0.087178f, -0.112707f, -0.394443f, 0.201989f,
+ 0.241759f, -1.038870f, 0.728124f, 0.800559f, -1.296268f, 0.198612f,
+ -0.053478f, 0.414344f, -0.510529f, 0.124179f, -2.219115f, -0.074583f,
+ -0.143055f, 0.001697f, 0.810811f, -0.657140f, 0.186818f, -0.936414f,
+ 0.539578f, -0.308244f, -0.126624f, -0.204767f, 0.091145f, -0.049340f,
+ 0.252014f, 0.394582f, 0.018764f, -0.060377f, -0.019133f, 0.064083f,
+ 0.069211f, -0.526693f, 0.209850f, -0.481466f, -0.468302f, -0.100407f,
+ 0.241018f, -1.037781f, 0.038539f, -2.113840f, -0.974895f, 0.163187f,
+ 0.425132f, -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f,
+ -0.745175f, -0.177077f, 0.217658f, 0.381431f, -0.052338f, 0.087176f,
+ -0.165972f, 0.085937f, 0.472564f, -0.796627f, -2.453307f, 0.569664f,
+ -0.233010f, -0.192134f, 0.064339f, -0.111411f, -0.262469f, -0.410022f,
+ 0.519993f, -0.684620f, 0.393460f, -0.277753f, -0.153624f, 0.528984f,
+ -0.415558f, -0.445863f, 0.588512f, -0.142439f, -0.132127f, 0.199776f,
+ -0.579284f, 0.119488f, -0.033590f, -0.503846f, -0.674979f, 0.335125f,
+ 0.020519f, 0.233973f, -0.297998f, -0.051511f, 0.518626f, -0.412782f,
+ -0.074045f, 0.130523f, 0.465751f, -0.117795f, 2.535813f, 0.352108f,
+ -0.499228f, 0.379784f, 0.056699f, 0.173142f, -0.076519f, -0.026666f,
+ 0.017834f, 0.492333f, 0.093364f, 0.037867f, -0.165420f, -0.356429f,
+ -0.562334f, 0.057656f, -0.307544f, 0.085857f, -0.559851f, 0.107230f,
+ -0.398633f, 0.152618f, -0.216835f, -0.024539f, 0.026044f, -0.249519f,
+ -0.563594f, -0.746025f, 0.025265f, -0.298888f, -0.185243f, 0.058794f,
+ 0.233696f, -0.115223f, 0.144617f, -0.864390f, 0.619944f, -0.023980f,
+ 0.019481f, 0.225252f, 0.416552f, -0.115993f, 0.935387f, 0.744386f,
+ 0.053353f, -0.052582f, -0.065650f, 0.228488f, -0.032042f, -0.371252f,
+ -0.003638f, -0.736984f, -0.203776f, 0.030922f, -0.065577f, -0.031643f,
+ -0.049253f, -0.054640f, 0.787134f, 0.545414f, -0.140297f, -0.124274f,
+ -0.110011f, -0.029552f, 0.657005f, 0.214973f, -0.374300f, 0.251642f,
+ 0.276591f, 0.030566f, -0.145470f, 0.350579f, -0.356436f, -0.052694f,
+ -0.063966f, -0.751008f, -1.042392f, 0.328892f, -0.425058f, -0.421571f,
+ -0.571889f, -1.141472f, -0.125216f, 0.212713f, -0.485170f, -0.088791f,
+ 0.124589f, 0.023237f, 0.077635f, 0.020901f, -0.271402f, -0.321424f,
+ -0.513946f, -0.867872f, -0.284593f, 0.106276f, 0.220192f, -0.143532f,
+ -0.014648f, 0.073402f, 0.327256f, -0.139803f, 0.168763f, 0.048199f,
+ -0.122526f, 0.111713f, -0.134257f, 0.810364f, -0.085222f, -0.259221f,
+ -0.239349f, 0.044448f, 0.205031f, 0.413113f, -0.107720f, -0.018816f,
+ -0.247741f, -0.004963f, 0.041170f, -0.158019f, 0.134839f, 0.129502f,
+ 0.800488f, -1.041584f, -0.129336f, 0.170834f, 0.566586f, -0.230443f,
+ 0.437937f, -0.149922f, -0.046665f, -0.094646f, 0.200070f, 0.072943f,
+ -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f, -0.444731f,
+ -0.100877f, 0.545196f, -1.786626f, -0.482946f, 0.500509f, -0.843257f,
+ 0.200374f, 0.045103f, -0.575718f, -0.164335f, -0.232522f, -0.021825f,
+ -0.139490f, 0.356058f, -0.352075f, 0.061751f, -0.200616f, -1.180921f,
+ -0.181355f, -0.137459f, 0.247574f, 0.181541f, 0.184314f, -0.961482f,
+ 0.493615f, 0.910261f, -2.279238f, 0.648631f, -0.055526f, -0.037137f,
+ 0.038643f, 0.136609f, -0.819373f, -0.040840f, -0.265989f, 0.006877f,
+ 0.454651f, -0.595323f, -0.099500f, -0.263717f, 0.150456f, 0.245077f,
+ -0.268666f, 0.162232f, -0.516451f, -0.024501f, 0.188046f, -0.002262f,
+ 0.261319f, 0.004173f, 0.746982f, 0.174761f, 0.470447f, -0.159558f,
+ -0.385240f, 0.023084f, -0.133520f, -0.220607f, -0.018731f, -0.373558f,
+ -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f,
+ -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f, -0.153361f,
+ 0.334394f, -0.569472f, -0.198118f, 0.255922f, 0.104717f, -0.065179f,
+ 0.111879f, -0.447237f, 1.373623f, -0.190191f, -0.063311f, 0.337529f,
+ -0.138800f, 0.057009f, -0.137006f, 0.641378f, 0.883147f, -0.679655f,
+ 0.267717f, -0.351602f, -0.135225f, 0.229398f, -0.513225f, -1.120345f,
+ 0.528786f, -0.051081f, 0.086653f, 0.140141f, -0.563969f, 0.333402f,
+ -0.174745f, 0.321093f, -0.438641f, -0.005131f, 0.247415f, 0.110120f,
+ -0.076308f, -0.083244f, 0.838944f, -0.113043f, -0.013258f, -0.175028f,
+ -0.179941f, 0.272676f, -0.047946f, -0.088076f, -0.450031f, 0.053929f,
+ -0.083549f, -0.089952f, -0.186253f, 0.257483f, 0.011019f, 0.586435f,
+ 0.060580f, -0.052078f, 0.090277f, -0.780869f, 0.969811f, -0.025349f,
+ -0.281917f, 0.014857f, 0.231863f, -0.228601f, -0.003861f, 0.226550f,
+ 0.141825f, -0.102171f, -0.010387f, 0.220378f, -2.561975f, -0.497071f,
+ -0.315117f, 0.371981f, 0.138247f, 0.625031f, -0.308133f, -0.217876f,
+ 0.005615f, -0.860179f, 0.747491f, 0.006356f, -0.057024f, -0.483189f,
+ 0.055592f, -0.316834f, 0.069858f, 0.218788f, -0.200044f, 0.227588f,
+ 0.215496f, -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f,
+ -0.152512f, -0.332995f, 0.129053f, 0.178668f, -0.302694f, 0.030678f,
+ 0.925896f, 0.964375f, 0.169021f, -0.218657f, -0.627204f, 0.206437f,
+ -0.521336f, 0.176206f, 0.142733f, 0.139248f, 0.411682f, 0.181544f,
+ 0.224850f, -0.935547f, -0.558208f, 0.348096f, 0.342129f, -0.389340f,
+ -0.236308f, -0.132099f, 0.073642f, 0.089391f, -0.306901f, -0.397842f,
+ 0.444282f, 0.074623f, -0.051075f, -0.106617f, -0.184037f, -0.239046f,
+ -0.138761f, 0.120794f, -0.647577f, -0.336471f, 0.527899f, -0.164234f,
+ -0.028354f, 1.083678f, -0.251534f, -0.145903f, -0.182783f, 0.070976f,
+ -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f,
+ -1.152632f, 0.383685f, -0.105895f, -0.096829f, 0.118382f, 0.047447f,
+ -0.019051f, 0.310180f, -0.162793f, -0.029574f, 0.058054f, -0.636017f,
+ 0.490639f, 0.158347f, -0.385701f, -0.147057f, 1.285825f, -1.276083f,
+ -0.021795f, -0.101600f, 0.163254f, 0.267160f, -2.317864f, -0.098598f,
+ -0.296337f, -0.309017f, 0.164127f, -0.270012f, -0.071187f, -0.262270f,
+ 0.075415f, -0.368328f, 0.186728f, -0.158031f, 0.481663f, 0.515950f,
+ -0.162551f, 0.497981f, 0.262196f, 0.168479f, 0.726066f, -0.243856f,
+ -0.058998f, 0.140168f, 0.053242f, -0.624623f, -0.249480f, 0.055197f,
+ -1.376804f, 0.417571f, 0.203784f, 0.174370f, -0.155531f, -0.029400f,
+ -0.491473f, 0.079811f, -0.080123f, 1.345900f, 0.637077f, 0.434862f,
+ -1.787438f, 0.005756f, -0.362706f, 0.179458f, -0.288263f, 0.516788f,
+ -0.921248f, 0.043794f, -0.137729f, -0.196171f, -0.046295f, -0.793781f,
+ -0.156532f, -0.132566f, 0.517989f, -0.154321f, -0.054174f, -0.077900f,
+ -0.373316f, -0.117718f, 0.188986f, -0.476188f, -0.245312f, 0.181439f,
+ -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f,
+ -0.135429f, 0.125766f, -0.081314f, -0.350894f, -0.163165f, -1.936507f,
+ -0.205966f, 0.031472f, 0.744446f, -0.006680f, -0.837551f, 0.605862f,
+ -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f, -0.183586f,
+ -0.010307f, 0.099373f, -0.228278f, 0.175236f, -0.000133f, 0.104491f,
+ -1.540545f, -0.570971f, -0.252885f, 0.483036f, 0.052531f, 0.260214f,
+ -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f, -1.775975f,
+ -0.298634f, 0.323626f, -0.373579f, -0.872977f, 0.619574f, 0.026862f,
+ -0.122531f, -0.084698f, -2.436297f, 0.483996f, -0.203640f, -0.302157f,
+ -0.150666f, -0.238320f, 0.089250f, 0.236485f, -0.668654f, -0.122863f,
+ 0.491152f, -0.226444f, -0.181248f, 0.120158f, 0.294027f, 0.250056f,
+ 0.307601f, 0.357875f, -1.746455f, -0.175670f, 0.385447f, -0.108808f,
+ -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f,
+ 0.607555f, -0.489426f, 0.150624f, 0.598114f, -0.128816f, -0.445793f,
+ -0.066524f, -0.254380f, 0.227106f, -0.406495f, -0.121632f, -0.275960f,
+ -0.136494f, 0.339457f, -1.318132f, -0.417572f, -2.614077f, 0.324603f,
+ -0.001211f, 0.375192f, -0.473448f, -0.162510f, 0.099329f, -0.277965f,
+ 0.101221f, -0.060263f, 0.121867f, -1.042140f, 0.440851f, 0.078898f,
+ -0.209007f, -0.243699f, 0.715197f, -0.093997f, 0.086022f, -0.178203f,
+ -2.275496f, -0.098413f, 0.199352f, -0.526791f, -0.162086f, -0.197806f,
+ -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f, 0.416236f,
+ 0.064082f, 0.197655f, 0.340871f, -0.186645f, -0.291498f, 0.433938f,
+ -1.110063f, 0.003751f, 0.392738f, 0.069360f, 0.102088f, -0.302128f,
+ -1.518457f, 0.106939f, 0.404527f, -0.306868f, -0.286928f, 0.729276f,
+ -0.531710f, 0.745048f, -0.168837f, -1.953886f, -0.258828f, -0.190252f,
+ 0.241877f, -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f,
+ -0.489957f, 0.100850f, 0.323999f, -0.802837f, -0.462408f, -0.079350f,
+ -0.029374f, 0.131213f, -0.825032f, 0.040202f, 0.351821f, 0.002869f,
+ -0.132516f, -0.471264f, -0.297002f, 0.263913f, 0.033478f, 0.146161f,
+ 0.533229f, -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f,
+ 0.005151f, 0.018584f, -0.029771f, -0.396038f, -0.159236f, 0.038691f,
+ -1.197056f, 0.146302f, 0.226840f, -0.852126f, 0.031214f, 0.108880f,
+ 0.562000f, -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f,
+ 0.515073f, -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f,
+ 0.347673f, 0.623379f, 0.722067f, -0.492458f, -0.513263f, 0.585167f,
+ 0.721518f, -0.693499f, 0.343725f, -0.273861f, -0.040230f, -0.785664f,
+ -0.157500f, -0.308445f, 0.054062f, 0.600131f, -0.860887f, 0.434470f,
+ -0.191382f, -0.306150f, -0.243965f, 0.705444f, 0.007789f, -0.146154f,
+ -0.054499f, -0.073500f, -1.067364f, 0.404936f, -2.864590f, 0.182323f,
+ 0.326126f, 0.102405f, -0.135800f, 1.128095f, -0.012267f, -0.023996f,
+ -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f, -0.498361f,
+ 0.083560f, -0.210074f, 0.019225f, -0.201614f, -0.904760f, 0.181421f,
+ 0.586384f, -0.177706f, 0.065471f, 0.168552f, 0.054705f, 0.045241f,
+ 0.048057f, -0.410957f, -2.188854f, -0.169812f, 0.015521f, 0.176856f,
+ -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f, 0.010454f,
+ 0.823643f, -0.119781f, -0.098359f, 0.093119f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = {
+ -0.433195f, -0.120488f, -0.116721f, 0.112134f, 0.118170f, -0.259769f,
+ -0.077530f, 0.394044f, 0.279167f, -0.317988f, 0.189538f, 0.314776f,
+ 0.325655f, -0.107123f, 0.591049f, 0.358744f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_128_layer0,
+ av1_ab_partition_nn_weights_128_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_128_layer0,
+ av1_ab_partition_nn_bias_128_layer1,
+ },
+};
+
+// nn model for ab partition pruning, 64x64.
+static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = {
+ -0.495347f, -0.049498f, -0.026804f, 0.030474f, -0.289308f, -0.264193f,
+ -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f,
+ -0.038217f, 0.014872f, -0.289728f, -0.233577f, -0.415875f, -0.343615f,
+ -0.442543f, -0.482492f, 0.073510f, 0.007503f, 2.162329f, -0.362849f,
+ 2.145915f, -0.883135f, 0.185636f, -0.062859f, -0.465574f, -0.486205f,
+ -0.056710f, -0.330642f, -0.321860f, 0.042321f, -0.348965f, 0.003542f,
+ -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f,
+ 0.246622f, 0.199651f, -0.663420f, -0.154152f, -1.220383f, 0.047138f,
+ 0.816811f, 0.083247f, -0.218839f, 0.038143f, -0.063436f, 0.015517f,
+ -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f,
+ 0.050425f, -0.221723f, -0.256942f, -0.287285f, 0.144011f, -0.033245f,
+ 0.083649f, 0.119428f, -0.056706f, -0.117805f, 0.021866f, -0.257300f,
+ -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f, -0.347247f,
+ 0.042539f, -0.302697f, 1.652316f, 0.000701f, -0.482843f, -0.160332f,
+ -0.450099f, 0.212399f, -4.715360f, -5.336774f, -5.375758f, -6.048339f,
+ 0.085956f, -0.037767f, 1.052409f, -0.931924f, -2.221907f, 0.268946f,
+ 0.015512f, 1.237094f, -1.092185f, 0.418247f, -0.082143f, -0.076914f,
+ -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f,
+ -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f,
+ -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f, 0.019839f,
+ 0.451127f, 0.004376f, 1.410392f, 3.255835f, -0.344815f, 0.145202f,
+ 0.204132f, 0.171948f, -0.527736f, -0.110353f, 0.901448f, 0.003238f,
+ -3.822090f, 0.235462f, 1.024823f, -0.821244f, 0.876056f, 2.553762f,
+ -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f,
+ -0.246040f, 0.039430f, -0.071769f, -0.118847f, -0.304053f, -0.281541f,
+ -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f,
+ 0.084721f, 0.168089f, -0.272169f, -0.204998f, -0.008303f, -0.173998f,
+ 0.079376f, -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f,
+ 0.066176f, -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f,
+ -0.385845f, 0.119769f, -0.006567f, -0.382126f, -0.214221f, 0.038449f,
+ -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f, -0.114423f,
+ -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f,
+ -0.279011f, -0.008132f, 0.208463f, 0.020569f, -0.206803f, -0.213408f,
+ -0.206131f, -0.290245f, 0.069701f, -0.000371f, -0.307572f, -0.451785f,
+ -0.300838f, -0.453186f, -0.301691f, 0.046327f, -0.312668f, 0.058272f,
+ -0.303131f, -0.376252f, 0.108384f, -0.086623f, -0.100630f, -0.027330f,
+ -0.003969f, 0.089502f, -0.200722f, -0.107889f, 0.061843f, -0.008478f,
+ -0.265057f, -0.271132f, -0.073562f, 0.129337f, -0.283698f, -0.353414f,
+ 0.076420f, -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f,
+ -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f,
+ -0.011932f, -0.585700f, 0.253212f, -1.061900f, -0.205116f, -0.336407f,
+ -0.762199f, 0.577737f, 0.230832f, 0.434440f, -0.096713f, 0.038552f,
+ -0.147800f, -0.213553f, 0.041740f, -0.281907f, -0.026154f, -0.082356f,
+ -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f,
+ -0.391963f, -0.467392f, 0.027453f, -0.394761f, -0.045544f, 0.076052f,
+ 0.483985f, 0.067093f, 0.141361f, 0.576772f, 0.859718f, 2.566515f,
+ -0.025476f, 0.769738f, -0.680235f, -1.683309f, -2.394131f, -0.000714f,
+ -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f,
+ 0.551148f, 1.777227f, -0.461630f, 0.043093f, 0.012293f, -0.255841f,
+ -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f,
+ -0.297266f, -0.128699f, -0.149555f, 0.016534f, -0.375498f, -0.346759f,
+ -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f,
+ -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f,
+ -0.048421f, -0.144133f, 0.889073f, 0.012606f, 3.007608f, -0.602584f,
+ -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f, -1.867208f,
+ -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f, 0.119141f,
+ -0.230715f, 0.083247f, 0.020367f, -0.128629f, -0.217455f, -0.159640f,
+ 1.815952f, -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f,
+ 0.662971f, 0.486475f, 0.159746f, -0.018932f, 3.692397f, 1.384353f,
+ -0.401984f, -0.248380f, -0.140861f, 0.215248f, -0.023711f, 0.059679f,
+ -0.072260f, 0.004271f, 0.039545f, -0.347971f, -0.081851f, -0.474896f,
+ -0.181572f, 0.066736f, -0.157822f, -0.163760f, -0.171113f, -0.089935f,
+ -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f,
+ -0.102701f, -0.312336f, 0.149831f, 0.007229f, -0.155700f, -0.173611f,
+ 4.074261f, 1.342306f, -1.272712f, 1.570899f, -0.545093f, -0.317605f,
+ -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f,
+ -0.239130f, -0.067211f, 0.041957f, -0.039234f, -1.003587f, -0.094412f,
+ 0.532512f, -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f,
+ 0.419466f, 0.492122f, -0.004368f, -0.022096f, -1.115132f, 0.150886f,
+ 2.396852f, 2.660000f, -0.376537f, 0.468628f, 0.149413f, -0.074898f,
+ -0.067154f, 0.021245f, 0.127857f, 0.294189f, 0.508056f, 0.390232f,
+ -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f,
+ -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f,
+ -0.398724f, -0.372068f, -0.234279f, 0.017799f, -0.424760f, -0.646717f,
+ -0.047568f, 2.924664f, -0.644165f, 0.359349f, -0.294800f, 0.591746f,
+ -0.404710f, -0.092358f, -0.250729f, 0.030829f, -0.147149f, -0.476023f,
+ -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f,
+ -0.377052f, -0.449899f, -0.056452f, 0.138081f, -0.085350f, -0.308391f,
+ 0.106661f, 0.176234f, 0.258869f, -0.230172f, -0.233029f, -0.241208f,
+ -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f,
+ -0.158114f, -0.223167f, -0.026689f, 0.051863f, 0.212834f, -0.304714f,
+ -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f,
+ 0.280815f, -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f,
+ -0.380595f, 0.109504f, -0.111141f, -0.437685f, -0.094459f, 0.144206f,
+ -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f,
+ -0.418429f, -0.183240f, 0.031319f, -0.095785f, -0.315447f, 0.069404f,
+ -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f,
+ -0.178198f, 0.177208f, 0.134688f, -0.081933f, -0.229452f, -0.208872f,
+ 0.026287f, -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f,
+ -0.267238f, -0.494125f, -0.056255f, 0.053715f, -0.487754f, 0.014818f,
+ 0.087383f, -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f,
+ -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f, 0.010186f,
+ -0.001587f, 0.086735f, -2.465718f, 1.482185f, 1.621193f, -2.081680f,
+ 1.386553f, -3.204335f, -0.267111f, -0.004508f, 0.164712f, 0.274147f,
+ 1.724306f, -2.273659f, 0.749574f, -0.891905f, 0.105965f, -0.030428f,
+ -0.416018f, -0.300762f, 0.122911f, -0.316908f, -0.292504f, 0.138666f,
+ -0.161327f, -0.042143f, -0.249128f, 0.149210f, -0.088987f, -0.654101f,
+ -1.501843f, 0.216777f, 0.955914f, 0.524158f, -1.642561f, -1.643626f,
+ 0.864797f, -0.425451f, -2.115764f, -0.012502f, 0.065172f, 1.297270f,
+ 0.018845f, 1.167276f, -0.470970f, -0.244995f, 0.374782f, -1.811056f,
+ -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f,
+ -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f,
+ -0.370683f, 0.172816f, -0.265069f, 0.194321f, -0.273478f, 0.037442f,
+ -0.235552f, -0.078625f, -0.447541f, 0.016836f, -0.271123f, -0.171481f,
+ -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f,
+ -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f,
+ 0.230343f, -0.034318f, -0.022687f, -0.047090f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer0[64] = {
+ -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f, -0.150944f,
+ -0.075727f, -0.208414f, 1.054996f, 0.713758f, -0.300051f, -0.151482f,
+ -2.443570f, 0.430590f, -0.129001f, -0.160733f, -0.230547f, -0.143228f,
+ -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f,
+ 0.578161f, -0.220318f, -0.210107f, -3.111584f, 0.604419f, -0.232622f,
+ -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f, -2.535531f,
+ -0.209783f, -0.211189f, -2.766337f, 0.000000f, 0.450177f, -1.754884f,
+ 3.262664f, -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f,
+ -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f,
+ -0.305430f, 0.739171f, 0.991277f, -0.088150f, 0.086313f, -0.023379f,
+ -0.125366f, -0.063576f, -0.212169f, -0.047463f,
+};
+
+static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = {
+ -0.036800f, 0.528721f, 0.490767f, 0.144409f, 1.103640f, 0.361910f,
+ -0.180069f, 0.068033f, -14.868382f, 0.359013f, 0.322567f, -0.199212f,
+ 0.906164f, -0.488254f, 0.149653f, -0.216394f, -0.099347f, 0.004936f,
+ -0.111391f, 0.074848f, -0.041709f, 0.147627f, -0.018905f, 0.096116f,
+ 0.184817f, -0.016241f, 0.115739f, 2.376754f, 0.637097f, 0.052954f,
+ 0.136428f, 0.225267f, -0.181873f, -0.142876f, 0.684048f, 0.658791f,
+ 0.105795f, 0.241705f, 1.381114f, -0.209379f, 1.145949f, 0.795293f,
+ -9.361877f, 0.198302f, 0.539600f, 0.092317f, -0.081695f, 0.200777f,
+ 0.102334f, 0.081583f, 0.060948f, -0.025110f, 0.160951f, -0.020170f,
+ 0.234006f, -0.029369f, 0.375036f, 0.270209f, -0.556529f, 1.402949f,
+ 0.101777f, -0.027331f, 0.004502f, -0.153166f, -0.116651f, 0.151573f,
+ -0.022187f, 0.144044f, -0.108719f, -0.129942f, -0.270321f, 0.227363f,
+ 1.892330f, -0.661052f, -0.219398f, -0.229417f, -0.856438f, -1.196988f,
+ -0.081774f, 0.078847f, -0.207057f, -0.048947f, 0.152073f, -0.243056f,
+ -0.233329f, -0.288689f, -0.158333f, -0.141177f, -0.715436f, 0.016947f,
+ -0.093752f, 0.204984f, -1.209782f, 0.155683f, 0.092239f, 0.146495f,
+ 0.813146f, -0.027757f, 0.330982f, 2.173948f, -0.028867f, -0.141815f,
+ 0.292708f, -0.204794f, 0.014496f, 1.032799f, 1.312155f, 0.107020f,
+ 0.824752f, -0.013945f, 0.184829f, -0.041633f, 0.215300f, -0.476088f,
+ -0.053213f, 0.126862f, -0.020777f, 0.082893f, -0.223727f, -0.923063f,
+ 0.466529f, 0.082140f, -0.845758f, -1.140791f, -0.262033f, 0.138491f,
+ 0.151717f, -0.182479f, -0.131128f, 0.055411f, 0.106771f, 0.125552f,
+ 0.297184f, -0.257403f, -0.059884f, -0.274903f, 2.694357f, -0.108244f,
+ 0.025377f, 0.043092f, -0.558317f, 3.517159f, -0.270833f, -0.240676f,
+ 0.205100f, -0.057068f, -0.140445f, -0.193449f, -0.030061f, -0.286762f,
+ -0.467523f, -0.012647f, 0.190564f, 0.022394f, -0.101479f, 0.339684f,
+ -0.902743f, -0.169578f, -0.178029f, -0.041836f, -3.952108f, -0.028298f,
+ -0.221137f, -0.733895f, -0.223895f, 0.039012f, 0.687867f, 0.021423f,
+ 0.113063f, 0.676087f, -0.961000f, -0.064847f, 0.712856f, -0.192765f,
+ -0.001132f, 0.016689f, -0.236020f, -0.766186f, -0.175729f, 0.012879f,
+ -0.251064f, -0.105523f, -0.039212f, -0.347584f, 0.304352f, -0.034174f,
+ -0.364258f, -0.685252f, -0.266115f, -0.247345f, -0.155905f, 0.152283f,
+ -0.156315f, 0.174082f, -0.757654f, 0.102303f, -2.192316f, -0.245815f,
+ 0.119882f, -0.086542f, 1.987246f, -1.353163f, -0.374813f, -0.233504f,
+ -1.980895f, 0.692093f, -0.168351f, 0.172700f, -0.009052f, -0.015734f,
+ 0.106679f, -0.060472f, -0.256813f, -0.074874f, -0.207488f, -0.329515f,
+ -0.418268f, -0.017940f, -0.036081f, 0.064719f, -1.488016f, 0.020591f,
+ -0.176325f, -0.141074f, 0.944494f, 0.150237f, -0.249805f, -0.277280f,
+ 0.012686f, 0.132483f, 0.116123f, 0.013737f, -0.116091f, 0.750340f,
+ 3.251343f, -0.188864f, 1.096992f, 0.058467f, -0.041433f, -0.037937f,
+ -0.133294f, -0.137908f, -0.171132f, 0.106362f, 0.069383f, -0.052662f,
+ -0.177883f, -0.408049f, 0.680221f, -0.117035f, -0.904240f, -1.395228f,
+ 0.154527f, 0.134427f, 0.022767f, -0.158886f, -0.230316f, 0.161096f,
+ 0.362213f, -0.235060f, -0.941620f, 0.055912f, -0.049458f, -0.166632f,
+ 0.481418f, 0.930146f, 0.041108f, 0.033674f, 1.372066f, -1.847709f,
+ 0.003324f, 0.259534f, 0.177014f, -0.202761f, -0.262017f, -0.190852f,
+ -0.102839f, 0.028338f, 0.187193f, -0.041684f, 0.123973f, -0.198576f,
+ -0.110369f, -1.431400f, 0.208369f, -0.302370f, -0.248549f, 0.062985f,
+ 0.673409f, 0.036662f, -0.711340f, -0.120584f, -0.189789f, 0.098812f,
+ 2.947819f, 0.216567f, -0.414472f, -0.181742f, 1.873779f, -0.222726f,
+ -0.782870f, 0.007889f, 0.015062f, -0.554328f, 0.182928f, -0.191430f,
+ 0.123636f, -0.215460f, -0.225245f, 0.251516f, -0.013025f, -1.359595f,
+ -0.750602f, 0.342667f, -0.141899f, -0.687493f, -0.072639f, 0.048018f,
+ -0.242107f, -0.031917f, -0.287472f, -0.046088f, 0.832197f, -0.016576f,
+ -1.553349f, -0.216341f, 0.023077f, -0.410867f, 4.243743f, -0.514878f,
+ -0.066007f, -0.160696f, -0.262678f, -0.648790f, -0.430586f, 0.199940f,
+ -0.202496f, -0.222241f, -0.016406f, -0.121473f, 0.000828f, -0.081584f,
+ -0.152641f, -0.190166f, 0.644400f, 0.040196f, -0.302104f, -1.143654f,
+ -0.160327f, -0.320780f, -0.187006f, 0.037311f, 0.440618f, -0.070733f,
+ -0.117785f, 1.527539f, -0.419310f, 0.001300f, 1.389956f, -0.036366f,
+ -0.269203f, 0.612265f, 2.721897f, -0.086836f, -0.446999f, 0.012525f,
+ -0.078317f, -0.287052f, -0.111188f, -0.085181f, -0.164667f, -0.010466f,
+ -0.569722f, -0.018888f, -0.101663f, -1.147130f, -0.465204f, 0.114524f,
+ -2.192402f, -0.221325f, 0.375748f, 0.206284f, -0.261548f, -0.246257f,
+ -0.143004f, -0.069981f, -0.057306f, -0.116481f, -0.435903f, -0.314970f,
+ 0.013210f, -0.010175f, 4.630571f, -0.473226f, -0.197199f, -0.028204f,
+ 0.122907f, 2.475548f, 0.025011f, -0.092603f, -0.127561f, -0.151330f,
+ -0.077295f, 0.245016f, -0.045005f, 0.183396f, -0.330556f, -0.384887f,
+ 0.356374f, -0.016618f, -0.463353f, -1.291546f, -0.071986f, -0.311599f,
+ 0.072385f, -0.430786f, -2.094788f, 0.202733f, -0.910109f, -1.336543f,
+ -0.086800f, -0.096413f, 1.544383f, 0.031860f, -0.796211f, 0.762786f,
+ 3.250022f, -0.441798f, -0.698537f, 0.062839f, 0.033525f, -0.362996f,
+ 0.027022f, -1.131264f, -0.228926f, 0.053885f, -0.338628f, 0.155037f,
+ -0.046844f, -0.888172f, -0.241767f, 0.084965f, -0.617743f, -0.049896f,
+ -0.036894f, -0.304783f, -0.002639f, 0.137957f, 0.052121f, -0.131161f,
+ -0.117200f, -0.253380f, -0.205561f, -0.302450f, -0.047397f, -0.330518f,
+ 3.613420f, -1.525951f, -0.026738f, 0.209150f, -2.103534f, 2.019689f,
+ -0.366199f, -0.095260f, 0.027417f, -0.242512f, 0.162579f, 0.052113f,
+ -0.293851f, -0.068138f, -0.005799f, -0.344696f, -0.114824f, -0.431107f,
+ -0.120058f, -1.139926f, -1.048379f, 0.036446f, -0.323020f, -0.432945f,
+ 0.454151f, -0.140058f, 0.050649f, -0.094900f, -0.017278f, -0.238719f,
+ 1.193153f, 0.120447f, -0.496061f, 0.917431f, 2.936126f, -0.115521f,
+ -0.347397f, -0.435325f, -0.004383f, -0.211864f, 0.162383f, -1.040726f,
+ 0.089537f, -0.128579f, -0.133505f, 0.107129f, -0.435657f, -0.180388f,
+ 0.043650f, 0.018709f, -0.773242f, -0.687192f, -0.120633f, -0.063626f,
+ 0.029912f, 0.113972f, -0.403502f, -0.127640f, -0.269625f, 0.129794f,
+ -0.188539f, 0.041641f, 0.029769f, -0.198374f, 1.401407f, 0.353887f,
+ -0.219925f, 0.260515f, 1.157034f, -2.992044f, -0.097618f, -0.064417f,
+ -0.203626f, -0.008217f, -0.112339f, -0.227407f, -0.155118f, 0.247705f,
+ -0.012304f, -0.248447f, -0.913463f, -0.064788f, -0.214619f, -0.251761f,
+ -0.386861f, -0.040574f, -0.163219f, -0.100700f, 1.488274f, -0.071684f,
+ -0.033626f, -0.006497f, -0.246945f, -0.145221f, -3.747390f, 0.149609f,
+ -0.263326f, -0.297385f, -1.039896f, -0.083174f, -0.025473f, -0.235586f,
+ -0.001087f, 0.254286f, 0.265106f, 0.007325f, 0.199239f, 0.134103f,
+ -0.578211f, -0.259801f, -0.062373f, 2.368348f, 0.560556f, -0.252260f,
+ 0.889997f, -0.447872f, -0.059218f, -0.095315f, -0.061667f, 0.183580f,
+ -0.157479f, 0.055387f, -0.831734f, 0.007606f, -1.104906f, 0.301180f,
+ -0.117115f, 0.212959f, 4.727223f, -0.243833f, -0.397495f, -0.025021f,
+ -0.367587f, -2.082058f, -0.217699f, 0.148111f, 0.252430f, 0.111088f,
+ -0.260692f, 0.095124f, -0.407774f, -0.322169f, 0.002927f, 0.126169f,
+ -1.272325f, -0.279772f, -0.373680f, -0.485177f, -0.605458f, 0.021225f,
+ -0.092031f, -0.226585f, 1.895162f, 0.037866f, -0.275475f, 1.614360f,
+ -0.014972f, -0.277679f, -3.449082f, -0.092060f, -0.747873f, 0.020716f,
+ 2.776178f, -0.049963f, 0.183999f, -0.295259f, -0.028868f, 0.221895f,
+ 0.001265f, 0.336823f, 0.219372f, 0.112824f, 0.408132f, -0.017940f,
+ -0.311666f, 1.489606f, -0.058093f, -0.305659f, -0.491933f, -0.143847f,
+ 0.166115f, 0.042867f, -0.123447f, -0.087099f, -0.305395f, -0.365079f,
+ -0.755801f, -0.160649f, 0.736260f, -0.008611f, 0.095836f, -0.017345f,
+ 5.697515f, -0.498971f, -0.125280f, 0.199907f, 0.300053f, 0.605026f,
+ -0.228225f, -0.259523f, 0.016384f, 0.146973f, 0.210258f, 0.226766f,
+ -0.075178f, -0.050924f, 0.188496f, -0.415266f, -0.484880f, -0.236384f,
+ 0.071931f, -0.331863f, -0.601243f, -0.232479f, -0.285272f, 0.123789f,
+ -1.341333f, 0.037082f, -0.315202f, -1.587215f, -0.271576f, 0.003216f,
+ -4.437186f, -0.256205f, -0.576589f, -0.114147f, 2.153916f, -0.369618f,
+ 0.271415f, 0.145036f, -0.158731f, -0.240938f, -0.187369f, 0.036325f,
+ 0.254771f, 0.211488f, -0.240297f, 0.098417f, -0.415011f, 2.334793f,
+ -0.127252f, 0.020069f, -0.168755f, -0.448922f, -0.219207f, 0.016232f,
+ -0.221935f, -0.269500f, -0.100636f, 0.102545f, -0.809376f, -0.054979f,
+ 0.360713f, -0.326541f, 0.112933f, 0.138073f, 4.229404f, -0.763801f,
+ -0.305429f, 0.199955f, -1.787713f, 0.272866f, 0.109895f, 0.138466f,
+ -0.250259f, -0.167162f, -0.212588f, -0.217589f, -0.067125f, -0.077490f,
+ -0.208970f, -0.006863f, -0.671146f, -0.298320f, -0.165509f, 0.044597f,
+ -1.408624f, -0.213957f, -0.220947f, 0.129718f, 1.316777f, -0.098928f,
+ -0.008121f, -0.558293f, -0.297290f, -0.218873f, -4.346638f, -0.228174f,
+ -0.204710f, -0.388864f, 2.697919f, 0.025260f, 0.857020f, 0.009921f,
+ 0.036915f, -0.320275f, -0.087937f, 0.022636f, 0.236667f, 0.135496f,
+ -0.059616f, -0.192955f, 0.009470f, 2.139589f, -0.200449f, 0.129818f,
+ 1.017444f, -0.608299f, 0.257914f, -0.134306f, -0.033327f, 0.002855f,
+ -0.338598f, 0.015559f, 0.117362f, -0.166760f, 0.086903f, -0.167666f,
+ 0.193523f, 0.033852f, -1.147686f, 0.489468f, -0.006969f, 0.125630f,
+ 1.557907f, -1.604449f, -0.071114f, 0.096178f, 0.007065f, 0.200013f,
+ 0.213393f, 0.168466f, -0.100568f, -0.117861f, -0.161542f, -0.072561f,
+ -1.069871f, -0.470138f, -0.352578f, -1.503513f, -0.001394f, -0.380109f,
+ 0.065089f, -0.281668f, 0.988953f, -0.002778f, -0.659026f, -0.470692f,
+ -0.407292f, 0.011710f, -1.362085f, 0.184738f, -0.135786f, -1.374241f,
+ 4.487930f, -0.067274f, -0.956404f, -0.233995f, 0.224527f, -0.454556f,
+ 0.037900f, -0.281658f, 0.208224f, -0.254753f, 0.045740f, 0.051444f,
+ -0.388281f, 0.257112f, -0.485030f, -0.082659f, 0.148103f, -1.007456f,
+ -0.022295f, 0.036984f, -0.369401f, -0.076943f, -0.007636f, -0.293022f,
+ 0.470466f, 0.199012f, -2.158182f, 0.036577f, -0.014725f, -0.229516f,
+ 2.236929f, 0.030945f, -0.400045f, 0.109348f, 0.214691f, -0.891516f,
+ -0.251379f, -0.217358f, 0.013733f, 0.205573f, -0.151725f, -0.191782f,
+ -0.339630f, -0.163905f, -0.119191f, -0.032516f, 0.503015f, 0.025772f,
+ 0.029094f, -1.146153f, 0.216723f, -0.330023f, 0.064695f, -0.262521f,
+ 0.425612f, -0.093080f, -0.489648f, 1.051293f, -0.092332f, 0.095557f,
+ -0.874132f, 0.218483f, -0.127648f, -1.605802f, 2.763617f, -0.186734f,
+ -1.243166f, -0.193514f, -0.173748f, 0.337822f, 0.183873f, -0.251594f,
+ -0.211582f, 0.144081f, 0.029620f, -0.024853f, -0.385140f, 0.467341f,
+ -0.928316f, -0.195442f, 0.917783f, 0.357084f, 0.174445f, -0.073659f,
+ -0.012811f, -0.115420f, -0.181147f, -0.364449f, -0.567395f, -0.012969f,
+ -1.680714f, 0.065323f, 0.198063f, -0.244201f, 1.428545f, -0.432539f,
+ -0.208931f, -0.091205f, 0.957125f, 0.813519f, -0.262677f, 0.246852f,
+ 0.015536f, 0.055026f, 0.067054f, 0.262103f, -0.358115f, -0.095206f,
+ -0.267522f, -0.402710f, -0.680397f, -0.123627f, -0.385590f, -1.504680f,
+ -0.169513f, -0.215338f, 0.043633f, -0.079052f, -0.464410f, 0.122894f,
+ -0.278231f, -2.456445f, -0.159917f, -0.015597f, -0.735449f, -0.078854f,
+ -0.400290f, -1.153870f, 3.657228f, -0.287093f, -1.174355f, -0.102001f,
+ -0.288281f, 0.185209f, -0.145228f, -0.200449f, -0.099914f, -0.138354f,
+ 0.254428f, -0.161751f, -0.118206f, 0.296043f, -0.482613f, 0.080932f,
+ 1.097605f, -0.010190f, 0.232439f, 0.447617f, -0.133508f, 0.115763f,
+ -0.388589f, 0.174695f, -0.236014f, 0.006284f, -1.374129f, 0.092015f,
+ -0.241419f, -0.231667f, 2.763950f, -0.922932f, -0.061605f, 0.208740f,
+ -1.597190f, 1.353325f, -0.198528f, 0.250498f, -0.013950f, -0.203861f,
+ -0.254563f, 0.081931f, -0.413369f, 0.011844f, 0.080961f, -0.231161f,
+ -1.234909f, -0.440843f, -0.174980f, -0.315283f, -0.337474f, -0.123243f,
+ -0.310001f, -0.271028f, 0.364179f, 0.022845f, -0.535517f, -0.772936f,
+ -0.188435f, 0.039667f, -0.807463f, 0.266550f, -0.288857f, -1.630789f,
+ 1.280155f, 0.065712f, -0.279960f, -0.300056f, 0.258440f, -0.073781f,
+ 0.213878f, 0.042196f, 0.021360f, 0.211698f, -0.003751f, -0.192673f,
+ -0.137008f, 0.247878f, -0.470604f, 0.073164f, 1.523241f, 0.734755f,
+ -0.114126f, -0.193834f, -0.025759f, 0.263183f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+ -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f,
+ -0.148074f, 0.923430f, -0.364770f, 0.203550f, 0.401216f, 0.938246f,
+ -0.872737f, 0.718723f, 0.703398f, 2.560015f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_64_layer0,
+ av1_ab_partition_nn_weights_64_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_64_layer0,
+ av1_ab_partition_nn_bias_64_layer1,
+ },
+};
+
+// nn model for ab partition pruning, 32x32.
+static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = {
+ -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f,
+ -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f,
+ 0.344916f, -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f,
+ 0.411575f, -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f,
+ -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f,
+ 0.225887f, -0.000493f, 2.682241f, 0.871204f, 0.059014f, 0.803542f,
+ -1.407028f, -1.154669f, 1.388148f, -0.293348f, -0.003669f, -0.009607f,
+ 1.330030f, -0.337841f, 2.118617f, 1.033059f, -0.084788f, 0.212904f,
+ 0.082405f, -0.070579f, -0.494005f, -0.173392f, 0.039546f, -0.463865f,
+ 0.077163f, -0.434066f, 0.030835f, -0.427139f, -0.560520f, -0.031606f,
+ -0.368541f, -0.027458f, 0.370574f, 0.461418f, 1.087682f, -0.572137f,
+ -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f,
+ -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f, -0.156744f,
+ -0.267922f, 0.171216f, 0.110556f, 0.002954f, -0.200327f, -0.187663f,
+ 3.691601f, 1.234152f, 0.186315f, -0.125370f, -0.211235f, -0.554432f,
+ -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f, 0.012896f,
+ -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f, 0.016307f,
+ 0.384673f, -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f,
+ -0.169709f, 0.421681f, -0.033360f, -0.072817f, 0.003647f, -0.110632f,
+ -0.158651f, -0.095136f, 0.223759f, 0.165767f, -0.269129f, -0.196075f,
+ -0.023183f, -0.293420f, 0.014875f, 0.018688f, -0.153407f, -0.172009f,
+ -0.259947f, -0.124015f, 0.173653f, -0.089103f, -0.021001f, -0.334230f,
+ 0.027177f, 0.103371f, -0.183860f, -0.204051f, -0.023721f, -0.192297f,
+ -0.143771f, -0.247106f, 0.218116f, -0.013240f, 2.831783f, 1.483928f,
+ -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f, 0.234684f,
+ -0.119150f, -0.075182f, -0.330463f, 0.071503f, -0.254924f, -0.360071f,
+ -0.037022f, 0.063261f, -0.148759f, -0.238254f, -0.462018f, -0.027166f,
+ 0.065318f, -0.235743f, -0.257194f, -0.094784f, 0.022423f, 0.055925f,
+ 0.086672f, -0.021010f, 0.009965f, -0.001648f, -0.104917f, -0.387443f,
+ -0.102673f, -0.281706f, 0.145923f, -0.233391f, -0.378365f, -0.145584f,
+ -0.077751f, -0.121166f, 1.134565f, -0.097500f, -0.749202f, -0.544566f,
+ -1.361374f, -0.102494f, 1.089275f, 0.375299f, -0.105091f, 0.037641f,
+ -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f,
+ -0.339326f, -0.128217f, -0.282905f, 0.014937f, 1.067185f, -0.171764f,
+ 0.484458f, 0.396706f, -0.557055f, -0.891596f, -0.257839f, -0.720879f,
+ -0.218449f, -0.004755f, 1.572857f, 0.006229f, 1.962895f, -0.029746f,
+ -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f,
+ -1.263078f, -0.304560f, 1.072374f, 2.556429f, 0.312850f, 0.257488f,
+ -0.634264f, 0.156769f, -0.188943f, 0.040295f, -0.389915f, 0.085250f,
+ -0.248525f, 0.045667f, -0.776115f, -0.274680f, -0.448145f, -0.566161f,
+ -1.285316f, 0.079060f, 0.389124f, -0.510401f, -0.015299f, -0.664661f,
+ 0.099901f, -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f,
+ -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f,
+ -0.064569f, -0.156516f, 0.543522f, -0.005924f, 0.161432f, 0.974793f,
+ 0.273712f, 1.104850f, -0.290312f, 0.313417f, -0.125370f, 0.136234f,
+ -0.191227f, -0.165054f, 0.011872f, -0.298871f, 0.095740f, 0.142760f,
+ -0.215771f, -0.031437f, 0.101041f, -0.085620f, 0.435387f, 0.002786f,
+ 1.971375f, 0.018392f, -1.771940f, -0.401433f, 0.808263f, -3.350013f,
+ 2.296952f, -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f,
+ -0.276088f, -0.455907f, 0.266021f, 0.087348f, -0.146566f, 0.040492f,
+ -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f,
+ -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f, 0.002185f,
+ -4.225019f, 0.344025f, 0.728796f, -0.262936f, 1.383924f, 1.577300f,
+ -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f,
+ -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f,
+ 0.031970f, -0.373402f, -0.396079f, 0.045566f, 0.072595f, -0.222681f,
+ -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f,
+ -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f,
+ -0.205936f, -0.316275f, 0.103729f, -0.197893f, -0.128029f, -0.218796f,
+ -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f,
+ -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f,
+ -0.270504f, 0.234505f, 0.272144f, 0.266938f, -0.392395f, -0.011717f,
+ -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f,
+ -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f,
+ -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f,
+ -0.119432f, -0.222351f, 0.000450f, 0.208724f, -0.510526f, -0.144656f,
+ -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f,
+ 0.043714f, -0.235414f, 0.115594f, -0.195616f, -0.106693f, -0.124242f,
+ 0.083990f, 0.049110f, -0.196130f, -0.059860f, -0.464235f, -0.516443f,
+ -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f,
+ -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f,
+ -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f,
+ -0.023459f, -0.222538f, 0.028849f, -0.088038f, -0.301550f, -0.273566f,
+ 0.067295f, -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f,
+ -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f,
+ -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f,
+ -0.086147f, -0.430088f, 0.058466f, -0.152129f, -0.058411f, -0.236392f,
+ -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f,
+ -0.324501f, 0.000490f, -0.282167f, -0.073163f, -0.281452f, 0.047932f,
+ -0.175500f, 0.165220f, -0.276212f, 0.062153f, -0.217054f, -0.255487f,
+ -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f,
+ -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f,
+ -0.158325f, 0.151907f, -0.266835f, -0.144697f, -0.193960f, -0.046587f,
+ -0.220028f, -0.247355f, 0.135584f, 0.016511f, 0.367705f, -1.855877f,
+ 0.435622f, 0.444710f, -3.372301f, -3.030489f, 1.013267f, 0.380951f,
+ -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f,
+ -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f,
+ -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f,
+ -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f,
+ -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f, 1.189112f,
+ 1.458468f, -0.005876f, -0.927475f, 0.062038f, -1.170818f, 0.338227f,
+ -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f,
+ -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f,
+ -0.310094f, 0.062721f, 0.251422f, -0.014350f, -1.282910f, 1.619560f,
+ 1.180566f, -0.032163f, -1.322951f, -0.603601f, 1.443710f, 0.654650f,
+ -0.393227f, 0.003536f, 0.029725f, -0.108925f, -0.053911f, 0.133977f,
+ -0.036145f, -0.168438f, 0.046989f, -0.331463f, -0.176983f, -0.311922f,
+ -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f,
+ -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f, -0.032867f,
+ -0.039424f, -0.063670f, 0.193808f, -0.303514f, -0.013376f, -0.057761f,
+ 0.187922f, 0.006938f, 0.031810f, 0.180594f, -1.198427f, 2.820662f,
+ 0.154986f, -0.375518f, 0.116925f, -0.795782f, -0.085139f, -0.079365f,
+ -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f,
+ -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f,
+ -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f,
+ -2.453587f, -0.045568f, -0.296932f, 0.613061f, -0.320284f, 0.191620f,
+ -0.827145f, -0.225277f, 0.275800f, 1.696635f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer0[64] = {
+ -0.176206f, 0.660189f, -0.186156f, -2.481963f, -1.564218f, -0.280424f,
+ 0.732684f, -0.135581f, -2.193132f, -0.172771f, 0.605001f, -0.060392f,
+ -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f,
+ 0.632779f, 0.005585f, 1.310169f, 1.392136f, -0.563860f, -0.051053f,
+ 0.660998f, -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f,
+ -0.177726f, 1.200859f, -0.178902f, -0.172620f, -0.184476f, -0.175559f,
+ 0.538503f, -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f,
+ -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f, -0.020116f,
+ -0.208096f, 0.000000f, 1.246166f, -0.225421f, -0.181555f, 0.861761f,
+ 1.172429f, -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f,
+ -1.384604f, -0.201713f, -0.271948f, 0.372351f,
+};
+
+static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = {
+ -0.037828f, 1.529029f, 0.004927f, 1.475763f, 0.627172f, 0.325872f,
+ -0.990757f, 0.129476f, 0.889958f, -0.082031f, 0.332133f, 0.074422f,
+ -0.176212f, -0.074355f, 0.774378f, 0.110987f, -0.155469f, 0.253310f,
+ 0.882538f, 0.253605f, 0.332436f, -5.389474f, 0.278470f, 0.168644f,
+ 0.914611f, 0.154165f, 0.809262f, -0.174734f, 0.923673f, 0.064716f,
+ -0.070228f, -0.228735f, 0.002312f, 0.112222f, -0.045502f, -0.046004f,
+ 0.514101f, 0.306480f, 0.021232f, -0.015955f, -0.288260f, 0.189177f,
+ -0.104158f, 0.103273f, 0.096910f, -0.086328f, 1.327289f, -0.154247f,
+ 0.056676f, -0.243327f, -0.646676f, 0.177221f, -0.086761f, 0.729729f,
+ -14.710893f, -0.044881f, 0.339003f, -0.134737f, 0.073621f, -0.162913f,
+ 1.215237f, 0.140723f, 0.138630f, 1.241719f, 0.204092f, -0.463080f,
+ -0.176086f, 1.125868f, 1.034814f, 0.225455f, -0.203421f, -0.078787f,
+ -0.527498f, 0.012491f, -0.563307f, -0.170792f, 0.002679f, 0.116153f,
+ 0.211348f, -0.191900f, -0.212505f, 0.263445f, -0.074679f, -0.081441f,
+ -0.815405f, 2.448215f, 0.781299f, 0.149542f, -1.045162f, 0.043014f,
+ 0.217381f, -0.094500f, -0.090427f, 0.025784f, -0.228906f, -2.741798f,
+ 0.230475f, -0.256112f, -0.103297f, 0.159121f, -0.229793f, -0.014883f,
+ -0.104131f, -0.123816f, 0.164148f, -0.052279f, -0.071845f, -0.041197f,
+ 0.208527f, -0.234197f, -0.542336f, 0.020053f, 0.088870f, 0.014346f,
+ 2.502164f, -0.010244f, -0.267792f, 0.844394f, 2.711486f, -0.015262f,
+ -0.868053f, -0.295704f, 0.222289f, -0.000286f, -0.352098f, -0.079000f,
+ 0.021267f, -0.721739f, -0.240558f, -0.384775f, 0.065974f, -2.161058f,
+ 0.195889f, 0.268966f, -0.009329f, 0.014949f, 0.314943f, 0.235885f,
+ 0.072591f, -0.127120f, 0.150784f, 0.105697f, -1.297403f, -0.207509f,
+ -0.217688f, -0.076752f, 0.170952f, -0.294235f, 0.449973f, -1.712690f,
+ 0.860989f, 0.054757f, -0.812627f, -0.105316f, -0.736230f, -0.133192f,
+ -3.741608f, 0.495660f, -0.288936f, 4.654852f, -0.021305f, -0.308916f,
+ 0.049205f, -0.259996f, 0.114248f, -0.252647f, -0.253180f, -0.449314f,
+ 0.022979f, 0.063281f, -0.196154f, 0.078295f, -0.322317f, -0.145142f,
+ 0.300573f, 0.048385f, -0.254787f, 0.123939f, -1.263088f, -0.228565f,
+ -0.389061f, 0.391084f, 2.322438f, 0.075009f, 0.225743f, -0.198808f,
+ -0.280538f, -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f,
+ -0.102756f, -1.760965f, 0.019149f, -0.867342f, 0.347141f, 0.031588f,
+ 0.302572f, -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f,
+ -0.108561f, -0.167077f, -2.851509f, -0.307116f, 0.202720f, -0.160280f,
+ -0.215525f, 0.064355f, -0.427220f, 1.516230f, 0.634453f, 0.099400f,
+ -1.013887f, -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f,
+ -0.160953f, 0.399036f, -0.030685f, -0.113619f, -0.184704f, 0.040519f,
+ -0.588252f, -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f,
+ -0.253959f, -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f,
+ -0.175087f, -0.055171f, 1.642014f, -0.192559f, -0.288147f, 0.610311f,
+ 4.688195f, -0.128728f, -0.914869f, -0.108286f, 0.013789f, 0.092125f,
+ 0.019770f, -0.178386f, 0.074164f, -1.152658f, -0.216738f, -0.277286f,
+ 0.012381f, 0.418259f, -0.680727f, -0.221917f, -0.485946f, 0.101672f,
+ 2.009457f, 0.054302f, 1.019838f, -0.116170f, 0.165134f, -0.112567f,
+ 0.852632f, -0.385796f, -0.108666f, 0.053181f, -0.311797f, -0.372875f,
+ -0.675717f, 2.409268f, -0.514720f, -0.214245f, -0.646596f, 0.009756f,
+ 0.203993f, 0.093617f, -0.301290f, 0.253551f, -0.128909f, -1.448442f,
+ -0.186823f, -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f,
+ -0.212084f, -0.137326f, 0.012505f, 0.087850f, -0.200413f, -0.394119f,
+ -0.132224f, 0.146917f, 0.155746f, 0.198725f, -0.322541f, 0.196391f,
+ -0.945500f, 0.036736f, -0.155646f, -0.677341f, 1.130545f, -0.339554f,
+ 0.411628f, -0.355813f, -0.249843f, 0.213694f, -2.035607f, 0.055694f,
+ -0.111669f, 0.408696f, -0.067043f, -0.048182f, 0.398110f, -0.067542f,
+ 1.459801f, 0.236833f, -0.178806f, 0.168758f, 0.492387f, 0.099691f,
+ -0.776680f, -0.172865f, 0.204225f, 0.193982f, 0.575685f, -0.062248f,
+ 0.011486f, 0.058571f, -0.493391f, 0.026893f, -0.900467f, 3.793129f,
+ -0.634613f, -0.064660f, -0.048262f, 0.361905f, 0.033641f, 0.245171f,
+ -0.064671f, 0.034954f, 0.204358f, -0.904023f, -0.052714f, -0.250134f,
+ 0.136700f, 0.000734f, -0.371720f, 0.226483f, 0.217958f, 0.060559f,
+ 0.180111f, 0.000970f, 0.079556f, -0.096775f, 0.093855f, -0.026224f,
+ -0.243664f, 0.004290f, 0.123281f, -0.239476f, 1.230374f, -0.107826f,
+ -0.101982f, -0.153917f, 5.464427f, 0.304375f, -0.809957f, 0.090564f,
+ -0.278416f, -0.245555f, -2.078421f, 0.243093f, -0.127666f, 0.052451f,
+ -0.126662f, -0.783505f, 0.025149f, -1.422675f, -0.207769f, -0.362547f,
+ 0.115310f, 0.133390f, 1.264754f, -0.027055f, -0.485312f, -0.240717f,
+ -0.239722f, 0.146818f, -1.265043f, -0.235553f, 0.267104f, -0.021357f,
+ -0.435949f, -0.309371f, 0.049920f, 1.302721f, -0.233978f, -0.097551f,
+ -0.240631f, -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f,
+ -0.029361f, 2.703590f, -0.430659f, 0.067927f, -0.387520f, -0.370630f,
+ -0.229236f, 0.085653f, -0.370956f, -0.065556f, -0.187859f, 0.068309f,
+ -0.109299f, -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f,
+ -0.196713f, -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f,
+ 5.963707f, -0.201157f, 0.726377f, -0.011076f, 0.010553f, -0.102918f,
+ -2.230088f, -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f,
+ -0.094735f, -1.381839f, 0.587298f, -0.173048f, 0.721360f, 0.241900f,
+ 0.764302f, -0.023609f, -1.173755f, 0.103912f, -0.185363f, 0.078435f,
+ -2.245062f, -0.127269f, 0.202234f, 0.158975f, -0.260909f, 0.098608f,
+ -0.348247f, 1.732502f, -0.412298f, -0.269602f, -0.425771f, -0.146243f,
+ -0.530730f, 0.125716f, -1.004419f, 0.145109f, -0.059289f, 1.096304f,
+ 0.012891f, 0.045033f, -0.306875f, 0.003514f, -0.176110f, 0.037544f,
+ -0.441537f, -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f,
+ -0.128894f, -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f,
+ 1.173404f, 0.088312f, -0.393568f, -0.175134f, 6.529819f, -0.326652f,
+ -0.631917f, -0.393476f, 0.057781f, -0.217748f, -1.781139f, -0.012614f,
+ -0.212621f, -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f,
+ -0.608744f, -0.265146f, 0.238517f, 0.066882f, -2.916806f, 0.054642f,
+ 0.282590f, 0.075248f, 0.010188f, -0.133486f, 0.985945f, -0.045849f,
+ -0.347564f, 0.057320f, -0.417920f, 0.063664f, 0.387062f, -2.692059f,
+ -0.535549f, 0.263736f, 0.327889f, -0.070273f, -0.775254f, 0.147250f,
+ 3.309425f, -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f,
+ 0.022907f, 0.138421f, -0.112159f, -0.288447f, -0.010799f, 0.056049f,
+ -0.036527f, 0.021525f, 0.106649f, -0.291883f, 0.088424f, -0.057773f,
+ -0.086031f, 0.015277f, -0.318505f, -0.269049f, -1.008913f, -0.224785f,
+ -0.025820f, -0.649037f, 0.706381f, 0.096410f, 0.643776f, -0.046743f,
+ -0.009654f, -0.024246f, 1.469255f, -0.183536f, -0.370046f, -0.048442f,
+ -0.376527f, -0.431264f, -0.245109f, -0.093951f, 0.203683f, -0.099872f,
+ 0.087210f, 0.160692f, -3.527694f, -0.068891f, -0.228994f, -0.231817f,
+ -0.241949f, 0.193613f, 0.979597f, -0.091259f, 0.414424f, -0.047341f,
+ -0.209582f, -0.295134f, -0.016824f, 0.460327f, -0.072671f, 0.246234f,
+ 0.235896f, 0.127238f, -1.068683f, 0.035648f, 2.254888f, 0.180105f,
+ -0.260098f, -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f,
+ -0.237916f, 0.031103f, -0.274063f, -0.049384f, -0.044917f, 0.102477f,
+ -0.342148f, -0.257558f, -0.346300f, 0.115333f, -0.115456f, 0.208354f,
+ -0.359301f, -0.167395f, 1.146514f, -0.177861f, -0.098658f, -0.444570f,
+ 6.759993f, -0.369772f, -0.831118f, 0.001866f, -0.073298f, -0.072095f,
+ 0.811902f, -0.431997f, -0.286587f, -0.269500f, 0.111492f, -0.525364f,
+ -0.351785f, -2.463474f, -1.852659f, 0.135325f, 0.138267f, 0.100643f,
+ -2.373278f, -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f,
+ -0.716424f, -0.031674f, 0.011147f, 0.057405f, -0.215873f, -0.094401f,
+ 0.573528f, -1.223820f, 0.414852f, -0.059053f, -0.076488f, -0.287168f,
+ -0.842640f, 0.174084f, -0.567186f, 0.336629f, -0.062514f, 2.075448f,
+ -0.061680f, -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f,
+ -0.049616f, -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f,
+ 0.141501f, -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f,
+ -1.521661f, -0.122639f, -0.015760f, -0.718912f, 5.877828f, 0.146916f,
+ 0.151767f, 0.220785f, -0.032298f, 0.230902f, 0.663943f, -0.252613f,
+ 0.057718f, -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f,
+ -1.031206f, -0.104136f, 0.389897f, 0.127602f, -2.667789f, -0.212366f,
+ -0.506262f, -0.009115f, -0.213202f, 0.076167f, -1.629405f, 0.055129f,
+ 0.375393f, -0.150272f, -0.241515f, -0.326497f, 0.100069f, 0.410703f,
+ 0.340622f, 0.042437f, -0.349945f, 0.041176f, -1.178950f, 0.030992f,
+ 0.933908f, -0.035844f, -0.098660f, 1.030584f, -0.092043f, -0.355739f,
+ -0.305562f, 0.036161f, -0.049558f, -0.033225f, -0.403856f, -0.088276f,
+ 0.215493f, -0.149105f, -0.013363f, 0.025886f, -0.101306f, -0.205781f,
+ -1.072487f, -0.076019f, 0.077555f, 0.131003f, 1.267763f, -0.008954f,
+ -0.327617f, -0.246539f, 6.664081f, -0.404403f, -1.442489f, 0.191301f,
+ -0.336361f, 0.181156f, 0.833108f, 0.007879f, -0.194464f, -1.029408f,
+ -0.036268f, -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f,
+ -0.065990f, 0.203160f, -0.291788f, 0.000680f, 0.587011f, -0.241289f,
+ 0.037034f, 0.000552f, 1.072308f, -0.387230f, -0.230050f, 0.292322f,
+ -0.720001f, 0.034109f, -0.467260f, 2.211644f, -1.839191f, -0.048797f,
+ -0.083469f, -0.334686f, -0.269056f, 0.051295f, 1.319904f, -0.035603f,
+ -0.018457f, -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f,
+ -0.305469f, -0.099011f, 0.014225f, -0.452772f, 0.170331f, -0.389312f,
+ -0.115084f, -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f,
+ -0.125137f, 0.067228f, -1.329271f, -0.117874f, -0.132499f, -0.218376f,
+ -0.588325f, -0.320024f, 0.085695f, -0.235047f, -0.217790f, 0.103015f,
+ -0.698644f, 0.017766f, -0.058299f, 0.199411f, -0.122485f, -0.563949f,
+ -0.349011f, -0.557045f, -0.131165f, 0.002281f, 0.118559f, -0.210302f,
+ -1.153815f, 0.116738f, -0.236007f, -0.003487f, -0.006885f, -0.244816f,
+ 0.953222f, 0.093748f, 0.266869f, 0.241869f, -0.860832f, -0.387012f,
+ -0.338986f, 2.097515f, -1.942512f, -0.298021f, 0.543911f, -0.043214f,
+ 0.082125f, -0.120242f, 0.712231f, 0.213327f, -0.301687f, -0.544011f,
+ -0.392131f, 0.004302f, 0.004825f, -0.317440f, -0.107518f, -0.293407f,
+ -0.159111f, -0.080367f, 0.132663f, -0.017726f, -0.237521f, -0.190297f,
+ -0.361633f, 0.200518f, -0.538296f, -0.027975f, -0.381704f, -0.016963f,
+ 0.630105f, -0.190997f, -0.287840f, -0.603488f, 3.605598f, -0.276614f,
+ -1.346383f, 0.186912f, -0.047575f, -0.189232f, -1.519072f, 0.097816f,
+ -0.223722f, 0.304924f, -0.213022f, -1.052433f, -0.322283f, -1.706734f,
+ -2.458027f, 0.237976f, 0.171050f, -0.103139f, -0.278689f, 0.329824f,
+ -0.262448f, -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f,
+ 0.091018f, -0.386471f, -0.723940f, 0.064956f, -0.057652f, 1.321024f,
+ -1.397418f, -0.143136f, 0.272468f, -0.030749f, 0.037324f, 0.069316f,
+ -0.904925f, -0.333693f, -0.117709f, 2.279598f, -0.428065f, -0.131157f,
+ -0.014288f, -0.402862f, -0.666090f, 0.017070f, -0.028333f, 0.002481f,
+ 0.197156f, -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f,
+ -0.905007f, -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f,
+ -0.089948f, -0.936827f, 1.437569f, -0.388908f, 0.126170f, 0.186162f,
+ -0.018819f, -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f,
+ -0.230436f, -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f,
+ 0.378157f, 0.113377f, -0.850610f, 0.080245f, -0.087305f, -0.002852f,
+ 0.044408f, -0.188172f, -1.891998f, 0.092189f, 0.125325f, -0.105090f,
+ -0.848510f, -0.396308f, -0.384130f, 2.007509f, -1.480787f, -0.126946f,
+ 0.314767f, 0.000195f, -0.285628f, -0.110442f, -0.293948f, 0.258559f,
+ -0.417603f, 1.570705f, 0.092459f, -0.340974f, -0.284754f, -0.007801f,
+ -0.324610f, -0.004734f, -0.207716f, -0.057175f, 0.055467f, -0.210830f,
+ -0.113005f, -0.299177f, 0.068074f, 0.017929f, -2.897598f, -0.260074f,
+ -0.014422f, -0.206467f, 1.246997f, -0.372863f, -0.214160f, -0.114035f,
+ 5.805862f, 0.003611f, -1.340990f, -0.021085f, -0.260431f, -0.002720f,
+ -1.251640f, -0.353531f, -0.304009f, -0.153376f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+ -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f,
+ 0.001427f, 0.523607f, 0.225068f, -0.055273f, 1.019519f, 1.181880f,
+ -0.010198f, 0.130597f, 1.276752f, 2.028188f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_32_layer0,
+ av1_ab_partition_nn_weights_32_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_32_layer0,
+ av1_ab_partition_nn_bias_32_layer1,
+ },
+};
+
+// nn model for ab partition pruning, 16x16.
+static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = {
+ 0.151902f, 0.007947f, -1.788454f, 0.431869f, -2.971387f, 0.923566f,
+ 1.632542f, -1.665136f, -0.338632f, -5.075884f, 0.398267f, 0.030467f,
+ 2.263534f, -0.045532f, -1.066128f, 0.915139f, -0.560500f, -3.293125f,
+ 2.072793f, -1.011414f, 0.122716f, -0.060169f, -0.388860f, 0.031019f,
+ -0.381861f, 0.001551f, -0.328472f, 0.038296f, -0.060398f, -0.375556f,
+ 0.209226f, 0.014764f, -1.443469f, -0.345486f, 2.409269f, 1.524846f,
+ -0.640666f, 1.322139f, -2.074771f, -0.580944f, -0.203960f, -0.072893f,
+ 0.329701f, 0.115339f, -1.339542f, 0.249024f, -0.421545f, -0.409151f,
+ -0.258293f, 0.836288f, -0.073685f, -0.009624f, 0.895712f, 0.320639f,
+ 0.451002f, -1.544558f, 0.193709f, -1.389012f, 1.305451f, 0.089795f,
+ 0.050338f, -0.017433f, -0.304667f, 0.500729f, 0.504346f, 0.073757f,
+ 0.582649f, -0.993623f, 1.766766f, -3.067265f, -0.415774f, -0.006036f,
+ -1.245281f, 0.253205f, -0.591245f, -0.626238f, 0.551852f, 0.593755f,
+ 0.491023f, 1.099384f, -0.348448f, 0.054564f, -0.451422f, -0.375781f,
+ -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f,
+ -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f,
+ 0.068066f, -0.374920f, 0.057536f, -0.189748f, 0.058375f, -0.267749f,
+ -0.147286f, -0.246153f, 0.006183f, -0.202029f, -0.059128f, 0.116852f,
+ 0.134719f, -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f,
+ -0.264499f, 0.155816f, -0.107255f, -0.056983f, -0.209771f, -0.099070f,
+ 0.007313f, -0.254124f, -0.231964f, -0.275972f, 0.032098f, -0.264564f,
+ -0.208743f, 0.155599f, -0.121511f, -0.156145f, -0.162315f, -0.059788f,
+ -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f,
+ -0.154114f, 0.017032f, -0.017364f, -0.233247f, 0.009918f, -0.179289f,
+ -0.190722f, 0.147106f, -0.063910f, -0.396872f, -0.263123f, -0.003850f,
+ -0.040718f, -0.324699f, 0.118660f, -0.170727f, -0.316788f, 0.100886f,
+ -0.202842f, 0.045371f, 0.150561f, -0.057054f, -0.308150f, 0.028346f,
+ -0.381473f, -0.195365f, 0.026221f, -0.281795f, 0.087204f, 0.047689f,
+ -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f,
+ -0.340273f, 0.048086f, 0.046103f, -0.121527f, 0.021697f, 0.054109f,
+ -0.002768f, -0.008461f, -2.297240f, 0.124651f, 3.621661f, -0.057120f,
+ -1.151656f, 2.296894f, -3.678720f, -0.290240f, 0.087683f, -0.186389f,
+ 0.007656f, -0.090236f, -0.245217f, 0.110389f, -0.251719f, -0.029084f,
+ -0.128203f, -0.100005f, -0.032779f, 0.007281f, -0.366596f, -0.267870f,
+ -0.215620f, 0.047687f, 0.010303f, 0.097980f, -0.191569f, -0.341162f,
+ 0.119249f, 0.026279f, -2.161546f, 0.459591f, 1.290566f, 1.791797f,
+ -0.409835f, 0.127081f, -1.156367f, 0.198286f, 0.099561f, -0.067445f,
+ -0.034352f, 0.017966f, -0.277380f, -0.057220f, -0.174198f, -0.014164f,
+ 0.146090f, -0.357530f, 0.097644f, -0.000932f, 0.446603f, -0.066793f,
+ 2.448620f, 0.937617f, -1.232922f, 0.313183f, 0.816827f, -0.275115f,
+ -0.245205f, -0.126895f, 0.156668f, -0.186977f, -0.273505f, 0.013315f,
+ 0.168629f, -0.089084f, 0.006166f, -0.116107f, -0.199316f, -0.024010f,
+ -0.242303f, 0.011612f, -0.218485f, -0.229661f, -0.123922f, 0.136699f,
+ 0.006732f, -0.148718f, -0.164225f, 0.116063f, 1.587898f, 0.690519f,
+ 0.360566f, 0.009739f, -0.678702f, -0.046003f, 0.126984f, 0.605212f,
+ 1.240663f, -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f,
+ -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f,
+ -0.242255f, 0.137424f, -0.307490f, -0.084637f, -0.023812f, -0.196582f,
+ -0.078695f, 0.038257f, -0.012110f, -0.263521f, 0.009839f, -0.109125f,
+ -0.226036f, 0.060712f, 0.093671f, 0.153143f, 0.039116f, -0.290891f,
+ 0.227057f, -0.204633f, -0.207539f, -0.148242f, 0.046204f, -0.231268f,
+ -0.209315f, -0.307579f, -0.436556f, 0.023475f, 0.131793f, -0.038301f,
+ 1.650584f, 0.392570f, 1.446576f, 1.254380f, -0.516867f, -0.057116f,
+ 0.149320f, 0.414424f, -0.246309f, 0.003877f, -0.480238f, -1.037035f,
+ -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f, 0.940609f,
+ -1.113370f, -0.018554f, 0.141064f, -0.182504f, 1.270707f, 0.414904f,
+ -0.216036f, 0.203831f, 0.450716f, -0.452909f, 0.139358f, -0.027143f,
+ 1.956892f, 1.643732f, -0.867839f, -0.620520f, -0.334607f, -0.519982f,
+ 0.205023f, 0.661159f, -0.000809f, 0.049033f, -0.348579f, -0.200338f,
+ -0.362144f, -0.346590f, -0.230096f, 0.180746f, -0.149954f, -0.253429f,
+ -0.378170f, -0.040724f, -0.041597f, 0.243659f, -0.472181f, 0.015401f,
+ -0.180376f, 0.153139f, -0.247738f, -0.010485f, -0.157158f, 0.016825f,
+ -0.238925f, -0.265798f, -0.318374f, 0.142352f, -0.210520f, 0.051928f,
+ -0.352190f, -0.179052f, -0.185498f, 0.025540f, -0.111667f, -0.235187f,
+ -0.215454f, 0.010931f, -0.238372f, -0.126659f, 0.075691f, -0.091167f,
+ -2.462379f, -0.007950f, -0.637990f, 0.285554f, -0.051275f, 0.282279f,
+ -0.744083f, -0.570646f, 0.592198f, 1.421332f, -0.256027f, -0.140315f,
+ 0.160247f, -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f,
+ -0.071228f, 0.055864f, -1.084764f, -0.263409f, 0.779266f, 0.228187f,
+ 0.375013f, 0.121204f, -0.656948f, 0.533561f, 0.272671f, -0.015423f,
+ -0.124180f, -0.009127f, 2.934838f, -0.150998f, 1.163152f, 0.081997f,
+ -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f, 0.024046f,
+ -1.451709f, 0.332558f, 0.990504f, 0.376290f, -1.466773f, -0.448439f,
+ -2.929108f, -4.255188f, 0.065238f, 0.019950f, 1.372393f, 0.444052f,
+ -2.538772f, 1.579767f, -0.464911f, -1.866114f, 1.053958f, 0.434467f,
+ -0.125964f, 0.034671f, 0.077116f, -0.138466f, -0.413395f, -0.223453f,
+ -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f, 0.037459f,
+ -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f,
+ -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f, -0.328036f,
+ -0.169790f, 0.036506f, 0.052572f, -0.183570f, -0.073617f, -0.244959f,
+ 0.266498f, 0.032846f, -1.902106f, 0.486078f, 2.414993f, 0.975182f,
+ -0.382875f, 1.647810f, -2.197017f, -0.890107f, 0.221287f, 0.010889f,
+ 3.817042f, 0.572728f, 0.092466f, 0.473337f, -1.634659f, -1.069455f,
+ 1.486776f, -1.023850f, 0.088184f, 0.008842f, 0.518202f, 0.270259f,
+ 1.757191f, -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f,
+ -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f,
+ -0.267836f, -0.319354f, -0.274975f, 0.068970f, -0.406467f, 0.044074f,
+ -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f, -0.177674f,
+ -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f,
+ -0.312272f, -0.222217f, -0.100548f, 0.106260f, -0.034655f, 0.135109f,
+ -0.021276f, 0.018177f, -0.353097f, -0.011128f, 0.061136f, -0.511662f,
+ -0.223236f, -0.308841f, 0.118789f, -0.154628f, -0.053178f, -0.055973f,
+ 0.013175f, -0.368337f, -0.090863f, -0.116920f, 0.178990f, -0.025278f,
+ -0.190553f, -0.238092f, 0.303943f, -0.024944f, 0.719373f, 0.384332f,
+ -0.378480f, -0.423316f, 0.709922f, 0.758514f, -1.559023f, -2.503173f,
+ 0.068652f, -0.234741f, -0.182932f, 0.037878f, 0.020684f, -0.174142f,
+ -0.182300f, -0.052796f, -0.219145f, 0.113028f, -1.041826f, 0.035317f,
+ 0.919904f, -0.676011f, 0.652297f, 1.456447f, -0.166904f, -0.861823f,
+ 0.895827f, 0.429821f, -0.180376f, -0.076587f, -0.273945f, -0.288990f,
+ -0.206692f, -0.080745f, -0.085444f, 0.186953f, -0.050135f, 0.044243f,
+ -0.391706f, -0.160498f, -0.292268f, 0.164060f, 0.412649f, 0.211611f,
+ -0.327294f, -0.919399f, 0.320297f, 0.385284f, -0.088848f, -0.072556f,
+ -0.384813f, -0.176267f, -0.065918f, 0.134724f, -0.231104f, -0.337707f,
+ -0.195442f, -0.263569f, 0.098090f, -0.341411f, -0.189211f, -0.439276f,
+ -0.404046f, 0.262491f, -0.311093f, -0.086454f, -0.013400f, -0.061447f,
+ -0.026945f, -0.112036f, -0.322985f, 0.078500f, -0.230205f, -0.344535f,
+ -0.021087f, 0.110220f, -0.128671f, 0.044219f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer0[64] = {
+ 2.936406f, -0.396539f, -0.110456f, -1.254954f, 0.785350f, 0.516290f,
+ -0.172341f, 0.254386f, -0.192465f, -0.106751f, -0.055518f, -0.094994f,
+ 0.000000f, -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f,
+ 0.457446f, -0.125051f, -0.107712f, 0.714607f, -0.140809f, -1.788650f,
+ -0.087199f, 0.000000f, -1.290050f, 0.443930f, -0.110634f, -0.109380f,
+ -0.188213f, -1.414179f, 1.193579f, 0.388775f, -0.873193f, -0.110050f,
+ -0.072565f, -0.117050f, -0.119132f, 0.456959f, -0.132069f, 0.131974f,
+ 1.160474f, 1.746465f, 0.442628f, -0.188849f, -0.207794f, -0.108364f,
+ -0.856655f, -2.141620f, 0.335476f, -0.105508f, -0.212162f, -0.109319f,
+ -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f, -0.023908f,
+ 0.123809f, -0.109797f, 0.200510f, -0.147542f,
+};
+
+static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = {
+ -6.823716f, 1.406568f, -0.144009f, 2.228765f, 0.838336f, 0.738107f,
+ -0.319014f, -0.148756f, 0.240862f, -0.111089f, -0.004241f, 0.025758f,
+ -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f, 0.252994f,
+ -0.289443f, 0.194932f, 0.057467f, 0.724735f, 0.014063f, 1.361352f,
+ 0.025191f, 0.024274f, 0.231462f, -7.227959f, -0.094515f, 0.039946f,
+ 0.412719f, 0.812318f, 3.038903f, -0.286289f, 0.647482f, -0.115114f,
+ 0.053590f, 0.066069f, 0.153134f, 0.996250f, -0.125700f, 0.951365f,
+ -6.243494f, -4.827697f, 0.566320f, 0.239515f, -0.099702f, 0.054546f,
+ 1.847330f, 3.680076f, -3.049829f, -0.127709f, 0.068469f, -0.017794f,
+ 0.223864f, -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f,
+ -0.552073f, 0.043311f, 0.218668f, 0.033209f, -3.199210f, 0.193079f,
+ 0.321406f, 0.718307f, -0.181418f, -0.459612f, -1.981170f, 0.968496f,
+ -0.029757f, -0.130065f, 0.043782f, 0.072394f, -0.088686f, 0.025322f,
+ 0.129882f, 0.101324f, 0.335707f, 0.072714f, -2.079774f, 0.203997f,
+ 0.239321f, -0.301757f, 0.257845f, 1.288382f, -0.031275f, -0.234194f,
+ 0.310722f, 2.045469f, 0.034716f, 0.135638f, -0.251388f, 0.320071f,
+ -1.065301f, -0.322731f, -0.545028f, 0.226276f, 0.090799f, 0.019289f,
+ 0.048950f, -1.079300f, 0.231938f, 0.083683f, 4.762127f, 0.145037f,
+ -0.145549f, 0.075592f, 0.172336f, 0.108175f, 0.333751f, 1.090501f,
+ 1.056114f, 0.047073f, 0.182052f, -0.081587f, 0.089900f, 0.339286f,
+ 2.049988f, 0.073585f, 0.537355f, -0.243322f, -0.010179f, -0.052601f,
+ -0.174915f, 0.117793f, 2.222990f, -2.520837f, -0.092699f, 1.199887f,
+ 0.138720f, 0.679918f, -0.463155f, -0.659496f, -0.109913f, -0.003398f,
+ 0.114633f, -0.128377f, 0.092970f, -0.107489f, -0.191078f, 0.185182f,
+ 0.216980f, -0.019343f, 3.443133f, 0.287953f, 0.099314f, 0.985958f,
+ 0.157268f, -0.606516f, 0.049418f, -0.221809f, -0.453081f, -0.344796f,
+ -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f,
+ -1.011192f, 0.022795f, 0.186363f, -0.076356f, -0.050932f, -0.165098f,
+ 0.168177f, -0.101596f, -5.270886f, 2.553943f, -0.440870f, -0.017494f,
+ 0.215208f, -0.017032f, 1.495915f, -4.304677f, 0.762211f, 0.182937f,
+ 0.254406f, -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f,
+ 0.737697f, -0.234989f, 0.168095f, 0.245118f, -0.077262f, 0.195718f,
+ 0.753302f, -1.637869f, 0.126227f, 0.982129f, -0.121444f, -0.295570f,
+ -1.215799f, 0.147867f, -0.068496f, 0.132726f, -0.005772f, -0.181774f,
+ 0.126513f, 0.204723f, -0.366123f, 0.103906f, -0.148053f, -0.075272f,
+ 0.243884f, -0.104828f, 0.198988f, 0.501034f, -0.112671f, 0.111421f,
+ 0.167508f, -0.117803f, -0.738624f, 2.046292f, 0.124011f, 0.057983f,
+ -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f,
+ 0.122417f, 0.060291f, -0.129033f, -0.843086f, 0.268241f, -0.399927f,
+ 1.585888f, 1.816393f, -0.631427f, 0.127826f, 0.088105f, 0.073488f,
+ 0.717694f, -1.497362f, 2.608528f, 0.066896f, -0.079230f, 0.223436f,
+ -0.010530f, 0.175310f, 1.120365f, 0.034391f, 0.835312f, 0.071652f,
+ -0.080615f, 0.111395f, 0.162742f, 0.079927f, -3.859582f, -0.638431f,
+ -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f, 0.931940f,
+ -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f, 0.102793f,
+ -0.048546f, 0.063545f, 0.023864f, -0.190863f, 1.934257f, -0.136286f,
+ -0.107916f, -0.637468f, 0.066449f, 1.089693f, -0.214047f, -0.265780f,
+ 0.899660f, -0.130333f, 0.288311f, -0.049024f, 0.090202f, 0.487969f,
+ 0.339704f, 0.858479f, 0.841253f, -0.184100f, -0.637070f, -0.125071f,
+ -0.077650f, -0.087877f, 0.202268f, -0.027300f, 2.842862f, -0.100698f,
+ -0.259080f, 0.260556f, 0.157912f, -0.070364f, 0.467190f, 1.200037f,
+ 1.419317f, -0.033588f, -0.227824f, 0.292617f, 0.228574f, 0.213839f,
+ -1.091099f, -0.022258f, -1.294681f, 0.136118f, 0.081652f, -0.185359f,
+ -0.039706f, 0.191407f, -2.053219f, -0.261934f, 0.047812f, -0.029536f,
+ -0.823869f, -1.090534f, -0.755890f, 0.441035f, -0.167945f, 0.231441f,
+ -0.135013f, -0.260762f, 0.256872f, 0.130339f, -0.243751f, 0.189760f,
+ -0.288454f, 0.145363f, 0.338490f, 0.403898f, -0.022814f, -1.263598f,
+ -0.101315f, 0.860135f, 0.136511f, 0.028942f, 0.574047f, 2.656370f,
+ 0.037587f, -0.188690f, -0.125312f, 1.100435f, -1.080402f, 0.380905f,
+ 0.004635f, 0.097144f, -0.214309f, 0.085552f, -0.285066f, -0.705134f,
+ -0.054704f, -0.319951f, 5.486626f, 0.958158f, -1.380585f, 0.223340f,
+ -0.169167f, -0.170697f, -0.216748f, 0.324232f, 2.684204f, -0.008490f,
+ -0.211052f, -0.201190f, 0.123466f, -0.000234f, 0.579907f, 0.096938f,
+ -0.042745f, 0.201855f, 0.157195f, -0.261440f, 0.029699f, -0.046599f,
+ 1.618216f, -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f,
+ 0.579699f, -0.100392f, 0.150694f, 0.061794f, 0.200425f, -0.062515f,
+ -0.179122f, 0.250112f, -0.344675f, -0.118359f, -0.095670f, 0.152311f,
+ 3.662276f, -0.154921f, -0.312991f, 0.972008f, -0.308596f, -0.190426f,
+ 0.133889f, -0.238673f, -0.094726f, 1.683835f, -0.215629f, -0.198890f,
+ -0.035278f, -0.367973f, -0.822435f, 0.240848f, -0.194656f, 0.034655f,
+ -0.079424f, 0.146670f, 0.026646f, -0.034507f, 0.059467f, -0.153109f,
+ -0.431033f, 2.552991f, -1.894091f, -0.180462f, -0.306839f, -0.025648f,
+ 1.026326f, -3.096230f, 1.346935f, 0.033633f, -0.181827f, 0.094376f,
+ 0.001696f, -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f,
+ 0.281795f, -0.127251f, 0.180776f, 0.067763f, 0.697124f, -1.040779f,
+ 0.111280f, 0.188351f, -0.340234f, -0.207790f, -0.720075f, -0.137409f,
+ -0.070310f, -0.032918f, -0.060787f, 0.131484f, -0.077845f, -0.258652f,
+ 0.056911f, -0.062034f, 0.007663f, -0.185100f, 1.340361f, 0.014096f,
+ -0.124602f, 0.194241f, 0.128383f, 0.360465f, 0.082979f, -0.050475f,
+ -0.519294f, 3.323262f, 0.067014f, 0.221203f, -0.085082f, -0.228606f,
+ -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f,
+ -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f, 1.790253f,
+ -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f,
+ 2.251166f, -0.146007f, 0.138527f, -0.003134f, 0.103665f, 0.006928f,
+ -0.240253f, -0.227464f, 0.578437f, -0.214724f, 0.503085f, 0.158093f,
+ 0.033091f, 0.008061f, 4.815371f, 2.132264f, 0.281850f, -2.288560f,
+ -0.145012f, 1.296832f, -0.362401f, -0.403252f, 0.109873f, 0.185746f,
+ 0.244764f, 0.172367f, -0.185588f, 0.139801f, -0.178254f, 0.068629f,
+ 0.358488f, -0.153969f, -6.433524f, 0.225983f, -0.138123f, -0.095971f,
+ -0.036089f, -1.400083f, 0.265908f, 0.257787f, 0.181144f, -1.647228f,
+ -0.136289f, -0.074206f, 0.122988f, -0.088895f, -1.266717f, 0.006010f,
+ 0.536681f, 0.263061f, -0.032207f, -0.155136f, 0.086431f, 0.441950f,
+ -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f, 0.117667f,
+ -0.000408f, 0.225719f, -2.199698f, 0.141447f, -1.459051f, 0.051315f,
+ 0.203228f, 0.354432f, -0.005775f, -0.028073f, -0.965817f, 0.231083f,
+ -0.666884f, 0.026283f, -0.317486f, 0.210754f, 0.123897f, 0.223827f,
+ 4.214405f, 1.457334f, -0.253945f, -1.306733f, -0.391235f, 0.451154f,
+ -1.553888f, -0.353429f, 0.069533f, 0.159278f, -0.173836f, -0.004952f,
+ -0.137033f, 0.127012f, 0.143600f, 0.051587f, -0.070549f, 0.066509f,
+ -5.776547f, 0.180021f, -0.189183f, -1.288504f, -0.233575f, -1.473873f,
+ 0.140940f, 0.144451f, -0.104534f, 2.089873f, -0.168168f, 0.110726f,
+ 0.132134f, -0.215223f, -1.682754f, 0.157757f, -0.146163f, 0.064882f,
+ 0.117313f, -0.038780f, -0.124720f, -0.501697f, 0.092047f, -0.233992f,
+ 3.324976f, 0.516601f, 1.294202f, 0.119989f, 0.061055f, 0.043420f,
+ -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f,
+ -0.282998f, -0.282705f, 0.073798f, 0.169851f, 0.135651f, 0.182677f,
+ -0.040220f, 0.132462f, -0.303120f, -0.230113f, 6.165739f, -0.258596f,
+ 0.024127f, -1.388283f, -0.006042f, 0.572600f, 0.348411f, -0.387376f,
+ -0.075845f, 0.122319f, -0.029616f, 0.077873f, 0.154763f, 0.049073f,
+ 0.018597f, 0.102688f, -0.204165f, 0.020734f, -1.389133f, -0.032854f,
+ -0.147561f, 0.853944f, 0.132100f, -3.259659f, 0.243745f, 0.181529f,
+ -0.738414f, 1.509994f, 0.023470f, -0.005329f, 0.066115f, -1.345081f,
+ -1.455402f, -0.172023f, -0.194625f, 0.071885f, -0.201742f, -0.262402f,
+ 0.077601f, -0.048938f, 0.257993f, -0.504029f, -2.032415f, 1.158880f,
+ 0.448647f, -0.025633f, 0.117586f, -0.072275f, -0.673744f, -3.854342f,
+ -0.983843f, 0.047766f, -0.017193f, -0.215775f, -0.158743f, -0.232042f,
+ -0.509112f, 0.148812f, 0.130122f, 0.006486f, -0.099016f, 0.022514f,
+ -0.486850f, -0.059623f, 4.012731f, 0.025454f, 0.029059f, -0.783546f,
+ -0.295260f, 0.322521f, -0.473201f, -0.172100f, -0.100087f, -0.076516f,
+ -0.258367f, -0.112897f, 0.269364f, -0.065912f, 0.169022f, -0.178783f,
+ -0.095114f, 0.122089f, -2.790099f, -0.100431f, -0.087963f, -0.009431f,
+ -0.087819f, -2.774399f, -0.100757f, 0.013005f, -0.964533f, 3.236665f,
+ -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f,
+ -1.799262f, -0.365269f, 0.108611f, 0.037994f, 0.024747f, -1.073639f,
+ -0.203158f, -0.935006f, 1.880891f, 1.578385f, 0.726272f, -0.024546f,
+ -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f, 0.075451f,
+ 0.182899f, 0.092215f, -0.207347f, -0.030111f, 0.054316f, 0.192481f,
+ 0.594639f, -0.247694f, 0.547471f, -0.032094f, -0.065000f, 0.007198f,
+ 1.605377f, -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f,
+ 0.919365f, 0.599980f, 0.125545f, 0.265813f, 0.246884f, 0.095385f,
+ -0.260374f, -0.202916f, -0.042770f, 0.234967f, -0.233139f, -0.326994f,
+ -1.375256f, 0.121766f, 0.077433f, -1.103569f, 0.019497f, -1.029185f,
+ 0.253905f, 0.206569f, 0.187334f, -0.237089f, -0.294351f, 0.164137f,
+ 0.149696f, -0.749787f, -0.413433f, 0.976587f, 1.027976f, -0.285264f,
+ 0.209273f, -0.124762f, 0.050884f, 0.250764f, -0.082031f, -0.646520f,
+ 4.116680f, 0.437336f, 0.671684f, 0.129509f, -0.078462f, 0.014072f,
+ -0.678232f, 0.094831f, 1.125624f, 0.207070f, -0.154750f, -0.025780f,
+ -0.103030f, 0.118019f, -0.908186f, -0.263546f, -1.555324f, -0.236887f,
+ -0.217854f, -0.051790f, 0.017915f, 0.171001f, 1.355562f, 0.094603f,
+ -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f,
+ -0.298901f, 0.038162f, 0.251899f, 0.039612f, -0.022935f, -0.232308f,
+ -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f,
+ -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f,
+ 0.185269f, 1.465082f, 0.040240f, 0.112665f, 0.144329f, -0.286112f,
+ -0.617649f, 0.916177f, 0.221044f, -0.079867f, 0.170251f, -0.093638f,
+ -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f, 1.241179f,
+ 0.355922f, -0.170848f, -0.189168f, 0.080225f, -1.357793f, 0.190890f,
+ 0.976800f, -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f,
+ -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f,
+ -0.049715f, -0.178005f, 3.029985f, -1.141546f, 0.080066f, -1.932316f,
+ -0.641137f, -0.189564f, 0.935080f, 0.136119f, 0.015558f, -0.179331f,
+ 0.204571f, 0.020350f, 0.009362f, 0.108478f, 0.037076f, -0.049009f,
+ 0.081090f, -0.180202f, 1.455561f, -0.081559f, 0.059361f, 0.484971f,
+ 0.160923f, -2.170744f, -0.013204f, 0.126561f, -0.407122f, 1.223661f,
+ 0.044262f, 0.118044f, 0.058274f, -1.747100f, -0.171318f, 0.971374f,
+ 0.306995f, -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f,
+ -0.106479f, -0.907933f, 1.121231f, 1.673840f, -0.421458f, -0.021146f,
+ -0.254838f, 0.097632f, 0.235109f, -2.901782f, 0.289518f, -0.355459f,
+ -0.068264f, -0.179121f, 0.068560f, -0.047570f, -0.522523f, -0.228963f,
+ -1.037158f, -0.163723f, 0.280563f, -0.000868f, -0.197220f, -0.239329f,
+ 1.985274f, -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f,
+ -0.792024f, -0.114290f, 0.060969f, 0.104106f, -0.252123f, -0.150400f,
+ -0.133277f, 0.267147f, 0.274413f, 0.223744f, -0.180223f, -0.345415f,
+ -0.104883f, 0.119210f, -0.095041f, -0.301635f, 0.013175f, -2.128121f,
+ -0.147208f, -0.151509f, -0.692013f, 3.418555f, -0.016541f, 0.171511f,
+ 0.107159f, -1.516672f, 0.127408f, 0.687035f, -0.906486f, -0.145463f,
+ -0.169382f, -0.143906f, 0.125091f, -0.960645f, -0.180869f, -0.716908f,
+ 2.840951f, 1.904919f, -0.416268f, -0.425181f, -0.194697f, -0.075932f,
+ -0.950604f, -1.599800f, 0.943671f, -0.022744f, -0.270492f, 0.080843f,
+ -0.372916f, 0.047838f, -0.100300f, -0.026600f, 0.011733f, -0.226051f,
+ 0.172790f, -0.172982f, 0.041258f, -0.299379f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+ -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f,
+ 0.748430f, 0.203096f, 0.059317f, 0.418219f, 0.841294f, 0.402693f,
+ -0.658522f, 0.723479f, 0.544264f, 1.035225f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_16_layer0,
+ av1_ab_partition_nn_weights_16_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_16_layer0,
+ av1_ab_partition_nn_bias_16_layer1,
+ },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 18
+#define LABEL_SIZE 4
+
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = {
+ -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f,
+ 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f,
+ 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f,
+ 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f,
+ -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f,
+ -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f,
+ 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f,
+ 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f,
+ -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f,
+ -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f,
+ -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f,
+ -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f,
+ 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f,
+ 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f,
+ -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f,
+ -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f,
+ -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f,
+ -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f,
+ 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f,
+ -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f,
+ -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f,
+ -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f,
+ -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f,
+ -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f,
+ -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f,
+ 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f,
+ 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f,
+ -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f,
+ 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f,
+ -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f,
+ 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f,
+ 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f,
+ -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f,
+ -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f,
+ 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f,
+ -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f,
+ 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f,
+ -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f,
+ 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f,
+ -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f,
+ 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f,
+ 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f,
+ -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f,
+ 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f,
+ 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f,
+ 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f,
+ 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f,
+ -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f,
+ -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f,
+ -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f,
+ 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f,
+ 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f,
+ -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f,
+ -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f,
+ 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f,
+ -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f,
+ -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f,
+ -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f,
+ 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f,
+ -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f,
+ -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f,
+ 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f,
+ 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f,
+ 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f,
+ 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f,
+ -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f,
+ 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f,
+ -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f,
+ -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f,
+ -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f,
+ 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f,
+ 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[24] = {
+ 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f,
+ -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f,
+ 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f,
+ -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = {
+ -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f,
+ 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f,
+ -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f,
+ -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f,
+ 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f,
+ -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f,
+ -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f,
+ 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f,
+ 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f,
+ -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f,
+ 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f,
+ -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f,
+ 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f,
+ -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f,
+ -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f,
+ -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+ -0.462133f,
+ 0.465060f,
+ 0.062211f,
+ 0.401786f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_16_layer0,
+ av1_4_partition_nn_weights_16_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_16_layer0,
+ av1_4_partition_nn_bias_16_layer1,
+ },
+};
+
+static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
+ -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f,
+ 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f,
+ -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f,
+ 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f,
+ -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f,
+ -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f,
+ -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f,
+ -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f,
+ -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f,
+ -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f,
+ 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f,
+ -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f,
+ -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f,
+ 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f,
+ -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f,
+ -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f,
+ -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f,
+ -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f,
+ -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f,
+ -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f,
+ 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f,
+ -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f,
+ -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f,
+ 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f,
+ 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f,
+ -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f,
+ 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f,
+ 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f,
+ -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f,
+ -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f,
+ -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f,
+ 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f,
+ -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f,
+ 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f,
+ -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f,
+ -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f,
+ 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f,
+ -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f,
+ 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f,
+ -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f,
+ -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f,
+ -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f,
+ -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f,
+ 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f,
+ 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f,
+ -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f,
+ -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f,
+ 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f,
+ 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f,
+ 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f,
+ 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f,
+ -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f,
+ 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f,
+ 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f,
+ -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f,
+ -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f,
+ -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f,
+ -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f,
+ -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f,
+ -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f,
+ -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f,
+ -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f,
+ 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f,
+ -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f,
+ 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f,
+ -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f,
+ 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f,
+ 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f,
+ 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f,
+ -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f,
+ 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f,
+ 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f,
+ 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f,
+ -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f,
+ 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f,
+ 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f,
+ -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f,
+ 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f,
+ -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f,
+ -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f,
+ -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f,
+ -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f,
+ -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f,
+ 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f,
+ -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f,
+ 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f,
+ -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f,
+ -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f,
+ 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f,
+ -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f,
+ -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f,
+ 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f,
+ -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f,
+ -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f,
+ -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f,
+ 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer0[32] = {
+ 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f,
+ -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f,
+ -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f,
+ -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f,
+ -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f,
+ 0.109579f, -0.082685f,
+};
+
+static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
+ 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f,
+ 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f,
+ 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f,
+ -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f,
+ 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f,
+ 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f,
+ -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f,
+ 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f,
+ 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f,
+ 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f,
+ -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f,
+ 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f,
+ -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f,
+ -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f,
+ 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f,
+ -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f,
+ 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f,
+ 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f,
+ 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f,
+ 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f,
+ -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f,
+ -0.800926f, -0.134132f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+ -0.019518f,
+ 0.198546f,
+ 0.339015f,
+ -0.261961f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_32_layer0,
+ av1_4_partition_nn_weights_32_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_32_layer0,
+ av1_4_partition_nn_bias_32_layer1,
+ },
+};
+
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = {
+ -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f,
+ -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f,
+ 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f,
+ -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f,
+ -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f,
+ 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f,
+ 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f,
+ 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f,
+ 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f,
+ -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f,
+ -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f,
+ 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f,
+ -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f,
+ 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f,
+ -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f,
+ -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f,
+ 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f,
+ -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f,
+ 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f,
+ -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f,
+ -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f,
+ -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f,
+ -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f,
+ -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f,
+ -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f,
+ -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f,
+ -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f,
+ 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f,
+ 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f,
+ -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f,
+ -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f,
+ 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f,
+ -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f,
+ 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f,
+ -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f,
+ 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f,
+ 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f,
+ -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f,
+ -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f,
+ 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f,
+ 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f,
+ 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f,
+ 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f,
+ -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f,
+ -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f,
+ 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f,
+ -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f,
+ 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f,
+ -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f,
+ 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f,
+ -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f,
+ -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f,
+ 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f,
+ -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f,
+ -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f,
+ -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f,
+ -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f,
+ -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f,
+ 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f,
+ 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f,
+ -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f,
+ -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f,
+ -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f,
+ 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f,
+ 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f,
+ -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f,
+ -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f,
+ 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f,
+ 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f,
+ 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f,
+ -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f,
+ 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[24] = {
+ 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f,
+ -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f,
+ -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f,
+ -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = {
+ -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f,
+ 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f,
+ 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f,
+ -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f,
+ -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f,
+ 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f,
+ -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f,
+ 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f,
+ 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f,
+ -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f,
+ -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f,
+ -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f,
+ 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f,
+ -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f,
+ -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f,
+ -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+ -0.478735f,
+ 0.292948f,
+ 0.293172f,
+ 0.040013f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_64_layer0,
+ av1_4_partition_nn_weights_64_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_64_layer0,
+ av1_4_partition_nn_bias_64_layer1,
+ },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 4
+static const float
+ av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = {
+ -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f,
+ -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f,
+ 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f,
+ -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f,
+ -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f,
+ -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f,
+ -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f,
+ -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f,
+ 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f,
+ 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f,
+ -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f,
+ -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f,
+ 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f,
+ -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f,
+ -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f,
+ -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f,
+ 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f,
+ -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f,
+ -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f,
+ -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f,
+ 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f,
+ -0.007193f, -0.257836f,
+ };
+
+static const float av1_partition_breakout_nn_bias_128_layer0[32] = {
+ 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f,
+ -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f,
+ 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f,
+ 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f,
+ -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f,
+ 0.429660f, -8.439470f,
+};
+
+static const float av1_partition_breakout_nn_weights_128_layer1[32] = {
+ -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f,
+ 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f,
+ 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f,
+ -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f,
+ -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f,
+ -0.039662f, 0.131499f,
+};
+
+static const float av1_partition_breakout_nn_bias_128_layer1[1] = {
+ 0.86678213f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_128_layer0,
+ av1_partition_breakout_nn_weights_128_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_128_layer0,
+ av1_partition_breakout_nn_bias_128_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+ 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f,
+ -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f,
+ 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f,
+ 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f,
+ -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f,
+ 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f,
+ 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f,
+ -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f,
+ 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f,
+ -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f,
+ -2.407131f, -0.062304f, 0.000874f, 0.108786f,
+ };
+
+static const float av1_partition_breakout_nn_bias_64_layer0[16] = {
+ 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f,
+ -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f,
+ -0.337413f, 4.492778f, 0.000000f, 17.043072f,
+};
+
+static const float av1_partition_breakout_nn_weights_64_layer1[16] = {
+ -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f,
+ 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f,
+ -0.038572f, 0.307899f, -0.294283f, 0.118323f,
+};
+
+static const float av1_partition_breakout_nn_bias_64_layer1[1] = {
+ -1.33438122f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_64_layer0,
+ av1_partition_breakout_nn_weights_64_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_64_layer0,
+ av1_partition_breakout_nn_bias_64_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = {
+ -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f,
+ 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f,
+ -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f,
+ -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f,
+ -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f,
+ 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f,
+ 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f,
+ -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f,
+ -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f,
+ -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f,
+ -0.520814f, -0.045386f, -0.443123f, -0.484209f,
+ };
+
+static const float av1_partition_breakout_nn_bias_32_layer0[16] = {
+ 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f,
+ 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f,
+ -0.423808f, 0.000000f, 6.352258f, -0.155787f,
+};
+
+static const float av1_partition_breakout_nn_weights_32_layer1[16] = {
+ 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f,
+ 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f,
+ -0.004171f, 0.157694f, 0.117845f, 0.272115f,
+};
+
+static const float av1_partition_breakout_nn_bias_32_layer1[1] = {
+ 0.09049262f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_32_layer0,
+ av1_partition_breakout_nn_weights_32_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_32_layer0,
+ av1_partition_breakout_nn_bias_32_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = {
+ 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f,
+ -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f,
+ -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f,
+ -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f,
+ -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f,
+ -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f,
+ -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f,
+ -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f,
+ -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f,
+ -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f,
+ -0.509287f, -0.048877f, -0.001512f, 0.077086f,
+ };
+
+static const float av1_partition_breakout_nn_bias_16_layer0[16] = {
+ 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f,
+ 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f,
+ 5.625762f, 0.615822f, 0.040057f, 16.668884f,
+};
+
+static const float av1_partition_breakout_nn_weights_16_layer1[16] = {
+ -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f,
+ 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f,
+ 0.269773f, -0.021105f, -0.146698f, 0.188764f,
+};
+
+static const float av1_partition_breakout_nn_bias_16_layer1[1] = {
+ 1.60751927f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_16_layer0,
+ av1_partition_breakout_nn_weights_16_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_16_layer0,
+ av1_partition_breakout_nn_bias_16_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = {
+ -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f,
+ 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f,
+ -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f,
+ -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f,
+ 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f,
+ -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f,
+ -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f,
+ -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f,
+ -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f,
+ -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f,
+ -0.596269f, 0.098494f, -0.005765f, 0.173652f,
+ };
+
+static const float av1_partition_breakout_nn_bias_8_layer0[16] = {
+ 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f,
+ 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f,
+ 2.336705f, -0.278834f, 0.231905f, 7.954366f,
+};
+
+static const float av1_partition_breakout_nn_weights_8_layer1[16] = {
+ -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f,
+ -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f,
+ 0.055858f, 0.230970f, -0.056466f, 0.119780f,
+};
+
+static const float av1_partition_breakout_nn_bias_8_layer1[1] = {
+ 1.27784479f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_8 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_8_layer0,
+ av1_partition_breakout_nn_weights_8_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_8_layer0,
+ av1_partition_breakout_nn_bias_8_layer1,
+ },
+};
+#undef FEATURE_SIZE
+
+#define FEATURE_SIZE 9 // Input layer size
+#define NUM_NODES 32 // Hidden layer size
+#define LABEL_SIZE 3 // Output layer size
+
+static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f,
+ -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f,
+ 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f,
+ -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f,
+ 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f,
+ 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f,
+ 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f,
+ -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f,
+ 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f,
+ 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f,
+ -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f,
+ -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f,
+ 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f,
+ 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f,
+ -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f,
+ 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f,
+ -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f,
+ 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f,
+ 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f,
+ -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f,
+ -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f,
+ -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f,
+ 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f,
+ 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f,
+ -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f,
+ 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f,
+ -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f,
+ -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f,
+ -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f,
+ 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f,
+ -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f,
+ -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f,
+ -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f,
+ 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f,
+ 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f,
+ -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f,
+ 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f,
+ 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f,
+ 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f,
+ 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f,
+ -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f,
+ -1.08228f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = {
+ 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f,
+ -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f,
+ 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f,
+ -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f,
+ -0.22638f, 1.40940f, -0.09309f, 0.05828f,
+};
+
+static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f,
+ -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f,
+ -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f,
+ -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f,
+ -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f,
+ 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f,
+ -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f,
+ 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f,
+ 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f,
+ -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f,
+ -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f,
+ 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f,
+ -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f,
+ -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = {
+ 1.70665f,
+ -0.77954f,
+ -0.92709f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_8 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_8_layer0,
+ av1_rect_partition_nn_weights_8_layer1 },
+ { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f,
+ -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f,
+ 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f,
+ -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f,
+ 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f,
+ -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f,
+ 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f,
+ 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f,
+ 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f,
+ -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f,
+ 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f,
+ 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f,
+ 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f,
+ 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f,
+ 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f,
+ -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f,
+ -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f,
+ 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f,
+ -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f,
+ -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f,
+ -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f,
+ 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f,
+ 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f,
+ -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f,
+ -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f,
+ -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f,
+ 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f,
+ 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f,
+ -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f,
+ -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f,
+ -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f,
+ -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f,
+ -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f,
+ 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f,
+ 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f,
+ 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f,
+ -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f,
+ -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f,
+ 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f,
+ -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f,
+ -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f,
+ -0.05573f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = {
+ -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f,
+ 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f,
+ 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f,
+ -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f,
+ -0.12044f, 1.65478f, -0.75153f, 1.18441f,
+};
+
+static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f,
+ 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f,
+ 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f,
+ 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f,
+ -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f,
+ 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f,
+ 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f,
+ 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f,
+ 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f,
+ -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f,
+ -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f,
+ -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f,
+ 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f,
+ -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer1[3] = {
+ 2.68750f,
+ -1.31894f,
+ -1.36768f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_16_layer0,
+ av1_rect_partition_nn_weights_16_layer1 },
+ { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f,
+ -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f,
+ -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f,
+ -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f,
+ -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f,
+ -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f,
+ -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f,
+ -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f,
+ -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f,
+ 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f,
+ -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f,
+ -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f,
+ 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f,
+ 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f,
+ -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f,
+ -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f,
+ 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f,
+ 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f,
+ 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f,
+ 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f,
+ 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f,
+ -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f,
+ 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f,
+ 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f,
+ -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f,
+ -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f,
+ -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f,
+ -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f,
+ -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f,
+ -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f,
+ 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f,
+ -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f,
+ -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f,
+ 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f,
+ 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f,
+ -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f,
+ 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f,
+ 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f,
+ 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f,
+ -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f,
+ -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f,
+ 0.33984f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = {
+ -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f,
+ 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f,
+ 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f,
+ -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f,
+ -0.27602f, -1.98063f, 0.20816f, -0.01315f,
+};
+
+static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f,
+ -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f,
+ 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f,
+ -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f,
+ 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f,
+ 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f,
+ 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f,
+ 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f,
+ 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f,
+ -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f,
+ 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f,
+ -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f,
+ -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f,
+ -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer1[3] = {
+ 2.47332f,
+ -1.65756f,
+ -0.81573f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_32_layer0,
+ av1_rect_partition_nn_weights_32_layer1 },
+ { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f,
+ 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f,
+ 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f,
+ 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f,
+ 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f,
+ 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f,
+ 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f,
+ -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f,
+ 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f,
+ 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f,
+ -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f,
+ -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f,
+ -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f,
+ -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f,
+ 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f,
+ 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f,
+ 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f,
+ -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f,
+ -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f,
+ -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f,
+ -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f,
+ -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f,
+ 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f,
+ 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f,
+ 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f,
+ -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f,
+ -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f,
+ 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f,
+ 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f,
+ 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f,
+ -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f,
+ -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f,
+ -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f,
+ 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f,
+ -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f,
+ -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f,
+ 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f,
+ -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f,
+ -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f,
+ 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f,
+ -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f,
+ 0.09101f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = {
+ 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f,
+ -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f,
+ -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f,
+ -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f,
+ 0.59835f, -0.31269f, -0.30585f, -1.66212f,
+};
+
+static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f,
+ -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f,
+ 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f,
+ 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f,
+ 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f,
+ -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f,
+ -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f,
+ 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f,
+ -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f,
+ 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f,
+ -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f,
+ -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f,
+ -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f,
+ 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer1[3] = {
+ 0.32215f,
+ -0.57522f,
+ 0.25314f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_64_layer0,
+ av1_rect_partition_nn_weights_64_layer1 },
+ { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f,
+ 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f,
+ 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f,
+ 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f,
+ -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f,
+ 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f,
+ 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f,
+ 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f,
+ 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f,
+ 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f,
+ -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f,
+ 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f,
+ -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f,
+ -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f,
+ 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f,
+ -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f,
+ -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f,
+ -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f,
+ -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f,
+ -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f,
+ -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f,
+ -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f,
+ -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f,
+ 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f,
+ 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f,
+ -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f,
+ -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f,
+ 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f,
+ 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f,
+ 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f,
+ -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f,
+ 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f,
+ -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f,
+ 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f,
+ -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f,
+ -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f,
+ 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f,
+ -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f,
+ -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f,
+ -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f,
+ 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f,
+ 2.02519f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = {
+ 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f,
+ 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f,
+ -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f,
+ -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f,
+ 0.66120f, 0.61119f, -1.42293f, 0.32676f,
+};
+
+static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f,
+ 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f,
+ -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f,
+ 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f,
+ 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f,
+ 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f,
+ 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f,
+ 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f,
+ -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f,
+ -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f,
+ 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f,
+ 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f,
+ 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f,
+ 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer1[3] = {
+ 1.09014f,
+ -0.53317f,
+ -0.55668f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_128_layer0,
+ av1_rect_partition_nn_weights_128_layer1 },
+ { av1_rect_partition_nn_bias_128_layer0,
+ av1_rect_partition_nn_bias_128_layer1 }
+};
+#undef FEATURE_SIZE
+#undef NUM_NODES
+#undef LABEL_SIZE
+
+// Below are the models used for simple_motion_search_based_split
+// Thresholds
+// The first index level is for aggresiveness, and the second is frame
+// resolution, third is bsize
+static const float av1_simple_motion_search_split_thresh[4][3][5] = {
+ // Aggressiveness = 0
+ {
+ // lowres
+ {
+ 1.40402595879f, // p = 0.8028197
+ 4.72845183649f, // p = 0.99123732
+ 1.86517797783f, // p = 0.86589934
+ 1.58715223005f, // p = 0.83021506
+ 7.22695596987f, // p = 0.9992738
+ },
+ // midres
+ {
+ 5.839480f, // p = 0.997098
+ 1.877167f, // p = 0.867285
+ 3.073499f, // p = 0.955783
+ 1.405601f, // p = 0.803071
+ 2.555636f, // p = 0.927951
+ },
+ // hdres
+ {
+ 5.839480f, // p = 0.997098
+ 1.877167f, // p = 0.867285
+ 3.073499f, // p = 0.955783
+ 1.405601f, // p = 0.803071
+ 2.555636f, // p = 0.927951
+ },
+ },
+ // Aggressiveness = 1
+ {
+ // Lowres
+ {
+ 100.0000f, // p = 1.000000
+ 4.952535f, // p = 0.992984
+ 1.720880f, // p = 0.848242
+ 1.426233f, // p = 0.806314
+ 1.491905f, // p = 0.816364
+ },
+ // Midres
+ {
+ 100.0000f, // p = 100.0000
+ 3.137263f, // p = 0.958404
+ 2.703262f, // p = 0.937219
+ 1.877166f, // p = 0.867285
+ 2.221149f, // p = 0.902133
+ },
+ // Hdres
+ {
+ 4.417680f, // p = 0.988082
+ 3.086898f, // p = 0.956349
+ 3.966704f, // p = 0.981416
+ 1.532565f, // p = 0.822381
+ 3.449975f, // p = 0.969230
+ },
+ },
+ // Aggressiveness = 2
+ {
+ // lowres
+ {
+ 100.000000f, // p = 0.998048
+ 1.484020f, // p = 0.815179
+ 1.866781f, // p = 0.866085
+ 1.706711f, // p = 0.846409
+ 2.080369f, // p = 0.888980
+ },
+ // midres
+ {
+ 100.000000f, // p = 0.0
+ 3.265763f, // p = 0.963235428881
+ 2.024598f, // p = 0.883355591569
+ 1.846446f, // p = 0.863709256976
+ 2.240962f, // p = 0.903868036126
+ },
+ // hdres
+ {
+ 3.133026f, // p = 0.958234684141
+ 2.940954f, // p = 0.949834204693
+ 2.484544f, // p = 0.923051170045
+ 1.702972f, // p = 0.845922460525
+ 1.655562f, // p = 0.839641385729
+ },
+ },
+ // Aggressiveness = 3
+ {
+ // lowres
+ { 100.000000f, 1.41409519484f, 0.606066095487f, 0.0993410805635f,
+ 0.762099214988f },
+ // midres
+ { 100.000000f, 0.702207995397f, 0.503550081119f, 0.0403228785199f,
+ 0.557298794638f },
+ // hdres
+ { 1.21895384144f, 1.26798450469f, 0.872537808115f, 0.975869438148f,
+ 1.86572095242f },
+ },
+};
+
+static const float av1_simple_motion_search_no_split_thresh[4][3][5] = {
+ // Aggressiveness = 0
+ {
+ // lowres
+ {
+ -100.0f, // p = 0.0
+ -100.0f, // p = 0.0
+ -100.0f, // p = 0.0
+ -100.0f, // p = 0.0
+ -100.0f, // p = 0.0
+ },
+ // midres
+ {
+ -3.38168078f, // p = 0.032872917
+ -4.08610739f, // p = 0.016526795
+ -1.78302370f, // p = 0.15270848
+ -100.000000f, // p = 0.0
+ -100.000000f, // p = 0.0
+ },
+ // hdres
+ {
+ -100.000000f, // p = 0.0
+ -100.000000f, // p = 0.0
+ -2.98718897f, // p = 0.048008
+ -100.000000f, // p = 0.0
+ -3.33229488f, // p = 0.03447975
+ },
+ },
+ // Aggressiveness = 1
+ {
+ // Lowres
+ {
+ -100.0000f, // p = 0.0
+ -4.893793f, // p = 0.007437
+ -3.387766f, // p = 0.032680
+ -2.982806f, // p = 0.048209
+ -2.330372f, // p = 0.088639
+ },
+ // Midres
+ {
+ -100.0000f, // p = 0.000000
+ -6.131853f, // p = 0.002168
+ -2.346579f, // p = 0.087338
+ -2.712849f, // p = 0.062219
+ -3.195430f, // p = 0.039338
+ },
+ // Hdres
+ {
+ -3.491416f, // p = 0.029557
+ -2.192853f, // p = 0.100394
+ -3.620180f, // p = 0.026079
+ -2.030855f, // p = 0.116001
+ -2.797586f, // p = 0.057455
+ },
+ },
+ // Aggressiveness = 2
+ {
+ // lowres
+ {
+ -100.0000f, // p = 0.0
+ -3.617350f, // p = 0.026151
+ -5.902503f, // p = 0.002725
+ -4.677840f, // p = 0.009213
+ -2.168378f, // p = 0.102626
+ },
+ // midres
+ {
+ -100.0000f, // p = 0.0
+ -3.204195f, // p = 0.0390081679555
+ -2.354128f, // p = 0.0867382128969
+ -2.523326f, // p = 0.0742390077132
+ -3.112328f, // p = 0.0426016085803
+ },
+ // hdres
+ {
+ -5.047760f, // p = 0.00638270448225
+ -3.414994f, // p = 0.0318301469487
+ -5.628090f, // p = 0.00358255438917
+ -2.122691f, // p = 0.10691083145
+ -1.972387f, // p = 0.122132728355
+ },
+ },
+ // Aggressiveness = 3
+ {
+ // lowres
+ { -100.000000f, -2.04766486133f, -1.00442099188f, -1.15077982642f,
+ -1.0830321897f },
+ // midres
+ { -100.000000f, -0.985686808303f, -0.757739584866f, -0.890120107569f,
+ -0.228236297886f },
+ // hdres
+ { -1.03535679263f, -1.57431743203f, -0.564851540156f, -0.35442301663f,
+ -1.36741555171f },
+ },
+};
+
+static const float av1_simple_motion_search_split_mean_128[17] = {
+ 14.119120f, 14.087010f, 12.016185f, 11.966075f, 12.042454f, 11.994805f,
+ 12.152105f, 12.100394f, 12.178377f, 12.128937f, 4.779944f, 0.714786f,
+ 3.535450f, 3.566207f, 0.835913f, 3.315452f, 3.302908f,
+};
+
+static const float av1_simple_motion_search_split_std_128[17] = {
+ 1.832420f, 1.835338f, 2.019207f, 2.020793f, 2.008731f, 2.008403f,
+ 1.900999f, 1.907081f, 1.908915f, 1.913122f, 2.109345f, 0.451517f,
+ 1.407097f, 1.372501f, 0.370355f, 1.321495f, 1.319665f,
+};
+
+static const float av1_simple_motion_search_split_mean_64[17] = {
+ 12.363721f, 12.314348f, 10.404341f, 10.333541f, 10.405775f, 10.336996f,
+ 10.402246f, 10.330084f, 10.405584f, 10.334330f, 4.554232f, 0.896393f,
+ 2.819613f, 2.855845f, 0.926296f, 2.808782f, 2.798229f,
+};
+
+static const float av1_simple_motion_search_split_std_64[17] = {
+ 1.878920f, 1.882255f, 1.950167f, 1.953289f, 1.913869f, 1.914781f,
+ 1.920096f, 1.924454f, 1.880200f, 1.882499f, 2.050922f, 0.304750f,
+ 1.144391f, 1.125088f, 0.261289f, 1.145059f, 1.131215f,
+};
+
+static const float av1_simple_motion_search_split_mean_32[17] = {
+ 10.750278f, 10.679627f, 8.745625f, 8.644149f, 8.757436f, 8.656657f,
+ 8.759780f, 8.656299f, 8.772563f, 8.669839f, 4.208026f, 0.958573f,
+ 2.308769f, 2.347375f, 0.961685f, 2.323464f, 2.296322f,
+};
+
+static const float av1_simple_motion_search_split_std_32[17] = {
+ 1.879269f, 1.883531f, 1.935828f, 1.935677f, 1.915823f, 1.914773f,
+ 1.909733f, 1.910315f, 1.890451f, 1.890032f, 1.913318f, 0.199276f,
+ 0.988825f, 0.972115f, 0.191956f, 0.977131f, 0.951418f,
+};
+
+static const float av1_simple_motion_search_split_mean_16[17] = {
+ 9.076768f, 8.974986f, 7.078364f, 6.926072f, 7.088739f, 6.936111f,
+ 7.096697f, 6.942841f, 7.114978f, 6.961046f, 3.865480f, 0.982632f,
+ 1.886023f, 1.912892f, 0.981492f, 1.926059f, 1.891233f,
+};
+
+static const float av1_simple_motion_search_split_std_16[17] = {
+ 1.922965f, 1.925609f, 1.851980f, 1.847558f, 1.848410f, 1.843990f,
+ 1.843931f, 1.839582f, 1.840304f, 1.836144f, 1.760042f, 0.130639f,
+ 0.841086f, 0.833523f, 0.134780f, 0.840790f, 0.831309f,
+};
+
+static const float av1_simple_motion_search_split_mean_8[17] = {
+ 7.120238f, 6.957731f, 5.176309f, 4.889594f, 5.178396f, 4.886607f,
+ 5.195322f, 4.905566f, 5.198845f, 4.904745f, 3.648933f, 0.993198f,
+ 1.496831f, 1.520804f, 0.991864f, 1.489763f, 1.460761f,
+};
+
+static const float av1_simple_motion_search_split_std_8[17] = {
+ 1.698498f, 1.696000f, 1.629605f, 1.614641f, 1.632476f, 1.618831f,
+ 1.618352f, 1.603742f, 1.623089f, 1.609674f, 1.668587f, 0.082193f,
+ 0.759407f, 0.759684f, 0.089830f, 0.742797f, 0.730632f,
+};
+
+static const float *const av1_simple_motion_search_split_mean[5] = {
+ av1_simple_motion_search_split_mean_128,
+ av1_simple_motion_search_split_mean_64,
+ av1_simple_motion_search_split_mean_32,
+ av1_simple_motion_search_split_mean_16,
+ av1_simple_motion_search_split_mean_8,
+};
+
+static const float *const av1_simple_motion_search_split_std[5] = {
+ av1_simple_motion_search_split_std_128, av1_simple_motion_search_split_std_64,
+ av1_simple_motion_search_split_std_32, av1_simple_motion_search_split_std_16,
+ av1_simple_motion_search_split_std_8,
+};
+
+#define NUM_HIDDEN_LAYERS_128 1
+#define NUM_FEATURES_128 17
+#define NUM_LAYER_0_UNITS_128 20
+#define NUM_LOGITS_128 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_128[] = {
+ 0.24095f, -0.397761f, -0.388619f, -0.0629548f, -0.44577f, 0.688212f,
+ -0.20889f, -1.08227f, -0.0313894f, -0.615505f, -0.401839f, 0.40233f,
+ -0.171305f, 0.439803f, 1.58527f, -0.968535f, -1.29255f, 1.14846f,
+ 0.885777f, 0.116412f, -0.225704f, 0.316506f, 0.793951f, -0.63591f,
+ 0.097789f, -0.327027f, -0.778396f, -0.231667f, -0.9622f, 1.0044f,
+ 0.32594f, 0.179768f, -0.115529f, -0.499395f, -1.14727f, -1.26111f,
+ 0.269818f, -0.0882028f, -0.349107f, 0.100901f, 0.0249506f, 0.528929f,
+ 0.113961f, 0.929794f, 0.242494f, -0.122828f, -0.0477379f, 0.170659f,
+ 0.0500187f, 0.28859f, 0.78783f, 0.482412f, 0.795298f, 0.179517f,
+ 0.453911f, -0.298029f, -0.903332f, 0.510615f, 0.691994f, 0.433383f,
+ -0.140802f, -1.11635f, -0.547326f, 1.11318f, 0.71905f, 0.978538f,
+ 0.097444f, -0.0386012f, 0.713599f, 0.465164f, 0.391278f, -0.472864f,
+ 0.230224f, -0.279508f, 0.558192f, -0.468625f, 0.55995f, -0.57507f,
+ -1.39947f, -0.755819f, -1.04512f, -0.411552f, -0.830444f, -0.106571f,
+ -0.0972184f, 0.251842f, 0.269955f, 0.230492f, -0.290581f, -0.484799f,
+ 0.0151041f, 0.171047f, 0.829999f, -0.384581f, 0.220301f, -0.121687f,
+ 1.88848f, -0.482809f, -0.48185f, 1.34482f, -0.716438f, -0.284482f,
+ -1.78592f, -1.29333f, 0.886867f, 0.80106f, 0.456415f, 0.649095f,
+ 0.231093f, 0.361562f, 0.290018f, 0.128009f, -0.196343f, 0.0607802f,
+ 0.576761f, -0.0413836f, 0.0300984f, -0.318998f, 0.204434f, -0.712524f,
+ 0.833394f, -0.81168f, 0.765488f, -0.720973f, 1.12866f, -0.838694f,
+ 1.295f, -0.159127f, 1.05404f, 0.736519f, 0.248662f, 0.229233f,
+ 0.0434302f, 0.0551856f, 0.197862f, 0.354823f, -0.32429f, -0.227353f,
+ -0.132198f, -0.438118f, -0.210401f, -0.81046f, 0.653555f, 0.826737f,
+ 0.154235f, 0.228945f, 0.123089f, 0.614964f, -0.0940471f, -0.00676807f,
+ 0.24996f, 0.949233f, 0.746526f, -0.044474f, 0.386414f, 0.503221f,
+ 0.155133f, -0.698848f, -0.735356f, -0.255091f, 0.413235f, -0.335295f,
+ -0.145757f, 0.326299f, -0.602629f, -0.844474f, -0.346722f, -0.42598f,
+ -0.491016f, -0.447732f, -0.965366f, -0.0242841f, 0.836606f, -0.104877f,
+ 1.23236f, 0.683986f, 0.787005f, -0.0253437f, 1.2145f, 1.29554f,
+ -1.24302f, -0.229495f, 0.439415f, 0.885087f, -0.408704f, -0.119299f,
+ -0.0960972f, 0.60148f, 0.683271f, -0.057129f, -0.180295f, -0.264815f,
+ -0.363184f, 0.638271f, 0.631083f, -0.252899f, -0.164364f, -1.31274f,
+ 0.354408f, 0.0429172f, 0.371154f, -1.0978f, 0.0433642f, -0.467394f,
+ -0.706572f, 1.57198f, -0.0701271f, 1.93149f, -0.446267f, 1.4519f,
+ -1.29567f, 0.309978f, -0.878062f, 0.891494f, 0.364005f, -0.209611f,
+ -0.125927f, 0.184097f, 0.0629695f, -0.43375f, -0.0980562f, 1.08547f,
+ 0.578312f, 0.16566f, -0.198852f, -0.241854f, -0.523934f, -0.206037f,
+ -0.867721f, 1.00041f, 1.09848f, -2.12562f, -0.19992f, -0.186128f,
+ -0.03507f, 0.0484884f, 0.160856f, 0.10802f, -0.805141f, -1.06902f,
+ 0.290363f, 0.0222096f, -0.849266f, 0.112932f, 0.148682f, -0.0457585f,
+ 1.139f, 1.79141f, 0.194122f, -0.342508f, -0.403572f, 0.133678f,
+ 0.217553f, -0.263759f, 0.18441f, 0.254529f, 0.0471115f, 0.733178f,
+ -0.416205f, 0.441447f, -0.443335f, 0.725005f, -0.78946f, 0.71301f,
+ -0.644969f, 1.5445f, 0.365277f, -0.455775f, -0.365066f, 0.4742f,
+ -0.381714f, -0.545794f, -0.0464861f, -0.222768f, -0.0106466f, -0.069743f,
+ 0.0335566f, 0.378348f, -0.249663f, 0.922286f, 0.125711f, -0.894619f,
+ 0.444682f, 0.447893f, -1.98936f, -1.41978f, 0.0406667f, -0.199928f,
+ -0.199786f, 0.463481f, 0.334931f, -0.396222f, -0.0732259f, 0.796684f,
+ -0.140817f, -0.26878f, 0.194642f, 0.895784f, -0.369976f, -2.26981f,
+ -0.0791776f, -0.0492268f, 0.6715f, 0.281805f, 0.0156664f, -0.779785f,
+ 0.17743f, 0.188786f, -0.588077f, -0.359153f, 0.258319f, 0.881688f,
+ 0.846894f, 1.00292f, 0.838134f, 0.680632f, 0.273098f, -0.329261f,
+ 0.217757f, -0.506726f, -0.336523f, -0.695875f, -0.252006f, 0.751216f,
+ 0.334409f, -0.0151467f, 0.0885474f, 0.0973114f, -0.248754f, -0.263716f,
+ 0.369906f, -0.213749f, -0.0355395f, -0.137799f, 2.43233f, -0.944233f,
+ -0.745167f, 0.318558f, 0.316608f, 0.568678f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_128[] = {
+ 0.821344f, 1.11542f, -1.24172f, 1.03642f, 1.13511f,
+ 1.16414f, -0.278655f, -1.35558f, -1.26788f, -1.63189f,
+ -0.323271f, 1.21319f, -0.888415f, 0.987145f, -1.16767f,
+ 0.255833f, -0.1392f, 1.43265f, -1.54952f, 1.65159f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_128[] = {
+ 0.3565753f, 0.5490161f, -1.015597f, 0.565366f, 0.751604f,
+ 0.922747f, -1.931846f, 1.759353f, -0.7362949f, 0.5707034f,
+ -1.092127f, 0.936767f, 2.034499f, 2.08148f, 0.9509507f,
+ -1.342504f, -0.834566f, 0.618184f, 0.844113f, 1.182693f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_128[] = {
+ 1.819351f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_128 = {
+ NUM_FEATURES_128,
+ NUM_LOGITS_128,
+ NUM_HIDDEN_LAYERS_128,
+ {
+ NUM_LAYER_0_UNITS_128,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_kernel_128,
+ av1_simple_motion_search_split_logits_kernel_128,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_bias_128,
+ av1_simple_motion_search_split_logits_bias_128,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_128
+#undef NUM_FEATURES_128
+#undef NUM_LAYER_0_UNITS_128
+#undef NUM_LOGITS_128
+
+#define NUM_HIDDEN_LAYERS_64 1
+#define NUM_FEATURES_64 17
+#define NUM_LAYER_0_UNITS_64 24
+#define NUM_LOGITS_64 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_64[] = {
+ -1.40663f, -0.851503f, -0.0613111f, 0.741591f, 0.302754f,
+ 0.184001f, 0.0474853f, 0.371096f, 0.0541624f, 0.381508f,
+ 0.355427f, 0.0428822f, 0.154916f, -0.00490099f, 0.025484f,
+ 0.0208921f, 0.140596f, -0.292525f, -0.459067f, -0.081393f,
+ 0.109824f, -0.290183f, 0.720236f, 0.385835f, -0.150643f,
+ -0.078518f, 0.0979819f, -0.102135f, 0.137152f, -0.0786457f,
+ 0.0171441f, 0.991338f, -0.546583f, -1.0714f, -0.0842851f,
+ 0.244072f, 0.427379f, 0.146775f, -0.921613f, -0.912093f,
+ 0.393566f, -0.232375f, 0.19963f, 0.312355f, 0.55659f,
+ -0.104714f, -0.137563f, 0.0985237f, 0.0788307f, -0.225514f,
+ 0.0228832f, -0.288733f, -0.00737685f, -0.711657f, -0.256796f,
+ 0.0869605f, 0.583977f, 0.384306f, 1.46692f, -0.741126f,
+ -0.21105f, -0.276604f, -0.0151463f, -0.0227997f, -0.0403232f,
+ 0.044122f, 0.0185784f, -0.0451951f, 0.00489513f, -0.387131f,
+ 0.0966724f, -0.599174f, -0.00243351f, -0.21439f, 0.302043f,
+ 0.130334f, -0.191251f, 0.863261f, -1.50112f, 0.00901057f,
+ 0.000324294f, -0.0572545f, 0.0117685f, -0.0734682f, -0.0570435f,
+ -0.126253f, 1.2313f, -0.328267f, 0.211788f, -0.175438f,
+ -0.0419298f, 0.166447f, -0.178739f, -0.326221f, -0.0439188f,
+ 1.01182f, -0.390678f, -0.426343f, 0.0944665f, -0.225042f,
+ -0.183344f, 0.0500763f, -0.377393f, -0.673401f, -0.436907f,
+ -0.00366876f, -0.363412f, 0.195194f, 0.250248f, -0.397193f,
+ -0.0917222f, -0.0221579f, 1.7693f, -0.0694484f, -0.0410764f,
+ -0.134571f, -0.159992f, -0.170359f, -0.249333f, -0.128056f,
+ -0.617054f, -0.808701f, -0.540642f, 0.396391f, 0.147787f,
+ 0.346916f, 0.709852f, 0.116064f, 0.0509731f, 0.073713f,
+ -0.365082f, -1.09287f, -0.618214f, 0.20545f, 0.126161f,
+ -0.140012f, 0.62592f, 0.316326f, -0.392765f, -0.15934f,
+ 0.337617f, -0.41669f, -0.295225f, 0.0602025f, -0.0150657f,
+ -0.319629f, 0.783729f, -0.0661199f, -0.362657f, 0.390042f,
+ -0.043614f, -0.0414596f, 0.121155f, -0.309775f, -0.284761f,
+ -0.243932f, 0.279855f, -0.266823f, 0.734824f, -0.164028f,
+ 0.261776f, -0.105585f, 0.10733f, -0.180469f, 1.18875f,
+ -1.12836f, -0.173008f, 0.150221f, 0.111598f, 0.148306f,
+ -1.2833f, -1.06346f, 0.233546f, 0.16432f, 0.00142378f,
+ 0.340574f, -0.0140885f, 0.634761f, -0.122096f, 0.821487f,
+ 0.421424f, -0.0256687f, -0.035503f, -0.0453547f, -0.0215179f,
+ -0.0671277f, -0.0486862f, -0.962761f, -0.208383f, 0.109573f,
+ -0.210668f, -0.176485f, 0.421279f, 0.41605f, 0.342084f,
+ 0.619364f, 0.103718f, -0.00341643f, 0.00266677f, 0.249089f,
+ -0.22848f, -0.0368968f, 1.12092f, -0.64912f, -0.456579f,
+ 0.477823f, 0.418345f, 1.41515f, 0.0936279f, 0.886155f,
+ -0.785656f, -0.217109f, -0.561829f, -0.286435f, -0.884068f,
+ -0.148839f, -0.282848f, 0.0683745f, 0.0962815f, -0.111975f,
+ 0.0509158f, -0.211274f, 0.744909f, -0.8982f, 0.315232f,
+ -0.78624f, 0.598387f, -0.530952f, 0.677357f, 0.0371339f,
+ 0.99209f, -0.681899f, -0.291416f, -0.224822f, -0.26049f,
+ -0.0436525f, -0.380004f, -0.27187f, 0.534779f, 0.717939f,
+ 0.418197f, -0.152539f, -0.0684039f, -0.186308f, -0.0653121f,
+ 0.194145f, -0.196367f, 0.256997f, -0.726269f, -0.307672f,
+ -0.153362f, 0.450827f, 0.708842f, -0.0667079f, 0.555564f,
+ 0.0486892f, 0.0715072f, -0.7211f, -0.849797f, 0.0650271f,
+ 1.2747f, -0.646738f, -0.53042f, 0.182197f, 0.928203f,
+ 0.180621f, -0.00640791f, -0.171416f, 0.092688f, -0.391275f,
+ -0.0650657f, 0.0843773f, 0.170824f, 0.378085f, 0.0596657f,
+ 0.844398f, -1.3083f, -1.27828f, -0.199179f, 0.557855f,
+ 0.241479f, 0.385804f, 0.169533f, -0.0028072f, 0.0538041f,
+ 0.00136234f, 0.0130481f, 0.0349449f, -0.0366494f, -0.000474055f,
+ 0.437956f, 0.286724f, -0.298187f, 0.461967f, 0.43065f,
+ -0.0877194f, -0.19133f, 0.379121f, -0.687751f, -1.64077f,
+ -0.375191f, -0.336836f, -0.323904f, -0.101859f, 0.0126672f,
+ -0.346332f, 0.112303f, -0.863336f, 0.155538f, 0.366509f,
+ -0.0976829f, 0.635278f, -0.681967f, -0.527729f, 0.591839f,
+ 0.366678f, 0.189981f, 0.0208007f, -0.565809f, 0.70183f,
+ -0.282844f, -0.327485f, 0.347243f, -1.13014f, -0.373378f,
+ -0.514978f, 0.662994f, -0.144931f, 0.1402f, -0.820049f,
+ 0.711498f, 0.681156f, 1.06515f, -0.423409f, -0.0392664f,
+ 0.0675396f, -0.0508602f, 0.0431443f, 0.0212639f, -0.0279887f,
+ -0.62611f, -0.202064f, 0.701934f, 1.28452f, -0.00858481f,
+ -0.517249f, 0.0615832f, -0.260215f, 0.0949119f, -0.28423f,
+ -0.39573f, -0.0574246f, -0.318658f, 0.0601775f, -0.0629386f,
+ -0.134208f, 0.111686f, -0.23355f, 0.078667f, 0.741023f,
+ 0.828523f, -0.345067f, -0.315135f, -0.0957154f, 0.522825f,
+ -0.190057f, -0.473789f, -0.390489f, 0.200677f, -0.0271802f,
+ 0.110336f, 0.493302f, 0.663126f, 0.570148f, -0.380042f,
+ -0.437349f, -0.660884f, 0.301908f, 0.0644179f, 0.172494f,
+ 0.461917f, 0.330938f, -0.140041f, -0.0430205f, -1.51003f,
+ -0.410984f, -0.182161f, 0.0235313f, -0.364849f, 0.154183f,
+ -0.592465f, 0.272701f, 0.192389f, -0.0497777f, -0.924467f,
+ -0.179513f, -0.592217f, 0.436363f, -0.0716164f, 0.189094f,
+ -0.574697f, -0.304303f, 0.326441f, -0.0865553f, 0.735948f,
+ 0.266912f, 0.435824f, -0.123322f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_64[] = {
+ -1.19333f, 1.01834f, -1.10844f, 0.0454873f, -1.45506f, 0.580864f,
+ -0.040979f, -0.505681f, -1.15072f, 0.692697f, -0.520812f, -0.479384f,
+ 0.529652f, 0.507252f, -1.08619f, 0.0586375f, 0.0929614f, -0.46753f,
+ -0.701857f, -0.362933f, -0.291983f, -0.133933f, -0.0131351f, -0.267582f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_64[] = {
+ -3.32501f, 0.43082f, -1.060692f, 1.328908f, 0.8892894f, 0.6488833f,
+ -1.096516f, -0.664786f, -1.301339f, 0.508805f, -2.128406f, -0.757304f,
+ 0.383839f, 0.694763f, -0.591725f, 0.770385f, 1.021594f, 0.589181f,
+ -0.76238f, 1.488826f, 0.709135f, -0.575738f, 0.26421759f, -0.2484219f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_64[] = {
+ 0.699037f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_64 = {
+ NUM_FEATURES_64,
+ NUM_LOGITS_64,
+ NUM_HIDDEN_LAYERS_64,
+ {
+ NUM_LAYER_0_UNITS_64,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_kernel_64,
+ av1_simple_motion_search_split_logits_kernel_64,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_bias_64,
+ av1_simple_motion_search_split_logits_bias_64,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_64
+#undef NUM_FEATURES_64
+#undef NUM_LAYER_0_UNITS_64
+#undef NUM_LOGITS_64
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 17
+#define NUM_LAYER_0_UNITS_32 20
+#define NUM_LOGITS_32 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_32[] = {
+ -0.980626f, -0.946611f, 0.103761f, 0.408899f, 0.498149f,
+ 0.0490161f, 0.253279f, 0.332029f, 0.00367441f, 0.364401f,
+ -0.236433f, 0.0592119f, -0.0978848f, 0.159733f, -0.018052f,
+ -1.10726f, 1.16167f, -0.244982f, -0.147819f, -0.147095f,
+ 0.111404f, -0.349502f, 0.441178f, 0.0984191f, -0.135537f,
+ -0.0423312f, 0.0123079f, 0.358012f, -0.266796f, 0.0125811f,
+ 0.196563f, 0.337093f, -1.07266f, -1.25134f, 0.57337f,
+ -0.521717f, 0.259824f, 0.537383f, -0.463688f, -0.336128f,
+ 0.373385f, 0.483443f, -0.229293f, -0.33373f, -0.656021f,
+ 0.768647f, 0.179279f, 0.315415f, 0.187749f, 1.07839f,
+ 0.0626629f, -0.230299f, 0.662606f, -0.414154f, 0.459334f,
+ -0.6312f, 0.427704f, -0.249849f, 0.701056f, -0.707969f,
+ 0.057401f, 0.620434f, 0.665748f, -0.501356f, -0.230685f,
+ 0.0722371f, -0.0988625f, -0.114035f, -0.653799f, 0.571353f,
+ 0.268276f, 1.13251f, -1.0695f, -0.225607f, -0.984355f,
+ -0.42213f, 0.300422f, 1.21492f, -0.139931f, -0.000726004f,
+ 0.045964f, -0.0817352f, -0.0278813f, -0.0102341f, -0.0144087f,
+ -0.475882f, 1.20682f, -0.359919f, 0.277189f, -0.166401f,
+ 0.599211f, -0.129872f, 0.574211f, -0.247573f, 0.824405f,
+ -1.53329f, -0.202151f, -0.328698f, -0.516322f, -0.281416f,
+ -0.383651f, -0.252862f, -0.43185f, 0.456802f, -0.430055f,
+ -0.55245f, -0.6884f, -0.541456f, -0.281376f, 1.10425f,
+ -0.140706f, 1.59816f, -0.0343895f, -0.00920039f, -0.0307667f,
+ 0.0560132f, -0.0340302f, -0.10848f, 0.0593314f, -0.951795f,
+ 0.876831f, -1.00548f, -0.566244f, 0.430061f, 1.10109f,
+ -0.634212f, -0.0755369f, -0.108953f, 1.03191f, 0.109036f,
+ -0.0415309f, 0.0681162f, -0.0611775f, -0.0231938f, 0.0973158f,
+ -0.0558169f, -0.823484f, -0.918509f, 0.16756f, 0.27087f,
+ 0.286074f, 0.174069f, 0.1304f, 0.386074f, 0.433953f,
+ 0.0291467f, -1.74087f, 0.0296094f, -0.00793714f, -0.13041f,
+ 0.00990992f, -0.0137848f, -0.0742606f, -0.251029f, -0.645316f,
+ 0.640029f, 0.550607f, 0.470097f, 0.549451f, -0.285723f,
+ -0.164759f, -0.128166f, -0.391496f, -0.80287f, 0.0769472f,
+ 1.34391f, 0.0215005f, 0.0669497f, 0.131919f, 0.291674f,
+ 0.0952889f, -0.677953f, -0.364054f, 0.144823f, 0.246198f,
+ -0.12393f, 0.363661f, 0.215091f, -0.239658f, 0.18491f,
+ 0.118703f, 0.0064156f, 1.38619f, -1.3845f, 0.0567323f,
+ 1.20812f, -0.720374f, -1.92158f, -1.48657f, 0.335601f,
+ 0.409379f, 0.373618f, 0.231274f, 0.292194f, 0.368619f,
+ 0.2398f, 0.473579f, 0.83402f, -0.0133751f, -0.00344358f,
+ 2.20688e-05f, 0.00836757f, 0.00405377f, 0.0110539f, -0.260154f,
+ 0.192112f, -0.666986f, 0.302875f, -0.113302f, 0.17882f,
+ -0.221493f, 0.146161f, -0.448697f, 0.584187f, 0.122109f,
+ 0.989981f, -1.14706f, -0.734042f, 0.0638213f, 0.213357f,
+ 0.068543f, -0.808558f, 0.404741f, 0.808313f, 1.57523f,
+ -0.113448f, 0.254102f, -0.350065f, -0.615f, 0.0753549f,
+ -0.540936f, -0.0250732f, -0.225681f, -0.161384f, 0.0128342f,
+ -0.0933368f, -0.286904f, 0.130133f, -0.874747f, 0.392585f,
+ -0.493135f, 0.169708f, 0.0909804f, 1.89921f, -0.469954f,
+ 0.65165f, -0.953401f, -0.21595f, -0.37479f, 0.0451146f,
+ 0.0234621f, -0.0596903f, -0.0682308f, -0.0830426f, 0.130011f,
+ -0.409141f, 0.0627038f, -0.581148f, -0.513922f, 0.631676f,
+ 0.0637034f, 0.0539081f, 0.0638872f, 0.515863f, -0.0123463f,
+ 0.177238f, 0.279506f, -0.930345f, 1.23726f, 0.202851f,
+ 0.708792f, -0.445086f, -0.0267075f, -0.913822f, -0.0714978f,
+ -0.281107f, -0.0770565f, -0.23086f, -0.165893f, -0.319683f,
+ 0.216235f, -0.490999f, 2.04841f, -0.0524071f, -0.239043f,
+ -0.0526375f, 0.023002f, -0.132685f, -0.155354f, -0.186503f,
+ -0.904296f, 0.166478f, 0.063268f, -0.302842f, -0.27179f,
+ -0.428299f, 0.50193f, 0.480717f, -0.864275f, 0.317096f,
+ 0.40698f, 0.0286107f, 0.189432f, -0.0374374f, 0.0671728f,
+ 0.203681f, -0.457959f, -0.155776f, 0.340948f, 0.542841f,
+ 0.342675f, -0.000952399f, 0.470957f, 0.744418f, -1.11763f,
+ -0.658812f, -0.044832f, 0.0688237f, -0.357766f, 0.428662f,
+ -0.087152f, -0.291903f, 0.373244f, -0.587853f, 0.415895f,
+ -0.535694f, 0.621785f, -0.143648f, 0.0451373f, 0.00068827f,
+ 1.84432f, -1.26239f, -0.432087f, -0.152307f, 0.0293551f,
+ 0.184744f, -0.0173156f, -0.00572154f, -0.0305062f, -0.0900071f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_32[] = {
+ 0.160011f, 0.903856f, -0.13738f, 0.358221f, -0.0906044f,
+ -0.606558f, -0.0215651f, -0.03377f, -1.67017f, -0.144554f,
+ -0.201482f, -0.87719f, 0.639815f, -0.51976f, -0.309922f,
+ -1.33421f, 0.721328f, -0.889354f, -1.7158f, -0.285963f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_32[] = {
+ -0.2745374f, 0.333548f, -0.2437388f, 0.288009f, 0.55635f,
+ 0.4560176f, 0.2970518f, 0.391192f, 1.311854f, -0.231219f,
+ -0.2968651f, -1.819984f, 0.2775824f, 0.28929857f, 0.419126f,
+ -0.32868411f, -0.916399f, -0.1921077f, -0.617489f, 0.637953f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_32[] = {
+ 0.208473f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_32 = {
+ NUM_FEATURES_32,
+ NUM_LOGITS_32,
+ NUM_HIDDEN_LAYERS_32,
+ {
+ NUM_LAYER_0_UNITS_32,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_kernel_32,
+ av1_simple_motion_search_split_logits_kernel_32,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_bias_32,
+ av1_simple_motion_search_split_logits_bias_32,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 17
+#define NUM_LAYER_0_UNITS_16 20
+#define NUM_LOGITS_16 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_16[] = {
+ 0.0136957f, 0.182135f, -0.583394f, 0.0556956f, 0.211152f,
+ 0.168234f, -0.694203f, -0.678216f, 0.289943f, 1.00014f,
+ -0.0427784f, -0.0427538f, -0.0276009f, -0.00133608f, 0.0901944f,
+ 0.0674892f, 0.104068f, -0.308582f, -0.43596f, 0.855997f,
+ -0.223414f, 0.0390026f, 0.366492f, 0.216065f, -0.386863f,
+ -0.148823f, -0.297022f, 0.0529546f, -0.202885f, 1.26471f,
+ -0.861163f, -0.0949431f, 0.573627f, -0.00277083f, -0.616063f,
+ -0.626927f, 0.371583f, -0.411743f, 0.173387f, -0.209734f,
+ 0.293697f, -0.260714f, 0.442728f, -0.594486f, 1.38987f,
+ 0.208025f, -0.0433776f, 0.01173f, 0.921766f, -0.168379f,
+ 0.000697326f, 0.209967f, -0.304577f, 0.149551f, -0.196658f,
+ 0.389251f, -0.449106f, -0.456329f, 0.669073f, -0.163806f,
+ 0.083348f, -0.0783998f, 0.0678355f, 0.0510435f, 0.103964f,
+ 0.104537f, -0.778093f, -1.0641f, -0.626102f, -2.02131f,
+ 0.159591f, 0.254161f, -0.000362642f, 0.289859f, 0.192713f,
+ 0.139801f, -0.0251327f, 0.164002f, 1.22892f, -0.0852193f,
+ 0.0769487f, 0.0296408f, -0.0418688f, 0.0936023f, 0.0448523f,
+ 0.674015f, -0.0732944f, 0.313575f, -0.593432f, 0.642067f,
+ -1.06063f, 0.468223f, -0.769085f, -0.173798f, -0.175663f,
+ 0.692808f, 0.00753295f, -0.123327f, -0.0234937f, -0.0923153f,
+ 0.0216917f, -0.0690157f, -0.397488f, 0.426628f, 0.264475f,
+ 0.342074f, -0.139817f, 0.215915f, 0.422544f, -0.321102f,
+ 0.0355587f, 0.460193f, 0.0315326f, 0.080556f, -0.0256533f,
+ -0.0857874f, -0.488283f, -0.299653f, -0.245987f, 0.104383f,
+ 0.203731f, 0.328734f, 0.668104f, -0.586909f, -0.501335f,
+ -0.661292f, -0.359811f, 0.00951363f, 0.816315f, -0.0124104f,
+ 0.0545827f, 0.089863f, 0.0125486f, 0.043609f, -0.0259544f,
+ 0.0123911f, 0.12557f, -0.539875f, -0.0556721f, 0.16532f,
+ 0.265834f, -0.384171f, 0.646496f, 0.366147f, -0.111272f,
+ 0.262096f, -0.0845724f, 0.382724f, 0.165783f, 0.1025f,
+ 0.392988f, 0.290525f, 0.038659f, 0.540269f, -0.485586f,
+ -0.273065f, -0.154052f, -0.0896895f, -0.35394f, 0.193214f,
+ -0.423728f, 0.654576f, -0.373321f, 0.814914f, 0.026278f,
+ -0.0328304f, -0.220913f, -0.0442121f, 0.487545f, -0.509537f,
+ -0.777581f, -1.23886f, 0.223482f, 0.206009f, 0.20391f,
+ 0.194628f, 0.226762f, 0.171609f, -0.219037f, 0.557892f,
+ -0.312011f, 1.27709f, 0.064013f, 0.105384f, 0.0493933f,
+ 0.074059f, -0.0100078f, -0.0176888f, -0.440005f, 0.302922f,
+ -0.197456f, 0.296128f, -0.326647f, 0.305323f, -0.30696f,
+ 0.201951f, -0.15874f, -0.793042f, 0.0197254f, 0.0569867f,
+ -0.0295468f, -0.0215012f, 0.025855f, -0.0196102f, 0.215558f,
+ -0.253069f, 0.298469f, 0.261269f, 0.435305f, 0.0120354f,
+ -0.384789f, -0.2772f, 0.0366613f, -0.494994f, 0.149072f,
+ 1.32981f, -0.427717f, 0.43938f, -0.16375f, -0.444342f,
+ 0.548214f, 0.127955f, -1.24387f, 0.0863676f, 0.175071f,
+ 0.172673f, -0.0906204f, 0.444454f, -0.546669f, 0.215857f,
+ -0.100621f, 0.200699f, -0.0985915f, 0.134706f, -0.256396f,
+ 0.393427f, 0.119606f, -0.214278f, -0.0183637f, 0.194266f,
+ -0.238025f, 0.182203f, 0.599718f, 0.846933f, 0.0607852f,
+ -0.183434f, -0.723743f, -0.72414f, -0.124701f, 0.0227527f,
+ -0.0664636f, -0.0385867f, -0.0257377f, -0.149054f, 0.12077f,
+ 0.678029f, -0.624456f, 0.189644f, -0.518604f, 0.134397f,
+ -0.189777f, -0.309376f, -0.00377086f, 0.701132f, -0.170915f,
+ 0.00736111f, -0.121906f, 0.329136f, 0.165514f, 0.0328356f,
+ 0.171275f, 0.248619f, 0.247704f, -0.449933f, 0.0841684f,
+ 0.136982f, 0.122703f, -0.0169439f, -0.0726496f, 0.302648f,
+ -0.128556f, 0.0667425f, -0.289717f, -0.207532f, -1.20269f,
+ -0.68892f, 0.045259f, 0.0973945f, 0.0988314f, -0.944748f,
+ -0.180401f, 0.134331f, 0.033834f, 0.109023f, 0.265723f,
+ 0.38063f, -0.106518f, -0.0686953f, 0.3744f, -1.0957f,
+ 0.0302782f, 0.0515164f, 0.00188222f, 0.0014413f, -0.0404425f,
+ 0.0124618f, -0.0828645f, 0.506166f, -0.776352f, -0.405138f,
+ -0.123887f, 0.0732116f, 0.379928f, 0.604524f, -0.492317f,
+ 0.439191f, 0.0744193f, 0.389101f, 0.0604518f, 0.0943165f,
+ 0.0339942f, 0.0917975f, 0.0161988f, 0.512227f, 0.538021f,
+ -0.411495f, 0.307281f, 0.33746f, -0.218639f, 0.265742f,
+ 0.39738f, -0.12442f, 0.125236f, -0.0845223f, -0.150396f,
+ 0.0334878f, -0.00391915f, 0.0406864f, -0.0487059f, 0.0377073f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_16[] = {
+ 0.0535976f, -0.0130279f, 0.150146f, -0.511132f, -0.357698f,
+ 0.6719f, -1.27877f, -0.0208048f, 0.0961914f, 0.263603f,
+ 0.704574f, -1.48998f, 0.728063f, 0.941829f, -0.199981f,
+ 0.797802f, -0.29816f, -0.60894f, -0.116624f, -1.16723f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_16[] = {
+ 0.343153f, -0.2110482f, -0.487199f, 0.3274144f, -2.1975f,
+ -0.6051438f, 0.1901127f, 0.4741924f, -0.24029f, -0.185018f,
+ -0.652635f, 2.57714f, -0.31033031f, -0.307222f, 0.329035f,
+ -0.430181f, 0.3429f, 0.742292f, 0.3269808f, 0.4142165f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_16[] = {
+ -0.783658f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_16 = {
+ NUM_FEATURES_16,
+ NUM_LOGITS_16,
+ NUM_HIDDEN_LAYERS_16,
+ {
+ NUM_LAYER_0_UNITS_16,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_kernel_16,
+ av1_simple_motion_search_split_logits_kernel_16,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_bias_16,
+ av1_simple_motion_search_split_logits_bias_16,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 17
+#define NUM_LAYER_0_UNITS_8 20
+#define NUM_LOGITS_8 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_8[] = {
+ 0.079443f, -1.04068f, 0.336819f, -0.20901f, 0.796251f,
+ 0.181066f, 0.0118876f, -0.207145f, 0.250671f, -0.402119f,
+ -0.0847227f, 1.88683f, 0.303469f, 0.0718458f, 0.0338589f,
+ 0.158896f, 0.0540238f, -0.385426f, 0.955925f, 0.424506f,
+ 0.492584f, -0.795058f, -0.248667f, -0.905349f, -0.316989f,
+ 0.545471f, 0.63762f, -0.232613f, -0.238947f, -0.395338f,
+ -0.322673f, -0.0761563f, -0.125357f, 0.0694415f, -0.371599f,
+ 0.358387f, -0.486841f, 0.403863f, -0.0295666f, 0.283074f,
+ -0.424396f, 0.156318f, -0.685355f, 0.6663f, 0.337949f,
+ 0.273198f, 0.517448f, 0.458911f, 0.157252f, 0.692096f,
+ 0.64965f, -0.23987f, -1.08431f, -0.252475f, -0.332614f,
+ -0.712291f, -0.380973f, 0.460545f, 0.48936f, 0.337601f,
+ 0.489223f, 1.65336f, -0.223585f, 0.17367f, -0.235057f,
+ -0.456773f, 0.327877f, -0.221192f, -0.940151f, -1.06616f,
+ 0.687084f, -0.109973f, 0.106636f, 0.445895f, 0.163432f,
+ 0.378306f, 0.201902f, 0.176811f, 0.693082f, 1.62156f,
+ -0.178346f, 0.455175f, 1.61943f, 0.231376f, 0.0890932f,
+ -0.889693f, -1.03298f, 0.778196f, -0.0289539f, 0.137848f,
+ 0.18707f, 0.171889f, 0.119157f, 0.24893f, -0.313628f,
+ 0.00250735f, -0.0758209f, 0.272974f, -0.229825f, 2.47926f,
+ -0.0354665f, 0.175366f, 0.0411555f, -1.52149f, -0.0258663f,
+ 0.253027f, -0.0520839f, -0.0189782f, 0.362387f, -0.371154f,
+ 0.622929f, 0.0447056f, 0.242529f, -0.168391f, 0.308935f,
+ -0.117294f, 2.16307f, 0.0673638f, 0.080771f, -0.460779f,
+ -0.940176f, 0.473266f, -0.0125302f, 0.475145f, -0.218187f,
+ 0.43258f, -0.0380196f, 0.413607f, -0.110856f, -1.52076f,
+ 0.0896812f, 0.246636f, -0.0612008f, 0.189583f, 0.0106902f,
+ -0.158403f, -0.629377f, -0.0634279f, -0.0864584f, -0.226568f,
+ -0.286234f, -0.0721132f, -0.43702f, 0.113702f, 0.433372f,
+ 0.743396f, 0.14312f, 0.29914f, 0.801188f, 0.7609f,
+ 0.385046f, 0.480314f, 0.171119f, -1.59058f, -1.18853f,
+ 0.150676f, 0.408123f, -0.00677924f, 0.398145f, 0.0914611f,
+ 0.176945f, 0.0677457f, 0.316478f, 0.998219f, -0.22618f,
+ 0.0756793f, -0.0156674f, 0.105716f, 0.0496245f, -0.0827133f,
+ -0.423119f, -0.161033f, 0.212962f, -0.234453f, 0.743366f,
+ 1.04108f, 0.0597604f, -0.285993f, -0.114829f, -0.557364f,
+ -0.840051f, 0.326509f, -0.192508f, -0.141769f, 0.370626f,
+ -0.126353f, 0.00672923f, 0.493623f, -0.852076f, 0.466798f,
+ -0.226436f, 0.259268f, -0.452662f, 0.0721126f, 0.0198245f,
+ 0.2048f, 0.02506f, 0.316194f, 0.814651f, 1.01288f,
+ -0.569607f, -0.0838994f, 1.37146f, -0.613135f, 0.441761f,
+ -0.643901f, 0.364269f, -0.147177f, 0.338001f, -0.332376f,
+ 0.518875f, -0.628964f, -0.291889f, -0.050736f, 0.108047f,
+ 1.05673f, 0.0479492f, 0.466756f, -0.0867334f, -0.0355575f,
+ 0.57626f, -0.227583f, -0.146421f, 0.0990489f, 0.117351f,
+ -0.103858f, -0.0336936f, 0.0201903f, -0.0766383f, -0.010211f,
+ 0.0400779f, 0.0725462f, 0.137142f, 0.478261f, 0.287869f,
+ 0.0882359f, -0.739754f, -0.853521f, -0.43703f, 0.316856f,
+ 0.27593f, 0.312149f, 0.175575f, 0.441839f, 0.264325f,
+ 0.0148051f, -0.005559f, 0.373176f, 0.933701f, -0.0197615f,
+ 0.0219723f, -0.0559883f, -0.103456f, -0.0323009f, 0.0773202f,
+ -0.390838f, 0.855488f, -0.596525f, -0.249093f, 0.124262f,
+ 0.220172f, 0.0552478f, 1.04041f, -0.960992f, -0.495255f,
+ -0.211612f, 0.350007f, -0.238998f, -0.0265068f, 0.384686f,
+ -0.0815808f, -0.0570019f, 0.123903f, -0.485114f, -0.00282573f,
+ -0.0649603f, 0.163719f, -0.469479f, -0.439713f, 0.0602562f,
+ -0.527993f, -0.111458f, 2.48686f, -0.180723f, 0.0553895f,
+ 0.0560679f, -0.0978928f, -0.216063f, 0.089457f, -1.5602f,
+ -1.62332f, -0.147388f, 0.736155f, 0.440409f, 0.243519f,
+ 0.0622638f, 0.522932f, 0.109686f, 0.422849f, 0.510589f,
+ 1.01116f, 0.174019f, 0.0191171f, -0.0717751f, -0.0068308f,
+ 0.172932f, -0.834888f, -0.635788f, 0.32012f, 0.298656f,
+ 0.274309f, -0.155456f, 0.1755f, -0.175171f, 0.343498f,
+ -0.122832f, -0.107696f, 0.279924f, -0.797633f, -0.344658f,
+ 0.162669f, 0.389092f, 0.644479f, -0.635216f, -0.181868f,
+ 0.0579244f, -0.0568976f, 0.433003f, -0.591067f, 0.71013f,
+ -0.165515f, 0.225725f, -0.358156f, 0.0541944f, 1.95485f,
+ -0.315223f, 0.61537f, -0.0401568f, 0.22811f, 0.271147f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_8[] = {
+ 1.63441f, -0.616459f, -0.437775f, -0.71669f, 1.56616f, 2.28109f, 1.64054f,
+ -1.51476f, 0.0274108f, 0.935156f, -0.966329f, 0.906069f, 1.19954f, -1.25867f,
+ -1.7376f, -0.594211f, 0.322242f, 0.438631f, -1.01682f, 1.30032f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_8[] = {
+ -0.463187f, 0.2936127f, 0.16762f, -0.1663271f, -0.292418f,
+ -0.421457f, -0.378265f, 1.053049f, 0.32432879f, -0.49775575f,
+ 0.427357f, -0.239251f, -0.1631546f, 0.335468f, 0.255371f,
+ 0.276901f, -0.665683f, -0.7021493f, 0.381513f, -0.1339761f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_8[] = {
+ -1.739754f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_8 = {
+ NUM_FEATURES_8,
+ NUM_LOGITS_8,
+ NUM_HIDDEN_LAYERS_8,
+ {
+ NUM_LAYER_0_UNITS_8,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_kernel_8,
+ av1_simple_motion_search_split_logits_kernel_8,
+ },
+ {
+ av1_simple_motion_search_split_hiddenlayer_0_bias_8,
+ av1_simple_motion_search_split_logits_bias_8,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
+
+static const NN_CONFIG *const av1_simple_motion_search_split_nn_config[5] = {
+ &av1_simple_motion_search_split_nn_config_128,
+ &av1_simple_motion_search_split_nn_config_64,
+ &av1_simple_motion_search_split_nn_config_32,
+ &av1_simple_motion_search_split_nn_config_16,
+ &av1_simple_motion_search_split_nn_config_8,
+};
+
+// Model based on simple_motion_search for pruning rect
+// Thresholds. The first idx level is aggresiveness, second is frame resolution,
+// third is bsize
+static const float av1_simple_motion_search_prune_rect_thresh[4][3][5] = {
+ // Aggressivness = 0
+ {
+ // Lowres
+ { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+ 0.000961189195907f, 0.0f },
+ // Midres
+ { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+ 0.000961189195907f, 0.0f },
+ // Hdres
+ { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+ 0.000961189195907f, 0.0f },
+ },
+ // Aggressivness = 1
+ {
+ // Lowres
+ {
+ 0.000000f,
+ 0.116076f,
+ 0.049759f,
+ 0.057747f,
+ 0.006001f,
+ },
+ // Midres
+ {
+ 0.000000f,
+ 0.017380f,
+ 0.026077f,
+ 0.078111f,
+ 0.064477f,
+ },
+ // Hdres
+ {
+ 0.002994f,
+ 0.103093f,
+ 0.076408f,
+ 0.010456f,
+ 0.187211f,
+ },
+ },
+ // Aggressiveness = 2
+ {
+ // Lowres
+ {
+ 0.000000f,
+ 0.003111f,
+ 0.144294f,
+ 0.144884f,
+ 0.069924f,
+ },
+ // Midres
+ {
+ 0.000000f,
+ 0.013696f,
+ 0.055203f,
+ 0.152271f,
+ 0.078886f,
+ },
+ // Hdres
+ {
+ 0.030577f,
+ 0.082486f,
+ 0.040690f,
+ 0.140924f,
+ 0.067608f,
+ },
+ },
+ // Aggressiveness = 3
+ {
+ // Lowres
+ { 0.0f, 0.352338114654f, 0.171190796972f, 0.322629318068f,
+ 0.287219697095f },
+ // Midres
+ { 0.0f, 0.30938393361f, 0.271772875141f, 0.240627957104f,
+ 0.178833795641f },
+ // Hdres
+ { 0.285731215187f, 0.37521798723f, 0.142380566244f, 0.338288917819f,
+ 0.21329309279f },
+ },
+};
+
+// Mean and std
+static const float av1_simple_motion_search_prune_rect_mean_128[25] = {
+ 13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f,
+ 10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f,
+ 12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f,
+ 12.152315f, 11.517566f, 11.465651f, 5.383040f, 0.757934f,
+ 4.012611f, 4.052191f, 0.853365f, 3.954503f, 3.944135f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_128[25] = {
+ 2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f,
+ 3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f,
+ 2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f,
+ 1.208679f, 0.353742f, 1.228122f, 1.211777f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_64[25] = {
+ 11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f,
+ 9.084122f, 8.559063f, 8.499496f, 8.095865f, 8.041795f,
+ 10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f,
+ 10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f,
+ 3.306144f, 3.351039f, 0.928582f, 3.319739f, 3.287726f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_64[25] = {
+ 2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f,
+ 3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f,
+ 2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f,
+ 1.081292f, 0.257521f, 1.112510f, 1.089404f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_32[25] = {
+ 9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f,
+ 7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f,
+ 8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f,
+ 2.751266f, 0.963302f, 2.716584f, 2.709725f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_32[25] = {
+ 1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f,
+ 1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f,
+ 1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f,
+ 0.952221f, 0.188018f, 0.985295f, 0.946228f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_16[25] = {
+ 8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f,
+ 6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f,
+ 7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f,
+ 2.131698f, 0.981005f, 2.110868f, 2.106539f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_16[25] = {
+ 1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f,
+ 1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f,
+ 1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f,
+ 0.829935f, 0.136507f, 0.828972f, 0.808563f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_8[25] = {
+ 6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f,
+ 4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f,
+ 6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f,
+ 1.531762f, 0.989606f, 1.496581f, 1.484139f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_8[25] = {
+ 1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f,
+ 1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f,
+ 1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f,
+ 0.754040f, 0.101419f, 0.738239f, 0.729455f,
+};
+
+static const float *const av1_simple_motion_search_prune_rect_mean[5] = {
+ av1_simple_motion_search_prune_rect_mean_128,
+ av1_simple_motion_search_prune_rect_mean_64,
+ av1_simple_motion_search_prune_rect_mean_32,
+ av1_simple_motion_search_prune_rect_mean_16,
+ av1_simple_motion_search_prune_rect_mean_8,
+};
+
+static const float *const av1_simple_motion_search_prune_rect_std[5] = {
+ av1_simple_motion_search_prune_rect_std_128,
+ av1_simple_motion_search_prune_rect_std_64,
+ av1_simple_motion_search_prune_rect_std_32,
+ av1_simple_motion_search_prune_rect_std_16,
+ av1_simple_motion_search_prune_rect_std_8,
+};
+
+#define NUM_HIDDEN_LAYERS_128 1
+#define NUM_FEATURES_128 25
+#define NUM_LAYER_0_UNITS_128 8
+#define NUM_LOGITS_128 4
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_128[] = {
+ -0.129103f, 0.457758f, -0.489986f, 0.65462f, -0.184312f, 3.81202f,
+ -0.444407f, -0.64198f, -0.575008f, 0.0311711f, 0.525243f, -20.892f,
+ 1.08811f, -65.0976f, -12.3973f, -1.38278f, -0.264233f, 0.241636f,
+ -10.6925f, -0.725414f, -18.8987f, -40.2284f, -16.08f, 0.995331f,
+ 1.47614f, -0.964864f, 0.405506f, 0.140449f, 0.459534f, -1.9093f,
+ 0.398452f, 0.696949f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_128[] = {
+ 1.22789f, -1.34527f, 0.759048f, 0.315086f,
+ 1.0834f, -1.58019f, -0.465158f, 1.20716f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_128[] = {
+ -0.668677f, 0.58694f, -0.417094f, 0.754735f, -0.7859f,
+ 0.377479f, -0.0415929f, -0.0140585f, -0.730001f, 0.747528f,
+ -0.135247f, 0.406505f, -0.234184f, 0.956362f, -0.637555f,
+ 0.791884f, 0.0303722f, 1.04424f, -0.727859f, -0.274321f,
+ -0.122986f, 0.066312f, -0.00559175f, -0.239643f, -0.0188767f,
+ -0.102787f, -0.262967f, 0.071882f, -0.283398f, 0.111607f,
+ -0.425826f, 0.02699f, 0.108873f, -0.180558f, -0.0794057f,
+ 0.29665f, -0.0252969f, -0.0266213f, -0.277462f, -0.361973f,
+ 0.512552f, 0.395011f, -0.225876f, 0.301924f, 0.136954f,
+ 0.507259f, 1.23425f, 0.0137135f, 0.662572f, 0.591583f,
+ 0.101564f, 0.416805f, -0.645081f, -0.179086f, -0.36747f,
+ -0.332213f, 0.095177f, 0.220739f, -0.153256f, 0.706155f,
+ 0.161701f, 0.696815f, -1.21531f, -0.115059f, 0.486764f,
+ -0.396093f, 0.784883f, 0.535357f, -0.278021f, 0.143496f,
+ -0.44931f, -0.144543f, 0.319326f, 0.0190167f, -0.206295f,
+ 0.373995f, -0.247897f, -0.608095f, -0.41796f, -0.137129f,
+ -0.709562f, 0.678273f, 0.537607f, 0.557474f, 0.453308f,
+ 0.21405f, -0.0466495f, 0.519139f, -0.168832f, 0.902911f,
+ 0.681131f, -0.139876f, -0.2052f, -0.393271f, 0.262222f,
+ -0.246246f, -0.213993f, 0.646619f, 0.0496181f, -0.00354157f,
+ 0.822927f, 0.0939522f, 0.180738f, 0.118355f, 0.120456f,
+ -0.0472214f, -0.144958f, 0.173405f, -0.886644f, -0.0949769f,
+ -0.813518f, -0.3947f, -0.128021f, 0.356196f, 0.469169f,
+ -0.413702f, 1.04242f, 0.428853f, -0.387293f, 0.0850877f,
+ 0.279409f, -0.142276f, 0.0579376f, 0.211112f, 0.0703013f,
+ -1.9274f, -0.729147f, 0.534193f, 0.773586f, 0.922864f,
+ 0.642881f, 1.15127f, 0.621032f, 0.933942f, 1.01837f,
+ -0.660282f, -0.40059f, -1.11279f, -0.77088f, -0.43349f,
+ 0.202361f, -0.0840912f, 0.0935707f, 0.056333f, -0.0779369f,
+ 0.0173447f, -0.0104756f, 0.0115005f, -0.0195593f, 0.03592f,
+ -0.343454f, -0.618048f, 0.258172f, -0.412322f, -0.0463746f,
+ -0.0413654f, -0.0400194f, 0.615981f, -0.452094f, 0.644555f,
+ 0.0822476f, -0.359791f, -0.0904274f, 0.209427f, 0.0116338f,
+ -0.190978f, 0.890233f, 0.737769f, -1.66663f, -0.392605f,
+ 0.0785728f, -0.224553f, -0.128258f, -0.227227f, -0.0777773f,
+ 0.685976f, 0.347042f, -0.555325f, -0.249221f, 0.0919837f,
+ -0.0660016f, -0.272316f, 0.0390632f, -0.619624f, -0.0565801f,
+ 0.585026f, 0.597375f, 0.54114f, 0.593389f, 0.604391f,
+ 0.0820294f, -0.85339f, -1.40741f, -0.391675f, 0.0579205f,
+ -0.197626f, 0.130044f, -0.234488f, -0.0373991f, -0.0717973f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_128[] = {
+ 1.58571f, -4.6314f, -2.00273f, 0.543699f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_128 = {
+ NUM_FEATURES_128,
+ NUM_LOGITS_128,
+ NUM_HIDDEN_LAYERS_128,
+ {
+ NUM_LAYER_0_UNITS_128,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_kernel_128,
+ av1_simple_motion_search_prune_rect_logits_kernel_128,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_bias_128,
+ av1_simple_motion_search_prune_rect_logits_bias_128,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_128
+#undef NUM_FEATURES_128
+#undef NUM_LAYER_0_UNITS_128
+#undef NUM_LOGITS_128
+
+#define NUM_HIDDEN_LAYERS_64 1
+#define NUM_FEATURES_64 25
+#define NUM_LAYER_0_UNITS_64 32
+#define NUM_LOGITS_64 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_64[] = {
+ 0.10424f, -0.346025f, 0.534547f, -0.385925f, 2.58341f, -0.256414f,
+ -0.232498f, 0.329823f, -0.0777376f, -0.590939f, 0.062657f, -0.628252f,
+ 0.0934588f, 2.04029f, -0.224448f, 0.371168f, -0.385348f, -0.589883f,
+ -3.73627f, -0.943144f, 0.346409f, -0.211215f, -0.351008f, 0.418807f,
+ 0.943663f, 0.173267f, 1.16585f, -0.0840888f, 0.227464f, 0.374412f,
+ 0.0422597f, -0.338868f, 0.222576f, 0.431713f, 1.12366f, 0.00753411f,
+ 0.248412f, -0.0902425f, 0.542455f, -0.665629f, -0.311245f, -0.205639f,
+ -0.447149f, -0.0502733f, -0.290186f, -0.794384f, 0.0940881f, -0.0686117f,
+ -0.0199961f, -0.587965f, 0.777096f, -0.083381f, -1.21282f, 0.652959f,
+ -1.18238f, 0.539991f, 0.352497f, -0.540076f, -0.26222f, -0.568556f,
+ 0.409102f, -0.131146f, -0.407161f, -0.188287f, -0.478657f, 0.000401932f,
+ -0.689324f, 0.351064f, -1.43704f, -0.315185f, -0.868726f, 0.376341f,
+ -0.0566277f, 0.364831f, 0.611298f, -0.495253f, -0.0193132f, 0.617978f,
+ 0.189586f, -0.236758f, -0.608246f, -0.149017f, -1.78303f, 0.143023f,
+ 0.698386f, -0.994086f, -0.673327f, 0.233868f, 0.360425f, 0.0294123f,
+ -0.248683f, -0.148392f, 0.0861829f, -0.190843f, -0.414906f, 0.607378f,
+ -0.756715f, -0.511713f, -0.321556f, 1.0078f, -1.18141f, 0.519751f,
+ 0.834629f, -0.359343f, 0.612262f, -0.0730553f, 0.262935f, 0.488276f,
+ 0.387071f, -1.44123f, 1.08269f, 0.554402f, -0.069f, 0.14113f,
+ 0.323817f, 0.824314f, -0.431417f, -0.349448f, 0.950728f, -0.587836f,
+ -0.83914f, -0.10844f, 0.26602f, 0.831933f, -0.271315f, 0.231563f,
+ 0.417049f, 0.190627f, -0.0940667f, 0.255363f, -0.0741022f, -0.0987662f,
+ -0.847522f, 0.00287554f, 0.0615741f, -0.0832218f, 0.0847148f, -0.392843f,
+ -0.938068f, -0.10621f, -0.260859f, -0.825175f, -0.401039f, 0.315213f,
+ -0.108269f, 0.288036f, -8.66166f, -0.970752f, -0.66678f, -0.593405f,
+ -0.518294f, -0.138722f, -0.454698f, -0.22969f, -0.553006f, -0.440111f,
+ 0.462661f, -0.536854f, 0.0108295f, -0.522888f, 0.00111157f, 0.229999f,
+ 0.0267768f, 0.176266f, -1.57043f, 0.0318106f, 0.257534f, -0.198583f,
+ 0.175564f, -0.251465f, -0.262441f, -1.65283f, -0.319603f, -0.875282f,
+ -0.301303f, 0.0170948f, -0.227075f, 0.0299545f, -4.98346f, 0.470046f,
+ -1.28051f, -0.213809f, -0.486585f, -0.906463f, -0.169984f, -0.333153f,
+ -0.376733f, 0.108016f, 0.486744f, -0.186936f, -0.429259f, 0.056501f,
+ -0.266545f, 0.265447f, -0.137718f, -0.490687f, -0.935668f, -0.16229f,
+ -0.696932f, 0.173157f, 0.434959f, -0.140595f, 0.345845f, -1.08013f,
+ -0.0205929f, -0.815874f, -0.179812f, 0.02767f, -0.141727f, 0.471936f,
+ -7.29453f, -1.04362f, -0.745482f, -0.28725f, -0.214997f, -0.0850651f,
+ -0.748471f, 0.161325f, -1.04387f, -0.705305f, 0.489427f, -0.765373f,
+ -0.301576f, 0.0742467f, -0.331282f, 0.0372328f, -0.90298f, -0.0608646f,
+ -2.18756f, 0.170384f, -0.258357f, 0.106287f, -0.161684f, -0.103799f,
+ -0.127774f, -0.156313f, 0.0705286f, -0.977908f, -0.281191f, -0.056757f,
+ -0.309474f, 0.050476f, -9.78198f, -2.42795f, -0.289626f, -1.07579f,
+ -0.439256f, -1.09948f, -0.564671f, 0.0913182f, -0.417216f, -1.19909f,
+ 0.287063f, 0.402315f, -0.17646f, 0.540488f, 0.00840239f, 0.397492f,
+ 0.702393f, -0.10566f, 0.655296f, -0.0443876f, 0.154918f, -0.760479f,
+ -0.0523153f, -0.366199f, -1.08212f, -0.398556f, -0.415203f, -1.10488f,
+ 0.208349f, 0.27079f, 0.101546f, -0.205752f, -13.7923f, -0.218637f,
+ -1.10077f, 0.355735f, -0.306196f, 0.627434f, -0.473101f, -0.308027f,
+ -1.12724f, 0.301597f, 0.660785f, 0.0576217f, -0.155925f, -0.56107f,
+ -0.223537f, 0.114299f, -0.53803f, -0.252674f, -2.66103f, -0.185245f,
+ -0.314673f, 0.403337f, 0.679821f, -0.69231f, 0.506264f, -0.999705f,
+ -0.549097f, 0.353745f, 0.188249f, 0.414484f, -0.615853f, 0.525681f,
+ -5.23065f, -3.05174f, 1.02074f, -0.965499f, -0.158947f, 0.0436088f,
+ -0.485824f, 0.0375094f, -1.39985f, -0.481392f, 0.485785f, -0.24874f,
+ -0.359633f, 0.668108f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_64[] = {
+ 0.0735592f, -0.045064f, -0.0114103f, 1.39246f, -0.683467f, 0.155765f,
+ -0.667652f, -0.202425f, -0.585433f, -0.146752f, -0.0812931f, 0.580642f,
+ 0.578542f, -0.831916f, 0.610063f, 0.0101856f, -0.235863f, 0.538141f,
+ -2.91334f, -1.71887f, 0.126616f, 0.582497f, -0.438879f, 0.221833f,
+ 0.850773f, -0.280886f, 0.443233f, -0.0964873f, -0.216161f, 0.34413f,
+ 0.656818f, 0.0169274f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_64[] = {
+ -0.310947f, -0.232675f, 0.0171092f, 0.0834474f, 0.373977f,
+ 0.300429f, 0.215072f, -0.454074f, 0.187565f, 0.282742f,
+ 0.562562f, -0.0419322f, 0.000978486f, -0.298267f, 0.216934f,
+ -0.388722f, -0.146866f, -0.275946f, 0.202361f, 0.225847f,
+ 1.42868f, 0.473127f, -0.145747f, -0.104986f, 0.153459f,
+ 0.69382f, 0.162266f, 0.0207715f, -0.45095f, -0.412071f,
+ -0.235109f, -0.130199f, 0.231741f, 0.460193f, 0.0378202f,
+ 0.429516f, 0.387691f, -0.272479f, 0.0723884f, -0.453914f,
+ -0.150618f, -0.10745f, -0.258615f, 0.0838312f, -0.00554958f,
+ 0.105377f, -0.0415479f, 0.13228f, 1.09044f, -0.73053f,
+ -0.422553f, -0.435842f, 0.211416f, 0.420332f, 0.0181353f,
+ -0.030891f, 0.522788f, 0.613526f, 0.374032f, 0.287986f,
+ -0.403118f, -0.287362f, -1.11523f, -0.577713f, -0.020228f,
+ 0.86465f, -0.0590579f, 0.341274f, -0.0115644f, -0.260236f,
+ 0.192123f, -0.0849825f, 0.0501709f, 0.444382f, 0.0762727f,
+ 0.0926596f, -0.101157f, -0.142787f, 0.40861f, 0.555805f,
+ -0.00614654f, -0.122846f, 0.203163f, 0.234266f, 0.409795f,
+ -0.0206245f, -0.224679f, 0.025081f, 0.518044f, -0.287186f,
+ 0.016494f, -0.0886331f, 0.236438f, -1.01032f, 0.118332f,
+ 0.364217f, 0.061438f, 0.0381303f, 0.128418f, 0.0257077f,
+ -0.975751f, -0.694894f, 0.00351914f, 0.278179f, 0.29363f,
+ 0.525576f, 0.0604849f, 0.531734f, 0.406643f, 0.812497f,
+ -0.403196f, -0.16664f, -0.620887f, -0.428194f, 0.275401f,
+ 0.432063f, -0.00378342f, 0.295758f, 0.105615f, -0.00683626f,
+ 0.00396146f, 0.00598654f, -0.0131701f, -0.0115787f, 0.00386643f,
+ -0.69686f, -0.139623f, -0.440817f, 0.0542873f, 0.217962f,
+ 0.527035f, -0.0201046f, 0.0471354f, 0.0271858f, -0.0775197f,
+ -0.309797f, 0.184879f, -0.232854f, -0.407081f, 0.706227f,
+ -0.0877534f, 0.306843f, 0.455075f, -0.333961f, 0.0759148f,
+ 0.0444791f, -0.0693626f, -0.0850289f, -0.513063f, -0.643971f,
+ -0.630279f, -0.153889f, 0.123315f, 0.00548238f, 0.170707f,
+ 0.734339f, -0.176988f, 0.322519f, 0.178365f, 0.183519f,
+ -0.698683f, -0.12043f, -0.349914f, -0.0696762f, -0.53986f,
+ -0.104738f, 1.05264f, 0.983568f, -0.109035f, 0.0113748f,
+ 0.0815189f, -0.0628812f, 0.0769389f, 0.010261f, 0.146573f,
+ -0.433194f, -0.211572f, -0.000397392f, 0.445325f, 0.145091f,
+ -0.0625902f, 0.29394f, 0.302315f, 0.0892226f, -0.209504f,
+ -0.0150374f, 0.242608f, 0.216223f, 0.366857f, 0.209829f,
+ -0.540035f, 0.117599f, -0.329315f, 0.0471133f, -0.0115449f,
+ -0.0638235f, 0.0527461f, 0.348149f, 0.360802f, 1.06624f,
+ -0.615991f, -0.341396f, 0.18972f, 0.0709888f, -0.0414466f,
+ -0.0193809f, 0.0938933f, 0.209058f, 0.575042f, 0.483608f,
+ -0.285875f, -0.115905f, -0.363637f, 0.375425f, 0.336217f,
+ 0.0336358f, -0.00265618f, -0.406854f, -0.792959f, -0.219354f,
+ 0.0331615f, 0.0298859f, -0.211446f, -0.00280773f, -0.194011f,
+ 0.262109f, 0.548076f, 0.120183f, -0.661603f, 0.241855f,
+ -0.501428f, 0.00102718f, -0.347331f, -0.58306f, 0.0977254f,
+ 0.117491f, 0.0840667f, 0.00693675f, 0.000600294f, 0.649569f,
+ -0.0553811f, -0.197198f, 0.397236f, -0.523737f, -0.564192f,
+ -0.374679f, -0.249344f, 0.00861428f, 0.00393439f, -0.0834608f,
+ 0.124389f, -0.0393049f, 0.0425391f, -0.153383f, -0.182346f,
+ 0.420953f, 0.464221f, 0.288984f, 0.570921f, -0.239965f,
+ 0.247239f, -0.083434f, 0.714418f, 0.986323f, -0.460244f,
+ -0.260993f, -0.947743f, -1.0789f, -0.0391231f, 0.612407f,
+ -0.0306767f, 0.281419f, 0.0072426f, -0.37623f, 0.188744f,
+ 0.221666f, -0.424914f, 0.29703f, 0.261715f, 0.277809f,
+ -0.0617616f, -0.000611999f, -0.0547053f, -0.0901018f, -0.347669f,
+ 0.856072f, 0.596675f, -0.467639f, -1.09324f, -0.184224f,
+ -0.56051f, -0.0144704f, 0.102894f, -0.122982f, -0.0020749f,
+ -0.0423487f, 0.0328702f, -0.0154263f, 0.0349021f, -0.00315595f,
+ 0.0254802f, -0.729191f, 0.207296f, -0.0212349f, -0.207078f,
+ 0.20636f, -0.156883f, 0.429765f, -0.42672f, 0.138775f,
+ -0.0267343f, 0.631528f, 0.300646f, -0.4793f, -0.273833f,
+ -0.0135367f, -0.530819f, -0.534881f, 0.830896f, 0.0266992f,
+ 0.473744f, 0.210334f, 0.0234739f, 0.255394f, 0.123531f,
+ -0.489341f, -0.796627f, 0.372617f, 0.190136f, 0.275342f,
+ 0.739505f, 0.402354f, 0.782806f, 0.437374f, 1.04948f,
+ -0.55963f, 0.382704f, -0.698321f, 0.0817868f, -0.440108f,
+ -0.0635004f, -0.277851f, -0.524194f, 0.286157f, -0.01097f,
+ -0.0293145f, -0.0405071f, -0.035662f, -0.012871f, -0.0516409f,
+ -0.406671f, 0.709259f, -0.525177f, 0.521123f, -0.44813f,
+ 0.48412f, -0.0546513f, 0.305253f, -0.468328f, 0.316453f,
+ -0.36307f, 0.497515f, -0.0606276f, 0.315764f, -0.422066f,
+ 0.554025f, -0.679183f, 0.616914f, 0.00283324f, -0.000643824f,
+ 0.0639999f, 0.0488285f, -0.141031f, 0.068003f, -0.0792678f,
+ -0.425307f, -0.152235f, 0.269917f, -0.352327f, 0.44792f,
+ -0.116514f, -0.465868f, 0.154287f, 0.0161028f, -0.16848f,
+ -0.255487f, 0.189832f, 0.254883f, 0.0240822f, 0.432638f,
+ -0.136564f, 0.137036f, 0.0375734f, 0.989246f, -0.126287f,
+ 0.111416f, -0.0271002f, 0.718755f, -0.0412969f, 0.00645681f,
+ 0.253811f, -0.0186998f, 0.691971f, -0.282042f, -0.0783915f,
+ 0.274592f, -0.358449f, 0.34155f, -0.186374f, -0.136907f,
+ -0.192334f, -0.251168f, -0.100874f, -0.166578f, -0.336507f,
+ 0.402373f, 0.173695f, 0.108788f, 0.00885581f, -0.310063f,
+ 1.05545f, 0.0295867f, 0.180785f, -0.173469f, -0.469924f,
+ -0.224155f, 0.665862f, -0.126546f, 0.240691f, -0.0415301f,
+ -0.598534f, 0.0012723f, -0.122297f, -0.558947f, 0.268844f,
+ 0.241193f, 0.0524422f, -0.1683f, 0.575588f, -0.139012f,
+ 0.0636691f, -0.446709f, -0.094532f, 0.883809f, -0.112981f,
+ -0.224047f, 0.0811193f, -0.140571f, -0.09683f, -0.0796143f,
+ -0.102246f, -0.863392f, -0.0755124f, 0.23125f, -0.0301361f,
+ -0.153029f, -0.172238f, -0.0286382f, -0.338495f, -0.317216f,
+ -0.146629f, -0.242264f, -0.702306f, -0.285052f, 0.0623479f,
+ 0.265735f, 0.00674475f, 0.666196f, 0.883586f, 0.278416f,
+ -0.341692f, -0.509931f, -0.156263f, 0.635885f, -0.544143f,
+ -0.572632f, -0.213285f, 0.443396f, -0.268329f, 0.0638439f,
+ -0.185397f, 0.071126f, 0.386503f, -0.402212f, -0.140784f,
+ -0.411661f, 0.049398f, -0.0672907f, -0.267034f, -0.0560875f,
+ 0.0607937f, 0.0445484f, -0.547651f, 0.574718f, 0.417189f,
+ -0.0610166f, 0.0632293f, 0.391619f, -0.00671215f, -0.136883f,
+ -0.339346f, 0.0356183f, 0.511993f, 0.178676f, 0.286998f,
+ 0.136511f, -0.00796929f, 0.203985f, 0.0423532f, -0.175196f,
+ 0.378534f, 0.770417f, 0.593778f, 0.0256067f, -0.82394f,
+ -0.500691f, -0.425725f, -0.623708f, -0.0406241f, -0.00226464f,
+ 0.0207836f, 0.30732f, -0.00784268f, 0.0065445f, -0.0991039f,
+ -0.20871f, -0.206835f, 0.281219f, 0.119361f, 0.259346f,
+ -0.102713f, 0.186488f, -0.034455f, -0.00198392f, -0.279107f,
+ -0.638993f, -0.374404f, -0.48601f, -0.262345f, 0.624532f,
+ 0.620632f, -0.227014f, 0.433579f, -0.0455096f, 1.22123f,
+ -0.429156f, 0.12396f, 0.0815152f, -0.0837355f, 0.0282623f,
+ -0.407475f, 0.787321f, -0.434974f, 0.312904f, -0.230805f,
+ 0.213042f, -0.250929f, 0.302997f, -0.354709f, 0.0504905f,
+ -0.561706f, 0.595558f, 0.374951f, 0.802969f, -0.674902f,
+ 0.33136f, 0.156606f, 0.0218968f, -0.694188f, -0.0221949f,
+ -0.00639123f, 0.0146536f, 0.0104145f, 0.021635f, -0.0499428f,
+ -0.575116f, -0.239035f, -0.0588276f, 0.599722f, 0.541932f,
+ 0.437433f, 0.716268f, 0.193207f, 0.548351f, 0.326951f,
+ -0.197124f, 0.0355353f, -0.0952009f, -0.217265f, -0.389789f,
+ 0.0528124f, -0.21334f, -0.190296f, -1.17367f, 0.108905f,
+ 0.109397f, -0.0192577f, 0.0343813f, 0.085004f, -0.0556737f,
+ -0.0411158f, -0.534989f, 0.0361896f, 0.124415f, 0.291603f,
+ -0.0311974f, -0.326726f, 0.343131f, 0.0276456f, -0.231827f,
+ -0.373894f, -0.208898f, -0.273011f, 0.061323f, -0.0910538f,
+ -0.30746f, -0.108644f, -0.190736f, 1.58048f, -0.0739711f,
+ -0.0623489f, -0.137967f, -0.0601359f, -0.133004f, -0.0857153f,
+ 0.00955987f, -0.365561f, -0.0329051f, 0.463463f, 0.14758f,
+ -0.512256f, -0.227463f, -0.26008f, -0.567777f, 0.0646234f,
+ 1.02161f, 0.66157f, -0.16733f, 0.264921f, -0.242036f,
+ 0.214622f, 0.0712054f, -0.260377f, 0.0849665f, 0.735094f,
+ 0.11001f, 0.297301f, -0.333342f, 0.066978f, -0.123625f,
+ 1.07596f, 0.401263f, 0.0800875f, -0.340862f, -0.115587f,
+ -0.32692f, -0.300842f, 0.0277397f, 0.0630788f, -0.261198f,
+ 0.428695f, -0.0544757f, -0.124511f, 0.036992f, 0.126322f,
+ 0.0317603f, 0.0820762f, 0.117277f, -1.14594f, -0.108076f,
+ -0.0258198f, -0.00337525f, -0.00512531f, 0.1274f, -0.0660535f,
+ -0.640733f, 0.197142f, 0.147278f, 0.489271f, 0.226507f,
+ -0.0668414f, 0.0946318f, 0.0994164f, -0.820516f, 0.512939f,
+ -0.305172f, -0.715187f, -0.195125f, 0.279346f, 0.462144f,
+ 0.913882f, -0.453879f, 0.0582033f, -0.462866f, 0.0538736f,
+ 0.0115737f, 0.00626993f, -0.0185185f, 0.0114601f, -0.0181164f,
+ 0.41588f, -0.0447331f, 0.611756f, 0.43385f, 0.834465f,
+ 0.122019f, -0.352983f, 0.340429f, -0.245425f, -0.365328f,
+ -0.521825f, 0.0371057f, 0.172188f, -0.387949f, 0.221054f,
+ 0.0126359f, 0.422958f, 0.584198f, -0.581498f, -0.019466f,
+ -0.0271737f, -0.0740885f, 0.00540879f, 0.186086f, -0.0324402f,
+ -0.563462f, -0.458759f, -0.425296f, -0.0118862f, -0.641508f,
+ 0.0132084f, 0.0581128f, 0.0231444f, 0.468587f, 0.258838f,
+ 0.0296665f, 0.0562801f, 0.630014f, 0.381816f, -0.269761f,
+ -0.135515f, 0.046186f, 1.07632f, -0.050616f, 0.104987f,
+ 0.29991f, 0.119316f, 0.117248f, 0.0795009f, 0.242573f,
+ 0.0416634f, -0.0577639f, -0.0974078f, 0.106255f, -0.13098f,
+ 0.0141486f, -0.00418257f, 0.144848f, -0.463934f, 0.0452591f,
+ 0.252617f, 0.205222f, -0.189843f, 0.0652245f, -0.135386f,
+ 0.0500646f, -0.200368f, -0.0142312f, -0.0286832f, -0.254355f,
+ -1.02752f, -0.73549f, 0.0364518f, 0.0416227f, -0.13185f,
+ -0.0886515f, -0.502314f, -0.102916f, 0.410911f, -0.355655f,
+ 0.400416f, -0.340217f, 0.208829f, 0.245972f, 0.149739f,
+ -0.49458f, 0.589482f, 0.550827f, 0.912709f, -0.351275f,
+ -0.128076f, -0.285172f, -0.672752f, 0.090583f, -0.245286f,
+ -0.737297f, -0.201515f, -0.025122f, -0.109854f, 0.36738f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_64[] = {
+ 0.346819f, 0.442965f, -0.0216032f, 0.0229235f, -0.402797f,
+ -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_64 = {
+ NUM_FEATURES_64,
+ NUM_LOGITS_64,
+ NUM_HIDDEN_LAYERS_64,
+ {
+ NUM_LAYER_0_UNITS_64,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_kernel_64,
+ av1_simple_motion_search_prune_rect_logits_kernel_64,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_bias_64,
+ av1_simple_motion_search_prune_rect_logits_bias_64,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_64
+#undef NUM_FEATURES_64
+#undef NUM_LAYER_0_UNITS_64
+#undef NUM_LOGITS_64
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 25
+#define NUM_LAYER_0_UNITS_32 28
+#define NUM_LOGITS_32 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_32[] = {
+ 0.486581f, 0.340847f, -0.109226f, 0.467224f, -0.541561f,
+ 0.0943619f, -0.429442f, -0.207442f, 0.959963f, 0.618666f,
+ -0.0636751f, 0.144508f, -0.0278289f, 0.332293f, -0.751493f,
+ 0.245438f, -0.917758f, 0.612128f, -0.32648f, 0.534618f,
+ -0.615239f, 2.71641f, 0.233759f, 0.820558f, -0.249758f,
+ -0.427783f, -0.359361f, 0.0375732f, 0.806973f, 0.352512f,
+ -0.0532192f, 0.0576861f, -0.464178f, -0.334877f, -0.697042f,
+ 0.0538218f, 0.0919659f, -0.00765812f, 0.0603847f, -0.460315f,
+ 0.37979f, -0.0867612f, -0.670683f, -0.188619f, -0.570586f,
+ 0.233418f, 0.153581f, 0.290905f, -0.624885f, -0.557842f,
+ -0.555567f, 0.463773f, -0.123909f, -0.277731f, 0.0374468f,
+ 0.409903f, 0.287638f, -0.593066f, -0.223434f, 0.154263f,
+ -0.250464f, -0.077696f, 0.229652f, -0.304174f, 0.308053f,
+ 0.33155f, -0.502825f, 0.361216f, -0.499294f, 0.00595444f,
+ -0.307201f, 0.5766f, -0.438384f, -0.093701f, -0.118586f,
+ 0.202337f, -0.486623f, 0.261552f, 0.139756f, -0.655642f,
+ -0.0627001f, -0.213053f, -0.243037f, 0.205918f, 0.0718368f,
+ 0.188041f, 0.141529f, -0.132239f, 0.425827f, -0.218353f,
+ 0.153114f, 0.33268f, 0.0226116f, 0.167394f, 0.269854f,
+ -0.457001f, 0.1973f, -0.526087f, 0.467528f, 0.290934f,
+ 1.16267f, 0.0823663f, -0.754389f, -0.83716f, 0.270157f,
+ -1.41229f, 0.148511f, -0.286832f, 0.664796f, 0.492254f,
+ 0.360567f, -0.533993f, 0.0435672f, -0.103001f, 0.220668f,
+ 0.594621f, -0.0213356f, -0.347638f, -0.694457f, 0.0759505f,
+ 0.161358f, -0.389384f, -0.0455192f, -0.61252f, -0.174173f,
+ -0.00788878f, -1.22487f, 0.332233f, -0.0457021f, -0.225918f,
+ -0.197657f, -0.115408f, -0.240589f, -2.05681f, 0.00914629f,
+ -1.92213f, 0.0268578f, -0.49076f, -0.0120123f, 0.291157f,
+ 0.267116f, -0.0775724f, 0.181115f, -0.392441f, -0.488114f,
+ -0.28842f, -0.115465f, 0.128974f, -0.0829899f, -0.14096f,
+ -0.140145f, -0.700281f, 0.0368945f, -0.437598f, 0.243485f,
+ -1.00301f, 0.332324f, 0.125014f, -0.0604481f, -0.0652028f,
+ -0.207295f, -1.0209f, -0.341525f, 0.191326f, -0.147578f,
+ 0.0878327f, 0.129827f, -0.0848319f, 0.187381f, -1.28663f,
+ 0.00537885f, -0.134277f, -0.0411126f, -0.3434f, -0.0456494f,
+ 0.37861f, 0.409095f, 0.237177f, -0.396855f, -0.205418f,
+ -1.31701f, -0.319032f, -0.123404f, -0.240005f, -0.305206f,
+ -0.0258176f, -0.26367f, -0.142396f, 0.191672f, -1.44061f,
+ 0.0554776f, -0.571839f, -0.284789f, -0.425677f, -0.0307376f,
+ 0.20275f, -0.223146f, 0.144612f, 0.0212636f, 0.0238303f,
+ -0.253802f, -0.188922f, -0.0637066f, -0.340836f, 0.124774f,
+ 0.130474f, -0.154099f, -0.0292733f, 0.158148f, -0.246989f,
+ -0.259059f, 0.220224f, 0.228449f, -0.41956f, -0.321848f,
+ -0.2396f, -0.316449f, -1.3363f, 0.0264099f, -1.46865f,
+ 0.113073f, 0.0722885f, -0.166986f, -0.164877f, 0.0360911f,
+ 0.534472f, -0.551152f, -0.328501f, 0.0781121f, -0.378112f,
+ -0.459502f, 0.28015f, -0.212302f, -0.521641f, 0.618993f,
+ -0.347709f, 0.266253f, -0.0280894f, 0.348511f, -0.0155031f,
+ -0.100693f, 0.0447673f, 0.277519f, -0.233998f, -0.0796738f,
+ -1.73644f, -0.160776f, 0.53092f, -0.180406f, 0.056447f,
+ 0.385356f, -0.262337f, -0.241479f, -0.271426f, -0.457354f,
+ -0.266788f, 0.367371f, -0.103065f, 0.47783f, -0.188327f,
+ -0.159636f, 0.00142907f, -0.409756f, 0.454889f, -0.24566f,
+ -0.0760084f, 0.286355f, 0.462102f, 0.0431695f, -0.127395f,
+ -0.200476f, -0.350557f, 0.217275f, -0.23975f, 0.255148f,
+ -0.280626f, 0.42476f, 0.157411f, 0.0358675f, -0.192591f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_32[] = {
+ 0.940498f, 0.15602f, -0.234831f, 0.0268585f, 0.144769f, 0.243081f,
+ 0.611406f, 0.366093f, 0.361868f, 0.39668f, 0.401479f, 0.369467f,
+ 0.0909503f, 0.710595f, 0.032786f, 0.525891f, -1.0232f, 0.732557f,
+ -0.064425f, 0.865222f, -0.042917f, -0.237191f, -0.527006f, -0.0172101f,
+ 0.59681f, -0.472405f, 0.0969218f, -0.250624f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_32[] = {
+ 0.355607f, 0.126701f, -0.0825159f, 0.200675f, -0.011308f,
+ -0.280057f, 0.559816f, 0.142689f, 0.0422419f, -0.151692f,
+ -0.0275637f, -0.283101f, -0.20822f, -0.200394f, 0.465427f,
+ 0.344491f, -0.525319f, -0.358813f, -0.39767f, 0.0974486f,
+ 0.00559058f, -0.00546089f, 0.0506486f, 0.114475f, -0.0436463f,
+ -0.574152f, -0.376294f, 0.16563f, -0.0967032f, 0.00579838f,
+ 0.0639909f, -0.037129f, 0.407574f, -0.231428f, 0.489326f,
+ -0.221566f, -0.270382f, -0.784628f, -0.155502f, 0.481698f,
+ -0.0296057f, 0.431855f, 0.840807f, 0.112291f, 0.773874f,
+ -0.0610936f, -0.012892f, 0.365154f, 0.0267687f, -0.0751114f,
+ 0.25043f, 0.516472f, -0.186133f, -0.12762f, -0.168804f,
+ -0.146309f, 0.139314f, -0.367113f, -0.601079f, 0.0559856f,
+ 0.176081f, 0.22397f, 0.434113f, 0.0363256f, 0.313051f,
+ 0.0143976f, 0.190076f, 0.474607f, -0.681134f, -0.0709097f,
+ -0.253289f, -0.216277f, -0.0593789f, -0.107795f, -0.194842f,
+ 0.513945f, 0.239171f, -0.720561f, 0.0136723f, -0.391147f,
+ -0.272043f, -0.164766f, 0.124248f, 0.147178f, -0.35497f,
+ 0.397725f, -0.117603f, 0.262937f, -0.331964f, 0.182418f,
+ 0.315671f, -0.0385649f, 0.488769f, -0.334568f, 0.00596018f,
+ 0.0661557f, -0.0446985f, -0.0928255f, -0.0221032f, -0.019045f,
+ -0.20881f, 0.197907f, -0.381881f, 0.0598071f, -0.0434551f,
+ 0.159283f, -0.110631f, 0.266996f, -0.0265494f, 0.135199f,
+ -0.00833162f, 0.804482f, -0.114698f, -0.15066f, -0.479553f,
+ 0.448407f, -0.344069f, -0.0280952f, -0.208211f, -0.102269f,
+ -0.679066f, -0.37476f, -0.0228875f, 0.0535049f, 0.111015f,
+ -0.18125f, -0.167584f, 0.0110497f, 0.262723f, -0.413839f,
+ -0.0611238f, 0.358499f, 0.0807514f, 0.208254f, 0.214499f,
+ 0.11137f, -0.14262f, -0.0513973f, 0.243718f, -0.373716f,
+ -0.00413366f, 0.216501f, -0.164149f, -0.064935f, -0.0840282f,
+ 0.0566148f, 0.0377686f, 0.289835f, 0.769388f, 0.891198f,
+ -0.592739f, 0.40744f, -0.153095f, 0.657311f, 0.140737f,
+ 0.28209f, 0.158344f, 0.353546f, 0.0868246f, 0.116887f,
+ 0.402004f, 0.437184f, 0.589219f, 0.760594f, -0.575419f,
+ -0.754308f, -0.709219f, -0.297814f, -0.418609f, -0.0262104f,
+ 0.0411959f, 0.0597708f, -0.143728f, -0.136642f, 0.099614f,
+ -0.257601f, -0.2404f, 0.305893f, 0.254009f, -0.0301398f,
+ -0.0653091f, -0.459002f, -0.163404f, 0.123152f, -0.0284252f,
+ -0.457272f, 0.00788622f, -0.828399f, -0.0534199f, 0.586877f,
+ 0.982728f, 0.424581f, 0.0891856f, 0.383182f, -0.122053f,
+ 0.0808408f, -0.00384914f, -0.0560201f, -0.0524772f, -0.263444f,
+ -0.239287f, -0.882777f, 0.0180592f, -0.0948711f, -0.177946f,
+ 0.0296473f, 0.096082f, 0.0455604f, -0.108608f, 0.00777951f,
+ -0.140896f, 0.117187f, -0.342467f, -0.0691604f, 0.0761611f,
+ -0.0892053f, 0.111386f, -0.167456f, 1.40616f, -0.00478793f,
+ 0.00547665f, -0.0441829f, 0.0151323f, -0.0674099f, -0.0380578f,
+ 0.16072f, 0.31882f, 0.245486f, -0.424318f, 0.101845f,
+ -0.203343f, -0.197402f, -0.163025f, -0.0771961f, -0.264435f,
+ 0.319429f, 0.250076f, 0.782726f, 0.386003f, 0.00700673f,
+ -0.375715f, 0.151453f, -0.296265f, -0.560183f, -0.00767249f,
+ -0.109593f, -0.119419f, -0.0161516f, 0.0380283f, -0.156417f,
+ 0.131708f, 0.396268f, -0.221796f, 0.232099f, 0.128852f,
+ 0.0567268f, 0.297297f, 0.173269f, 0.213411f, 0.0384426f,
+ -0.290985f, -0.0426841f, -0.488292f, -0.087101f, -0.311582f,
+ 0.83009f, -0.153163f, 0.903335f, -1.15644f, -0.0378635f,
+ -0.0552129f, -0.126362f, -0.176945f, 0.0653115f, 0.0989368f,
+ -0.333543f, -0.330586f, 0.29775f, -0.103535f, 0.210824f,
+ -0.00300509f, 0.317105f, 0.216852f, 0.479718f, 0.0485808f,
+ -0.15662f, 0.718199f, 0.327513f, 0.115169f, -0.423598f,
+ -0.456633f, -0.575814f, -0.494454f, 0.304411f, 0.0493055f,
+ -0.381171f, 0.467251f, -0.122872f, -0.167441f, 0.017253f,
+ -0.0583646f, -0.1586f, 0.214046f, -0.0284424f, -0.217112f,
+ 0.606567f, -0.107533f, 0.36615f, -0.0709227f, 0.604761f,
+ -0.244657f, -0.296651f, -0.595611f, -0.156629f, -0.693468f,
+ -0.310603f, 0.499272f, 0.282941f, 0.295043f, -0.178704f,
+ 0.281186f, 0.014329f, -0.120819f, 0.154234f, 0.0131325f,
+ -0.472231f, -0.631281f, 0.422955f, 0.711432f, -0.118025f,
+ 0.0864996f, 0.343971f, -0.301477f, -0.246638f, 0.165068f,
+ 0.218044f, 0.224236f, -0.0848522f, 0.00671216f, 0.401141f,
+ -0.218857f, -0.0298495f, -0.135725f, -0.377618f, 0.022473f,
+ 0.106955f, -0.0582005f, 0.0468484f, -0.0217442f, 0.130911f,
+ -0.0926905f, 0.383007f, -0.159353f, -0.222711f, -0.0286419f,
+ 0.372315f, -0.469095f, 0.797571f, -0.301315f, 0.239327f,
+ -0.997507f, -0.363409f, 0.353717f, 0.676686f, -0.0500028f,
+ 0.0638539f, -0.431927f, 0.243852f, 0.000884826f, -0.00166585f,
+ 0.0613292f, -0.029558f, -0.0248432f, -0.0125607f, -0.0309674f,
+ -0.743308f, 0.0409806f, 0.0921015f, 0.167816f, 0.406849f,
+ 0.095677f, 0.0308913f, 0.139956f, -0.400472f, 0.396617f,
+ 0.936517f, 0.355057f, -0.423816f, -0.232472f, -0.220188f,
+ -0.399746f, -0.409623f, -0.158797f, 0.361153f, 0.0327019f,
+ 0.0690844f, -0.032197f, 0.0248558f, 0.00438518f, 0.0222724f,
+ -0.326832f, -0.314295f, 0.156563f, 0.0562703f, 0.332694f,
+ 0.299424f, 0.228206f, 0.322038f, 0.0136098f, 0.0060297f,
+ -0.165851f, -0.306512f, 0.0796508f, -0.37158f, 0.239395f,
+ -0.349442f, 0.198515f, -0.253854f, -1.13694f, 0.0202873f,
+ -0.0504009f, -0.130528f, -0.017126f, -0.0370001f, -0.087458f,
+ -0.119952f, -0.130404f, 0.0333733f, -0.184736f, 0.182162f,
+ 0.227776f, -0.166563f, -0.156162f, 0.118215f, -0.220183f,
+ 0.00474779f, -0.107792f, 0.260493f, 0.11884f, 0.156587f,
+ 0.303936f, -0.131788f, -0.314774f, 0.310606f, 0.0935523f,
+ 0.790767f, 0.26461f, 0.0236426f, 0.0629469f, 0.0344072f,
+ -0.151513f, 0.211498f, 0.0245435f, 0.0629973f, 0.052019f,
+ -0.03308f, 0.123487f, 0.0885027f, 0.159172f, -0.0510615f,
+ 0.0298033f, -0.130515f, -0.121799f, -0.104915f, 0.208822f,
+ -0.310496f, -0.314106f, 0.303307f, -0.0196736f, 0.0420045f,
+ 0.461777f, -0.433699f, 0.00345407f, 0.703139f, -0.655637f,
+ -0.210767f, -0.201278f, 0.163694f, -0.236534f, 0.300877f,
+ 0.0769982f, -0.282453f, 0.149721f, -0.0303466f, -0.191473f,
+ -0.406056f, -0.213472f, 0.1619f, -0.245953f, 0.00544399f,
+ -0.121434f, 0.193012f, -0.307165f, 1.45431f, -0.161468f,
+ -0.12444f, -0.146129f, -0.0528212f, -0.0925165f, -0.134528f,
+ -0.479475f, 0.315525f, 0.133845f, 0.382158f, -0.0799693f,
+ -0.151041f, 0.255772f, 0.409536f, -0.240663f, -0.323741f,
+ -0.205876f, 0.03699f, -0.217541f, 0.108511f, 0.640628f,
+ 0.705993f, -0.423899f, -0.78314f, -0.100733f, -0.00859087f,
+ 0.0251879f, 0.0458335f, 0.00210128f, -0.047576f, -0.0560518f,
+ -1.23869f, -0.829914f, 0.0346551f, 0.350505f, 0.193688f,
+ 0.459154f, 0.137898f, 0.503818f, 0.260867f, 0.649539f,
+ 0.0150802f, 0.0239274f, -0.276069f, -0.0621478f, -0.193106f,
+ -0.0375665f, -0.654529f, 0.189493f, 0.446625f, -0.0208265f,
+ 0.019838f, -0.0201955f, 0.00180428f, -0.0110678f, -0.0172414f,
+ 0.0276489f, -0.252882f, -0.0351807f, -0.0518874f, 0.279098f,
+ -0.245122f, 0.101287f, -0.114202f, -0.0812187f, 0.572429f,
+ -0.0821731f, 0.564183f, 0.0222552f, 0.190111f, -0.0417497f,
+ -0.00385925f, -0.182995f, -0.240482f, -0.291572f, -0.0450444f,
+ 0.0962974f, -0.165973f, -0.0954637f, -0.163841f, -0.833405f,
+ -1.31541f, -0.336473f, -0.0920702f, 0.816105f, 0.393377f,
+ 0.0340241f, -0.0844545f, 0.61729f, -0.17596f, 0.241149f,
+ -0.42825f, -0.59091f, -0.290702f, 0.0796465f, 0.0982819f,
+ 0.466934f, 0.261666f, 0.0373333f, 0.332509f, -0.0266694f,
+ -0.0476951f, -0.00642167f, -0.0132542f, -0.000320841f, 0.00475532f,
+ 0.000502778f, 0.296534f, -0.13297f, -0.113082f, -0.327923f,
+ 0.35901f, -0.302246f, 0.189799f, -0.37994f, 0.16107f,
+ -0.20414f, 0.548575f, -0.460821f, 0.591878f, -0.213113f,
+ -0.169373f, -0.07332f, 0.228841f, 0.682302f, -0.0665316f,
+ -0.142456f, -0.0873117f, 0.00607451f, 0.0376443f, 0.0536673f,
+ -0.0109536f, -0.400279f, 0.550058f, 0.820871f, -0.666373f,
+ -0.471962f, -0.315925f, -0.313142f, 0.952742f, 0.473928f,
+ -0.119006f, 0.153241f, -0.0383078f, 0.631869f, -0.343423f,
+ -0.233473f, -0.218195f, -0.077688f, -0.728291f, 0.0382408f,
+ -0.00662886f, -0.0419666f, 0.0309776f, -0.0281592f, 0.0154229f,
+ -0.198534f, 0.0206324f, 0.0152272f, -0.235067f, 0.0330486f,
+ 0.139198f, -0.0612118f, 0.133154f, -0.258675f, 0.0900275f,
+ -0.127771f, 0.157322f, -0.00767807f, -0.329258f, 0.327458f,
+ 0.0528581f, -0.181125f, 0.409995f, -0.162979f, -0.0193475f,
+ 0.186009f, 0.0519501f, 0.651877f, -0.37821f, -1.10341f,
+ -0.189776f, -0.0922788f, 0.460256f, 0.168011f, 0.440295f,
+ 0.478135f, 0.374573f, 0.384048f, 0.116953f, 0.68886f,
+ -0.427727f, -0.36676f, -0.500013f, -0.228685f, -0.218859f,
+ 0.208396f, -0.0173765f, -0.0680241f, -0.00538013f, -0.0674409f,
+ -0.092764f, 0.0295707f, -0.0462887f, -0.00636006f, 0.0334169f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_32[] = {
+ 0.176459f, 0.154405f, 0.281821f, 0.375264f, -0.882863f,
+ -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_32 = {
+ NUM_FEATURES_32,
+ NUM_LOGITS_32,
+ NUM_HIDDEN_LAYERS_32,
+ {
+ NUM_LAYER_0_UNITS_32,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_kernel_32,
+ av1_simple_motion_search_prune_rect_logits_kernel_32,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_bias_32,
+ av1_simple_motion_search_prune_rect_logits_bias_32,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 25
+#define NUM_LAYER_0_UNITS_16 32
+#define NUM_LOGITS_16 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_16[] = {
+ -0.520913f, 0.395611f, 0.0369091f, -0.318591f, -0.463252f,
+ 0.134992f, -0.43154f, -0.0739112f, -0.118817f, 0.476373f,
+ -0.281406f, 0.3413f, 0.456255f, 0.33307f, 0.2942f,
+ 0.1317f, 0.498113f, 1.95406f, -0.165726f, -0.219306f,
+ -0.302656f, -1.31157f, -0.433662f, 0.151716f, -0.214817f,
+ 0.504523f, -0.710049f, 0.359616f, -0.412695f, -0.103193f,
+ 0.341912f, 0.351378f, -0.181486f, 0.573862f, -0.0396254f,
+ -0.17855f, -0.276163f, 0.0367465f, -0.353905f, -0.204689f,
+ 0.309581f, -0.0439686f, -0.147855f, 0.152745f, 0.290871f,
+ 0.131049f, -0.27808f, -0.142997f, 0.207843f, -1.23074f,
+ -0.267714f, -0.336923f, 0.313781f, -0.61488f, -0.161984f,
+ 0.238059f, -0.0879942f, -0.085543f, -0.260156f, -0.13614f,
+ -0.242196f, 0.201216f, -0.248691f, 0.0936671f, -0.350522f,
+ -0.35002f, -0.156583f, -0.00579001f, 0.300578f, -0.341269f,
+ -0.290712f, 0.354802f, -0.31629f, 0.509107f, -0.236953f,
+ -0.0923519f, 0.544509f, -0.280991f, -0.017437f, -0.202721f,
+ -0.116388f, -0.7191f, 0.324586f, 0.254249f, 0.125505f,
+ 0.00658697f, -0.333322f, -0.126537f, -0.140004f, -0.0241202f,
+ -0.172466f, 0.210035f, -0.270833f, 0.0579044f, 0.0950352f,
+ -0.120382f, 0.063292f, -0.394925f, 0.482165f, 0.147753f,
+ 0.331465f, -0.187444f, 0.1083f, 0.414028f, 0.279238f,
+ -0.486889f, -0.674349f, -0.313656f, -0.131186f, -0.100662f,
+ 0.238191f, -1.19083f, -0.30667f, -2.4324f, 0.235311f,
+ 0.108605f, 1.67197f, 0.476157f, 0.30055f, 0.0839538f,
+ 0.408469f, -0.473517f, 0.560283f, -0.0188136f, 0.273824f,
+ -0.43707f, -0.0346978f, -0.438315f, -0.0196275f, -0.0567921f,
+ -0.220166f, 0.216175f, -0.0180461f, 0.0116429f, -0.0096949f,
+ -0.32613f, 0.176829f, -0.243563f, -0.240972f, -0.621819f,
+ -0.00619648f, -0.145525f, 0.124324f, -0.0306925f, 0.172208f,
+ -2.04631f, -0.200087f, -0.594135f, -0.352303f, -0.309826f,
+ 0.0922786f, -0.698371f, -0.0366823f, 0.0244036f, 0.338775f,
+ -0.115947f, 0.144971f, -0.0607037f, -0.762412f, 0.0125584f,
+ -0.262427f, -0.0830273f, -0.291252f, -0.176059f, -0.203983f,
+ 0.0871455f, -0.0894925f, 0.0426263f, -0.060001f, -0.542355f,
+ -0.407837f, -0.0419273f, 0.226608f, -0.114844f, 0.158733f,
+ -0.187237f, 0.113163f, -1.86337f, -0.367544f, -0.547048f,
+ -0.24192f, -0.226764f, 0.090912f, 0.819604f, 0.433766f,
+ -0.841657f, 0.446987f, -0.622761f, -0.0296385f, -0.130176f,
+ -0.0518136f, -0.640326f, -0.330107f, -0.137832f, -0.0119033f,
+ 0.39401f, 0.111331f, -0.141367f, -0.230289f, 0.171054f,
+ -0.924059f, -0.107317f, -0.347983f, 0.0261109f, 0.423002f,
+ -0.305817f, 0.247696f, 0.0436002f, 0.0305862f, -1.52448f,
+ -0.595587f, -0.155552f, -1.11949f, -0.513937f, 0.138347f,
+ -0.301487f, 0.352144f, -0.615801f, 0.0326701f, -0.215322f,
+ -0.0608176f, -0.416557f, -0.306073f, -0.441512f, -0.0569277f,
+ -0.709768f, -0.602527f, -0.311134f, 0.152471f, -0.255299f,
+ 0.354505f, 0.194464f, 0.0144251f, 0.110732f, -0.4452f,
+ -0.804814f, 0.205325f, -0.0957486f, 0.502684f, 0.09112f,
+ -0.533087f, -1.77979f, 0.556992f, -0.176157f, -0.642633f,
+ 0.11553f, -0.232561f, 0.161277f, -0.0631125f, -0.20759f,
+ 0.489253f, -0.067533f, 0.0231024f, -0.179831f, -0.272985f,
+ -0.390059f, 0.3089f, 0.185733f, -0.257065f, -0.508838f,
+ -0.550028f, 0.0665621f, -0.138288f, -0.413188f, 0.191193f,
+ -1.32969f, -0.431025f, 0.270242f, -0.340062f, 0.0817257f,
+ 0.0376051f, -0.18633f, 0.0828274f, 0.00670051f, -0.431295f,
+ -0.450316f, -0.173042f, -0.322248f, 0.370628f, 0.10019f,
+ 0.317293f, -0.266613f, 0.0752441f, -0.425656f, -0.112223f,
+ 0.557991f, -0.324368f, -0.195261f, -0.0526129f, -0.807472f,
+ -0.387466f, 0.192186f, 0.353213f, -0.120238f, 0.107686f,
+ 0.200678f, -0.75363f, 0.466857f, -0.282345f, -0.0849236f,
+ -0.0490695f, -0.00643182f, 0.123047f, -0.207805f, -0.130456f,
+ -1.09455f, 0.340973f, 0.334784f, 0.0706643f, -1.65681f,
+ -0.319952f, -0.198514f, -0.0787972f, 0.089524f, 0.0531034f,
+ -0.202705f, -0.0852339f, -0.62572f, -0.0734234f, -0.838088f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_16[] = {
+ -0.0616197f, 0.939947f, 0.521161f, 0.213886f, 0.130324f, -0.127443f,
+ -0.0538715f, 0.708746f, 0.445031f, 0.418781f, -0.114539f, 0.521941f,
+ 1.13719f, 0.606545f, -0.32193f, -0.150788f, 0.158487f, -0.224005f,
+ 0.654715f, 0.115729f, -0.286506f, -2.06223f, 0.0117697f, 0.503905f,
+ -0.102339f, 0.653256f, -0.813561f, 0.905235f, -0.417269f, -0.206265f,
+ 0.661496f, 0.95533f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_16[] = {
+ -0.203489f, 0.00686229f, -0.161414f, 0.0637276f, 0.27516f,
+ 0.512219f, 0.164205f, 0.00326062f, -0.41914f, -0.400334f,
+ 0.554419f, 0.715772f, -0.295569f, -0.703503f, 0.0137744f,
+ -0.0934259f, 0.174234f, -0.148618f, -0.0360558f, -0.0986598f,
+ -0.138502f, -0.0770713f, 0.122922f, -0.00784415f, 0.0953234f,
+ -0.255754f, -0.310967f, 0.185306f, 0.464554f, 0.147338f,
+ -0.0612304f, 0.164783f, 0.301097f, 0.161364f, -0.12723f,
+ -0.0265984f, -0.471361f, 0.0578776f, -0.362865f, 0.425789f,
+ 0.402758f, -0.190235f, 0.00549738f, -0.570908f, 1.27206f,
+ 0.048868f, -0.0097675f, 0.0708324f, 0.0456103f, 0.0149062f,
+ -0.563032f, -0.420573f, 0.107278f, 0.0938258f, 0.142712f,
+ -0.00251036f, -0.250583f, 0.522272f, 0.0113175f, 0.126751f,
+ -0.433028f, -0.035542f, -0.536686f, -0.0668722f, 0.253094f,
+ 0.254007f, -0.435505f, 0.343001f, 0.0531542f, -0.361914f,
+ -0.102664f, 0.0404874f, 0.132686f, 0.0762298f, 0.0236971f,
+ -0.419454f, 0.230877f, -0.223714f, 0.037813f, 0.0818604f,
+ 0.383705f, -0.235028f, -0.0554801f, 0.429851f, 0.0845829f,
+ 0.166295f, 0.355111f, -0.421197f, 0.298949f, 0.0218224f,
+ 0.445705f, -0.392217f, -0.429578f, -0.076276f, -0.0963531f,
+ -0.631425f, -0.225977f, 8.06349e-06f, 0.0676679f, 0.0779651f,
+ 0.0706891f, 0.101377f, 0.517103f, 0.0945502f, -0.52522f,
+ -0.312022f, 0.0358089f, 0.616509f, -0.0507444f, -0.465814f,
+ -0.0326024f, 0.591298f, 0.188544f, -0.0633316f, -0.199987f,
+ 0.403118f, -0.511281f, -0.696263f, 0.112996f, 0.103875f,
+ 0.0495595f, -0.0107449f, 0.521539f, -0.0123823f, -0.0642751f,
+ 0.08548f, -0.0679207f, 0.526558f, 0.0651114f, -0.342643f,
+ -0.349934f, 0.307437f, 0.368763f, -0.194851f, -0.134117f,
+ 0.102448f, -0.0520666f, 0.0415824f, -0.175085f, 0.272685f,
+ 0.0675856f, 0.120627f, 0.391408f, -0.135249f, -0.357024f,
+ 0.019666f, -0.0622677f, 0.407427f, 0.22655f, -0.129432f,
+ -0.165327f, 0.004893f, 0.5479f, 0.0613981f, -0.479682f,
+ -0.144228f, -0.130106f, 0.206458f, -0.342086f, 0.12691f,
+ -0.113554f, 0.231164f, -0.051419f, 0.0401286f, -0.560429f,
+ -0.070609f, 0.420232f, 0.442465f, -0.237501f, -0.000293732f,
+ -1.017f, -0.210222f, 0.0157063f, 0.0488178f, 0.0734721f,
+ -0.52626f, -0.276441f, -0.521579f, 0.443532f, -0.0819051f,
+ -0.0732633f, -0.17999f, 0.258525f, -0.0374872f, 0.150115f,
+ 0.0510939f, 0.168116f, 0.473372f, 0.824489f, 0.302195f,
+ -0.348613f, 0.238569f, 0.176444f, -0.633945f, -0.0567195f,
+ -0.0305827f, -0.0551851f, 0.85822f, -0.0628099f, 0.0364294f,
+ -0.234823f, 0.179067f, 0.143208f, -0.0511014f, -0.404191f,
+ 0.428035f, 0.0235506f, 0.371991f, -0.312909f, 0.550933f,
+ -0.389265f, -0.271813f, -0.293461f, -0.583752f, 0.179991f,
+ 0.191698f, 0.659094f, 1.07941f, -0.509555f, -0.100638f,
+ 0.079988f, -0.0519107f, -0.112723f, -0.0663326f, 0.0353569f,
+ -0.795055f, -0.465999f, 0.283579f, 0.340913f, 0.152738f,
+ 0.294664f, 0.527839f, 0.187735f, 0.359461f, 0.164629f,
+ 0.107512f, 0.390402f, 0.236702f, 0.114674f, -0.525655f,
+ -0.555476f, -0.6589f, -0.266601f, -0.0946547f, 0.6306f,
+ 0.0248513f, 0.038497f, 0.432706f, -0.0715465f, 0.0410172f,
+ -0.115313f, -0.428684f, 0.136283f, 0.0913185f, 0.11277f,
+ 0.0968689f, -0.00437052f, 0.0888981f, 0.10304f, 0.02442f,
+ -0.211315f, 0.00981596f, -0.0974827f, 0.208611f, 0.140644f,
+ 0.0315567f, 0.350332f, -0.291049f, -0.0715449f, -0.352992f,
+ -0.858004f, 0.828658f, 0.439092f, 0.0151291f, 0.0503828f,
+ 0.0656112f, -0.710749f, -0.0951757f, 0.193908f, 0.00908018f,
+ 0.141486f, -0.0657711f, 0.099791f, 0.153729f, -0.419576f,
+ -0.892636f, -0.0449268f, -0.170786f, -0.156564f, 0.384511f,
+ 0.296565f, 0.0569815f, -0.103938f, 1.27479f, -0.0406475f,
+ 0.154083f, -0.186442f, 0.0282588f, 0.0312102f, -0.188994f,
+ 0.284243f, -0.564693f, 0.425525f, -0.00924596f, 0.810003f,
+ 0.233812f, -0.0180273f, 0.121082f, -0.209096f, 0.151437f,
+ 0.286921f, -0.348095f, 0.174813f, -0.413798f, 0.108994f,
+ -0.34266f, -0.0337981f, -0.459f, -0.409812f, -0.0890104f,
+ 0.0834802f, -0.00259191f, -0.105914f, -0.164207f, 0.0697689f,
+ -0.312098f, -0.00650536f, -0.486758f, -0.248486f, 0.24314f,
+ -0.0857144f, 0.0884781f, -0.65615f, -0.121744f, 0.0709335f,
+ -0.0237193f, 0.10764f, -0.0409452f, -0.0824305f, 0.42329f,
+ 0.138258f, 0.502607f, 0.228545f, 0.0687789f, 0.0361586f,
+ 0.39074f, 0.0722654f, -0.0133148f, 0.283278f, 0.0743384f,
+ 0.310292f, -0.297675f, -0.359935f, 0.521021f, -0.10082f,
+ -0.272333f, 0.0120283f, 0.138118f, -0.123711f, -0.0711386f,
+ 0.0170747f, 0.831039f, 0.0509626f, 0.790608f, -0.0863406f,
+ -0.31962f, 0.0631013f, 0.0873453f, -0.472331f, -0.0826027f,
+ -0.241722f, 0.148835f, -0.131611f, 0.000195347f, -0.0615804f,
+ -0.838663f, -0.586979f, 0.247713f, 0.362254f, 0.492727f,
+ -0.132163f, 0.0516545f, 0.477838f, -0.0395182f, 0.0124993f,
+ -0.771514f, 0.0386912f, -0.118525f, -0.346172f, -0.265905f,
+ -0.175257f, -0.406287f, 0.393837f, 0.409096f, -0.408501f,
+ -0.0207146f, 0.0487809f, 0.0636982f, 0.0276368f, 0.0878249f,
+ 0.0425889f, 0.0868633f, 0.17423f, -0.128217f, -0.477068f,
+ -0.321294f, 0.0393771f, 0.00812823f, -0.350529f, -0.129012f,
+ 0.439953f, 0.396662f, 0.410475f, -0.123129f, -0.565966f,
+ 0.0298635f, -0.614611f, -0.477514f, 0.453651f, 0.0617068f,
+ 0.0530563f, 0.0479074f, 0.213551f, 0.039034f, 0.0449095f,
+ -1.06868f, -1.2654f, -0.175482f, 0.595068f, -0.230095f,
+ 0.719838f, -0.272148f, 0.696564f, 0.0485396f, 0.468584f,
+ 0.0695439f, -0.0842122f, -0.228978f, 0.161397f, -0.000441421f,
+ -0.0297514f, -0.250599f, 0.196656f, 0.608423f, -0.0112096f,
+ 0.0236881f, -0.00167311f, 0.0040709f, 0.015495f, 0.00757698f,
+ -0.165886f, 0.359767f, -0.0214696f, 0.377208f, 0.0303547f,
+ 0.0657094f, 0.140775f, 0.21867f, -0.203922f, 0.263878f,
+ -0.0529099f, 0.202438f, -0.243226f, 0.156659f, -0.627056f,
+ -0.845036f, -0.500873f, 0.172588f, 0.402972f, -0.147734f,
+ 0.151792f, -0.075579f, 0.443519f, 0.0311335f, -0.0328222f,
+ -0.0299781f, 0.435956f, -0.0987376f, 0.288402f, 0.135902f,
+ -0.173584f, -0.186255f, 0.224524f, -0.249645f, 0.123702f,
+ -0.0846244f, 0.491317f, 0.544846f, 0.338677f, -0.258885f,
+ -0.617434f, -0.629003f, -0.347233f, 0.181262f, -0.0606015f,
+ -0.537766f, 0.215089f, -0.334527f, 0.0488534f, 0.0577997f,
+ -1.12431f, -0.932292f, -0.11559f, 0.573715f, 0.151128f,
+ 0.693818f, -0.16956f, 0.802591f, -0.231531f, 1.04318f,
+ -0.476417f, 0.293452f, -0.610136f, 0.27506f, -0.384012f,
+ 0.305366f, -0.0540464f, -0.337583f, -0.174285f, 0.157248f,
+ 0.0477345f, -0.0229535f, 0.0475766f, -0.00603319f, 0.00856119f,
+ -0.702893f, -0.0579673f, 0.183024f, -0.166222f, 0.109763f,
+ -0.148019f, -0.258873f, -0.0820157f, -0.186716f, -0.449265f,
+ -0.0534138f, 0.15732f, 0.46357f, 0.00502591f, -0.0282085f,
+ 0.152277f, -0.855199f, -0.357115f, 0.0366159f, 0.0131101f,
+ -0.0407758f, 0.0462835f, 0.146309f, -0.00276278f, -0.0591814f,
+ -0.109437f, 0.506764f, -0.044421f, 0.465907f, 0.114444f,
+ -0.241053f, -0.362649f, -0.432615f, 0.199989f, -0.00635866f,
+ -0.521886f, 0.0958924f, -0.485725f, 0.0430527f, 0.069746f,
+ 0.681091f, -0.288144f, 0.505671f, 0.0489065f, -0.0373836f,
+ 0.266079f, 0.145173f, -0.011481f, -0.225074f, -0.754501f,
+ -0.122939f, -0.294213f, 0.334738f, 0.281561f, 0.558977f,
+ -0.21551f, -0.346507f, -0.0625635f, 0.0782034f, -0.236999f,
+ -0.803783f, -0.601117f, 0.091192f, 0.636122f, -0.250626f,
+ 0.0354961f, 0.103915f, 0.508571f, 0.329911f, -0.0425999f,
+ -0.0867587f, -0.0385824f, 1.13914f, -0.0261992f, 0.00484478f,
+ 0.124603f, -0.012173f, -0.377358f, -0.243563f, 0.236094f,
+ 0.145663f, -0.132752f, 0.347497f, -0.529315f, 0.271632f,
+ -0.372805f, 0.0261836f, 0.126169f, 0.0941008f, 0.283773f,
+ 0.765701f, -0.226477f, -0.181549f, -0.306896f, 0.110165f,
+ -0.0784234f, -0.0827892f, -0.0374252f, -0.0950872f, -0.451015f,
+ -0.995793f, -0.452663f, 0.293338f, -0.380865f, 0.032683f,
+ 0.0178248f, 0.0699194f, -0.0811722f, -0.0866096f, 0.139289f,
+ 0.296604f, 0.192293f, -0.0589607f, -0.179878f, 0.00360266f,
+ -0.0905794f, 0.136744f, -0.191555f, 1.31877f, -0.0592033f,
+ -0.158766f, 0.0214746f, -0.190113f, -0.116671f, 0.0449292f,
+ -0.109533f, -0.709307f, 0.386424f, 0.40201f, 0.262211f,
+ -0.155244f, 0.233988f, -0.0166317f, 0.462665f, 0.0484462f,
+ 0.210902f, -0.352798f, 0.38698f, -0.228261f, -0.084309f,
+ -0.220751f, -0.170879f, -0.352617f, -1.24277f, 0.266004f,
+ -0.0125749f, -0.0380073f, 0.101838f, -0.0483024f, -0.0629178f,
+ -0.0695577f, -0.103439f, 0.242131f, -0.0796858f, 0.349718f,
+ -0.332045f, 0.0138352f, -0.380235f, -0.28717f, -0.176276f,
+ 0.865903f, 0.36593f, 0.243925f, -0.422289f, -0.117327f,
+ 0.21876f, 0.245393f, -0.426134f, -0.186077f, 0.0352515f,
+ -0.123742f, 0.249376f, 1.3281f, 0.0707771f, 0.071415f,
+ -0.286827f, -0.131691f, -0.270881f, -0.434378f, 0.376064f,
+ 0.35966f, 0.513374f, 0.439378f, -0.222716f, -0.5874f,
+ 0.487997f, -0.293271f, -0.184245f, -0.037256f, 0.17723f,
+ -0.438651f, 0.428184f, 0.112983f, -0.449287f, -0.0451963f,
+ 0.0854929f, 0.0735442f, -0.0148642f, -0.0586782f, -0.176455f,
+ -0.438979f, -0.127109f, 0.211478f, 0.388035f, -0.0372021f,
+ 0.220575f, 0.382144f, 0.302121f, 0.0857121f, 0.193445f,
+ -0.488858f, -0.195288f, -0.316184f, -0.314026f, -0.111956f,
+ 0.0744768f, 0.292709f, 0.30187f, -0.285506f, -0.105006f,
+ 0.0851402f, -0.082318f, 0.277518f, 0.725294f, -0.756304f,
+ 0.0155309f, -0.378542f, 0.293377f, -0.347252f, -0.338458f,
+ 0.221449f, -0.176443f, -0.131972f, 0.0129163f, -0.290649f,
+ 0.198596f, -0.0721333f, 0.620591f, 0.568736f, 0.174001f,
+ -0.205186f, -0.265606f, -0.249155f, 0.299163f, 1.11842f,
+ 0.17423f, 0.196417f, -0.014484f, 0.0735422f, 0.26329f,
+ 0.12284f, -0.750305f, -0.351337f, 0.121994f, -0.00542878f,
+ -0.295707f, -0.094124f, 0.300993f, 0.412408f, -0.170761f,
+ -0.0676329f, -0.106638f, -0.419785f, -0.43878f, 0.22421f,
+ 0.0339903f, 0.619851f, 0.0615381f, 0.514631f, 1.35424f,
+ -0.0679228f, -0.203457f, 0.131948f, -0.0041251f, -0.209054f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_16[] = {
+ 0.304025f, 0.131887f, 0.259279f, -0.561564f, -0.161729f,
+ -0.208036f, 0.102206f, -0.162937f, -1.42311f, -0.708305f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_16 = {
+ NUM_FEATURES_16,
+ NUM_LOGITS_16,
+ NUM_HIDDEN_LAYERS_16,
+ {
+ NUM_LAYER_0_UNITS_16,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_kernel_16,
+ av1_simple_motion_search_prune_rect_logits_kernel_16,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_bias_16,
+ av1_simple_motion_search_prune_rect_logits_bias_16,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 25
+#define NUM_LAYER_0_UNITS_8 32
+#define NUM_LOGITS_8 4
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_8[] = {
+ -0.266303f, -0.387676f, 0.204501f, -0.120842f, -0.0752326f, 0.0337739f,
+ 0.0243477f, -0.356748f, 0.0143051f, -0.16403f, -0.139013f, 0.175003f,
+ -0.206754f, 0.349059f, 0.181763f, 0.212768f, -0.313783f, 0.182829f,
+ 0.00205376f, -0.939525f, -0.0992424f, 0.306254f, 0.083329f, -0.133137f,
+ -0.179022f, -0.0237902f, 0.0601026f, -0.216698f, -0.551149f, 0.081711f,
+ -0.442191f, 0.0680832f, -0.0353678f, 0.237704f, 0.23155f, -0.36097f,
+ 0.123389f, -0.288927f, 0.178133f, -0.152222f, -0.235648f, -0.0495293f,
+ -0.316522f, 0.034207f, 0.0463139f, -0.817825f, 0.417443f, -0.110984f,
+ -0.402371f, 0.0341694f, -0.37383f, 0.414532f, 0.093993f, 0.0039505f,
+ 0.0803175f, -0.511859f, -0.0154802f, 0.0979595f, 0.0909049f, -0.120938f,
+ -0.577382f, -0.155041f, -0.404295f, 0.122223f, -0.084703f, 0.00415336f,
+ 0.149135f, 0.113219f, 0.124236f, -0.240905f, 0.163909f, -0.154202f,
+ -0.208917f, 0.00200158f, -0.71796f, 0.105984f, -0.131996f, -0.539603f,
+ 0.223768f, -0.0710733f, -0.346679f, -0.0745909f, 0.171032f, 0.215701f,
+ 0.218519f, 0.105981f, -0.096209f, -0.166453f, -0.468894f, -0.401578f,
+ -0.239222f, 0.111382f, 0.38747f, -0.164734f, -0.175955f, 0.336621f,
+ -0.0305501f, -0.0576765f, 0.0672671f, -0.183692f, 0.412082f, -0.262951f,
+ -0.153429f, -0.128589f, -0.530472f, 0.0936412f, -1.08296f, -0.45147f,
+ 0.0714904f, -3.96842f, 0.438125f, -0.313945f, 0.231104f, -0.00183851f,
+ -0.0192768f, -0.637531f, -0.109296f, 0.0531702f, 0.00262162f, -0.615951f,
+ -0.546241f, -0.635305f, -0.0762367f, 0.0122019f, 0.423693f, -0.129142f,
+ -0.112242f, 0.295184f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_8[] = {
+ -2.16023f, -3.12831f, -0.213206f, -2.97875f, -1.83791f, -2.84713f,
+ -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f,
+ -0.853224f, -3.29503f, -0.537517f, 0.923106f, -3.18665f, -1.29905f,
+ 1.64506f, -1.99848f, -2.24315f, 0.408613f, 0.503671f, -3.83393f,
+ -2.88388f, -3.52337f, 1.46818f, -1.67169f, -3.83253f, 1.52644f,
+ -0.490783f, -0.415782f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_8[] = {
+ -0.702198f, -0.102148f, 0.0564545f, -0.0555548f, 0.16184f,
+ 0.0950792f, 0.136974f, -0.00824146f, 0.05746f, 0.0447542f,
+ 0.145978f, 0.0855769f, -0.041449f, 0.301347f, -0.0206691f,
+ -0.0662514f, -0.0525079f, -0.0998387f, -0.0891438f, 0.110545f,
+ -0.863098f, -1.83798f, 0.238818f, 0.127797f, 0.116872f,
+ -0.270655f, -0.21057f, 0.197013f, -0.123332f, 0.137104f,
+ -0.174766f, -0.00803025f, 0.0234369f, -0.0894175f, -0.0380927f,
+ 0.00827928f, -0.134148f, 0.110575f, -0.250173f, 0.116273f,
+ 0.0197749f, 0.270391f, 0.108437f, 0.173197f, -0.0650348f,
+ 0.0884626f, 0.262792f, 0.0649228f, 0.5573f, -2.81315f,
+ -0.479801f, -1.15825f, 0.0807932f, -0.19144f, 0.404016f,
+ -0.211521f, 0.233269f, -0.391414f, 0.160381f, -0.277233f,
+ 0.426354f, 0.156839f, 0.494315f, -0.214259f, -0.0132062f,
+ 0.148628f, -0.0899568f, 0.161845f, 0.467689f, 0.229474f,
+ 0.590634f, -0.705793f, -0.0486113f, -0.439088f, 0.994566f,
+ 0.679065f, 0.777869f, -0.225291f, -0.0303006f, -0.638782f,
+ -0.0824632f, -0.128561f, -0.327603f, 0.105624f, 0.567581f,
+ -0.396135f, -0.471028f, 0.181286f, 0.274604f, 0.180169f,
+ 0.0612144f, -0.865004f, 0.0306804f, 0.142985f, -0.0914358f,
+ -0.243284f, 0.358359f, -0.443847f, -0.371978f, 0.606933f,
+ -0.900408f, -0.52076f, 0.472118f, 0.0610973f, 0.152526f,
+ -0.550379f, 0.309331f, -0.141573f, 0.203046f, -0.231485f,
+ 0.505156f, 0.393224f, 0.435487f, -0.218681f, 0.123707f,
+ -0.270383f, -0.033565f, 0.210373f, -2.33967f, 0.367434f,
+ 0.0308118f, -0.205771f, 0.546141f, 0.19837f, 0.035648f,
+ -0.467007f, -1.50995f, -0.0314176f, 0.11762f, -0.15307f,
+ 0.618257f, -0.139502f, 0.303386f, -0.00758681f, 0.228107f,
+ -0.594499f, -0.201984f, -0.239666f, 0.114878f, -0.922174f,
+ -0.530137f, -0.379366f, -0.319582f, 0.0889624f, -0.00544663f,
+ 0.316264f, -0.204262f, -0.0959358f, 0.23552f, 0.141369f,
+ -0.207129f, -1.04067f, -0.0780501f, 0.226768f, -0.246752f,
+ 0.0823105f, 0.114783f, 0.49315f, 0.0197732f, 0.705433f,
+ 0.158076f, -0.250584f, -0.157326f, -0.0439547f, -0.139047f,
+ 0.090531f, -0.38833f, 0.743143f, -1.47418f, -0.155009f,
+ 0.511466f, -0.726716f, -0.181075f, 0.450133f, -0.390204f,
+ 0.292725f, 0.00811462f, -0.347738f, 0.613381f, -0.237124f,
+ 0.750748f, -0.383123f, 0.410309f, -0.204166f, 0.667199f,
+ -0.313197f, 0.436059f, -0.607571f, 0.193681f, 0.409399f,
+ 0.631747f, -0.0454149f, 0.198232f, 0.345591f, -0.0137374f,
+ -0.307014f, -0.535515f, 0.764678f, -0.225686f, -0.451621f,
+ -2.75564f, -1.52877f, 0.0511933f, 0.905979f, 0.145029f,
+ 0.759615f, 0.130166f, 0.83827f, 0.0655081f, 1.07555f,
+ -0.529777f, 0.682967f, -0.412052f, 0.611947f, -0.83676f,
+ 0.940695f, -0.465681f, 0.51505f, -0.883659f, -0.105524f,
+ -0.0344173f, -0.0683618f, -0.00698688f, -0.139349f, 0.135741f,
+ -0.294455f, -0.377834f, -0.602084f, -1.00128f, 0.483291f,
+ 1.25327f, 0.178987f, 0.75068f, -0.520731f, -0.325517f,
+ 0.272032f, 0.144144f, -0.279453f, 0.564907f, 0.144036f,
+ 0.297448f, -0.504243f, -0.250508f, -1.26395f, 0.4816f,
+ 0.392771f, -0.389961f, -0.261585f, -0.127124f, -0.202945f,
+ -0.709716f, -0.174719f, 0.113613f, 0.477753f, -0.226659f,
+ 0.0697828f, -0.177994f, 0.300726f, -0.185504f, 0.339424f,
+ -0.316746f, 0.369693f, -0.339723f, -0.143886f, -0.0326589f,
+ -0.268761f, -0.241094f, 0.284876f, -0.0270867f, -0.207397f,
+ -1.42738f, 0.495612f, -0.0277732f, 0.199675f, 1.48638f,
+ -0.659257f, -1.28199f, 0.498702f, 0.140695f, 0.571152f,
+ 0.416368f, 0.14153f, 0.126876f, 0.521114f, -0.00150571f,
+ 0.375581f, 0.00537624f, 0.1286f, -0.332227f, 0.417663f,
+ -0.539023f, 0.217124f, -0.787111f, -0.0335266f, 1.56751f,
+ 0.0640563f, -0.158791f, 0.118195f, 0.000970493f, -0.0403852f,
+ -0.0572557f, -0.0201181f, -0.10255f, 0.63237f, 0.156662f,
+ 0.418696f, -0.274802f, -0.663923f, -0.375232f, -0.40846f,
+ 0.462092f, 1.2176f, -0.301532f, -0.779704f, -0.112876f,
+ 0.0806591f, -0.0141923f, 0.00960801f, -0.663557f, 0.0979948f,
+ -0.0575999f, -0.012847f, 0.0403853f, -0.133666f, -0.00330217f,
+ -0.931518f, -0.774599f, -0.21391f, 0.377601f, -0.183365f,
+ 0.299094f, 0.0238552f, 0.206716f, -0.18959f, 0.346013f,
+ -0.150991f, -0.192817f, -0.293962f, -0.0537604f, -0.0648171f,
+ -0.275941f, -0.144854f, -0.224092f, 2.43113f, 0.0422494f,
+ -0.047236f, -0.0262028f, 0.0282119f, -0.175553f, 0.0888502f,
+ 0.580682f, 0.951055f, -0.284441f, -0.120133f, -0.268058f,
+ -0.312083f, -0.411556f, 0.21431f, -0.28033f, 0.324851f,
+ -1.02787f, -0.936816f, -0.577628f, 0.544743f, 0.295807f,
+ 0.406157f, 0.447927f, 0.25369f, -0.811421f, -0.0424979f,
+ -0.189867f, 0.00778673f, -0.113587f, -0.116175f, -0.0542222f,
+ -1.80089f, -1.44175f, -0.35332f, 0.191314f, -0.236691f,
+ -0.0261926f, -0.502363f, 0.252278f, -0.485478f, 0.296495f,
+ 0.455612f, -0.0489631f, 0.227255f, 0.170975f, 0.473487f,
+ 0.257812f, 0.178048f, 0.2506f, 2.04637f, -0.173857f,
+ 0.0583379f, 0.00765589f, -0.025772f, -0.162666f, -0.016214f,
+ -0.607486f, -0.0808025f, 0.0551611f, -0.0772291f, 0.126421f,
+ 0.10869f, -0.0877463f, -0.111527f, -0.0775766f, 0.503886f,
+ -0.002757f, -0.0421354f, -0.247857f, 0.140827f, 0.383576f,
+ 0.228232f, -0.157877f, -0.0927911f, 0.344687f, 0.191181f,
+ 0.236533f, 0.00102869f, -0.0184502f, -1.4509f, -1.15945f,
+ -0.521978f, -0.643225f, 0.133139f, 0.0660321f, 0.0851957f,
+ 0.0303648f, 0.0296239f, 0.0455713f, 0.175647f, 0.080532f,
+ 0.0445691f, -0.257356f, -0.125602f, -0.138829f, -0.167057f,
+ -0.0992552f, -0.13944f, 0.507531f, 0.444997f, 0.221452f,
+ -0.308384f, -0.327554f, 0.13235f, 2.1487f, -1.15453f,
+ -0.280239f, -0.363582f, -0.00358745f, 0.012866f, 0.251088f,
+ 0.0676416f, 0.178492f, -0.136631f, 0.197938f, -0.078198f,
+ 0.812439f, 1.1173f, 0.712113f, 1.10124f, -0.836503f,
+ -1.22433f, -1.07894f, -1.29215f, 0.56057f, 2.23928f,
+ -0.419029f, 0.282178f, -0.0719266f, -0.172192f, 0.28034f,
+ -2.99124f, -2.01481f, 0.0688982f, 0.697466f, 0.00635555f,
+ 0.566069f, 0.047534f, 0.507755f, -0.00690707f, 0.712594f,
+ -0.191467f, 0.355733f, -0.480016f, 0.664669f, -0.390619f,
+ 0.351199f, -0.482342f, 0.325005f, 1.9089f, 0.155987f,
+ 0.17032f, 0.132729f, 0.0402649f, 0.146991f, 0.0314905f,
+ -0.775316f, -0.208892f, -0.105993f, 0.0181653f, -0.12735f,
+ 0.0897852f, 0.0470231f, 0.25807f, 0.127406f, -0.0893252f,
+ -0.279776f, 0.190844f, 0.110384f, -0.148833f, 0.025293f,
+ 0.239838f, 0.00932245f, 0.35103f, -0.128268f, -0.0536754f,
+ 0.506899f, -0.16793f, 0.0955582f, -2.01108f, 0.721433f,
+ -2.31413f, -2.08646f, 0.033315f, 0.689828f, -0.271213f,
+ 0.790425f, -0.114234f, 0.755325f, -0.211533f, 0.774544f,
+ -0.263268f, 0.795762f, -0.551455f, 0.953602f, -0.168454f,
+ 0.529055f, -0.768991f, 0.882371f, 0.29763f, -0.155017f,
+ 0.00464101f, 0.121093f, 0.948271f, 0.113138f, -0.110332f,
+ -2.0492f, -1.31322f, -0.129212f, 0.464778f, -0.181465f,
+ 0.618403f, 0.0627984f, 0.465228f, 0.165729f, 0.278277f,
+ -0.563276f, -0.358358f, -0.590638f, 0.0104993f, 0.731206f,
+ 0.752569f, 0.631615f, 0.811822f, 0.129804f, -0.0558327f,
+ 0.570081f, -0.417922f, -0.168275f, 0.0703671f, 0.269127f,
+ 0.240457f, -0.197159f, -0.00179261f, 0.220065f, 0.463511f,
+ 0.0714626f, -0.716477f, -0.441865f, -0.717028f, -0.149176f,
+ 0.452182f, 0.662699f, -0.906534f, -0.817133f, 0.237747f,
+ 0.26024f, -7.7441e-05f, 0.0934616f, 0.824641f, -0.0404494f,
+ -0.088297f, -0.157899f, 0.037408f, 0.132435f, -0.316155f,
+ -0.276785f, 0.0117868f, 0.185008f, 0.32369f, -0.465855f,
+ -0.302127f, 0.303289f, 0.338597f, -0.665408f, -0.507594f,
+ 0.526979f, 0.532091f, 0.234395f, 0.754063f, 0.116769f,
+ 0.0800309f, -0.939344f, -1.51269f, 1.4583f, 0.178444f,
+ 0.0106756f, -0.213468f, -0.00369439f, 0.071015f, -0.192798f,
+ -0.0933147f, -0.129901f, -0.368279f, -0.246564f, 0.126966f,
+ 0.478565f, -0.476246f, -0.762863f, 0.168883f, 0.536136f,
+ -0.272969f, 0.2573f, -0.161577f, 0.311428f, -0.777994f,
+ -1.29752f, 0.216046f, 0.329016f, 1.57265f, 0.168075f,
+ -0.192518f, 0.0829308f, -0.073533f, -0.0202034f, 0.114716f,
+ -0.34888f, -0.519215f, 0.190809f, 0.0138507f, 0.133635f,
+ 0.14194f, 0.410618f, -0.165106f, 0.214438f, 0.0438265f,
+ -0.8481f, -1.19182f, -1.07878f, -0.882217f, 0.45616f,
+ 0.977385f, 0.74929f, 0.918466f, 0.904704f, 0.041938f,
+ 0.0362776f, 0.0757255f, 1.14007f, 0.0516825f, -0.160068f,
+ 0.219535f, 0.638634f, -0.0284544f, -0.222849f, -0.0344915f,
+ -0.0350256f, -0.0504452f, -0.0458416f, 0.146099f, 0.0783083f,
+ 0.206579f, 0.241264f, 0.28401f, 0.0425312f, -0.802049f,
+ -0.746271f, -0.578969f, -0.078218f, 0.436176f, -0.281465f,
+ -2.5539f, 0.237868f, -0.121796f, 0.0715619f, 0.106992f,
+ -0.621862f, -0.167142f, 0.153716f, 0.0570912f, -0.06525f,
+ -0.923773f, 0.130759f, 0.0517066f, 0.0729862f, -0.873064f,
+ 0.0403328f, -0.186499f, -0.0831918f, -0.223723f, 0.144697f,
+ 0.212845f, 0.416876f, 0.361598f, 0.138229f, 0.0728777f,
+ -1.95419f, -0.00382816f, -0.0440387f, 0.433627f, 0.44781f,
+ -1.05229f, -1.54506f, 0.564827f, -0.263456f, 0.296105f,
+ -0.158055f, 0.388274f, -0.366639f, 0.212006f, -0.245619f,
+ 0.593064f, 0.088727f, 0.410632f, -0.263462f, 0.507075f,
+ -0.0974155f, 0.275268f, -0.1293f, 0.136679f, 1.98276f,
+ 0.411766f, 0.391987f, 0.34283f, -0.114077f, 0.258462f,
+ -0.302443f, 0.301138f, -0.00726621f, 0.276441f, -0.291582f,
+ 0.66498f, -0.321451f, -0.332805f, 0.0943272f, 0.572253f,
+ -0.45818f, -0.0219593f, -0.151679f, 0.402033f, -1.15502f,
+ -0.882955f, 0.772904f, 0.88126f, -0.149555f, 0.709525f,
+ 0.350116f, -0.21531f, 0.797893f, 0.0230234f, 0.0203034f,
+ 0.2744f, 1.08273f, 0.039349f, 0.503909f, -0.45892f,
+ -0.579516f, -0.344058f, 0.390628f, -0.386941f, -0.430317f,
+ -0.0807066f, 0.435906f, 0.522996f, 0.724476f, -0.74371f,
+ -0.05376f, -0.340898f, -0.962646f, -0.0278005f, 0.0981149f,
+ -0.0811161f, 0.00237994f, 0.850042f, 0.0665473f, 0.134413f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_8[] = {
+ 1.63404f, -0.715866f, -1.0132f, -2.08745f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_8 = {
+ NUM_FEATURES_8,
+ NUM_LOGITS_8,
+ NUM_HIDDEN_LAYERS_8,
+ {
+ NUM_LAYER_0_UNITS_8,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_kernel_8,
+ av1_simple_motion_search_prune_rect_logits_kernel_8,
+ },
+ {
+ av1_simple_motion_search_prune_rect_layer_0_bias_8,
+ av1_simple_motion_search_prune_rect_logits_bias_8,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
+
+static const NN_CONFIG
+ *const av1_simple_motion_search_prune_rect_nn_config[5] = {
+ &av1_simple_motion_search_prune_rect_nn_config_128,
+ &av1_simple_motion_search_prune_rect_nn_config_64,
+ &av1_simple_motion_search_prune_rect_nn_config_32,
+ &av1_simple_motion_search_prune_rect_nn_config_16,
+ &av1_simple_motion_search_prune_rect_nn_config_8,
+ };
+
+// nn model for predicting max square partition level of a superblock
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_FEATURES 13
+#define NUM_LAYER_0_UNITS 48
+#define NUM_LOGITS 4
+
+static const float av1_max_part_pred_logits_kernel[] = {
+ -0.304561f, 0.0885596f, -0.988539f, 1.08147f, 0.215213f,
+ 0.202965f, -0.828457f, -0.233945f, -0.0866977f, -0.115521f,
+ 0.02079f, 0.196491f, -0.0285075f, 0.05067f, -0.00872862f,
+ 0.00281844f, -0.238954f, 0.0253801f, 0.0257775f, 0.339269f,
+ 0.176174f, -0.152545f, -0.0588704f, -1.62275f, -0.189329f,
+ 0.0808033f, 0.233844f, -4.53798f, 0.674968f, -0.0361688f,
+ -0.0754075f, 1.16129f, -0.0188879f, 0.113255f, -3.04378f,
+ 0.814728f, -0.568517f, -0.00179383f, -3.61223f, -1.67535f,
+ -2.20417f, -0.197196f, 0.0507745f, -0.0909394f, -0.0507879f,
+ -1.27999f, -0.055623f, 0.0318497f, 0.192867f, 0.138726f,
+ 0.0443392f, -0.595075f, -0.166774f, 0.0882958f, -0.348161f,
+ 0.0214428f, -0.0599275f, -0.0995385f, -0.82358f, 0.141205f,
+ -0.053232f, 0.00508296f, -1.90872f, 1.15004f, -0.194219f,
+ 0.0229019f, -0.00354318f, 0.22016f, 0.154101f, -0.159231f,
+ -0.0446647f, -0.197503f, 0.0408453f, 0.197659f, 0.797858f,
+ -0.189722f, 0.343653f, 0.124666f, -1.03083f, 0.603059f,
+ 0.101565f, 0.0932993f, 0.462484f, 0.295984f, 1.11198f,
+ 0.143709f, -0.846232f, -0.464392f, -1.06058f, -0.124889f,
+ 0.0727475f, 1.18446f, -0.100302f, 0.0641918f, -0.101622f,
+ 0.10219f, 0.130189f, 0.0915623f, -0.166904f, -1.10606f,
+ -0.16726f, -0.146152f, 0.145443f, -0.177091f, -0.0215214f,
+ 0.0158506f, -0.553294f, 0.0784749f, -0.0416628f, -0.027785f,
+ 0.280027f, 0.484898f, -0.164225f, 0.0238317f, -0.0345254f,
+ 0.0410244f, 0.131529f, 0.0239622f, -0.0749436f, -0.0224914f,
+ 0.128926f, 0.224539f, 0.413297f, 0.0638572f, 0.103308f,
+ 0.0913242f, -0.119274f, 0.0163103f, 0.113828f, 0.119809f,
+ 0.297057f, -0.124889f, -0.533108f, -0.181408f, -0.129896f,
+ 0.0221064f, -0.0773281f, -0.0386467f, 0.0342961f, 0.126575f,
+ -0.24114f, 0.0735576f, 0.0524791f, 0.246896f, -0.130674f,
+ -0.03979f, 0.173639f, 1.95193f, -0.113029f, -0.0305852f,
+ -0.00671737f, 0.157159f, -0.00102858f, -0.543688f, 0.566772f,
+ 0.124124f, -0.0294064f, -0.0699021f, -0.0704103f, -0.766097f,
+ -0.0625802f, -0.0906173f, -0.0520414f, -0.0272724f, 0.283064f,
+ 0.236213f, -0.127319f, 0.019392f, 0.170042f, -0.0214542f,
+ 0.0740938f, 0.356578f, -0.236257f, 0.269021f, 0.114759f,
+ -0.641166f, 0.136308f, -0.0386959f, -0.112024f, -0.361209f,
+ 0.686095f, 0.183906f, 0.288656f, 0.182007f, 0.337458f,
+ 0.058974f, -0.305512f, -0.841708f, -0.243779f, -0.0614058f,
+ 0.208747f, 0.448697f
+};
+
+static const float av1_max_part_pred_layer_0_bias[] = {
+ -0.776544f, -2.0022f, -0.330294f, 2.47665f, 1.90206f, -1.61571f,
+ 0.536246f, 1.00455f, 5.24561f, 1.55111f, -0.816399f, -4.88703f,
+ -1.06417f, -1.15359f, -0.145289f, 1.91831f, 0.630915f, -1.94256f,
+ -3.35239f, -1.05007f, -1.05186f, 1.36824f, -5.2878f, 1.10482f,
+ -5.00077f, -0.0445198f, 3.41427f, 2.3439f, -0.413306f, -1.88152f,
+ -2.28638f, 8.24783f, -1.91961f, -1.49324f, 1.96599f, -6.32309f,
+ -0.332426f, -0.425506f, 4.06511f, 5.84386f, 4.15747f, 1.22402f,
+ 2.8512f, 2.53027f, 0.0170272f, -1.43966f, -0.997785f, 5.43064f
+};
+
+static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f,
+ 1.96217f, 0.728905f };
+
+static const float av1_max_part_pred_layer_0_kernel[] = {
+ 0.992471f, 0.533006f, 0.143743f, -2.51788f, -0.468337f,
+ -0.201376f, -0.151834f, 0.479883f, 1.16061f, -0.278878f,
+ -0.814954f, -0.152405f, -0.0521608f, 0.797104f, -2.08912f,
+ 0.385839f, -2.22889f, -0.106858f, -0.239766f, -0.951128f,
+ -0.698753f, 0.0831051f, 1.1702f, 0.342834f, -0.0352795f,
+ -0.0847639f, -0.802086f, 0.258982f, 1.14174f, 0.645885f,
+ -1.19226f, -0.592888f, -0.343659f, 1.1912f, 1.45411f,
+ -1.22927f, 0.152858f, 0.00373585f, -1.60637f, 0.592611f,
+ 0.0857475f, -0.346147f, -0.150784f, -0.0817408f, -0.189918f,
+ -0.804952f, -1.33036f, -1.03307f, 0.0248769f, 0.16607f,
+ -2.896f, -2.1293f, 0.12293f, -0.173179f, -0.212128f,
+ -6.76221f, 0.033188f, 0.0231787f, 0.905957f, 0.0551327f,
+ -0.356276f, 0.0181795f, 0.0977523f, -0.0352873f, -0.0396386f,
+ 2.3241f, 0.0632874f, -0.11804f, -6.32521f, 0.0224659f,
+ -0.00188896f, 0.267992f, 0.272337f, 0.00936963f, 0.659969f,
+ -2.25707f, -0.0278229f, -0.0185089f, -1.14466f, 0.104827f,
+ 0.0435885f, 0.558586f, -0.00697004f, 0.0312611f, 0.540574f,
+ -0.568625f, 0.218608f, 0.378911f, -0.0289192f, -0.0734742f,
+ -1.08782f, -2.42069f, -0.0127239f, 0.0493651f, -1.15837f,
+ 0.261831f, 0.401824f, -1.04545f, 0.284173f, 0.784972f,
+ -0.511243f, -0.982599f, -0.106134f, -0.325964f, -1.44107f,
+ -1.42434f, -1.02402f, -1.52034f, 0.0737116f, 0.0462242f,
+ 0.628722f, -1.0405f, -0.113718f, 2.20573f, -4.33951f,
+ -0.0192695f, -0.0229314f, -1.89156f, 0.645942f, 0.375708f,
+ -1.97447f, -0.267014f, 0.0989443f, -0.450534f, -1.01737f,
+ -0.642416f, -0.0897288f, -2.08724f, -0.190965f, -0.279135f,
+ -0.830178f, 0.808754f, -0.139091f, 1.11004f, -0.454439f,
+ -0.479238f, -1.44001f, 0.0888059f, 0.885689f, -0.642505f,
+ -0.00773651f, -0.0265721f, -0.906346f, 1.68504f, 0.084257f,
+ -0.951101f, -8.06495f, 0.19231f, 0.16389f, -0.193678f,
+ 0.729837f, -1.98392f, -5.98513f, 3.32638f, -0.0658378f,
+ -0.0910426f, -0.666567f, -0.315339f, 0.123124f, -2.66375f,
+ -0.714852f, -0.136176f, -0.460166f, -0.567551f, -1.06193f,
+ -1.21389f, -0.83865f, 0.00280695f, -0.199519f, -0.534704f,
+ 0.419311f, -0.149008f, -3.68707f, 0.00285113f, -0.0718198f,
+ -1.41026f, -1.34155f, -0.538687f, -0.623666f, -2.56462f,
+ -0.0183333f, -0.323532f, -1.27141f, -0.0212039f, 0.198633f,
+ 0.459554f, -4.65103f, -1.01293f, -1.39512f, -0.289026f,
+ 0.208724f, -0.665226f, 1.13369f, -1.96734f, -1.45442f,
+ -3.46172f, 0.810681f, -0.603973f, 0.842764f, -3.90371f,
+ -0.394561f, -3.61363f, -2.88085f, 0.031645f, -0.23125f,
+ -2.63898f, -1.35314f, -0.46726f, 1.33145f, 1.20269f,
+ 1.38682f, -0.331637f, 0.069021f, 0.149523f, -1.24957f,
+ -0.878857f, -0.200368f, 0.465744f, 1.01365f, -0.0122221f,
+ -0.550586f, -1.12581f, -0.422132f, -0.0744868f, -2.4804f,
+ -1.07072f, -0.479006f, 0.101817f, -0.118947f, 0.341576f,
+ -1.0538f, -0.812346f, -1.13727f, -0.00939806f, 10.1571f,
+ -0.0441302f, 0.00280407f, -21.5044f, 0.0181152f, -0.0143246f,
+ 3.23462f, -1.38624f, -1.80416f, 4.89763f, -2.67364f,
+ 2.31771e-05f, 0.000393989f, 0.352204f, -0.193455f, 0.531455f,
+ 0.488757f, -0.442555f, -0.518528f, 0.431482f, -2.67727f,
+ -2.00626f, -0.39729f, -0.221494f, -0.0188888f, -0.0377649f,
+ -1.80169f, 0.0810332f, -0.0408335f, -1.28675f, -0.0353824f,
+ -0.666723f, -1.07281f, 0.252912f, -1.24547f, -1.7831f,
+ -1.14354f, -0.137662f, 0.00230182f, 0.736862f, 0.175872f,
+ -0.187556f, 0.43963f, -0.796524f, 0.056219f, -0.387874f,
+ 0.0710224f, -0.16548f, -0.100993f, 0.931481f, -3.20738f,
+ -0.0197576f, 0.266148f, -0.173909f, -0.337795f, -0.0682381f,
+ 0.176844f, 0.140286f, 1.12033f, 0.429064f, -2.24192f,
+ -1.54682f, 2.23646f, -0.0371138f, -0.0475339f, -3.21766f,
+ 0.0412858f, 0.387811f, 6.6711f, 0.140649f, 0.0559547f,
+ -0.802839f, 0.599977f, 0.64552f, -2.08103f, -0.503401f,
+ -0.0407036f, -0.0299199f, 0.0849445f, -0.111657f, -1.63462f,
+ 3.33762f, 0.0441394f, 0.0466889f, -0.951806f, 0.0723954f,
+ 0.00348661f, -1.36903f, 2.24625f, -0.0348915f, -0.0508893f,
+ -0.240891f, -0.120143f, -0.17991f, -2.09137f, 0.0150871f,
+ 0.0480333f, 1.72012f, 0.0309551f, -0.0370507f, -0.377075f,
+ 0.103916f, -0.0169255f, -0.0145395f, -4.02144f, 0.83193f,
+ -0.316502f, 6.3832f, -1.70038f, -1.97215f, -1.94501f,
+ 1.45479f, 0.711725f, -0.348496f, -0.279056f, -1.13396f,
+ -1.51744f, -0.853307f, 1.53131f, -0.0032358f, 1.41808f,
+ -1.32989f, -0.245221f, -0.161614f, -0.500845f, -0.449252f,
+ 0.0724151f, -0.116333f, -0.0946182f, -2.0945f, 0.0564572f,
+ 0.393261f, -1.06861f, -0.111458f, -0.839943f, -0.0880348f,
+ 0.0365742f, 0.415339f, -1.57494f, -0.713697f, 1.02349f,
+ -0.221371f, -0.0446281f, 1.89223f, -0.0811754f, -0.402773f,
+ -0.930987f, 0.0243194f, 0.0678332f, -0.0233014f, 0.165372f,
+ -0.44083f, -1.2404f, 0.35675f, -0.040916f, -0.0512548f,
+ -2.9071f, 0.861174f, -0.778133f, 2.14436f, -0.688427f,
+ -0.480371f, -1.69032f, 0.706687f, -0.281982f, -2.30451f,
+ 1.61541f, -0.0213638f, -0.740509f, -0.266677f, 0.0268434f,
+ -0.0116908f, -3.17595f, 0.0114825f, 0.0196997f, -0.144005f,
+ 0.0550181f, -0.851459f, -0.000285073f, -0.538441f, -0.0254868f,
+ -0.0104454f, -0.0661998f, -0.196469f, -0.346372f, -5.52892f,
+ -0.643683f, -0.622224f, -0.31463f, -0.555956f, -0.520132f,
+ -0.843166f, -2.59479f, -0.750195f, 0.00635995f, -0.338615f,
+ -0.216676f, -0.391544f, -1.62185f, -0.718471f, -0.475406f,
+ -0.782041f, -0.608824f, -1.09633f, -1.27308f, -0.560719f,
+ -0.207539f, -0.0196445f, -1.05519f, -0.575249f, -1.0642f,
+ 1.01615f, -0.873633f, -0.417953f, -0.428051f, 0.350259f,
+ -2.53833f, -2.72203f, 0.672846f, -0.503094f, -1.1374f,
+ 0.214291f, 0.013305f, 0.0112064f, 1.10532f, 0.030455f,
+ 0.0239614f, 0.628072f, 0.0539135f, -0.472441f, -0.688439f,
+ -0.32044f, -0.0234867f, -0.0158436f, -0.949314f, -0.0453161f,
+ -1.18306f, 0.626845f, -0.426925f, -0.688371f, 0.415062f,
+ 0.0640985f, -0.638387f, -2.01399f, -0.209744f, -0.762892f,
+ -0.0753296f, -0.879315f, -0.520433f, -0.111375f, 0.389742f,
+ -0.398862f, -0.643227f, -0.246396f, 0.0317051f, 1.06973f,
+ 0.413617f, 0.180506f, -0.0507897f, -0.00650435f, 0.620892f,
+ 0.046312f, 0.475032f, 0.906993f, -0.0388061f, -0.256271f,
+ -1.03323f, 0.0125266f, -0.31116f, -0.377611f, -0.0386407f,
+ -0.0232745f, -0.353644f, -2.27289f, 0.0571779f, -0.00865006f,
+ 1.65101f, 0.0175711f, 0.0184585f, 0.558458f, 0.2213f,
+ -0.285089f, 0.433445f, -0.427177f, -0.0103682f, -0.0101273f,
+ 0.214085f, -0.0459885f, 0.00761981f, 0.836381f, 0.0175293f,
+ 0.02508f, -1.51778f, 0.0143956f, -0.162589f, 0.595418f,
+ 0.21445f, -0.0335848f, -0.0136684f, -0.16686f, -0.14612f,
+ 0.0816238f, 0.499636f, 0.12458f, -2.41673f, -0.261721f,
+ -0.676805f, -1.88366f, 0.730462f, 0.69196f, -0.0288489f,
+ -2.38272f, 0.329876f, 0.014517f, -0.115145f, -3.48151f,
+ -0.00209072f, -0.0732377f, 0.820443f, -0.0118701f, 0.112145f,
+ 0.272315f, 0.137531f, -0.0200997f, -0.0397883f, -2.19458f,
+ 0.183554f, -0.639716f, 0.481605f, -0.621639f, -0.0980299f,
+ -0.710534f, -0.143105f, -6.77626f, -1.65139f, -2.37718f,
+ -0.533127f, -1.12574f, 3.34182f, -0.0758663f, 0.0334238f,
+ -9.48647f, 0.0674974f, 0.0507665f, 0.523007f, -0.0668f,
+ 0.5736f, -0.589761f, -1.1692f, -0.0236497f, -0.00828928f,
+ -0.265823f, 1.15284f, 0.307927f, -0.695308f, 0.13725f,
+ -0.20394f, -0.363965f, -0.331159f, -1.50927f, -1.20051f,
+ -0.0205825f, -0.0381859f, -0.0579876f, -1.6913f, -1.94626f,
+ 3.4214f, 3.3922f, -2.13798f, -0.679848f, -0.890735f,
+ 0.235017f, -0.253202f, -1.0571f, 1.40354f, 0.00719052f,
+ -1.54365f, -0.7289f, -1.05492f, 0.0238169f, -0.00543592f,
+ -0.0510353f, -0.175386f, -0.724207f, -0.788936f, 0.039976f,
+ 1.36966f, 0.869475f, -0.0302774f, -0.0537556f
+};
+
+static const NN_CONFIG av1_max_part_pred_nn_config = {
+ NUM_FEATURES,
+ NUM_LOGITS,
+ NUM_HIDDEN_LAYERS,
+ {
+ NUM_LAYER_0_UNITS,
+ },
+ {
+ av1_max_part_pred_layer_0_kernel,
+ av1_max_part_pred_logits_kernel,
+ },
+ {
+ av1_max_part_pred_layer_0_bias,
+ av1_max_part_pred_logits_bias,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+// Early termination in second pass
+static const float av1_simple_motion_search_term_none_mean_128[28] = {
+ 12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f,
+ 11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f,
+ 11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f,
+ 5.940535f, 0.770746f, 4.292692f, 4.309581f, 0.848423f, 4.292334f,
+ 4.298179f, 8.514713f, 14.911736f, 19.825352f,
+};
+
+static const float av1_simple_motion_search_term_none_std_128[28] = {
+ 1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f,
+ 1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f,
+ 1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f,
+ 1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_64[28] = {
+ 10.904455f, 10.853546f, 9.247903f, 9.184479f, 9.251985f, 9.186686f,
+ 9.253490f, 9.190190f, 9.270079f, 9.204357f, 10.086511f, 10.031060f,
+ 10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f,
+ 4.888378f, 0.878113f, 3.598450f, 3.628491f, 0.925833f, 3.560971f,
+ 3.573322f, 8.807137f, 13.348477f, 18.269117f,
+};
+
+static const float av1_simple_motion_search_term_none_std_64[28] = {
+ 1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f,
+ 1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f,
+ 1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f,
+ 0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_32[28] = {
+ 9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f, 8.012570f,
+ 7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f, 8.859187f,
+ 8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f, 2.645082f,
+ 2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f,
+};
+
+static const float av1_simple_motion_search_term_none_std_32[28] = {
+ 1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f,
+ 1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f,
+ 1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f,
+ 0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_16[28] = {
+ 8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f, 7.088727f,
+ 6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f, 7.967233f,
+ 8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f, 1.901347f,
+ 1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f,
+};
+
+static const float av1_simple_motion_search_term_none_std_16[28] = {
+ 1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f,
+ 1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f,
+ 1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f,
+ 0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f,
+};
+
+static const float av1_simple_motion_search_term_none_model_128[] = {
+ -0.6106842357f, -1.0402954455f, 0.6054417656f, -0.2116623578f,
+ 0.2447714930f, 0.3782256209f, 0.5095592479f, -0.3275620904f,
+ 0.3886188013f, 0.2629499420f, -0.1979599415f, -0.5389565605f,
+ 0.1209207902f, -0.4913347466f, 0.3798542731f, -0.2812861709f,
+ -0.1049824167f, -0.1088672020f, 0.4059596517f, -0.1347896613f,
+ 0.2276868621f, 0.0506386970f, 0.0071088411f, 0.0467952100f,
+ 0.2091247458f, -0.7371964736f, 0.1368935545f, 0.3175247786f,
+ -0.5493146094f,
+};
+
+static const float av1_simple_motion_search_term_none_model_64[] = {
+ -0.4150046575f, -0.3954358561f, 0.1997997444f, 0.3395826831f,
+ 0.2827215753f, 0.3395683652f, 0.2483140395f, 0.2722216476f,
+ 0.2610308009f, 0.3724974359f, -0.0551479654f, -0.1721616359f,
+ -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f,
+ -0.3169539902f, -0.0269429900f, 0.9891530919f, -0.0125084982f,
+ 0.0972182377f, 0.0008889801f, 0.0205418050f, 0.0057237854f,
+ 0.1005222691f, -0.2851321920f, -1.5150336445f, 0.1893942436f,
+ -0.4337360901f,
+};
+
+static const float av1_simple_motion_search_term_none_model_32[] = {
+ -0.4667392852f, -0.3893302767f, 0.1603498635f, 0.2304974726f,
+ 0.1404975592f, 0.2505516225f, 0.1423053884f, 0.2189318406f,
+ 0.1379765409f, 0.2638241296f, -0.1342865463f, -0.0549054345f,
+ -0.1925223436f, -0.1142702769f, 0.0127811659f, 0.0868639997f,
+ -0.0643197251f, 0.0279496470f, 0.9904395769f, -0.0095178685f,
+ 0.1179410649f, -0.0013411972f, 0.0095060660f, 0.0195730400f,
+ 0.0779717771f, -0.2498860763f, -0.8168817125f, -0.4798397348f,
+ -0.6609679881f,
+};
+
+static const float av1_simple_motion_search_term_none_model_16[] = {
+ -0.3021081992f, -0.4620153673f, 0.0448577479f, 0.1738455035f,
+ 0.0663209177f, 0.1629614573f, 0.0555168744f, 0.1631870212f,
+ 0.0425805150f, 0.1688564954f, 0.0434083772f, -0.0046603915f,
+ -0.0271580056f, -0.0183879127f, 0.1073730471f, 0.0314201476f,
+ 0.0576891756f, 0.0119723753f, 0.9084332022f, -0.0188429077f,
+ 0.0755089811f, -0.0172550234f, 0.0037663075f, 0.0022094472f,
+ 0.0500247894f, -0.2944572004f, -0.8908521199f, -0.2555515792f,
+ -0.5396254205f,
+};
+
+#define FEATURES 31
+#define HIDDEN_NODES 32
+static const float av1_early_term_after_split_nn_weights_64_layer0[] = {
+ -0.306296f, -0.691664f, 0.335148f, -0.298465f, -0.509241f, -0.632796f,
+ -0.527979f, -0.009904f, -0.503646f, -0.494002f, -0.575101f, 0.239911f,
+ -0.413312f, -0.622825f, -0.405448f, -0.419103f, -0.505903f, -0.392550f,
+ -0.240293f, 0.121749f, -0.489777f, -0.756647f, 0.001047f, -0.016528f,
+ 0.145714f, 0.172910f, 0.086197f, 0.162882f, -0.070588f, -0.077104f,
+ 0.502730f, -0.244954f, 0.265605f, -0.323994f, 0.223397f, -1.086453f,
+ 0.391886f, 0.200343f, 0.253878f, 0.018925f, 0.201819f, -0.205136f,
+ 0.427314f, 0.041155f, 0.070484f, 0.159925f, -0.057095f, -0.146544f,
+ -0.073792f, 0.152628f, 0.003986f, -0.515965f, -0.209754f, 0.037457f,
+ 0.070622f, -0.143571f, -0.059602f, 0.111734f, 0.319674f, 0.149894f,
+ -0.219883f, 0.206678f, 0.015809f, -0.210549f, 0.130156f, -0.189502f,
+ -0.850392f, -0.156363f, -0.060354f, 0.189044f, 0.266495f, 0.151305f,
+ -0.563677f, -0.354896f, 0.300637f, 0.257568f, -0.008359f, -0.535497f,
+ -0.003127f, 0.293054f, -0.020212f, -0.157278f, 0.229972f, -0.309799f,
+ -0.329927f, -0.077140f, 0.001177f, -0.024415f, 0.134044f, -0.181587f,
+ -0.135380f, 0.230989f, -0.281451f, 0.912282f, 0.511562f, -3.900779f,
+ -0.039917f, 1.956406f, -0.357589f, 0.292998f, -0.950158f, 0.422041f,
+ 0.526572f, 0.605746f, -0.147110f, 0.256576f, 0.090010f, 0.221641f,
+ 0.029763f, 0.351592f, 0.458324f, -0.005888f, 0.010521f, -0.389326f,
+ -0.094006f, -0.171489f, -0.013153f, 0.026333f, -0.454571f, -1.932891f,
+ -0.168211f, 0.051298f, -0.258061f, -0.028936f, -0.555937f, -0.475566f,
+ -0.304046f, -0.318113f, 0.099697f, -0.217145f, 0.139433f, -0.203986f,
+ -0.164012f, 0.051527f, 0.138603f, -0.085100f, -0.082887f, -0.242955f,
+ -0.663410f, -0.535772f, -0.181665f, -0.197883f, 0.071319f, 0.135086f,
+ 0.146200f, 0.184827f, -0.199041f, 0.162570f, -0.300167f, 0.017748f,
+ -0.140111f, 0.103553f, 0.206929f, 0.193446f, 0.123141f, -1.201898f,
+ -0.052254f, -0.750121f, 0.111741f, 0.204092f, -0.166266f, 0.124008f,
+ -0.455496f, 0.306035f, 0.275903f, 0.193599f, -0.730011f, 0.126808f,
+ 0.051059f, 0.103634f, -0.044334f, 0.048889f, 0.405228f, 0.574099f,
+ 0.061167f, 0.260576f, 0.070032f, -0.038040f, 0.229183f, -0.243269f,
+ -0.130116f, -0.538563f, -0.070199f, -0.129249f, -0.205153f, -0.268530f,
+ -0.290828f, -0.233006f, 0.068712f, 0.618085f, -0.407008f, 0.686868f,
+ 0.172247f, 0.826287f, -0.002672f, 0.239825f, -0.051548f, 0.420773f,
+ 0.218747f, 0.041057f, -0.071189f, 0.286987f, -0.113915f, 0.122561f,
+ 0.013979f, -0.049046f, 0.148175f, 0.031313f, -0.248601f, 0.209488f,
+ 0.069008f, 0.072763f, 0.332475f, 0.079986f, -0.151042f, -0.205110f,
+ -0.155550f, -0.510408f, 0.330429f, 0.577729f, 0.266524f, -0.378489f,
+ 0.228204f, 0.055318f, 0.117583f, -0.588557f, -0.778201f, 0.434622f,
+ -0.227820f, 0.611642f, 0.170548f, 0.817761f, 0.006642f, -1.005794f,
+ -0.911490f, 1.633684f, -0.290664f, 0.308128f, 0.295986f, 0.243377f,
+ -0.001275f, -0.131156f, 0.275205f, -0.041865f, -0.201951f, -0.016380f,
+ 0.336604f, -0.258118f, 0.890810f, 0.441065f, -0.968006f, 0.135989f,
+ -1.447191f, 0.353426f, -0.343235f, 0.376837f, -0.071602f, -0.319639f,
+ -0.072347f, 0.547450f, -0.215380f, 0.182141f, -0.066186f, 0.033787f,
+ 0.257482f, 0.217428f, -0.130249f, 0.057525f, 0.263991f, 0.230664f,
+ -0.245113f, 0.048610f, -0.079955f, 0.251737f, -0.070368f, -0.017968f,
+ -0.151815f, 0.025945f, -0.257769f, 0.299735f, 0.077263f, -0.565526f,
+ 0.326263f, 0.096429f, 0.113414f, 0.092754f, -0.141908f, 0.172060f,
+ 0.393117f, -0.216755f, 0.331051f, -0.363369f, -0.113363f, -0.095164f,
+ -0.072784f, 0.214572f, 0.010993f, 0.209456f, 0.260381f, -0.314747f,
+ -0.422173f, -0.189963f, -0.225130f, 0.339448f, 0.153814f, 0.265616f,
+ -0.103575f, -0.123841f, -0.106236f, 0.155894f, -0.156264f, -1.361406f,
+ -0.040736f, -0.614998f, -0.468200f, -0.266505f, -0.342786f, -0.908088f,
+ 0.105758f, 0.040788f, -0.313589f, -1.359318f, 0.071329f, 0.176404f,
+ -0.476141f, 0.010108f, -0.201440f, -0.221167f, -0.197448f, -0.013927f,
+ -0.610270f, -0.607285f, 0.178070f, 0.174320f, 0.313115f, 0.026191f,
+ -0.112330f, 0.122338f, -0.367751f, 0.196794f, 0.153709f, -0.205454f,
+ -0.397471f, -1.879336f, -0.030129f, 0.143429f, -0.079832f, 0.435259f,
+ -1.729539f, 0.518301f, -0.141393f, 0.199399f, -1.914601f, 0.142865f,
+ -0.219899f, 0.508458f, 0.086365f, -0.220740f, -0.012507f, 1.263320f,
+ 0.042136f, 0.050922f, -0.329644f, -0.188198f, 0.251522f, 0.394731f,
+ -0.047866f, -0.260853f, -0.267207f, -0.248489f, 0.146474f, 0.359257f,
+ -0.427732f, -0.100652f, 0.192129f, 0.075572f, 0.916708f, 0.255747f,
+ 0.486384f, 0.127989f, -0.556449f, -0.484913f, 0.392298f, 0.045401f,
+ -0.839551f, -0.703619f, 0.069263f, -0.040720f, 0.542265f, 0.443739f,
+ 0.862552f, -0.021726f, 0.230858f, -0.261004f, -0.125697f, -0.106435f,
+ 0.002341f, 0.013904f, 0.011034f, 0.542296f, -0.284325f, 0.135736f,
+ 0.113882f, 0.040610f, -0.255485f, 0.224061f, -0.087140f, 0.127872f,
+ -0.002638f, 0.164889f, -0.335958f, -0.031166f, -0.393581f, 0.075455f,
+ 0.055995f, 0.087934f, -0.133859f, -0.342187f, 0.002492f, -0.340722f,
+ 0.058304f, 0.104165f, -0.142136f, -0.351111f, -0.158037f, -0.079924f,
+ -0.253209f, -0.092840f, -0.174646f, -0.202772f, -0.353438f, -0.031111f,
+ 0.076088f, -0.232091f, -0.070052f, 0.097595f, 0.063173f, -0.211195f,
+ 0.126478f, -0.178828f, 0.278723f, -0.070807f, -0.179783f, 0.034123f,
+ 0.035721f, -0.200431f, 0.170640f, 0.107933f, 0.226594f, -0.301499f,
+ -0.291096f, 0.228076f, -0.272951f, 0.002490f, -0.210707f, -0.128033f,
+ -0.194009f, -0.011347f, -0.256694f, -0.011841f, -0.005167f, -0.163203f,
+ -0.253796f, -0.198877f, -0.055827f, -0.882685f, -0.443471f, 0.349601f,
+ 0.749334f, -1.161845f, 0.505480f, 0.221733f, 0.210490f, -0.234984f,
+ 0.014183f, -0.510401f, 0.238692f, -0.134111f, 0.083844f, -0.478751f,
+ -0.088434f, 0.304063f, 0.150336f, -0.749682f, -0.081999f, 0.729739f,
+ 0.412508f, 0.132571f, 0.058306f, -0.047451f, -0.117435f, -0.445395f,
+ -0.005182f, -0.025757f, 0.175051f, -0.258194f, -0.150311f, -0.196533f,
+ -1.314316f, -0.428627f, 0.512451f, 0.045138f, -0.200925f, 0.081538f,
+ -0.346151f, -0.358197f, -0.422258f, -0.028542f, -0.383534f, -0.026163f,
+ -0.419858f, -0.154321f, 0.376970f, 0.094017f, 0.783520f, 0.110641f,
+ 0.077966f, -0.093064f, 0.160522f, -0.863041f, 0.086210f, 0.560764f,
+ 0.057032f, 0.159224f, 0.323068f, -0.173109f, 0.014042f, -0.126856f,
+ -0.128237f, -0.245273f, -0.317312f, -0.257597f, -0.181977f, 0.259485f,
+ -0.215834f, 0.062076f, -0.270596f, 0.271581f, -0.153486f, -0.247165f,
+ 0.079737f, -0.157049f, -0.027459f, -0.299397f, 0.136729f, -0.334192f,
+ -0.191722f, 0.145865f, -0.031324f, -0.307165f, -0.244923f, -0.228027f,
+ 0.063807f, 0.054965f, -0.005709f, -0.041977f, -0.276245f, 0.020003f,
+ 0.133323f, -0.145992f, -0.951030f, 0.414083f, -1.063323f, 0.137872f,
+ 0.104732f, -0.123728f, 0.542532f, 0.213654f, 0.542954f, 0.155619f,
+ 0.543072f, 0.399067f, 0.191402f, -0.102552f, -0.176734f, -0.136776f,
+ -0.012814f, -0.021298f, -0.802467f, -0.957481f, -0.238787f, -0.138482f,
+ 0.058331f, 0.126601f, 0.104420f, -0.148684f, 0.343218f, 0.093604f,
+ -0.055642f, -0.383918f, -0.045250f, -0.090480f, -0.155464f, 0.278299f,
+ 0.042791f, -0.029084f, -0.373861f, -0.073233f, -0.085172f, 0.186841f,
+ -0.070898f, -0.156415f, 0.112831f, -0.065931f, -0.353007f, 0.058453f,
+ -0.136982f, 0.233393f, 0.017240f, -0.018428f, 0.229104f, -0.371440f,
+ -0.262212f, 0.203075f, -0.263293f, 0.034413f, -0.299354f, 0.227269f,
+ 0.204977f, -0.118107f, -0.359832f, -0.068252f, 0.480105f, -0.214711f,
+ -0.614381f, 0.209048f, -0.456014f, -0.188819f, -0.220995f, -0.322104f,
+ -0.191457f, 0.420874f, -0.454919f, 0.023119f, 0.291700f, -0.532885f,
+ -0.032642f, 0.043271f, 0.133974f, 0.002399f, -0.179899f, -0.044158f,
+ -0.027078f, -0.350075f, 0.236766f, 0.346771f, -0.118534f, -0.421221f,
+ 0.019544f, 0.109349f, 0.141517f, 0.403561f, 0.409102f, 0.054555f,
+ -0.561751f, 0.577183f, -0.705156f, -0.231188f, -1.969772f, 0.172289f,
+ -0.048122f, 0.205671f, -0.667130f, -0.066870f, 0.202838f, -0.095538f,
+ -0.842651f, 0.254170f, 0.046256f, -0.271891f, -0.369254f, 0.492101f,
+ 0.001189f, -0.186525f, 0.188470f, -0.207072f, 0.030086f, -0.132904f,
+ 0.127001f, 0.116662f, -0.079246f, 0.227241f, -0.462178f, 0.446304f,
+ -1.660753f, 0.241832f, -0.288040f, 0.054663f, -0.435804f, 0.296782f,
+ -0.026421f, -0.115618f, 0.163416f, 0.834001f, 0.008019f, -0.014243f,
+ 0.524658f, 0.067894f, -0.253936f, -0.100657f, 1.285389f, -0.005952f,
+ 0.087134f, -0.088375f, -0.121866f, -0.171172f, 0.279463f, -0.598593f,
+ -0.727761f, 0.189831f, -0.822575f, -0.291141f, -0.012410f, -0.069999f,
+ 0.098842f, -0.218513f, 0.009494f, 0.100106f, -0.402884f, -0.299236f,
+ -0.345668f, -0.057739f, -0.213248f, -0.426661f, -0.360268f, -0.349860f,
+ -0.382177f, -0.357802f, -0.032030f, -0.110597f, -0.155442f, -0.418794f,
+ -0.012113f, -0.032962f, -0.450648f, 0.129060f, -0.135227f, -0.298593f,
+ 0.001435f, 0.278790f, -0.272945f, 0.162759f, -0.290208f, 0.058481f,
+ -0.490971f, 0.019630f, -0.210347f, 0.000520f, -0.340413f, 0.641562f,
+ 0.023104f, 0.194832f, -0.441894f, -0.253538f, -0.228332f, 0.423264f,
+ -1.094073f, -0.475657f, -0.238752f, 0.033910f, 0.440425f, 0.036320f,
+ 0.566989f, -0.065326f, -0.297939f, 0.406098f, 0.529561f, -0.113084f,
+ 0.141472f, -0.024462f, -0.179212f, 0.187801f, -0.235787f, -0.229624f,
+ 0.357791f, 0.061110f, -0.607788f, -1.713694f, -0.651041f, 1.734283f,
+ -0.334701f, 0.161687f, 0.010215f, 0.320708f, 0.169447f, 0.513558f,
+ 0.488340f, -0.619036f, -0.525441f, -1.144352f, -0.546154f, 0.669973f,
+ 0.327028f, -0.100539f, 0.012048f, -0.223013f, -0.239680f, 0.323035f,
+ 0.165950f, -0.155110f, 0.128664f, -0.157378f, -0.124490f, 0.291553f,
+ 0.055849f, -0.221664f, 0.077770f, -0.350658f, -0.181939f, 0.110230f,
+ -0.078219f, 0.007472f, -0.031620f, 0.007708f, -0.201794f, 0.017594f,
+ -0.027480f, 0.058884f, -0.369166f, -0.369770f, 0.181635f, -0.183318f,
+ -0.389184f, -0.256661f, 0.160107f, 0.037127f, -0.082573f, -0.095815f,
+ -0.322782f, 0.072528f, -0.348875f, 0.216247f, -0.161757f, -0.385502f,
+ -0.315738f, 0.020123f, -0.155609f, 0.114403f, -0.383232f, 0.629529f,
+ 0.066142f, 0.448392f, -0.389557f, -0.083315f, 0.829535f, -0.015531f,
+ -0.050728f, -0.325127f, 0.812992f, -0.196780f, 0.021060f, -0.952647f,
+ 0.006687f, -0.512715f, -0.066778f, 0.410067f, -0.116945f, -0.288283f,
+ 0.189334f, -0.083153f, 0.159980f, -0.068208f, 0.107358f, -0.154411f,
+ -0.068914f, 0.186816f, 0.032251f, 0.109242f, 0.134825f, 0.035101f,
+ -0.253175f, 0.157309f, -0.363597f, -0.138176f, -0.334141f, -0.172697f,
+ 0.045800f, -0.286057f, 0.173403f, -0.172444f, -0.117996f, -0.383848f,
+ -0.173303f, -0.258482f, -0.021404f, -0.017898f, -0.001970f, 0.003273f,
+ 0.056121f, 0.155046f, 0.044708f, -0.295609f, -0.211688f, -0.233229f,
+ -0.264980f, 0.145549f, 0.045323f, -0.027112f, 0.175638f, -0.207251f,
+ -0.055274f, 0.092706f, 0.086200f, -0.241340f, -0.147416f, 0.024510f,
+ -0.357194f, -0.181944f, -0.050104f, -0.079024f, -0.290473f, -0.169790f,
+ -0.277982f, -0.017781f, -0.004854f, -0.094132f, -0.348555f, 0.199291f,
+ -0.343989f, -0.319299f, -0.268935f, -0.021208f, 0.020938f, -0.090609f,
+ 0.006595f, -0.200790f, 0.171856f, -0.027766f, -0.032017f, -0.006745f,
+ 0.566426f, -0.096850f, 0.727633f, -0.408065f, -0.012436f, 0.005646f,
+ -0.305148f, -0.095075f, -0.391549f, -0.020378f, -0.236498f, -0.252773f,
+ -0.231385f, -0.203175f, 0.041903f, -0.373694f, 0.058239f, -0.101116f,
+ 0.183772f, 0.164523f, -0.099046f, -0.201272f, -0.394523f, -0.157517f,
+ 0.032079f, -0.381173f, -0.238496f, -0.037990f, -0.294553f, 0.141473f,
+ 0.100268f, -0.023806f, 0.004978f, 0.184916f, 0.142699f, -0.113240f,
+ -0.213364f, -0.160059f, -0.216263f, -0.406387f, -0.301140f, -0.406355f,
+ -0.113085f, -0.279699f, -0.267434f, 0.126263f, -0.260527f, -0.153904f,
+ -0.494653f, -0.355144f, 0.030549f, -0.216400f, -0.123363f, 0.189090f,
+ 0.219122f, 0.096677f, -0.202037f, -0.014489f, -0.137859f, -0.114184f,
+ -0.279423f, -0.270683f,
+};
+
+static const float av1_early_term_after_split_nn_bias_64_layer0[] = {
+ -0.491455f, 0.464538f, -0.005742f, -0.219951f, -0.073682f, 0.102027f,
+ 0.567071f, 0.441402f, 0.277521f, 0.314498f, -0.448199f, -0.065032f,
+ 0.488139f, -0.079632f, 0.000000f, 0.521555f, -0.151950f, -0.034616f,
+ 0.393438f, -0.072242f, -0.087343f, -0.571308f, 0.017372f, -0.126144f,
+ 0.372261f, -0.451537f, -0.140238f, -0.092377f, -0.074475f, -0.068879f,
+ -0.109614f, -0.164492f,
+};
+
+static const float av1_early_term_after_split_nn_weights_64_layer1[] = {
+ -0.373195f, -0.283141f, 0.416113f, 0.483659f, 0.230583f, 0.349197f,
+ -0.168582f, -0.813338f, -0.472369f, -0.173872f, 1.297845f, 0.339355f,
+ -0.828033f, 0.019617f, 0.118757f, -0.619360f, 0.282295f, -0.054116f,
+ -0.730596f, 0.068567f, -0.248707f, 0.461225f, 0.330224f, -0.287080f,
+ -0.458103f, 0.591852f, -0.008491f, 0.632119f, -0.007872f, 0.007869f,
+ -0.230698f, -0.011437f,
+};
+
+static const float av1_early_term_after_split_nn_bias_64_layer1[] = {
+ -0.55403697f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_64 = {
+ FEATURES,
+ 1,
+ 1,
+ {
+ HIDDEN_NODES,
+ },
+ {
+ av1_early_term_after_split_nn_weights_64_layer0,
+ av1_early_term_after_split_nn_weights_64_layer1,
+ },
+ {
+ av1_early_term_after_split_nn_bias_64_layer0,
+ av1_early_term_after_split_nn_bias_64_layer1,
+ },
+};
+
+static const float av1_early_term_after_split_nn_weights_32_layer0[] = {
+ 0.026050f, -0.226531f, 0.308107f, -0.083744f, 0.201785f, 0.098562f,
+ 0.147595f, -0.495771f, -0.245741f, 0.201616f, -0.272070f, -0.579545f,
+ -0.127261f, -0.229588f, 0.250831f, -0.176929f, -0.031689f, 0.284718f,
+ 0.085845f, -0.285027f, 0.012304f, 0.382402f, -0.204591f, 0.272514f,
+ -0.065854f, -0.054228f, -0.231174f, -0.174504f, 0.258287f, 0.195689f,
+ 0.242530f, 0.023528f, -0.294242f, -0.272132f, 0.460180f, -0.731281f,
+ -0.208103f, 0.208204f, 0.348250f, 0.016328f, 0.043707f, -0.169551f,
+ 0.108521f, 0.226895f, -0.020471f, 0.102443f, 0.429640f, -0.252555f,
+ -0.218434f, -0.163665f, 0.175531f, 0.101588f, -0.135798f, -0.158102f,
+ 0.142565f, 0.128277f, 0.174985f, -0.100073f, 0.113967f, 0.223682f,
+ -0.145576f, -0.008443f, 0.112748f, -0.037845f, 0.076954f, -0.287137f,
+ -0.518185f, -0.106833f, 0.175359f, 0.031408f, 0.219069f, -0.294440f,
+ 0.007766f, 0.067754f, -0.049168f, -0.212368f, -0.261708f, 0.309252f,
+ 0.220859f, -0.274852f, -0.653157f, 0.083438f, -0.265386f, 0.174429f,
+ -0.116931f, -0.091594f, -0.244897f, -0.089015f, 0.274453f, 0.212890f,
+ 0.272053f, -0.425315f, -0.107726f, 0.294444f, -0.354629f, 0.104402f,
+ -0.307663f, 0.558430f, 0.140334f, -0.054831f, -0.449456f, 0.058274f,
+ -0.033768f, -0.354117f, -0.331618f, -0.411772f, 0.232064f, -0.079297f,
+ -0.638571f, 0.181823f, -0.039611f, 0.206310f, -0.659157f, -0.102930f,
+ -0.067303f, -0.176881f, -0.001038f, 0.091835f, 0.079739f, -0.121923f,
+ 0.211070f, 0.362719f, -0.154915f, -0.151876f, -0.165460f, 0.023469f,
+ -0.251036f, 0.210014f, -0.537125f, 0.156832f, -0.216987f, 0.062975f,
+ -0.198462f, 0.329123f, 0.125870f, 0.225830f, 0.086377f, -0.128773f,
+ -0.179673f, -0.074612f, 0.456645f, 0.021905f, -0.243140f, 0.059145f,
+ -0.273942f, -0.277822f, 0.154556f, -0.025459f, 0.227614f, -0.313076f,
+ 0.044705f, -0.019017f, 0.108999f, -0.020243f, -0.016373f, 0.560270f,
+ -0.064818f, 0.050880f, -0.218458f, 0.825699f, -0.534056f, -0.258253f,
+ 0.222073f, 0.013295f, 0.477870f, -0.386727f, 0.388509f, 0.004128f,
+ 0.451388f, -0.175788f, 0.264093f, -0.109812f, 0.358132f, 0.500992f,
+ -0.446933f, -0.222397f, 0.345834f, 0.370943f, -0.233115f, -0.047005f,
+ -0.111335f, -0.111586f, 0.026975f, -0.052191f, -0.111800f, -0.129782f,
+ 0.225132f, 0.102524f, 0.544557f, -0.111674f, -0.857884f, 0.133258f,
+ 0.310001f, 0.043829f, 0.104143f, 0.256493f, 0.242520f, -0.342082f,
+ 0.421447f, 0.124227f, 0.061542f, -0.090206f, 0.316681f, 0.353452f,
+ -0.918408f, -0.001903f, -0.052303f, -0.004816f, -0.446393f, -0.053038f,
+ 0.255725f, -0.126346f, 0.034095f, -0.240276f, -0.135918f, 0.095682f,
+ -0.147457f, -0.338216f, -0.200426f, 0.010265f, -0.243915f, -0.231375f,
+ -0.323924f, -0.014353f, 0.150252f, -0.264346f, 0.205303f, -0.194610f,
+ -0.282527f, 0.180555f, -0.000087f, 0.027240f, -0.000903f, -0.345877f,
+ -0.353274f, -0.311829f, 0.172985f, -0.111748f, -0.309380f, 0.108110f,
+ -0.260914f, -0.164990f, 0.183625f, -0.319692f, -0.096988f, 0.094147f,
+ -0.047062f, -0.080978f, 0.227387f, -0.000450f, -0.220159f, -0.211448f,
+ -0.020885f, -0.139646f, -0.086721f, 0.067928f, -0.033084f, -0.251996f,
+ 0.090317f, 0.086313f, -0.228420f, -0.111356f, -0.314304f, -0.223664f,
+ 0.188176f, -0.002360f, -0.029491f, -0.006000f, -0.075343f, 0.173699f,
+ -0.272800f, -0.238507f, -0.272071f, -0.015000f, -0.215305f, -0.192943f,
+ -0.038595f, 0.119537f, 0.260477f, -0.168014f, -0.172751f, 0.532861f,
+ -0.753250f, -0.017485f, -0.115541f, -0.109291f, -1.098943f, 0.418559f,
+ -0.532110f, 0.359323f, -0.254786f, 0.471316f, -0.545024f, 0.291912f,
+ -0.836939f, 0.443427f, -0.441709f, 0.168866f, -0.140372f, 0.546607f,
+ -0.315465f, 0.023328f, 0.137709f, -0.083492f, -0.049986f, -0.071302f,
+ -0.293680f, -0.105049f, 0.315317f, 0.279569f, 0.220762f, 0.088161f,
+ -0.756456f, -0.074512f, 0.958318f, -0.332924f, -0.004906f, -0.629271f,
+ 0.212050f, 0.279123f, 0.311523f, -0.599580f, 0.516150f, 0.456952f,
+ 0.020255f, 0.247290f, -0.182670f, -0.335554f, 0.021203f, 0.131081f,
+ -0.208584f, 0.112530f, -0.198980f, 0.211583f, -0.101271f, -0.206453f,
+ -0.502688f, -0.294976f, -0.187019f, -0.114473f, 0.282050f, -0.165483f,
+ 0.094953f, -0.182578f, 0.055068f, 0.135605f, -0.266941f, -0.297556f,
+ 0.199181f, 0.015979f, -0.158659f, -0.226841f, 0.171306f, 0.013438f,
+ -0.286309f, -0.071753f, -0.170300f, -0.238188f, 0.093572f, -0.026230f,
+ -0.254502f, -0.297786f, -0.063480f, -0.300799f, -0.065644f, 0.074710f,
+ 0.248576f, -0.144425f, -0.113948f, -0.247297f, 0.276682f, 0.010963f,
+ -0.737786f, 0.026347f, 0.007830f, 0.753543f, 0.371904f, 0.305614f,
+ 0.105028f, 0.073530f, -0.119137f, 0.102352f, -0.080523f, 0.176366f,
+ -0.159457f, -0.339948f, 0.360131f, -0.007051f, -0.388378f, -0.101695f,
+ 0.663041f, -0.234486f, -0.142536f, -0.099931f, 0.041478f, 0.230425f,
+ 0.005743f, 0.154060f, 0.056233f, -0.080668f, -0.009754f, -0.194356f,
+ 0.185474f, -0.296474f, 0.192700f, 0.257767f, 0.348529f, 0.458265f,
+ 0.060276f, -0.130473f, 0.139889f, 0.310073f, -0.306869f, -0.272922f,
+ -0.259862f, 0.409207f, 0.431991f, -0.100357f, -0.050415f, -0.071830f,
+ -0.239665f, 0.153399f, 0.177192f, -0.611644f, -0.176114f, -0.022694f,
+ -0.033701f, -0.345842f, 0.015660f, 0.158931f, -0.097586f, 0.222001f,
+ 0.257887f, -0.171307f, -0.222607f, -0.245508f, -0.145742f, -0.096461f,
+ -0.010895f, 0.052815f, -0.265306f, -0.081059f, 0.219162f, -0.256084f,
+ -0.372676f, 0.148977f, 0.174831f, 0.086980f, 0.108518f, 0.074011f,
+ 0.038032f, -0.070856f, -0.109407f, 0.126174f, 0.022341f, -0.249786f,
+ -0.356164f, -0.202841f, -0.087437f, -0.133740f, 0.090956f, -0.017953f,
+ -0.028353f, 0.233621f, 0.109426f, 0.232798f, -0.104950f, -0.241798f,
+ -0.018995f, -0.167954f, 0.002473f, 0.060418f, -0.232717f, -0.195980f,
+ -0.283971f, -0.371881f, 0.219728f, 0.018072f, -0.166694f, -0.083301f,
+ -0.000616f, -0.212641f, -0.173158f, 0.222739f, -0.235302f, 0.237624f,
+ 0.222232f, -0.041235f, -0.342411f, 0.121194f, 0.211291f, -0.032237f,
+ -0.249401f, -0.291668f, 0.206055f, -0.148200f, 0.011824f, -0.272728f,
+ -0.194854f, 0.367175f, -0.257243f, 0.103433f, -0.231077f, 0.236734f,
+ 0.135733f, -0.362845f, 0.197147f, 0.242782f, -0.135289f, 0.123311f,
+ 0.259420f, -0.116278f, 0.127287f, 0.236789f, -0.097438f, 0.118073f,
+ 0.112796f, -0.035949f, 0.184408f, 0.200948f, -0.008859f, 0.195989f,
+ 0.161970f, -0.295320f, -0.330389f, 0.141034f, 0.066081f, -0.707857f,
+ 0.357037f, 0.149633f, 0.679877f, 0.548674f, 0.469076f, 0.194123f,
+ -0.209872f, -0.071764f, -0.126960f, 0.199420f, 0.327116f, -0.169053f,
+ -0.429156f, 0.443429f, -0.225530f, -0.130738f, -0.028351f, 0.644393f,
+ 0.049606f, -0.243602f, -0.409920f, 0.117028f, -0.258557f, 0.073865f,
+ -0.200454f, -0.139957f, -0.031314f, 0.162325f, 0.247221f, 0.071909f,
+ -0.336276f, 0.079922f, 0.192780f, -0.148882f, 0.133192f, -0.143177f,
+ -0.121327f, 0.126221f, -0.089521f, -0.181826f, 0.149923f, -0.280682f,
+ 0.391572f, 0.108990f, -0.445494f, -0.170787f, 0.225182f, 0.223313f,
+ -0.234828f, -0.071072f, -0.072673f, -0.093686f, 0.223892f, -0.049377f,
+ 0.057976f, 0.033558f, 0.068733f, -0.283353f, 0.217877f, 0.158093f,
+ -0.276761f, -0.097049f, -0.351913f, -0.383604f, 0.002863f, -0.474510f,
+ -0.096738f, 0.256940f, 0.234203f, -0.226667f, -0.260576f, -0.183403f,
+ -0.035578f, 0.141570f, 0.078764f, -0.028086f, 0.155800f, -0.251115f,
+ -0.286703f, -0.014739f, -0.072621f, -0.311506f, -0.048639f, 0.081621f,
+ 0.043057f, 0.068136f, -0.179903f, 0.143699f, -0.002571f, 0.239012f,
+ 0.197456f, 0.035745f, -0.311927f, 0.220320f, 0.102687f, -0.294105f,
+ 0.426740f, 0.209050f, 0.211907f, 0.083453f, 0.006578f, -0.143338f,
+ 0.003157f, 0.040295f, 0.234497f, 0.035344f, -0.163909f, 0.411115f,
+ 0.289453f, -0.075357f, -0.008884f, 0.469798f, -0.033304f, -0.153293f,
+ -0.229322f, -0.004162f, 0.113363f, 0.395381f, 0.067414f, -0.188966f,
+ -0.117424f, -0.166423f, 0.066839f, 0.595641f, -0.204782f, -0.451727f,
+ 0.198509f, -0.921583f, -0.246765f, -0.153411f, 0.046491f, 0.365906f,
+ 0.376710f, -0.017355f, -0.035232f, 0.138785f, -0.163918f, -0.283449f,
+ -0.094340f, 0.192127f, 0.154815f, 0.035787f, -0.029087f, 0.115649f,
+ -0.220133f, -0.452741f, 0.311667f, 0.157666f, 0.091401f, 0.236040f,
+ -0.168523f, 0.122176f, -0.219016f, -0.214856f, 0.172824f, -0.091810f,
+ 0.031520f, -0.857420f, 0.643446f, -0.017471f, 0.206082f, -0.933517f,
+ -0.020070f, -0.065091f, -0.117680f, -1.271870f, -0.069177f, -0.149409f,
+ 0.289970f, -0.889775f, -0.044741f, 0.232647f, -0.319416f, 0.073030f,
+ 0.278549f, 0.238782f, -0.202206f, 0.272540f, 0.201412f, 0.175574f,
+ -0.127971f, -0.253164f, -0.086352f, -0.005381f, 0.114714f, 0.505169f,
+ -0.175049f, -1.534280f, -0.320666f, -2.119298f, -0.023075f, -0.021259f,
+ -0.161019f, 0.344837f, 0.361958f, -0.097050f, 0.014375f, 0.267110f,
+ 0.341442f, -0.016688f, 0.073393f, 0.131500f, 0.246331f, 0.011059f,
+ 0.033597f, 0.014779f, -0.269366f, -0.504788f, 0.048651f, 0.295682f,
+ 0.237363f, 0.227484f, -0.235814f, -0.160530f, 0.182682f, -0.172999f,
+ -0.126630f, 0.168357f, -0.078729f, 0.052805f, 0.377021f, -0.004727f,
+ 0.230415f, -0.876673f, 0.458457f, 0.099401f, -0.019616f, 0.611982f,
+ -0.231508f, -0.070894f, -0.056142f, 0.548969f, -0.376599f, -0.600428f,
+ 0.241930f, -0.592893f, 0.189371f, 0.488651f, -0.092446f, -0.272569f,
+ 0.251643f, 0.315945f, -0.301468f, 0.112961f, 0.052119f, -0.066076f,
+ -0.082249f, 0.252805f, -0.195539f, 0.150386f, -0.865534f, 0.673447f,
+ 0.030177f, -0.438528f, -1.006174f, 0.575176f, -0.271656f, 0.035835f,
+ -1.056916f, 0.495267f, -0.092428f, -0.109511f, -0.192359f, 0.166669f,
+ -0.624326f, -0.000354f, -0.089075f, 0.176279f, -0.289347f, 0.021346f,
+ 0.020375f, 0.255282f, -0.045588f, 0.173675f, 0.100957f, -0.294373f,
+ 0.049303f, -0.134132f, -0.255731f, -0.025559f, -0.307463f, -0.205100f,
+ 0.079024f, 0.101113f, 0.135742f, -0.348869f, -0.026759f, -0.134155f,
+ -0.179275f, -0.054297f, -0.054948f, 0.029351f, 0.190560f, 0.102476f,
+ -0.025785f, 0.169442f, -0.271303f, 0.200667f, 0.099063f, 0.074767f,
+ -0.326533f, 0.044426f, -0.290251f, -0.082443f, -0.164482f, -0.349412f,
+ 0.045109f, -0.157330f, 0.165935f, 0.012672f, -0.059818f, 0.399140f,
+ -0.316620f, 0.386638f, -0.285399f, -0.296777f, -0.200473f, -0.144232f,
+ 0.251851f, -0.203768f, 0.001071f, -0.179063f, 0.248952f, -0.143029f,
+ 0.010423f, -0.030293f, -0.046786f, -0.196195f, -0.016845f, 0.295023f,
+ 0.322825f, 0.133683f, 0.017388f, 0.142467f, 0.221320f, 0.004059f,
+ -0.115770f, 0.143363f, 0.137972f, -0.272584f, 0.489366f, -0.091828f,
+ -0.014703f, 0.082332f, -0.476226f, -0.202859f, 0.356094f, -0.283049f,
+ 0.218086f, 0.202015f, 0.201724f, 0.012617f, 0.050720f, 0.255695f,
+ 0.244653f, 0.111296f, -0.151450f, -0.056210f, -0.757348f, 0.441724f,
+ -0.022455f, -0.244662f, 0.296205f, -0.421883f, -0.217386f, -0.254301f,
+ 0.409105f, -0.031309f, 0.050147f, -0.337170f, -0.106620f, -0.606455f,
+ 0.308024f, 0.298144f, 0.363993f, 0.704870f, -0.047292f, 0.166901f,
+ 0.105991f, -0.536757f, -0.424031f, -0.226034f, 0.213635f, -0.526754f,
+ 0.310990f, -0.116038f, 0.007775f, 0.538330f, -0.177912f, 0.445357f,
+ -0.290365f, 0.451169f, 0.030931f, 0.033388f, 0.209905f, -0.244492f,
+ -0.097792f, -0.246042f, 0.132047f, 0.032576f, 0.115516f, 0.022890f,
+ 0.093508f, -0.071840f, 0.362948f, -0.135245f, 0.659911f, -0.321413f,
+ 0.193118f, -0.795001f, -0.218311f, 0.024862f, 0.206172f, -0.832878f,
+ -0.255670f, 0.343402f, -0.275211f, -0.898363f, -0.025172f, 0.158565f,
+ 0.171347f, -0.127518f, -0.215156f, -0.159198f, 0.250355f, -0.132452f,
+ 0.061254f, -0.097544f, -0.223246f, 0.013183f, 0.239468f, 0.259017f,
+ -0.217739f, -0.032263f, 0.123755f, -0.701777f, 0.150049f, -0.555293f,
+ 0.062430f, -0.260304f, 0.494894f, -0.168702f, -0.134829f, -0.113989f,
+ 0.150092f, -0.060248f, 0.115711f, -0.277202f, 0.499811f, 0.417116f,
+ 0.191081f, -0.376432f, -0.321092f, 0.033992f, 0.057193f, 0.127077f,
+ -0.009042f, 0.014443f, 0.142808f, -0.124349f, 0.213087f, -0.381686f,
+ 0.129726f, -0.038396f,
+};
+
+static const float av1_early_term_after_split_nn_bias_32_layer0[] = {
+ -0.107171f, 0.060848f, -0.069480f, -0.121982f, 0.037637f, -0.291839f,
+ 0.102257f, -0.065889f, -0.032452f, 0.034171f, -0.073984f, -0.005236f,
+ 0.218820f, 0.132123f, -0.089621f, -0.067679f, 0.049368f, 0.329444f,
+ -0.184729f, 0.031702f, 0.009735f, -0.039964f, -0.018024f, -0.073031f,
+ -0.030166f, -0.191037f, -0.074862f, -0.076548f, 0.076537f, 0.216609f,
+ -0.078358f, -0.007740f,
+};
+
+static const float av1_early_term_after_split_nn_weights_32_layer1[] = {
+ 0.047869f, -0.231773f, -0.185663f, 0.460676f, -0.208182f, 0.590555f,
+ -0.622627f, 0.279377f, 0.351681f, 0.633504f, 1.069884f, 0.332449f,
+ -0.457703f, -0.435817f, -0.028853f, 0.327490f, -0.282469f, -0.975792f,
+ -0.062975f, -0.147187f, 0.348340f, -1.207116f, 0.516159f, -1.509626f,
+ -0.805072f, 0.522999f, 0.143671f, 0.304246f, -0.360720f, -0.612472f,
+ 0.260045f, -0.223243f,
+};
+
+static const float av1_early_term_after_split_nn_bias_32_layer1[] = {
+ -0.07571174f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_32 = {
+ FEATURES,
+ 1,
+ 1,
+ {
+ HIDDEN_NODES,
+ },
+ {
+ av1_early_term_after_split_nn_weights_32_layer0,
+ av1_early_term_after_split_nn_weights_32_layer1,
+ },
+ {
+ av1_early_term_after_split_nn_bias_32_layer0,
+ av1_early_term_after_split_nn_bias_32_layer1,
+ },
+};
+
+static const float av1_early_term_after_split_nn_weights_16_layer0[] = {
+ -0.113798f, 0.053357f, -0.037947f, -0.477171f, 0.276517f, -0.349252f,
+ -0.177284f, 0.189597f, 0.141744f, 0.230207f, -0.328104f, 0.074328f,
+ 0.247717f, 0.233533f, 0.145167f, 0.018029f, -0.398725f, -0.226199f,
+ -0.309724f, 0.125279f, 0.194759f, 0.025531f, 0.349714f, -0.273944f,
+ 0.186871f, 0.181735f, -0.520614f, -0.264076f, 0.308207f, 0.157438f,
+ -0.137791f, -0.054582f, 0.125879f, 0.796218f, -0.897562f, 0.885439f,
+ 0.381640f, 0.106625f, -2.027456f, 0.000874f, 0.179581f, 0.013287f,
+ -2.329439f, -0.163169f, -0.136191f, 0.320108f, -2.318779f, -0.196722f,
+ -0.295721f, 0.203658f, -0.182275f, 0.615941f, 0.015762f, 0.257181f,
+ -0.115297f, 0.295774f, -0.026144f, -0.022686f, -0.219423f, -0.042861f,
+ 0.207647f, -0.057791f, 0.201671f, -0.169569f, 0.291492f, -0.994991f,
+ 0.137473f, 0.230948f, 0.505626f, -1.065860f, 0.275225f, -0.250861f,
+ 0.519466f, -1.217242f, -0.087384f, 0.053441f, 0.030729f, -1.702304f,
+ -0.034635f, 0.010177f, -0.035422f, -0.749979f, 0.355499f, 0.408166f,
+ -0.086883f, 0.017203f, 0.195706f, -0.218056f, -0.029153f, 0.367335f,
+ -0.061732f, -0.241068f, 0.078496f, -0.370346f, -0.124223f, -0.172708f,
+ 0.037971f, 0.038875f, -0.282489f, -0.266323f, -0.210864f, 0.214714f,
+ 0.234695f, -0.045625f, 0.015357f, -0.007464f, -0.362003f, -0.113465f,
+ 0.145141f, 0.238470f, -0.202664f, -0.286587f, -0.347112f, 0.054501f,
+ -0.190290f, -0.283256f, 0.062179f, 0.041165f, -0.006935f, -0.220351f,
+ -0.088800f, 0.220924f, -0.200982f, 0.058493f, -0.225175f, 0.057175f,
+ -0.618187f, 0.761023f, -0.743774f, -0.500599f, -0.584999f, 1.545211f,
+ 0.123055f, -0.106848f, -0.353057f, 1.552187f, 0.174104f, 0.068060f,
+ -0.449859f, 1.254299f, -0.161716f, -0.060630f, -0.230721f, 0.165976f,
+ -0.101582f, -0.422415f, 0.110384f, -0.130098f, 0.104428f, 0.083518f,
+ 0.031626f, 0.083048f, 0.158877f, 0.173340f, 0.063962f, 0.427845f,
+ 0.663268f, 0.376996f, 0.146435f, -0.091329f, 0.443447f, 0.518432f,
+ -0.182777f, -0.091313f, 0.331229f, 0.532604f, -0.187001f, 0.054774f,
+ 0.298068f, 0.502295f, -0.362378f, 0.054283f, 0.292806f, 0.168901f,
+ -0.214787f, 0.025637f, 0.458009f, -0.322714f, -0.264059f, 0.140313f,
+ -0.102696f, -0.431208f, -0.134450f, -0.545415f, 0.253851f, -0.009061f,
+ -0.050681f, 0.108681f, 0.043272f, -1.073133f, 0.206410f, 0.469576f,
+ 0.291494f, -2.021244f, -0.001183f, -0.067542f, 0.364907f, -2.470543f,
+ 0.049147f, -0.018868f, 0.658500f, -2.531048f, 0.275433f, -0.034224f,
+ -0.171386f, 0.096369f, 0.728069f, 0.272332f, 0.222255f, -0.030426f,
+ 0.026994f, 0.208928f, -0.173943f, -0.227581f, -0.214798f, 0.079341f,
+ 0.032344f, -0.253575f, -0.044353f, -0.239265f, -0.055852f, -0.162582f,
+ -0.086592f, 0.066487f, 0.337353f, -0.168704f, 0.015702f, 0.022607f,
+ 0.286647f, 0.218106f, 0.193319f, -0.358714f, 0.030796f, 0.007646f,
+ -0.045617f, 0.165007f, -0.284641f, -0.291812f, 0.207544f, 0.082823f,
+ -0.141907f, -0.331336f, -0.052908f, 0.120716f, 0.202521f, 0.232782f,
+ -0.348141f, -0.017332f, 1.191126f, -0.391987f, -0.154537f, -0.206551f,
+ -2.378690f, 0.057918f, -0.328183f, 2.151556f, 0.238803f, 0.164880f,
+ -0.480039f, 1.616200f, 0.260243f, 0.083704f, -0.174461f, 1.804634f,
+ 0.194810f, 0.223837f, 0.550107f, -0.068171f, -0.293435f, -0.186770f,
+ -0.364846f, 0.127181f, 0.105556f, -0.016202f, 0.278403f, -0.344995f,
+ -0.009761f, -0.082555f, 0.046731f, -0.301452f, 0.604259f, 0.055895f,
+ 0.049862f, 0.314249f, -0.305811f, -0.112937f, 0.658787f, -0.549288f,
+ -0.307567f, -0.460650f, -0.840643f, 0.082576f, 0.373711f, 0.138318f,
+ 0.336901f, 0.284984f, -0.281400f, 0.408210f, -0.449858f, 0.461054f,
+ 0.227629f, -0.131705f, 0.301769f, -0.278540f, 0.189290f, -0.269041f,
+ 0.111350f, -0.300257f, 0.436858f, -0.265920f, -0.211938f, 0.272631f,
+ 0.206291f, 0.253273f, -0.229776f, -0.031112f, -0.171183f, -0.109676f,
+ -0.202390f, -0.068857f, 0.182125f, -0.140523f, -0.308742f, -0.045840f,
+ 0.256545f, -0.262405f, 0.225951f, -0.287463f, -0.189203f, -0.055552f,
+ -0.052448f, -0.242839f, -0.278877f, 0.140920f, -0.175755f, 0.215402f,
+ -0.248841f, -0.264080f, -0.178303f, 0.147777f, 0.049460f, -0.279877f,
+ -0.539725f, -0.004622f, 0.182874f, 0.338814f, 0.265974f, 0.249851f,
+ -0.141154f, 0.157228f, -0.090972f, 0.179444f, 0.305255f, 0.127788f,
+ 0.123270f, 0.355320f, 0.076797f, 0.263495f, 0.235965f, -0.133816f,
+ 0.243624f, 0.227062f, -0.213629f, 0.002075f, 0.061203f, -0.077820f,
+ -0.008807f, -0.247324f, -0.051464f, -0.191894f, -0.238713f, -0.389526f,
+ -0.274248f, 0.053950f, -0.225750f, -0.367097f, -0.122391f, 0.181212f,
+ -0.411824f, -0.084241f, -0.302288f, 0.077860f, -0.187443f, -0.300262f,
+ 0.083156f, -0.392461f, -0.332320f, -0.346474f, 0.140658f, -0.283656f,
+ 0.120714f, -0.056577f, -0.280968f, 0.017795f, -0.024686f, 0.073113f,
+ -0.346637f, 0.082567f, -0.036556f, -0.369730f, 0.081225f, -0.005211f,
+ 0.144886f, -0.003544f, 0.178307f, -0.366035f, -0.063887f, -0.191767f,
+ 0.105835f, -0.273978f, -0.266532f, -0.023984f, 0.039166f, 0.065848f,
+ -0.026802f, -0.268923f, 0.189659f, 0.086300f, 0.030718f, 0.216565f,
+ -0.130025f, -0.215687f, 0.146341f, -0.286438f, -0.394226f, -0.181509f,
+ -0.005612f, 0.186040f, 0.133491f, 0.032096f, -0.261609f, 0.074007f,
+ -0.042929f, -0.234479f, 0.189704f, 0.088395f, -0.003671f, -0.125055f,
+ -0.252418f, -0.086387f, 0.111197f, -0.297071f, -0.018793f, -0.031902f,
+ -0.333191f, -0.186279f, 0.039868f, 0.091419f, -0.264438f, -0.216150f,
+ -0.212550f, 0.203412f, -0.113028f, -0.197169f, -0.346771f, 0.086066f,
+ 0.091443f, -0.128507f, -0.007281f, -0.118389f, 0.003370f, -0.338661f,
+ 0.026739f, -0.063571f, -0.281567f, -0.166824f, 0.167455f, 0.216173f,
+ 0.199163f, 0.256314f, -0.222679f, 0.040282f, -0.154808f, -0.133943f,
+ -0.270163f, -0.357398f, 0.260373f, 0.176950f, -0.125162f, -0.085050f,
+ 0.226376f, -0.124585f, -0.324804f, 0.035536f, -0.133600f, 0.173450f,
+ 0.068107f, -0.337442f, 0.169629f, 0.047223f, 0.057878f, 0.055555f,
+ -0.317449f, -0.103768f, 0.080899f, -0.194759f, -1.137593f, 0.508999f,
+ 0.045372f, 1.746454f, 1.250347f, -0.342930f, -0.127821f, -0.220175f,
+ -0.417649f, -0.480595f, 0.071902f, 0.050231f, -0.562554f, -0.677866f,
+ -0.121416f, -0.247558f, -0.483876f, -0.504157f, 1.731953f, 0.572936f,
+ 0.047325f, 0.050619f, 0.112611f, -0.035393f, 0.052585f, -0.071076f,
+ -0.015798f, -0.050228f, -0.142875f, 0.189329f, 0.048833f, 0.503633f,
+ 0.249588f, 0.175492f, -0.137664f, -0.018533f, 0.288453f, -0.025644f,
+ 0.079131f, 0.195096f, -0.154039f, -0.104220f, -0.224072f, 0.095946f,
+ -0.208424f, 0.214745f, 0.056468f, 0.182603f, 0.341784f, -0.134664f,
+ -0.194050f, 0.058532f, -0.107336f, -0.087783f, -0.238795f, -0.387212f,
+ 0.049055f, -0.127417f, -0.299919f, -0.094371f, -0.011735f, -0.264753f,
+ 0.407375f, -0.462654f, -0.609488f, 0.027742f, -0.985512f, -0.109154f,
+ -0.423276f, 2.347960f, 0.129240f, 0.187610f, -0.057081f, 2.424892f,
+ 0.087666f, 0.106716f, -0.039379f, 2.764866f, 0.113309f, 0.028196f,
+ -0.582789f, 0.335385f, -0.538029f, -0.477337f, -0.114207f, 0.178829f,
+ 0.006276f, 0.123179f, 0.095101f, 0.139898f, -0.372074f, -0.111010f,
+ 0.136330f, 0.272900f, 0.126737f, -0.097808f, -0.363697f, 0.108665f,
+ -0.227749f, -0.083421f, 1.714677f, 0.451943f, 0.107931f, -0.392281f,
+ 1.615846f, 0.022307f, -0.247011f, 0.257703f, 1.039134f, 0.537789f,
+ 0.022177f, -0.271532f, 0.351350f, -0.399205f, -0.240534f, -0.315399f,
+ 0.026928f, -0.005618f, 0.053179f, -0.010277f, 0.000501f, 0.040896f,
+ -0.109160f, 0.018282f, 0.003887f, 0.199599f, 0.095349f, -0.337284f,
+ 0.169929f, -0.109409f, -0.166983f, 0.059908f, -0.226574f, -0.120114f,
+ 0.077329f, -0.333133f, -0.220936f, 0.114309f, -0.233965f, -0.281551f,
+ 0.042948f, 0.100940f, 0.116037f, -0.313122f, 0.215149f, -0.309057f,
+ -0.341052f, -0.294417f, -0.179722f, 0.010795f, 0.192053f, -0.275261f,
+ -0.033077f, 0.117348f, 0.090206f, 0.781573f, 0.602456f, -0.220296f,
+ 0.172159f, 0.758513f, 0.157910f, -0.217897f, -0.372659f, 0.031935f,
+ 0.791463f, 0.267195f, 0.931593f, -0.057349f, 0.405512f, -0.058512f,
+ -0.641663f, -0.076592f, 0.550227f, -0.024094f, 0.048218f, -0.289971f,
+ 0.180940f, 0.167533f, 0.052711f, -0.360726f, 0.019210f, -0.488879f,
+ 0.380498f, 0.151608f, -0.276895f, -0.596554f, 0.106076f, -0.245833f,
+ -0.048783f, 0.073823f, 0.098780f, 0.000211f, 0.113958f, -0.068964f,
+ -0.265533f, -0.185457f, 0.175586f, -0.163621f, -0.204919f, 0.145802f,
+ -0.163421f, 0.129576f, -0.153486f, -0.105573f, 0.067289f, -0.213120f,
+ -0.286103f, 0.249543f, -0.044970f, -0.170464f, -0.105501f, -0.094765f,
+ -0.050734f, -0.369468f, 0.180020f, -0.363328f, -0.151654f, -0.262550f,
+ -0.424503f, 0.829032f, -0.559452f, 0.506837f, 0.143823f, 0.276660f,
+ -1.808608f, -0.259517f, -0.053945f, 0.035676f, -1.842195f, -0.065960f,
+ -0.069285f, 0.462022f, -2.319453f, -0.370299f, 0.183329f, -0.146412f,
+ -0.563875f, 0.305068f, 0.480904f, 0.044319f, -0.016098f, 0.168516f,
+ 0.114874f, -0.097621f, -0.030373f, 0.177700f, 0.181591f, -0.146003f,
+ -0.330853f, -0.259200f, 0.779319f, -1.517524f, 0.178781f, 0.135451f,
+ 0.088784f, -2.076089f, 0.628717f, -0.048685f, 0.281327f, -2.341596f,
+ 0.422171f, 0.006135f, 0.367096f, -1.663118f, 0.365253f, -0.072884f,
+ -0.197620f, -0.688634f, 0.477354f, 0.395841f, -0.098505f, 0.208709f,
+ -0.027523f, 0.127119f, 0.106274f, 0.114424f, -0.122877f, -0.087245f,
+ 0.086923f, -0.527398f, -0.342062f, -0.764662f, 0.713094f, -0.626453f,
+ -0.081454f, -0.087683f, 0.885047f, 0.323440f, -0.018579f, -0.217166f,
+ 1.617984f, -0.159038f, 0.265991f, -0.390313f, 1.933182f, -0.032431f,
+ -0.057513f, -0.300841f, 0.461248f, -0.072147f, -0.287052f, -0.078056f,
+ 0.011734f, 0.044013f, 0.177174f, 0.093400f, 0.028819f, 0.193686f,
+ -0.224853f, 0.268321f, -0.075059f, 0.074526f, -0.015618f, 0.165615f,
+ -0.276780f, -0.063908f, -0.369264f, -0.171497f, -0.173624f, -0.130743f,
+ -0.224625f, -0.124980f, -0.104482f, 0.076864f, -0.009631f, -0.164682f,
+ 0.150480f, -0.111880f, -0.260425f, 0.086234f, -0.176936f, -0.136771f,
+ -0.168867f, -0.405626f, -0.288716f, -0.128950f, -0.207327f, 0.015581f,
+ -0.109061f, -0.098970f, 0.090792f, -0.109623f, 0.349851f, 0.266341f,
+ -0.088602f, -0.108071f, 0.082519f, 0.472650f, -1.838758f, 0.456694f,
+ 0.119927f, 0.461077f, -2.860022f, 0.231495f, 0.235771f, 0.256424f,
+ -1.938516f, -0.188202f, -0.000832f, -0.518206f, 0.194644f, 0.505510f,
+ 0.615657f, 0.193760f, 0.224600f, 0.265732f, -0.121553f, -0.354597f,
+ -0.242414f, -0.276639f, -0.057591f, 0.026369f, -0.261148f, -0.356155f,
+ -0.149178f, -0.353566f, -0.340835f, -0.141776f, 0.076535f, 0.221299f,
+ -0.108857f, -0.156514f, 0.050901f, 0.058541f, -0.077141f, 0.071515f,
+ -0.333283f, -0.181489f, -0.212900f, -0.224698f, -0.174693f, -0.178665f,
+ -0.143374f, -0.091811f, 0.165161f, 0.060156f, -0.086103f, -0.039031f,
+ -0.377759f, -0.370533f, 0.074431f, 0.064192f, 0.186576f, 0.447858f,
+ -0.082260f, -0.020268f, -0.123089f, -0.402017f, 0.080500f, 0.176286f,
+ 2.850013f, 0.019385f, -0.225361f, -0.235315f, 1.654694f, -0.073978f,
+ -0.341412f, -1.187575f, 2.815900f, -0.228063f, -0.174547f, 0.623825f,
+ -0.010676f, 0.157189f, 0.111879f, -0.198965f, 0.051851f, 0.158396f,
+ 0.045194f, 0.293531f, -0.246714f, -0.351493f, 0.026954f, 0.076233f,
+ 0.420367f, 0.168154f, -0.131450f, 0.134487f, -0.288851f, -0.134553f,
+ 0.014902f, 0.756381f, 0.277713f, 0.190080f, -0.020869f, 1.446672f,
+ 0.029792f, -0.025927f, 0.060640f, 0.559864f, 0.422229f, 0.198459f,
+ 0.036167f, 0.029432f, 0.001882f, 0.038480f, -0.160528f, -0.288855f,
+ -0.310886f, 0.291296f, 0.190558f, -0.182816f, -0.002252f, 0.073101f,
+ -0.172245f, -0.305980f, 0.112492f, -0.422839f, -0.295999f, -0.078160f,
+ -0.173405f, -0.032819f, 0.373774f, -0.715223f, 0.018911f, 0.131753f,
+ -0.237364f, -0.128499f, -0.228406f, 0.341619f, 0.343552f, -0.521581f,
+ -0.263790f, 0.362502f, -0.018450f, 0.054233f, 0.183068f, 0.382772f,
+ 0.188811f, -0.627287f, 0.040399f, -0.487338f, -0.192591f, 0.247426f,
+ 0.154372f, -0.483994f,
+};
+
+static const float av1_early_term_after_split_nn_bias_16_layer0[] = {
+ -0.173976f, 0.305495f, 0.250981f, -0.067127f, -0.313100f, 0.242464f,
+ 0.315196f, -0.056052f, -0.241227f, -0.253308f, -0.002697f, 0.003687f,
+ -0.124421f, -0.090383f, -0.070366f, -0.064074f, -0.056115f, 0.123313f,
+ -0.239698f, -0.182082f, -0.065296f, 0.021503f, -0.036787f, 0.311861f,
+ 0.118135f, -0.320456f, -0.110719f, 0.220692f, -0.071727f, -0.088226f,
+ -0.110874f, -0.111671f,
+};
+
+static const float av1_early_term_after_split_nn_weights_16_layer1[] = {
+ -0.338573f, 0.398159f, 0.314774f, -0.037448f, -0.271950f, -0.774991f,
+ 0.950901f, -0.225380f, -1.841906f, -0.350379f, -0.079350f, 0.383148f,
+ -0.183676f, -0.313132f, -0.340820f, -0.309401f, -1.050540f, -0.432267f,
+ -0.657195f, 0.927632f, -0.040150f, 0.578920f, 0.212301f, 0.292495f,
+ 0.563590f, -0.205735f, 0.195877f, 0.582122f, -0.217860f, 1.613379f,
+ 0.313278f, -0.555802f,
+};
+
+static const float av1_early_term_after_split_nn_bias_16_layer1[] = {
+ 0.16553f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_16 = {
+ FEATURES,
+ 1,
+ 1,
+ {
+ HIDDEN_NODES,
+ },
+ {
+ av1_early_term_after_split_nn_weights_16_layer0,
+ av1_early_term_after_split_nn_weights_16_layer1,
+ },
+ {
+ av1_early_term_after_split_nn_bias_16_layer0,
+ av1_early_term_after_split_nn_bias_16_layer1,
+ },
+};
+
+static const float av1_early_term_after_split_nn_weights_8_layer0[] = {
+ -0.719472f, 0.305806f, 0.855829f, 0.100094f, 0.412517f, 1.254673f,
+ 1.552105f, -5.890773f, -0.089957f, -0.016736f, 1.418074f, -5.393506f,
+ -0.028214f, 0.117758f, 1.479209f, -5.299794f, 0.171585f, -0.084182f,
+ -0.162105f, 0.388577f, -0.044319f, -0.025861f, 0.251782f, -0.181462f,
+ -0.101545f, -0.079999f, -0.033014f, -0.191627f, -0.032802f, -0.053404f,
+ 0.038038f, -0.119492f, 0.049104f, -0.344384f, -0.354513f, 0.036977f,
+ 0.017513f, -0.004025f, -0.163212f, -0.261999f, 0.146575f, 0.207541f,
+ 0.130365f, -0.252127f, 0.097419f, -0.231057f, -0.309421f, 0.347866f,
+ -0.064670f, -0.283171f, -0.244193f, -0.193323f, -0.226954f, -0.276194f,
+ -0.233553f, 0.156354f, -0.184009f, 0.344289f, -0.308058f, -0.205202f,
+ -0.325068f, 0.183820f, -0.361667f, -0.069559f, -0.121834f, -0.038357f,
+ -0.210043f, -0.266129f, 0.003188f, 0.074902f, -0.328843f, 0.293679f,
+ -0.234698f, -0.428268f, -0.308772f, -0.136538f, -0.008384f, -0.078227f,
+ 0.166074f, -0.262899f, 0.102114f, -0.323420f, 0.057064f, -0.203318f,
+ -0.397413f, -0.317324f, -0.307093f, 0.020574f, -0.188627f, 0.132529f,
+ 0.118992f, -0.487387f, -0.282975f, 0.573231f, -0.266071f, 0.125140f,
+ -0.970034f, 1.424008f, -0.487366f, -0.196415f, 3.680273f, -0.008407f,
+ 0.081109f, -0.187479f, 3.876021f, 0.159168f, 0.111721f, -0.337423f,
+ 3.901760f, 0.261268f, -0.245555f, -0.187632f, -0.324298f, 0.167234f,
+ 0.170986f, -0.473055f, 0.087016f, -0.003469f, 0.051035f, 0.251794f,
+ 0.153549f, 0.217609f, -0.326870f, -0.175511f, 0.637341f, -0.694837f,
+ -0.873487f, -0.186614f, -1.089884f, -0.607316f, -0.523519f, 5.256331f,
+ 0.071414f, 0.215265f, -0.835999f, 5.735746f, 0.300101f, 0.089626f,
+ -0.450261f, 5.608051f, 0.190491f, 0.110220f, -0.595360f, -0.446324f,
+ 0.311380f, 0.268812f, -0.339656f, -0.008708f, 0.011111f, -0.027557f,
+ 0.171534f, 0.000676f, 0.227232f, 0.033993f, 0.146684f, 0.094817f,
+ -0.175381f, -0.211927f, -0.362471f, 0.168834f, 0.264149f, -0.350538f,
+ -0.463249f, -0.288105f, 0.347155f, 0.183231f, -0.229732f, -0.252202f,
+ -0.218074f, -0.008769f, -0.156103f, 0.181233f, -0.354736f, 0.263270f,
+ -0.106636f, 0.081057f, 0.060634f, -0.046887f, 0.050468f, 0.071259f,
+ 0.221287f, 0.199071f, -0.180185f, -0.406902f, -0.239351f, -0.034957f,
+ 0.369140f, 0.864600f, 0.233798f, 0.423612f, -0.468918f, 0.976987f,
+ 0.691198f, -1.597908f, 0.102926f, 0.305546f, 0.391196f, -3.909059f,
+ 0.333635f, 0.311561f, 0.738886f, -4.002001f, 0.236394f, -0.233141f,
+ 0.263342f, 0.679898f, 0.136233f, 0.254743f, -0.367571f, 0.066412f,
+ 0.001606f, -0.059542f, 0.051726f, -0.347145f, -0.045501f, -0.313847f,
+ -0.021952f, 1.386316f, -0.579139f, -1.275844f, -0.003493f, -1.716577f,
+ 0.250209f, 0.192086f, 4.177055f, 0.351835f, 0.338177f, 0.140163f,
+ 4.099592f, 0.321866f, -0.128153f, -0.360414f, 4.350767f, 0.025943f,
+ -0.116740f, -0.664107f, -0.064558f, -0.039553f, -0.208186f, -0.678774f,
+ 0.149441f, -0.019823f, 0.012759f, 0.404442f, -0.108881f, 0.067974f,
+ -0.188278f, 0.136327f, 0.109927f, -0.179270f, -0.272342f, 0.018064f,
+ -0.304216f, -0.469470f, 0.109310f, -0.326214f, 0.061909f, -0.278997f,
+ -0.352329f, -0.333770f, -0.186522f, -0.328567f, -0.206211f, -0.008804f,
+ 0.042441f, -0.126699f, -0.420399f, -0.033842f, 0.016773f, -0.273789f,
+ 0.081928f, -0.191552f, -0.179533f, -0.263070f, -0.471807f, 0.062601f,
+ -0.232576f, 0.082955f, -0.490080f, 0.073820f, -0.090384f, 0.035781f,
+ -0.158880f, -0.506793f, -0.069132f, 0.047602f, -0.349640f, -0.058389f,
+ -0.017387f, -0.194636f, -0.457227f, -0.143105f, 0.222045f, -0.548909f,
+ -0.131561f, 0.247196f, -0.207923f, 0.133056f, -0.509854f, -0.193685f,
+ -0.181327f, -0.242442f, 0.091821f, 0.114430f, -0.375233f, -0.015254f,
+ -0.336632f, -0.060279f, -0.169169f, -0.429914f, -0.036563f, -0.400560f,
+ -0.076332f, -0.186232f, -0.268491f, 0.075561f, -0.389082f, -0.077435f,
+ 0.352562f, -0.020086f, -0.338181f, -0.404629f, 0.254983f, 0.150477f,
+ -0.265903f, 0.003341f, 0.099969f, -0.211964f, -0.129372f, -0.166366f,
+ 0.327712f, -0.276234f, 0.140675f, -0.433677f, -0.163050f, -0.143578f,
+ -0.397840f, -0.422130f, -0.293835f, -0.075362f, -0.468375f, 1.021238f,
+ 1.394155f, -0.922486f, -1.350222f, 2.030201f, 0.057717f, 0.227650f,
+ -0.193179f, 0.037224f, 0.065555f, 0.020558f, -0.059205f, -0.023690f,
+ -0.008718f, 0.095976f, -0.549587f, -0.321164f, -0.243728f, 1.344381f,
+ -1.254107f, 0.294244f, -0.154737f, -0.152597f, 0.342419f, 0.301883f,
+ 0.069866f, -0.327766f, 0.209323f, -0.364913f, -0.005530f, -0.558972f,
+ 0.057684f, -0.309357f, -0.283325f, -0.278445f, -0.420115f, -0.418457f,
+ -0.391481f, -0.418460f, -0.003897f, -0.023744f, -0.312330f, -0.366213f,
+ 0.269628f, -0.274877f, -0.189988f, -0.419555f, -0.034033f, 0.192874f,
+ -0.135487f, -0.326108f, -0.039019f, 0.185029f, -0.264883f, -0.563447f,
+ -0.163532f, -0.447652f, -0.141851f, 0.001714f, -0.193184f, 0.032609f,
+ -0.112883f, 0.074599f, 0.490665f, 0.434764f, 0.021652f, -0.219618f,
+ 0.743267f, 0.147195f, -0.303479f, -0.097674f, 0.195813f, 0.704007f,
+ -1.290851f, 0.119701f, 0.224065f, 0.260246f, -0.580657f, -0.096201f,
+ -0.333214f, -0.586689f, 0.567178f, 0.157340f, -0.043184f, 0.194358f,
+ -0.026506f, -0.339894f, -0.571803f, -0.234828f, 0.147054f, -0.564178f,
+ -0.156933f, -0.366055f, -0.691687f, -0.187501f, 0.215834f, -0.346106f,
+ -0.256892f, 0.110915f, -0.337464f, -0.341474f, -0.216113f, 0.249445f,
+ -0.070175f, -0.412141f, 0.153458f, -0.081280f, 0.164669f, -0.356396f,
+ -0.294971f, -0.165121f, -0.133585f, -0.071467f, 0.295147f, -0.253233f,
+ -0.213833f, -0.343416f, -0.474344f, -0.304000f, -0.341379f, -0.331456f,
+ -0.393952f, -0.508004f, -0.569518f, -0.509864f, 0.121961f, 0.011957f,
+ 0.000498f, -0.201969f, -0.407195f, -0.414375f, -0.295846f, 0.247492f,
+ 0.124249f, -0.550804f, -0.420397f, -0.123462f, 0.333292f, -0.240230f,
+ -0.025604f, 0.337536f, -0.295006f, -0.272614f, -0.496850f, -0.278521f,
+ 0.234591f, -0.052775f, -0.014052f, -0.260078f, -0.279128f, -0.036385f,
+ 0.008714f, -0.064018f, -0.124873f, -0.334014f,
+};
+
+static const float av1_early_term_after_split_nn_bias_8_layer0[] = {
+ 1.202379f, -0.117005f, -0.135527f, -0.262255f, -0.443658f, -0.078981f,
+ 0.615653f, -0.124482f, -0.227768f, -0.227014f, -0.135898f, 0.143216f,
+ -0.225995f, 0.370877f, -0.214821f, -0.227752f,
+};
+
+static const float av1_early_term_after_split_nn_weights_8_layer1[] = {
+ 0.376594f, 0.266703f, -0.039847f, 1.680142f, -0.879939f, 0.286806f,
+ -0.378223f, -0.405295f, -0.021107f, 0.039188f, 0.259308f, 0.193091f,
+ 0.077994f, -0.269141f, 0.011180f, -0.019262f,
+};
+
+static const float av1_early_term_after_split_nn_bias_8_layer1[] = {
+ -1.29585564f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_8 = {
+ FEATURES,
+ 1,
+ 1,
+ {
+ 16,
+ },
+ {
+ av1_early_term_after_split_nn_weights_8_layer0,
+ av1_early_term_after_split_nn_weights_8_layer1,
+ },
+ {
+ av1_early_term_after_split_nn_bias_8_layer0,
+ av1_early_term_after_split_nn_bias_8_layer1,
+ },
+};
+#undef FEATURES
+#undef HIDDEN_NODES
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/partition_search.c b/third_party/aom/av1/encoder/partition_search.c
new file mode 100644
index 0000000000..1c17b09ee1
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_search.c
@@ -0,0 +1,6263 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+
+#include "aom_dsp/txfm_common.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/av1_ml_partition_models.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define COLLECT_MOTION_SEARCH_FEATURE_SB 0
+
+void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+ part_sf->partition_search_type = SEARCH_PARTITION;
+ part_sf->less_rectangular_check_level = 0;
+ part_sf->use_square_partition_only_threshold = BLOCK_128X128;
+ part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+ part_sf->default_max_partition_size = BLOCK_LARGEST;
+ part_sf->default_min_partition_size = BLOCK_4X4;
+ part_sf->adjust_var_based_rd_partitioning = 0;
+ part_sf->max_intra_bsize = BLOCK_LARGEST;
+ // This setting only takes effect when partition_search_type is set
+ // to FIXED_PARTITION.
+ part_sf->fixed_partition_size = BLOCK_16X16;
+ // Recode loop tolerance %.
+ part_sf->partition_search_breakout_dist_thr = 0;
+ part_sf->partition_search_breakout_rate_thr = 0;
+ part_sf->prune_ext_partition_types_search_level = 0;
+ part_sf->prune_part4_search = 0;
+ part_sf->ml_prune_partition = 0;
+ part_sf->ml_early_term_after_part_split_level = 0;
+ for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
+ part_sf->ml_partition_search_breakout_thresh[i] =
+ -1; // -1 means not enabled.
+ }
+ part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0;
+ part_sf->simple_motion_search_split = 0;
+ part_sf->simple_motion_search_prune_rect = 0;
+ part_sf->simple_motion_search_early_term_none = 0;
+ part_sf->simple_motion_search_reduce_search_steps = 0;
+ part_sf->intra_cnn_based_part_prune_level = 0;
+ part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+ part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+ part_sf->ext_part_eval_based_on_cur_best = 0;
+ part_sf->prune_ext_part_using_split_info = 0;
+ part_sf->prune_rectangular_split_based_on_qidx = 0;
+ part_sf->early_term_after_none_split = 0;
+ part_sf->ml_predict_breakout_level = 0;
+ part_sf->prune_sub_8x8_partition_level = 0;
+ part_sf->simple_motion_search_rect_split = 0;
+ part_sf->reuse_prev_rd_results_for_part_ab = 0;
+ part_sf->reuse_best_prediction_for_part_ab = 0;
+ part_sf->use_best_rd_for_pruning = 0;
+ part_sf->skip_non_sq_part_based_on_none = 0;
+}
+
+// Reset speed features that works for the baseline encoding, but
+// blocks the external partition search.
+void av1_reset_sf_for_ext_part(AV1_COMP *const cpi) {
+ cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions = 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+// If input |features| is NULL, write tpl stats to file for each super block.
+// Otherwise, store tpl stats to |features|.
+// The tpl stats is computed in the unit of tpl_bsize_1d (16x16).
+// When writing to text file:
+// The first row contains super block position, super block size,
+// tpl unit length, number of units in the super block.
+// The second row contains the intra prediction cost for each unit.
+// The third row contains the inter prediction cost for each unit.
+// The forth row contains the motion compensated dependency cost for each unit.
+static void collect_tpl_stats_sb(const AV1_COMP *const cpi,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col,
+ aom_partition_features_t *features) {
+ const AV1_COMMON *const cm = &cpi->common;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+ return;
+ }
+
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ // If tpl stats is not established, early return
+ if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) {
+ if (features != NULL) features->sb_features.tpl_features.available = 0;
+ return;
+ }
+
+ const int tpl_stride = tpl_frame->stride;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+ const int mi_width =
+ AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+ const int mi_height =
+ AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+ const int col_steps = (mi_width / step) + ((mi_width % step) > 0);
+ const int row_steps = (mi_height / step) + ((mi_height % step) > 0);
+ const int num_blocks = col_steps * row_steps;
+
+ if (features == NULL) {
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/tpl_feature_sb%d",
+ cpi->oxcf.partition_info_path, cpi->sb_counter);
+ FILE *pfile = fopen(filename, "w");
+ fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize,
+ tpl_data->tpl_bsize_1d, num_blocks);
+ int count = 0;
+ for (int row = 0; row < mi_height; row += step) {
+ for (int col = 0; col < mi_width; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+ tpl_data->tpl_stats_block_mis_log2)];
+ fprintf(pfile, "%.0f", (double)this_stats->intra_cost);
+ if (count < num_blocks - 1) fprintf(pfile, ",");
+ ++count;
+ }
+ }
+ fprintf(pfile, "\n");
+ count = 0;
+ for (int row = 0; row < mi_height; row += step) {
+ for (int col = 0; col < mi_width; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+ tpl_data->tpl_stats_block_mis_log2)];
+ fprintf(pfile, "%.0f", (double)this_stats->inter_cost);
+ if (count < num_blocks - 1) fprintf(pfile, ",");
+ ++count;
+ }
+ }
+ fprintf(pfile, "\n");
+ count = 0;
+ for (int row = 0; row < mi_height; row += step) {
+ for (int col = 0; col < mi_width; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+ tpl_data->tpl_stats_block_mis_log2)];
+ const int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ fprintf(pfile, "%.0f", (double)mc_dep_delta);
+ if (count < num_blocks - 1) fprintf(pfile, ",");
+ ++count;
+ }
+ }
+ fclose(pfile);
+ } else {
+ features->sb_features.tpl_features.available = 1;
+ features->sb_features.tpl_features.tpl_unit_length = tpl_data->tpl_bsize_1d;
+ features->sb_features.tpl_features.num_units = num_blocks;
+ int count = 0;
+ for (int row = 0; row < mi_height; row += step) {
+ for (int col = 0; col < mi_width; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+ tpl_data->tpl_stats_block_mis_log2)];
+ const int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ features->sb_features.tpl_features.intra_cost[count] =
+ this_stats->intra_cost;
+ features->sb_features.tpl_features.inter_cost[count] =
+ this_stats->inter_cost;
+ features->sb_features.tpl_features.mc_dep_cost[count] = mc_dep_delta;
+ ++count;
+ }
+ }
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+ FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
+ int blk_row, int blk_col,
+ uint8_t allow_update_cdf) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, mbmi->bsize,
+ tx_size);
+ const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+ assert(tx_size > TX_4X4);
+
+ if (depth == MAX_VARTX_DEPTH) {
+ // Don't add to counts in this case
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ return;
+ }
+
+ if (tx_size == plane_tx_size) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->txfm_partition[ctx][0];
+#endif
+ if (allow_update_cdf)
+ update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->txfm_partition[ctx][1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
+ ++x->txfm_search_info.txb_split_count;
+
+ if (sub_txs == TX_4X4) {
+ mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+ mbmi->tx_size = TX_4X4;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+ return;
+ }
+
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ int offsetr = row;
+ int offsetc = col;
+
+ update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+ blk_col + offsetc, allow_update_cdf);
+ }
+ }
+ }
+}
+
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+ BLOCK_SIZE plane_bsize,
+ FRAME_COUNTS *td_counts,
+ uint8_t allow_update_cdf) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+ for (int idy = 0; idy < mi_height; idy += bh) {
+ for (int idx = 0; idx < mi_width; idx += bw) {
+ update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
+ allow_update_cdf);
+ }
+ }
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
+ int blk_col) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (tx_size == plane_tx_size) {
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+
+ } else {
+ if (tx_size == TX_8X8) {
+ mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+ mbmi->tx_size = TX_4X4;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+ return;
+ }
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+ for (int row = 0; row < row_end; row += bsh) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < col_end; col += bsw) {
+ const int offsetc = blk_col + col;
+ set_txfm_context(xd, sub_txs, offsetr, offsetc);
+ }
+ }
+ }
+}
+
+static void tx_partition_set_contexts(const AV1_COMMON *const cm,
+ MACROBLOCKD *xd, BLOCK_SIZE plane_bsize) {
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+ for (int idy = 0; idy < mi_height; idy += bh) {
+ for (int idx = 0; idx < mi_width; idx += bw) {
+ set_txfm_context(xd, max_tx_size, idy, idx);
+ }
+ }
+}
+
+static void update_zeromv_cnt(const AV1_COMP *const cpi,
+ const MB_MODE_INFO *const mi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ if (mi->ref_frame[0] != LAST_FRAME || !is_inter_block(mi) ||
+ mi->segment_id > CR_SEGMENT_ID_BOOST2) {
+ return;
+ }
+ const AV1_COMMON *const cm = &cpi->common;
+ const MV mv = mi->mv[0].as_mv;
+ const int bw = mi_size_wide[bsize] >> 1;
+ const int bh = mi_size_high[bsize] >> 1;
+ const int xmis = AOMMIN((cm->mi_params.mi_cols - mi_col) >> 1, bw);
+ const int ymis = AOMMIN((cm->mi_params.mi_rows - mi_row) >> 1, bh);
+ const int block_index =
+ (mi_row >> 1) * (cm->mi_params.mi_cols >> 1) + (mi_col >> 1);
+ for (int y = 0; y < ymis; y++) {
+ for (int x = 0; x < xmis; x++) {
+ // consec_zero_mv is in the scale of 8x8 blocks
+ const int map_offset = block_index + y * (cm->mi_params.mi_cols >> 1) + x;
+ if (abs(mv.row) < 10 && abs(mv.col) < 10) {
+ if (cpi->consec_zero_mv[map_offset] < 255)
+ cpi->consec_zero_mv[map_offset]++;
+ } else {
+ cpi->consec_zero_mv[map_offset] = 0;
+ }
+ }
+ }
+}
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+ ThreadData *td, TokenExtra **t, RUN_TYPE dry_run,
+ BLOCK_SIZE bsize, int *rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO **mi_4x4 = xd->mi;
+ MB_MODE_INFO *mbmi = mi_4x4[0];
+ const int seg_skip =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+ const int mis = cm->mi_params.mi_stride;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const int is_inter = is_inter_block(mbmi);
+
+ // Initialize tx_mode and tx_size_search_method
+ TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ set_tx_size_search_method(
+ cm, &cpi->winner_mode_params, txfm_params,
+ cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ if (!is_inter) {
+ xd->cfl.store_y = store_cfl_required(cm, xd);
+ mbmi->skip_txfm = 1;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run,
+ cpi->optimize_seg_arr[mbmi->segment_id]);
+ }
+
+ // If there is at least one lossless segment, force the skip for intra
+ // block to be 0, in order to avoid the segment_id to be changed by in
+ // write_segment_id().
+ if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
+ cpi->enc_seg.has_lossless_segment)
+ mbmi->skip_txfm = 0;
+
+ xd->cfl.store_y = 0;
+ if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+ for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
+ if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+ if (!dry_run) {
+ av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
+ PALETTE_MAP, tile_data->allow_update_cdf,
+ td->counts);
+ } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+ *rate +=
+ av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
+ }
+ }
+ }
+ }
+
+ av1_update_intra_mb_txb_context(cpi, td, dry_run, bsize,
+ tile_data->allow_update_cdf);
+ } else {
+ int ref;
+ const int is_compound = has_second_ref(mbmi);
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const YV12_BUFFER_CONFIG *cfg =
+ get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]);
+ assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+ av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+ xd->block_ref_scale_factors[ref], num_planes);
+ }
+ // Predicted sample of inter mode (for Luma plane) cannot be reused if
+ // nonrd_check_partition_split speed feature is enabled, Since in such cases
+ // the buffer may not contain the predicted sample of best mode.
+ const int start_plane =
+ (x->reuse_inter_pred && (!cpi->sf.rt_sf.nonrd_check_partition_split) &&
+ cm->seq_params->bit_depth == AOM_BITS_8)
+ ? 1
+ : 0;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ start_plane, av1_num_planes(cm) - 1);
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
+ assert(cpi->oxcf.motion_mode_cfg.enable_obmc);
+ av1_build_obmc_inter_predictors_sb(cm, xd);
+ }
+
+#if CONFIG_MISMATCH_DEBUG
+ if (dry_run == OUTPUT_ENABLED) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ int pixel_c, pixel_r;
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+ pd->subsampling_x, pd->subsampling_y);
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+ mismatch_record_block_pre(pd->dst.buf, pd->dst.stride,
+ cm->current_frame.order_hint, plane, pixel_c,
+ pixel_r, pd->width, pd->height,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+ }
+#else
+ (void)num_planes;
+#endif
+
+ av1_encode_sb(cpi, x, bsize, dry_run);
+ av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate,
+ tile_data->allow_update_cdf);
+ }
+
+ if (!dry_run) {
+ if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1;
+ if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ !xd->lossless[mbmi->segment_id] && mbmi->bsize > BLOCK_4X4 &&
+ !(is_inter && (mbmi->skip_txfm || seg_skip))) {
+ if (is_inter) {
+ tx_partition_count_update(cm, x, bsize, td->counts,
+ tile_data->allow_update_cdf);
+ } else {
+ if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
+ ++x->txfm_search_info.txb_split_count;
+ if (block_signals_txsize(bsize)) {
+ const int tx_size_ctx = get_tx_size_context(xd);
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+ const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
+ const int max_depths = bsize_to_max_depth(bsize);
+
+ if (tile_data->allow_update_cdf)
+ update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+ depth, max_depths + 1);
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+ }
+ }
+ assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
+ } else {
+ int i, j;
+ TX_SIZE intra_tx_size;
+ // The new intra coding scheme requires no change of transform size
+ if (is_inter) {
+ if (xd->lossless[mbmi->segment_id]) {
+ intra_tx_size = TX_4X4;
+ } else {
+ intra_tx_size =
+ tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+ }
+ } else {
+ intra_tx_size = mbmi->tx_size;
+ }
+
+ const int cols = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_width);
+ const int rows = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_height);
+ for (j = 0; j < rows; j++) {
+ for (i = 0; i < cols; i++) mi_4x4[mis * j + i]->tx_size = intra_tx_size;
+ }
+
+ if (intra_tx_size != max_txsize_rect_lookup[bsize])
+ ++x->txfm_search_info.txb_split_count;
+ }
+ }
+
+ if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ block_signals_txsize(mbmi->bsize) && is_inter &&
+ !(mbmi->skip_txfm || seg_skip) && !xd->lossless[mbmi->segment_id]) {
+ if (dry_run) tx_partition_set_contexts(cm, xd, bsize);
+ } else {
+ TX_SIZE tx_size = mbmi->tx_size;
+ // The new intra coding scheme requires no change of transform size
+ if (is_inter) {
+ if (xd->lossless[mbmi->segment_id]) {
+ tx_size = TX_4X4;
+ } else {
+ tx_size = tx_size_from_tx_mode(bsize, txfm_params->tx_mode_search_type);
+ }
+ } else {
+ tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+ }
+ mbmi->tx_size = tx_size;
+ set_txfm_ctxs(tx_size, xd->width, xd->height,
+ (mbmi->skip_txfm || seg_skip) && is_inter_block(mbmi), xd);
+ }
+
+ if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) {
+ cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
+ }
+ if (!dry_run) {
+ if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->svc.temporal_layer_id == 0 &&
+ cpi->sf.rt_sf.use_temporal_noise_estimate &&
+ (!cpi->ppi->use_svc ||
+ (cpi->ppi->use_svc &&
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
+ update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize);
+ }
+}
+
+static void setup_block_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
+ x->rdmult = cpi->rd.RDMULT;
+
+ if (aq_mode != NO_AQ) {
+ assert(mbmi != NULL);
+ if (aq_mode == VARIANCE_AQ) {
+ if (cpi->vaq_refresh) {
+ const int energy = bsize <= BLOCK_16X16
+ ? x->mb_energy
+ : av1_log_block_var(cpi, x, bsize);
+ mbmi->segment_id = energy;
+ }
+ x->rdmult = set_rdmult(cpi, x, mbmi->segment_id);
+ } else if (aq_mode == COMPLEXITY_AQ) {
+ x->rdmult = set_rdmult(cpi, x, mbmi->segment_id);
+ } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+ // If segment is boosted, use rdmult for that segment.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+ x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+ }
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->common.delta_q_info.delta_q_present_flag &&
+ !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ x->rdmult = av1_get_cb_rdmult(cpi, x, bsize, mi_row, mi_col);
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM) {
+ av1_set_ssim_rdmult(cpi, &x->errorperbit, bsize, mi_row, mi_col,
+ &x->rdmult);
+ }
+#if CONFIG_SALIENCY_MAP
+ else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_SALIENCY_MAP) {
+ av1_set_saliency_map_vmaf_rdmult(cpi, &x->errorperbit,
+ cpi->common.seq_params->sb_size, mi_row,
+ mi_col, &x->rdmult);
+ }
+#endif
+#if CONFIG_TUNE_VMAF
+ else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_MAX_GAIN ||
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+ }
+#endif
+#if CONFIG_TUNE_BUTTERAUGLI
+ else if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) {
+ av1_set_butteraugli_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+ }
+#endif
+ if (cpi->oxcf.mode == ALLINTRA) {
+ x->rdmult = (int)(((int64_t)x->rdmult * x->intra_sb_rdmult_modifier) >> 7);
+ }
+
+ // Check to make sure that the adjustments above have not caused the
+ // rd multiplier to be truncated to 0.
+ x->rdmult = (x->rdmult > 0) ? x->rdmult : 1;
+}
+
+void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
+ const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+
+ set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+ mi_row, mi_col);
+
+ set_entropy_context(xd, mi_row, mi_col, num_planes);
+ xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ // Set up destination pointers.
+ av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+ num_planes);
+
+ // Set up limit values for MV components.
+ // Mv beyond the range do not produce new/different prediction block.
+ av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+ mi_width, cpi->oxcf.border_in_pixels);
+
+ set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+ // Set up distance of MB to edge of frame in 1/8th pel units.
+ assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+ set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+ cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+
+ // Set up source buffers.
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+
+ // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+ xd->tile = *tile;
+}
+
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi;
+
+ av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+ // Setup segment ID.
+ mbmi = xd->mi[0];
+ mbmi->segment_id = 0;
+ if (seg->enabled) {
+ if (seg->enabled && !cpi->vaq_refresh) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+ mbmi->segment_id =
+ map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0;
+ }
+ av1_init_plane_quantizers(cpi, x, mbmi->segment_id, 0);
+ }
+#ifndef NDEBUG
+ x->last_set_offsets_loc.mi_row = mi_row;
+ x->last_set_offsets_loc.mi_col = mi_col;
+ x->last_set_offsets_loc.bsize = bsize;
+#endif // NDEBUG
+}
+
+/*!\brief Hybrid intra mode search.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * \callergraph
+ * This is top level function for mode search for intra frames in non-RD
+ * optimized case. Depending on speed feature and block size it calls
+ * either non-RD or RD optimized intra mode search.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] rd_cost Struct to keep track of the RD information
+ * \param[in] bsize Current block size
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+
+static AOM_INLINE void hybrid_intra_mode_search(AV1_COMP *cpi,
+ MACROBLOCK *const x,
+ RD_STATS *rd_cost,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ int use_rdopt = 0;
+ const int hybrid_intra_pickmode = cpi->sf.rt_sf.hybrid_intra_pickmode;
+ // Use rd pick for intra mode search based on block size and variance.
+ if (hybrid_intra_pickmode && bsize < BLOCK_16X16) {
+ unsigned int var_thresh[3] = { 0, 101, 201 };
+ assert(hybrid_intra_pickmode <= 3);
+ if (x->source_variance >= var_thresh[hybrid_intra_pickmode - 1])
+ use_rdopt = 1;
+ }
+
+ if (use_rdopt)
+ av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+ else
+ av1_nonrd_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+// For real time/allintra row-mt enabled multi-threaded encoding with cost
+// update frequency set to COST_UPD_TILE/COST_UPD_OFF, tile ctxt is not updated
+// at superblock level. Thus, it is not required for the encoding of top-right
+// superblock be complete for updating tile ctxt. However, when encoding a block
+// whose right edge is also the superblock edge, intra and inter mode evaluation
+// (ref mv list population) require the encoding of the top-right superblock to
+// be complete. So, here, we delay the waiting of threads until the need for the
+// data from the top-right superblock region.
+static AOM_INLINE void wait_for_top_right_sb(
+ AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync,
+ TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ const int sb_size_in_mi = mi_size_wide[sb_size];
+ const int bw_in_mi = mi_size_wide[bsize];
+ const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1);
+ const int blk_col_in_sb = mi_col & (sb_size_in_mi - 1);
+ const int top_right_block_in_sb =
+ (blk_row_in_sb == 0) && (blk_col_in_sb + bw_in_mi >= sb_size_in_mi);
+
+ // Don't wait if the block is the not the top-right block in the superblock.
+ if (!top_right_block_in_sb) return;
+
+ // Wait for the top-right superblock to finish encoding.
+ const int sb_row_in_tile =
+ (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2;
+ const int sb_col_in_tile =
+ (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2;
+
+ enc_row_mt->sync_read_ptr(row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+}
+
+/*!\brief Interface for AV1 mode search for an individual coding block
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Searches prediction modes, transform, and coefficient coding modes for an
+ * individual coding block. This function is the top-level interface that
+ * directs the encoder to the proper mode search function, among these
+ * implemented for inter/intra + rd/non-rd + non-skip segment/skip segment.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during
+ * encoding
+ * \param[in] x Pointer to structure holding all the data for
+ * the current macroblock
+ * \param[in] mi_row Row coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] mi_col Column coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] rd_cost Pointer to structure holding rate and distortion
+ * stats for the current block
+ * \param[in] partition Partition mode of the parent block
+ * \param[in] bsize Current block size
+ * \param[in] ctx Pointer to structure holding coding contexts and
+ * chosen modes for the current block
+ * \param[in] best_rd Upper bound of rd cost of a valid partition
+ *
+ * \remark Nothing is returned. Instead, the chosen modes and contexts necessary
+ * for reconstruction are stored in ctx, the rate-distortion stats are stored in
+ * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
+ * signalled by an INT64_MAX rd_cost->rdcost.
+ */
+static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ RD_STATS *rd_cost, PARTITION_TYPE partition,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ RD_STATS best_rd) {
+ if (cpi->sf.part_sf.use_best_rd_for_pruning && best_rd.rdcost < 0) {
+ ctx->rd_stats.rdcost = INT64_MAX;
+ ctx->rd_stats.skip_txfm = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
+
+ av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+
+ if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab &&
+ ctx->rd_mode_is_ready) {
+ assert(ctx->mic.bsize == bsize);
+ assert(ctx->mic.partition == partition);
+ rd_cost->rate = ctx->rd_stats.rate;
+ rd_cost->dist = ctx->rd_stats.dist;
+ rd_cost->rdcost = ctx->rd_stats.rdcost;
+ return;
+ }
+
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+ int i;
+
+ // This is only needed for real time/allintra row-mt enabled multi-threaded
+ // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
+ wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
+ &tile_data->tile_info, cm->seq_params->sb_size,
+ cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_sb_modes_time);
+#endif
+
+ mbmi = xd->mi[0];
+ mbmi->bsize = bsize;
+ mbmi->partition = partition;
+
+#if CONFIG_RD_DEBUG
+ mbmi->mi_row = mi_row;
+ mbmi->mi_col = mi_col;
+#endif
+
+ // Sets up the tx_type_map buffer in MACROBLOCKD.
+ xd->tx_type_map = txfm_info->tx_type_map_;
+ xd->tx_type_map_stride = mi_size_wide[bsize];
+
+ for (i = 0; i < num_planes; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ p[i].dqcoeff = ctx->dqcoeff[i];
+ p[i].eobs = ctx->eobs[i];
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ }
+
+ for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+ ctx->skippable = 0;
+ // Set to zero to make sure we do not use the previous encoded frame stats
+ mbmi->skip_txfm = 0;
+ // Reset skip mode flag.
+ mbmi->skip_mode = 0;
+
+ x->source_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+
+ // Initialize default mode evaluation params
+ set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+ // Set error per bit for current rdmult
+ av1_set_error_per_bit(&x->errorperbit, x->rdmult);
+ av1_rd_cost_update(x->rdmult, &best_rd);
+
+ // If set best_rd.rdcost to INT64_MAX, the encoder will not use any previous
+ // rdcost information for the following mode search.
+ // Disabling the feature could get some coding gain, with encoder slowdown.
+ if (!cpi->sf.part_sf.use_best_rd_for_pruning) {
+ av1_invalid_rd_stats(&best_rd);
+ }
+
+ // Find best coding mode & reconstruct the MB so it is available
+ // as a predictor for MBs that follow in the SB
+ if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+ av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+ } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+ if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+ rd_cost, bsize, ctx, best_rd.rdcost);
+ } else {
+ av1_rd_pick_inter_mode(cpi, tile_data, x, rd_cost, bsize, ctx,
+ best_rd.rdcost);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+ }
+
+ // Examine the resulting rate and for AQ mode 2 make a segment choice.
+ if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ &&
+ bsize >= BLOCK_16X16) {
+ av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+ }
+
+ x->rdmult = orig_rdmult;
+
+ // TODO(jingning) The rate-distortion optimization flow needs to be
+ // refactored to provide proper exit/return handle.
+ if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+
+ ctx->rd_stats.rate = rd_cost->rate;
+ ctx->rd_stats.dist = rd_cost->dist;
+ ctx->rd_stats.rdcost = rd_cost->rdcost;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_sb_modes_time);
+#endif
+}
+
+static void update_stats(const AV1_COMMON *const cm, ThreadData *td) {
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ const BLOCK_SIZE bsize = mbmi->bsize;
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
+ if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active &&
+ is_comp_ref_allowed(bsize)) {
+ const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+#if CONFIG_ENTROPY_STATS
+ td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
+#endif
+ update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+ }
+
+ if (!mbmi->skip_mode && !seg_ref_active) {
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+#if CONFIG_ENTROPY_STATS
+ td->counts->skip_txfm[skip_ctx][mbmi->skip_txfm]++;
+#endif
+ update_cdf(fc->skip_txfm_cdfs[skip_ctx], mbmi->skip_txfm, 2);
+ }
+
+#if CONFIG_ENTROPY_STATS
+ // delta quant applies to both intra and inter
+ const int super_block_upper_left =
+ ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+ ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ if (delta_q_info->delta_q_present_flag &&
+ (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
+ super_block_upper_left) {
+ const int dq = (mbmi->current_qindex - xd->current_base_qindex) /
+ delta_q_info->delta_q_res;
+ const int absdq = abs(dq);
+ for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+ td->counts->delta_q[i][1]++;
+ }
+ if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
+ if (delta_q_info->delta_lf_present_flag) {
+ if (delta_q_info->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+ delta_q_info->delta_lf_res;
+ const int abs_delta_lf = abs(delta_lf);
+ for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+ td->counts->delta_lf_multi[lf_id][i][1]++;
+ }
+ if (abs_delta_lf < DELTA_LF_SMALL)
+ td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
+ }
+ } else {
+ const int delta_lf =
+ (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+ delta_q_info->delta_lf_res;
+ const int abs_delta_lf = abs(delta_lf);
+ for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+ td->counts->delta_lf[i][1]++;
+ }
+ if (abs_delta_lf < DELTA_LF_SMALL)
+ td->counts->delta_lf[abs_delta_lf][0]++;
+ }
+ }
+ }
+#endif
+
+ if (!is_inter_block(mbmi)) {
+ av1_sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
+ frame_is_intra_only(cm));
+ }
+
+ if (av1_allow_intrabc(cm)) {
+ const int is_intrabc = is_intrabc_block(mbmi);
+ update_cdf(fc->intrabc_cdf, is_intrabc, 2);
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->intrabc[is_intrabc];
+#endif // CONFIG_ENTROPY_STATS
+ if (is_intrabc) {
+ const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ const int_mv dv_ref = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+ av1_update_mv_stats(&mbmi->mv[0].as_mv, &dv_ref.as_mv, &fc->ndvc,
+ MV_SUBPEL_NONE);
+ }
+ }
+
+ if (frame_is_intra_only(cm) || mbmi->skip_mode) return;
+
+ FRAME_COUNTS *const counts = td->counts;
+ const int inter_block = is_inter_block(mbmi);
+
+ if (!seg_ref_active) {
+#if CONFIG_ENTROPY_STATS
+ counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+#endif
+ update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+ inter_block, 2);
+ // If the segment reference feature is enabled we have only a single
+ // reference frame allowed for the segment so exclude it from
+ // the reference frame counts used to work out probabilities.
+ if (inter_block) {
+ const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+ const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+ if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+ if (is_comp_ref_allowed(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ counts->comp_inter[av1_get_reference_mode_context(xd)]
+ [has_second_ref(mbmi)]++;
+#endif // CONFIG_ENTROPY_STATS
+ update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2);
+ }
+ }
+
+ if (has_second_ref(mbmi)) {
+ const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+ ? UNIDIR_COMP_REFERENCE
+ : BIDIR_COMP_REFERENCE;
+ update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+ COMP_REFERENCE_TYPES);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+ [comp_ref_type]++;
+#endif // CONFIG_ENTROPY_STATS
+
+ if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+ const int bit = (ref0 == BWDREF_FRAME);
+ update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+ counts
+ ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (!bit) {
+ const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+ update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+ [bit1]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (bit1) {
+ update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+ ref1 == GOLDEN_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2]
+ [ref1 == GOLDEN_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ } else {
+ const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+ update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (!bit) {
+ update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+ [ref0 == LAST2_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ } else {
+ update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
+ [ref0 == GOLDEN_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
+ [ref1 == ALTREF_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (ref1 != ALTREF_FRAME) {
+ update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+ ref1 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+ [ref1 == ALTREF2_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ } else {
+ const int bit = (ref0 >= BWDREF_FRAME);
+ update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (bit) {
+ assert(ref0 <= ALTREF_FRAME);
+ update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+ [ref0 == ALTREF_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (ref0 != ALTREF_FRAME) {
+ update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+ ref0 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
+ [ref0 == ALTREF2_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ } else {
+ const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+ update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (!bit1) {
+ update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+ [ref0 != LAST_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ } else {
+ update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME,
+ 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+ [ref0 != LAST3_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ }
+
+ if (cm->seq_params->enable_interintra_compound &&
+ is_interintra_allowed(mbmi)) {
+ const int bsize_group = size_group_lookup[bsize];
+ if (mbmi->ref_frame[1] == INTRA_FRAME) {
+#if CONFIG_ENTROPY_STATS
+ counts->interintra[bsize_group][1]++;
+#endif
+ update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+#endif
+ update_cdf(fc->interintra_mode_cdf[bsize_group],
+ mbmi->interintra_mode, INTERINTRA_MODES);
+ if (av1_is_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+#endif
+ update_cdf(fc->wedge_interintra_cdf[bsize],
+ mbmi->use_wedge_interintra, 2);
+ if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+ counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+ update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index,
+ 16);
+ }
+ }
+ } else {
+#if CONFIG_ENTROPY_STATS
+ counts->interintra[bsize_group][0]++;
+#endif
+ update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+ }
+ }
+
+ const MOTION_MODE motion_allowed =
+ cm->features.switchable_motion_mode
+ ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+ cm->features.allow_warped_motion)
+ : SIMPLE_TRANSLATION;
+ if (mbmi->ref_frame[1] != INTRA_FRAME) {
+ if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+ counts->motion_mode[bsize][mbmi->motion_mode]++;
+#endif
+ update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
+ MOTION_MODES);
+ } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+ counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+#endif
+ update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2);
+ }
+ }
+
+ if (has_second_ref(mbmi)) {
+ assert(current_frame->reference_mode != SINGLE_REFERENCE &&
+ is_inter_compound_mode(mbmi->mode) &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+ const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+ cm->seq_params->enable_masked_compound;
+ if (masked_compound_used) {
+ const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+ ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
+#endif
+ update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+ mbmi->comp_group_idx, 2);
+ }
+
+ if (mbmi->comp_group_idx == 0) {
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+#if CONFIG_ENTROPY_STATS
+ ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
+#endif
+ update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx,
+ 2);
+ } else {
+ assert(masked_compound_used);
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->compound_type[bsize][mbmi->interinter_comp.type -
+ COMPOUND_WEDGE];
+#endif
+ update_cdf(fc->compound_type_cdf[bsize],
+ mbmi->interinter_comp.type - COMPOUND_WEDGE,
+ MASKED_COMPOUND_TYPES);
+ }
+ }
+ }
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+ counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
+#endif
+ update_cdf(fc->wedge_idx_cdf[bsize],
+ mbmi->interinter_comp.wedge_index, 16);
+ }
+ }
+ }
+ }
+
+ if (inter_block && cm->features.interp_filter == SWITCHABLE &&
+ av1_is_interp_needed(xd)) {
+ update_filter_type_cdf(xd, mbmi, cm->seq_params->enable_dual_filter);
+ }
+ if (inter_block &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const PREDICTION_MODE mode = mbmi->mode;
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+ if (has_second_ref(mbmi)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#endif
+ update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+ INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
+ } else {
+ av1_update_inter_mode_stats(fc, counts, mode, mode_ctx);
+ }
+
+ const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+ if (new_mv) {
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ for (int idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ const uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2);
+#if CONFIG_ENTROPY_STATS
+ ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
+ if (mbmi->ref_mv_idx == idx) break;
+ }
+ }
+ }
+
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ for (int idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ const uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2);
+#if CONFIG_ENTROPY_STATS
+ ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+#endif
+ if (mbmi->ref_mv_idx == idx - 1) break;
+ }
+ }
+ }
+ if (have_newmv_in_inter_mode(mbmi->mode)) {
+ const int allow_hp = cm->features.cur_frame_force_integer_mv
+ ? MV_SUBPEL_NONE
+ : cm->features.allow_high_precision_mv;
+ if (new_mv) {
+ for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ const int_mv ref_mv = av1_get_ref_mv(x, ref);
+ av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+ allow_hp);
+ }
+ } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) {
+ const int ref = 1;
+ const int_mv ref_mv = av1_get_ref_mv(x, ref);
+ av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+ allow_hp);
+ } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) {
+ const int ref = 0;
+ const int_mv ref_mv = av1_get_ref_mv(x, ref);
+ av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+ allow_hp);
+ }
+ }
+ }
+}
+
+/*!\brief Reconstructs an individual coding block
+ *
+ * \ingroup partition_search
+ * Reconstructs an individual coding block by applying the chosen modes stored
+ * in ctx, also updates mode counts and entropy models.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during encoding
+ * \param[in] td Pointer to thread data
+ * \param[in] tp Pointer to the starting token
+ * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+ * \param[in] mi_col Column coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] dry_run A code indicating whether it is part of the final
+ * pass for reconstructing the superblock
+ * \param[in] bsize Current block size
+ * \param[in] partition Partition mode of the parent block
+ * \param[in] ctx Pointer to structure holding coding contexts and the
+ * chosen modes for the current block
+ * \param[in] rate Pointer to the total rate for the current block
+ *
+ * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd. Also, the chosen modes
+ * will be stored in the MB_MODE_INFO buffer td->mb.e_mbd.mi[0].
+ */
+static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+ ThreadData *td, TokenExtra **tp, int mi_row, int mi_col,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ PARTITION_TYPE partition, PICK_MODE_CONTEXT *const ctx,
+ int *rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int subsampling_x = cm->seq_params->subsampling_x;
+ const int subsampling_y = cm->seq_params->subsampling_y;
+
+ av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+ const int origin_mult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ mbmi->partition = partition;
+ av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+
+ if (!dry_run) {
+ set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
+ x->cb_offset[PLANE_TYPE_UV]);
+ assert(x->cb_offset[PLANE_TYPE_Y] <
+ (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
+ assert(x->cb_offset[PLANE_TYPE_UV] <
+ ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
+ (subsampling_x + subsampling_y)));
+ }
+
+ encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+
+ if (!dry_run) {
+ update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+ if (bsize == cpi->common.seq_params->sb_size && mbmi->skip_txfm == 1 &&
+ cm->delta_q_info.delta_lf_present_flag) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+ mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+ mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+ }
+ if (has_second_ref(mbmi)) {
+ if (mbmi->compound_idx == 0 ||
+ mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+ mbmi->comp_group_idx = 0;
+ else
+ mbmi->comp_group_idx = 1;
+ }
+
+ // delta quant applies to both intra and inter
+ const int super_block_upper_left =
+ ((mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+ ((mi_col & (cm->seq_params->mib_size - 1)) == 0);
+ const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+ if (delta_q_info->delta_q_present_flag &&
+ (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
+ super_block_upper_left) {
+ xd->current_base_qindex = mbmi->current_qindex;
+ if (delta_q_info->delta_lf_present_flag) {
+ if (delta_q_info->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+ }
+ } else {
+ xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+ }
+ }
+ }
+
+ RD_COUNTS *rdc = &td->rd_counts;
+ if (mbmi->skip_mode) {
+ assert(!frame_is_intra_only(cm));
+ rdc->skip_mode_used_flag = 1;
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+ assert(has_second_ref(mbmi));
+ rdc->compound_ref_used_flag = 1;
+ }
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ } else {
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+ if (!seg_ref_active) {
+ // If the segment reference feature is enabled we have only a single
+ // reference frame allowed for the segment so exclude it from
+ // the reference frame counts used to work out probabilities.
+ if (is_inter_block(mbmi)) {
+ av1_collect_neighbors_ref_counts(xd);
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+ if (has_second_ref(mbmi)) {
+ // This flag is also updated for 4x4 blocks
+ rdc->compound_ref_used_flag = 1;
+ }
+ }
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ }
+ }
+ }
+
+ if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+
+ // Gather obmc and warped motion count to update the probability.
+ if ((cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+ cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) ||
+ (cm->features.allow_warped_motion &&
+ cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) {
+ const int inter_block = is_inter_block(mbmi);
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+ if (!seg_ref_active && inter_block) {
+ const MOTION_MODE motion_allowed =
+ cm->features.switchable_motion_mode
+ ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+ cm->features.allow_warped_motion)
+ : SIMPLE_TRANSLATION;
+
+ if (mbmi->ref_frame[1] != INTRA_FRAME) {
+ if (motion_allowed >= OBMC_CAUSAL) {
+ td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+ }
+ if (motion_allowed == WARPED_CAUSAL) {
+ td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++;
+ }
+ }
+ }
+ }
+ }
+ // TODO(Ravi/Remya): Move this copy function to a better logical place
+ // This function will copy the best mode information from block
+ // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+ // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+ // bitstream preparation.
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext,
+ av1_ref_frame_type(xd->mi[0]->ref_frame));
+ x->rdmult = origin_mult;
+}
+
+/*!\brief Reconstructs a partition (may contain multiple coding blocks)
+ *
+ * \ingroup partition_search
+ * Reconstructs a sub-partition of the superblock by applying the chosen modes
+ * and partition trees stored in pc_tree.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] td Pointer to thread data
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during encoding
+ * \param[in] tp Pointer to the starting token
+ * \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+ * \param[in] mi_col Column coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] dry_run A code indicating whether it is part of the final
+ * pass for reconstructing the superblock
+ * \param[in] bsize Current block size
+ * \param[in] pc_tree Pointer to the PC_TREE node storing the picked
+ * partitions and mode info for the current block
+ * \param[in] rate Pointer to the total rate for the current block
+ *
+ * \remark Nothing is returned. Instead, reconstructions (w/o in-loop filters)
+ * will be updated in the pixel buffers in td->mb.e_mbd.
+ */
+static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+ int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ PC_TREE *pc_tree, int *rate) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int is_partition_root = bsize >= BLOCK_8X8;
+ const int ctx = is_partition_root
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : -1;
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+#if !CONFIG_REALTIME_ONLY
+ int quarter_step = mi_size_wide[bsize] / 4;
+ int i;
+ BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+ if (subsize == BLOCK_INVALID) return;
+
+ if (!dry_run && ctx >= 0) {
+ const int has_rows = (mi_row + hbs) < mi_params->mi_rows;
+ const int has_cols = (mi_col + hbs) < mi_params->mi_cols;
+
+ if (has_rows && has_cols) {
+#if CONFIG_ENTROPY_STATS
+ td->counts->partition[ctx][partition]++;
+#endif
+
+ if (tile_data->allow_update_cdf) {
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ update_cdf(fc->partition_cdf[ctx], partition,
+ partition_cdf_length(bsize));
+ }
+ }
+ }
+
+ switch (partition) {
+ case PARTITION_NONE:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->none, rate);
+ break;
+ case PARTITION_VERT:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->vertical[0], rate);
+ if (mi_col + hbs < mi_params->mi_cols) {
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ partition, pc_tree->vertical[1], rate);
+ }
+ break;
+ case PARTITION_HORZ:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->horizontal[0], rate);
+ if (mi_row + hbs < mi_params->mi_rows) {
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ partition, pc_tree->horizontal[1], rate);
+ }
+ break;
+ case PARTITION_SPLIT:
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+ pc_tree->split[0], rate);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ pc_tree->split[1], rate);
+ encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ pc_tree->split[2], rate);
+ encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ subsize, pc_tree->split[3], rate);
+ break;
+
+#if !CONFIG_REALTIME_ONLY
+ case PARTITION_HORZ_A:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+ partition, pc_tree->horizontala[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+ partition, pc_tree->horizontala[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ partition, pc_tree->horizontala[2], rate);
+ break;
+ case PARTITION_HORZ_B:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->horizontalb[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+ partition, pc_tree->horizontalb[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ bsize2, partition, pc_tree->horizontalb[2], rate);
+ break;
+ case PARTITION_VERT_A:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+ partition, pc_tree->verticala[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+ partition, pc_tree->verticala[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ partition, pc_tree->verticala[2], rate);
+
+ break;
+ case PARTITION_VERT_B:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->verticalb[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+ partition, pc_tree->verticalb[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ bsize2, partition, pc_tree->verticalb[2], rate);
+ break;
+ case PARTITION_HORZ_4:
+ for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ int this_mi_row = mi_row + i * quarter_step;
+ if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+ encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
+ partition, pc_tree->horizontal4[i], rate);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ int this_mi_col = mi_col + i * quarter_step;
+ if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+ encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
+ partition, pc_tree->vertical4[i], rate);
+ }
+ break;
+#endif
+ default: assert(0 && "Invalid partition type."); break;
+ }
+
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static AOM_INLINE int is_adjust_var_based_part_enabled(
+ AV1_COMMON *const cm, const PARTITION_SPEED_FEATURES *const part_sf,
+ BLOCK_SIZE bsize) {
+ if (part_sf->partition_search_type != VAR_BASED_PARTITION) return 0;
+ if (part_sf->adjust_var_based_rd_partitioning == 0 ||
+ part_sf->adjust_var_based_rd_partitioning > 2)
+ return 0;
+
+ if (bsize <= BLOCK_32X32) return 1;
+ if (part_sf->adjust_var_based_rd_partitioning == 2) {
+ const int is_larger_qindex = cm->quant_params.base_qindex > 190;
+ const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+ return is_360p_or_larger && is_larger_qindex && bsize == BLOCK_64X64;
+ }
+ return 0;
+}
+
+/*!\brief AV1 block partition search (partition estimation and partial search).
+*
+* \ingroup partition_search
+* Encode the block by applying pre-calculated partition patterns that are
+* represented by coding block sizes stored in the mbmi array. Minor partition
+* adjustments are tested and applied if they lead to lower rd costs. The
+* partition types are limited to a basic set: none, horz, vert, and split.
+*
+* \param[in] cpi Top-level encoder structure
+* \param[in] td Pointer to thread data
+* \param[in] tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during encoding
+* \param[in] mib Array representing MB_MODE_INFO pointers for mi
+blocks starting from the first pixel of the current
+block
+* \param[in] tp Pointer to the starting token
+* \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+* \param[in] mi_col Column coordinate of the block in a step size of
+MI_SIZE
+* \param[in] bsize Current block size
+* \param[in] rate Pointer to the final rate for encoding the current
+block
+* \param[in] dist Pointer to the final distortion of the current block
+* \param[in] do_recon Whether the reconstruction function needs to be run,
+either for finalizing a superblock or providing
+reference for future sub-partitions
+* \param[in] pc_tree Pointer to the PC_TREE node holding the picked
+partitions and mode info for the current block
+*
+* \remark Nothing is returned. The pc_tree struct is modified to store the
+* picked partition and modes. The rate and dist are also updated with those
+* corresponding to the best partition found.
+*/
+void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+ MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *rate,
+ int64_t *dist, int do_recon, PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int num_planes = av1_num_planes(cm);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ const int pl = (bsize >= BLOCK_8X8)
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : 0;
+ const PARTITION_TYPE partition =
+ (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+ : PARTITION_NONE;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc;
+ BLOCK_SIZE bs_type = mib[0]->bsize;
+ int use_partition_none = 0;
+ x->try_merge_partition = 0;
+
+ if (pc_tree->none == NULL) {
+ pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+ PICK_MODE_CONTEXT *ctx_none = pc_tree->none;
+
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+ // In rt mode, currently the min partition size is BLOCK_8X8.
+ assert(bsize >= cpi->sf.part_sf.default_min_partition_size);
+
+ av1_invalid_rd_stats(&last_part_rdc);
+ av1_invalid_rd_stats(&none_rdc);
+ av1_invalid_rd_stats(&chosen_rdc);
+ av1_invalid_rd_stats(&invalid_rdc);
+
+ pc_tree->partitioning = partition;
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
+ }
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+ if (partition != PARTITION_NONE &&
+ is_adjust_var_based_part_enabled(cm, &cpi->sf.part_sf, bsize) &&
+ (mi_row + hbs < mi_params->mi_rows &&
+ mi_col + hbs < mi_params->mi_cols)) {
+ assert(bsize > cpi->sf.part_sf.default_min_partition_size);
+ mib[0]->bsize = bsize;
+ pc_tree->partitioning = PARTITION_NONE;
+ x->try_merge_partition = 1;
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, PARTITION_NONE,
+ bsize, ctx_none, invalid_rdc);
+
+ if (none_rdc.rate < INT_MAX) {
+ none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+ none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+ }
+
+ // Try to skip split partition evaluation based on none partition
+ // characteristics.
+ if (none_rdc.rate < INT_MAX && none_rdc.skip_txfm == 1) {
+ use_partition_none = 1;
+ }
+
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ mib[0]->bsize = bs_type;
+ pc_tree->partitioning = partition;
+ }
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ pc_tree->split[i]->index = i;
+ }
+ switch (partition) {
+ case PARTITION_NONE:
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_NONE, bsize, ctx_none, invalid_rdc);
+ break;
+ case PARTITION_HORZ:
+ if (use_partition_none) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ pc_tree->horizontal[i] =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->horizontal[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_HORZ, subsize, pc_tree->horizontal[0],
+ invalid_rdc);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_row + hbs < mi_params->mi_rows) {
+ RD_STATS tmp_rdc;
+ const PICK_MODE_CONTEXT *const ctx_h = pc_tree->horizontal[0];
+ av1_init_rd_stats(&tmp_rdc);
+ av1_update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+ NULL);
+ pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+ PARTITION_HORZ, subsize, pc_tree->horizontal[1],
+ invalid_rdc);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+ }
+ break;
+ case PARTITION_VERT:
+ if (use_partition_none) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ pc_tree->vertical[i] =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->vertical[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_VERT, subsize, pc_tree->vertical[0], invalid_rdc);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_col + hbs < mi_params->mi_cols) {
+ RD_STATS tmp_rdc;
+ const PICK_MODE_CONTEXT *const ctx_v = pc_tree->vertical[0];
+ av1_init_rd_stats(&tmp_rdc);
+ av1_update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+ NULL);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+ PARTITION_VERT, subsize,
+ pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+ }
+ break;
+ case PARTITION_SPLIT:
+ if (use_partition_none) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+
+ last_part_rdc.rate = 0;
+ last_part_rdc.dist = 0;
+ last_part_rdc.rdcost = 0;
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ int jj = i >> 1, ii = i & 0x01;
+ RD_STATS tmp_rdc;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+
+ av1_init_rd_stats(&tmp_rdc);
+ av1_rd_use_partition(
+ cpi, td, tile_data,
+ mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+ mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+ &tmp_rdc.dist, i != (SUB_PARTITIONS_SPLIT - 1), pc_tree->split[i]);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ }
+ break;
+ case PARTITION_VERT_A:
+ case PARTITION_VERT_B:
+ case PARTITION_HORZ_A:
+ case PARTITION_HORZ_B:
+ case PARTITION_HORZ_4:
+ case PARTITION_VERT_4:
+ assert(0 && "Cannot handle extended partition types");
+ default: assert(0); break;
+ }
+
+ if (last_part_rdc.rate < INT_MAX) {
+ last_part_rdc.rate += mode_costs->partition_cost[pl][partition];
+ last_part_rdc.rdcost =
+ RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+ }
+
+ if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
+ cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) &&
+ partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+ (mi_row + bs < mi_params->mi_rows ||
+ mi_row + hbs == mi_params->mi_rows) &&
+ (mi_col + bs < mi_params->mi_cols ||
+ mi_col + hbs == mi_params->mi_cols)) {
+ BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ chosen_rdc.rate = 0;
+ chosen_rdc.dist = 0;
+
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ pc_tree->partitioning = PARTITION_SPLIT;
+
+ // Split partition.
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ RD_STATS tmp_rdc;
+
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ pc_tree->split[i]->partitioning = PARTITION_NONE;
+ if (pc_tree->split[i]->none == NULL)
+ pc_tree->split[i]->none =
+ av1_alloc_pmc(cpi, split_subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+ PARTITION_SPLIT, split_subsize, pc_tree->split[i]->none,
+ invalid_rdc);
+
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&chosen_rdc);
+ break;
+ }
+
+ chosen_rdc.rate += tmp_rdc.rate;
+ chosen_rdc.dist += tmp_rdc.dist;
+
+ if (i != SUB_PARTITIONS_SPLIT - 1)
+ encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+ OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
+
+ chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+ }
+ if (chosen_rdc.rate < INT_MAX) {
+ chosen_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+ chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
+ }
+ }
+
+ // If last_part is better set the partitioning to that.
+ if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+ mib[0]->bsize = bs_type;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+
+ chosen_rdc = last_part_rdc;
+ }
+ // If none was better set the partitioning to that.
+ if (none_rdc.rdcost < INT64_MAX &&
+ none_rdc.rdcost - (none_rdc.rdcost >> 9) < chosen_rdc.rdcost) {
+ mib[0]->bsize = bsize;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+ chosen_rdc = none_rdc;
+ }
+
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ // We must have chosen a partitioning and encoding or we'll fail later on.
+ // No other opportunities for success.
+ if (bsize == cm->seq_params->sb_size)
+ assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_sb_time);
+#endif
+ if (do_recon) {
+ if (bsize == cm->seq_params->sb_size) {
+ // NOTE: To get estimate for rate due to the tokens, use:
+ // int rate_coeffs = 0;
+ // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+ // bsize, pc_tree, &rate_coeffs);
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_sb_time);
+#endif
+
+ *rate = chosen_rdc.rate;
+ *dist = chosen_rdc.dist;
+ x->rdmult = orig_rdmult;
+}
+
+static void encode_b_nonrd(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+ ThreadData *td, TokenExtra **tp, int mi_row,
+ int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ PARTITION_TYPE partition,
+ PICK_MODE_CONTEXT *const ctx, int *rate) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing((AV1_COMP *)cpi, encode_b_nonrd_time);
+#endif
+ const AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+ const int origin_mult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ mbmi->partition = partition;
+ av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+ const int subsampling_x = cpi->common.seq_params->subsampling_x;
+ const int subsampling_y = cpi->common.seq_params->subsampling_y;
+ if (!dry_run) {
+ set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
+ x->cb_offset[PLANE_TYPE_UV]);
+ assert(x->cb_offset[PLANE_TYPE_Y] <
+ (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
+ assert(x->cb_offset[PLANE_TYPE_UV] <
+ ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
+ (subsampling_x + subsampling_y)));
+ }
+
+ encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+ if (!dry_run) {
+ update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+ if (has_second_ref(mbmi)) {
+ if (mbmi->compound_idx == 0 ||
+ mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+ mbmi->comp_group_idx = 0;
+ else
+ mbmi->comp_group_idx = 1;
+ mbmi->compound_idx = 1;
+ }
+ RD_COUNTS *const rdc = &td->rd_counts;
+ if (mbmi->skip_mode) {
+ assert(!frame_is_intra_only(cm));
+ rdc->skip_mode_used_flag = 1;
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+ has_second_ref(mbmi)) {
+ rdc->compound_ref_used_flag = 1;
+ }
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ } else {
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+ if (!seg_ref_active) {
+ // If the segment reference feature is enabled we have only a single
+ // reference frame allowed for the segment so exclude it from
+ // the reference frame counts used to work out probabilities.
+ if (is_inter_block(mbmi)) {
+ av1_collect_neighbors_ref_counts(xd);
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+ has_second_ref(mbmi)) {
+ // This flag is also updated for 4x4 blocks
+ rdc->compound_ref_used_flag = 1;
+ }
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ }
+ }
+ }
+ if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY &&
+ (mbmi->mode == NEWMV || mbmi->mode < INTRA_MODE_END)) {
+ int32_t blocks = mi_size_high[bsize] * mi_size_wide[bsize];
+ rdc->newmv_or_intra_blocks += blocks;
+ }
+ if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+ }
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm &&
+ !cpi->rc.rtc_external_ratectrl && cm->seg.enabled)
+ av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize, dry_run);
+ // TODO(Ravi/Remya): Move this copy function to a better logical place
+ // This function will copy the best mode information from block
+ // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+ // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+ // bitstream preparation.
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, &x->mbmi_ext,
+ av1_ref_frame_type(xd->mi[0]->ref_frame));
+ x->rdmult = origin_mult;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing((AV1_COMP *)cpi, encode_b_nonrd_time);
+#endif
+}
+
+static int get_force_zeromv_skip_flag_for_blk(const AV1_COMP *cpi,
+ const MACROBLOCK *x,
+ BLOCK_SIZE bsize) {
+ // Force zero MV skip based on SB level decision
+ if (x->force_zeromv_skip_for_sb < 2) return x->force_zeromv_skip_for_sb;
+
+ // For blocks of size equal to superblock size, the decision would have been
+ // already done at superblock level. Hence zeromv-skip decision is skipped.
+ const AV1_COMMON *const cm = &cpi->common;
+ if (bsize == cm->seq_params->sb_size) return 0;
+
+ const int num_planes = av1_num_planes(cm);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const unsigned int thresh_exit_part_y =
+ cpi->zeromv_skip_thresh_exit_part[bsize];
+ const unsigned int thresh_exit_part_uv =
+ CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y);
+ const unsigned int thresh_exit_part[MAX_MB_PLANE] = { thresh_exit_part_y,
+ thresh_exit_part_uv,
+ thresh_exit_part_uv };
+ const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, LAST_FRAME);
+
+ struct buf_2d yv12_mb[MAX_MB_PLANE];
+ av1_setup_pred_block(xd, yv12_mb, yv12, sf, sf, num_planes);
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, yv12_mb[plane].buf, yv12_mb[plane].stride);
+ assert(plane < MAX_MB_PLANE);
+ if (plane_sad >= thresh_exit_part[plane]) return 0;
+ }
+ return 1;
+}
+
+/*!\brief Top level function to pick block mode for non-RD optimized case
+ *
+ * \ingroup partition_search
+ * \callgraph
+ * \callergraph
+ * Searches prediction modes, transform, and coefficient coding modes for an
+ * individual coding block. This function is the top-level function that is
+ * used for non-RD optimized mode search (controlled by
+ * \c cpi->sf.rt_sf.use_nonrd_pick_mode). Depending on frame type it calls
+ * inter/skip/hybrid-intra mode search functions
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during
+ * encoding
+ * \param[in] x Pointer to structure holding all the data for
+ * the current macroblock
+ * \param[in] mi_row Row coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] mi_col Column coordinate of the block in a step size of
+ * MI_SIZE
+ * \param[in] rd_cost Pointer to structure holding rate and distortion
+ * stats for the current block
+ * \param[in] bsize Current block size
+ * \param[in] ctx Pointer to structure holding coding contexts and
+ * chosen modes for the current block
+ *
+ * \remark Nothing is returned. Instead, the chosen modes and contexts necessary
+ * for reconstruction are stored in ctx, the rate-distortion stats are stored in
+ * rd_cost. If no valid mode leading to rd_cost <= best_rd, the status will be
+ * signalled by an INT64_MAX rd_cost->rdcost.
+ */
+static void pick_sb_modes_nonrd(AV1_COMP *const cpi, TileDataEnc *tile_data,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ // For nonrd mode, av1_set_offsets is already called at the superblock level
+ // in encode_nonrd_sb when we determine the partitioning.
+ if (bsize != cpi->common.seq_params->sb_size ||
+ cpi->sf.rt_sf.nonrd_check_partition_split == 1) {
+ av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+ }
+ assert(x->last_set_offsets_loc.mi_row == mi_row &&
+ x->last_set_offsets_loc.mi_col == mi_col &&
+ x->last_set_offsets_loc.bsize == bsize);
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ int i;
+
+ // This is only needed for real time/allintra row-mt enabled multi-threaded
+ // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
+ wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
+ &tile_data->tile_info, cm->seq_params->sb_size,
+ cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, pick_sb_modes_nonrd_time);
+#endif
+ // Sets up the tx_type_map buffer in MACROBLOCKD.
+ xd->tx_type_map = txfm_info->tx_type_map_;
+ xd->tx_type_map_stride = mi_size_wide[bsize];
+ for (i = 0; i < num_planes; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ p[i].dqcoeff = ctx->dqcoeff[i];
+ p[i].eobs = ctx->eobs[i];
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ }
+ for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+ x->force_zeromv_skip_for_blk =
+ get_force_zeromv_skip_flag_for_blk(cpi, x, bsize);
+
+ // Source variance may be already compute at superblock level, so no need
+ // to recompute, unless bsize < sb_size or source_variance is not yet set.
+ if (!x->force_zeromv_skip_for_blk &&
+ (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size))
+ x->source_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+ // Set error per bit for current rdmult
+ av1_set_error_per_bit(&x->errorperbit, x->rdmult);
+ // Find best coding mode & reconstruct the MB so it is available
+ // as a predictor for MBs that follow in the SB
+ if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, hybrid_intra_mode_search_time);
+#endif
+ hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, hybrid_intra_mode_search_time);
+#endif
+ } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, nonrd_pick_inter_mode_sb_time);
+#endif
+ if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ RD_STATS invalid_rd;
+ av1_invalid_rd_stats(&invalid_rd);
+ // TODO(kyslov): add av1_nonrd_pick_inter_mode_sb_seg_skip
+ av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+ rd_cost, bsize, ctx,
+ invalid_rd.rdcost);
+ } else {
+ av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, nonrd_pick_inter_mode_sb_time);
+#endif
+ }
+ if (cpi->sf.rt_sf.skip_cdef_sb) {
+ // cdef_strength is initialized to 1 which means skip_cdef, and is updated
+ // here. Check to see is skipping cdef is allowed.
+ const int allow_cdef_skipping =
+ cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad &&
+ !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] ||
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]);
+
+ // Find the corresponding 64x64 block. It'll be the 128x128 block if that's
+ // the block size.
+ const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64;
+ const int mi_col_sb = mi_col - mi_col % MI_SIZE_64X64;
+ MB_MODE_INFO **mi_sb =
+ cm->mi_params.mi_grid_base +
+ get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb);
+ // Do not skip if intra or new mv is picked, or color sensitivity is set.
+ // Never skip on slide/scene change.
+ if (cpi->sf.rt_sf.skip_cdef_sb >= 2) {
+ mi_sb[0]->cdef_strength =
+ mi_sb[0]->cdef_strength &&
+ (allow_cdef_skipping || x->source_variance == 0);
+ } else {
+ mi_sb[0]->cdef_strength =
+ mi_sb[0]->cdef_strength && allow_cdef_skipping &&
+ !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV);
+ }
+ // Store in the pickmode context.
+ ctx->mic.cdef_strength = mi_sb[0]->cdef_strength;
+ }
+ x->rdmult = orig_rdmult;
+ ctx->rd_stats.rate = rd_cost->rate;
+ ctx->rd_stats.dist = rd_cost->dist;
+ ctx->rd_stats.rdcost = rd_cost->rdcost;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, pick_sb_modes_nonrd_time);
+#endif
+}
+
+static int try_split_partition(AV1_COMP *const cpi, ThreadData *const td,
+ TileDataEnc *const tile_data,
+ TileInfo *const tile_info, TokenExtra **tp,
+ MACROBLOCK *const x, MACROBLOCKD *const xd,
+ const CommonModeInfoParams *const mi_params,
+ const int mi_row, const int mi_col,
+ const BLOCK_SIZE bsize, const int pl,
+ PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int hbs = mi_size_wide[bsize] / 2;
+ if (mi_row + mi_size_high[bsize] >= mi_params->mi_rows ||
+ mi_col + mi_size_wide[bsize] >= mi_params->mi_cols)
+ return 0;
+ if (bsize <= BLOCK_8X8 || frame_is_intra_only(cm)) return 0;
+ if (x->content_state_sb.source_sad_nonrd <= kLowSad) return 0;
+
+ // Do not try split partition when the source sad is small, or
+ // the prediction residual is small.
+ const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, LAST_FRAME);
+ const int num_planes = av1_num_planes(cm);
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+ av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf, num_planes);
+ int block_sad = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const unsigned int plane_sad = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride);
+ block_sad += plane_sad;
+ }
+ const int blk_pix = block_size_wide[bsize] * block_size_high[bsize];
+ const int block_avg_sad = block_sad / blk_pix;
+ // TODO(chengchen): find a proper threshold. It might change according to
+ // q as well.
+ const int threshold = 25;
+ if (block_avg_sad < threshold) return 0;
+
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_STATS split_rdc, none_rdc;
+ av1_invalid_rd_stats(&split_rdc);
+ av1_invalid_rd_stats(&none_rdc);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ // Calculate rdcost for none partition
+ pc_tree->partitioning = PARTITION_NONE;
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ if (!pc_tree->none) {
+ pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->none);
+ }
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+ pc_tree->none);
+ none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+ none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+ // Calculate rdcost for split partition
+ pc_tree->partitioning = PARTITION_SPLIT;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ av1_init_rd_stats(&split_rdc);
+ split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+ if (subsize >= BLOCK_8X8) {
+ split_rdc.rate += (mode_costs->partition_cost[pl][PARTITION_NONE] * 4);
+ }
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ if (!pc_tree->split[i]) {
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ }
+ pc_tree->split[i]->index = i;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ RD_STATS block_rdc;
+ av1_invalid_rd_stats(&block_rdc);
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+ if (!pc_tree->split[i]->none) {
+ pc_tree->split[i]->none =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->split[i]->none);
+ }
+ pc_tree->split[i]->partitioning = PARTITION_NONE;
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+ &block_rdc, subsize, pc_tree->split[i]->none);
+ split_rdc.rate += block_rdc.rate;
+ split_rdc.dist += block_rdc.dist;
+ av1_rd_cost_update(x->rdmult, &split_rdc);
+ if (none_rdc.rdcost < split_rdc.rdcost) break;
+ if (i != SUB_PARTITIONS_SPLIT - 1)
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
+ subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL);
+ }
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+ split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+ const int split = split_rdc.rdcost < none_rdc.rdcost;
+
+ return split;
+}
+
+// Returns if SPLIT partitions should be evaluated
+static bool calc_do_split_flag(const AV1_COMP *cpi, const MACROBLOCK *x,
+ const PC_TREE *pc_tree, const RD_STATS *none_rdc,
+ const CommonModeInfoParams *mi_params,
+ int mi_row, int mi_col, int hbs,
+ BLOCK_SIZE bsize, PARTITION_TYPE partition) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int is_larger_qindex = cm->quant_params.base_qindex > 100;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ bool do_split =
+ (cpi->sf.rt_sf.nonrd_check_partition_merge_mode == 3)
+ ? (bsize <= BLOCK_32X32 || (is_larger_qindex && bsize <= BLOCK_64X64))
+ : true;
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN ||
+ cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 ||
+ cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) ||
+ !none_rdc->skip_txfm)
+ return do_split;
+
+ const int use_model_yrd_large = get_model_rd_flag(cpi, xd, bsize);
+
+ // When model based skip is not used (i.e.,use_model_yrd_large = 0), skip_txfm
+ // would have been populated based on Hadamard transform and skip_txfm flag is
+ // more reliable. Hence SPLIT evaluation is disabled at all quantizers for 8x8
+ // and 16x16 blocks.
+ // When model based skip is used (i.e.,use_model_yrd_large = 1), skip_txfm may
+ // not be reliable. Hence SPLIT evaluation is disabled only at lower
+ // quantizers for blocks >= 32x32.
+ if ((!use_model_yrd_large) || (!is_larger_qindex)) return false;
+
+ // Use residual statistics to decide if SPLIT partition should be evaluated
+ // for 32x32 blocks. The pruning logic is avoided for larger block size to
+ // avoid the visual artifacts
+ if (pc_tree->none->mic.mode == NEWMV && bsize == BLOCK_32X32 && do_split) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+ assert(subsize < BLOCK_SIZES_ALL);
+ double min_per_pixel_error = DBL_MAX;
+ double max_per_pixel_error = 0.;
+ int i;
+ for (i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ const int x_idx = (i & 1) * hbs;
+ const int y_idx = (i >> 1) * hbs;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols)) {
+ break;
+ }
+
+ // Populate the appropriate buffer pointers.
+ // Pass scale factors as NULL as the base pointer of the block would have
+ // been calculated appropriately.
+ struct buf_2d src_split_buf_2d, pred_split_buf_2d;
+ const struct buf_2d *src_none_buf_2d = &x->plane[AOM_PLANE_Y].src;
+ setup_pred_plane(&src_split_buf_2d, subsize, src_none_buf_2d->buf,
+ src_none_buf_2d->width, src_none_buf_2d->height,
+ src_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0);
+ const struct buf_2d *pred_none_buf_2d = &xd->plane[AOM_PLANE_Y].dst;
+ setup_pred_plane(&pred_split_buf_2d, subsize, pred_none_buf_2d->buf,
+ pred_none_buf_2d->width, pred_none_buf_2d->height,
+ pred_none_buf_2d->stride, y_idx, x_idx, NULL, 0, 0);
+
+ unsigned int curr_uint_mse;
+ const unsigned int curr_uint_var = cpi->ppi->fn_ptr[subsize].vf(
+ src_split_buf_2d.buf, src_split_buf_2d.stride, pred_split_buf_2d.buf,
+ pred_split_buf_2d.stride, &curr_uint_mse);
+ const double curr_per_pixel_error =
+ sqrt((double)curr_uint_var / block_size_wide[subsize] /
+ block_size_high[subsize]);
+ if (curr_per_pixel_error < min_per_pixel_error)
+ min_per_pixel_error = curr_per_pixel_error;
+ if (curr_per_pixel_error > max_per_pixel_error)
+ max_per_pixel_error = curr_per_pixel_error;
+ }
+
+ // Prune based on residual statistics only if all the sub-partitions are
+ // valid.
+ if (i == SUB_PARTITIONS_SPLIT) {
+ if (max_per_pixel_error - min_per_pixel_error <= 1.5) do_split = false;
+ }
+ }
+
+ return do_split;
+}
+
+static void try_merge(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, MB_MODE_INFO **mib,
+ TokenExtra **tp, const int mi_row, const int mi_col,
+ const BLOCK_SIZE bsize, PC_TREE *const pc_tree,
+ const PARTITION_TYPE partition, const BLOCK_SIZE subsize,
+ const int pl) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int num_planes = av1_num_planes(cm);
+ // Only square blocks from 8x8 to 128x128 are supported
+ assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ bool do_split = false;
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_STATS split_rdc, none_rdc;
+ av1_invalid_rd_stats(&split_rdc);
+ av1_invalid_rd_stats(&none_rdc);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ pc_tree->partitioning = PARTITION_NONE;
+ if (!pc_tree->none) {
+ pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->none);
+ }
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
+ pc_tree->none);
+ none_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+ none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode < 2 ||
+ none_rdc.skip_txfm != 1 || pc_tree->none->mic.mode == NEWMV) {
+ do_split = calc_do_split_flag(cpi, x, pc_tree, &none_rdc, mi_params, mi_row,
+ mi_col, hbs, bsize, partition);
+ if (do_split) {
+ av1_init_rd_stats(&split_rdc);
+ split_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ RD_STATS block_rdc;
+ av1_invalid_rd_stats(&block_rdc);
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+ if (!pc_tree->split[i]->none) {
+ pc_tree->split[i]->none =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->split[i]->none);
+ }
+ pc_tree->split[i]->partitioning = PARTITION_NONE;
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+ &block_rdc, subsize, pc_tree->split[i]->none);
+ // TODO(yunqingwang): The rate here did not include the cost of
+ // signaling PARTITION_NONE token in the sub-blocks.
+ split_rdc.rate += block_rdc.rate;
+ split_rdc.dist += block_rdc.dist;
+
+ av1_rd_cost_update(x->rdmult, &split_rdc);
+
+ if (none_rdc.rdcost < split_rdc.rdcost) {
+ break;
+ }
+
+ if (i != SUB_PARTITIONS_SPLIT - 1)
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx,
+ 1, subsize, PARTITION_NONE, pc_tree->split[i]->none,
+ NULL);
+ }
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+ }
+ }
+
+ if (none_rdc.rdcost < split_rdc.rdcost) {
+ /* Predicted samples can not be reused for PARTITION_NONE since same
+ * buffer is being used to store the reconstructed samples of
+ * PARTITION_SPLIT block. */
+ if (do_split) x->reuse_inter_pred = false;
+
+ mib[0]->bsize = bsize;
+ pc_tree->partitioning = PARTITION_NONE;
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+ pc_tree->none, NULL);
+ } else {
+ mib[0]->bsize = subsize;
+ pc_tree->partitioning = PARTITION_SPLIT;
+ /* Predicted samples can not be reused for PARTITION_SPLIT since same
+ * buffer is being used to write the reconstructed samples. */
+ // TODO(Cherma): Store and reuse predicted samples generated by
+ // encode_b_nonrd() in DRY_RUN_NORMAL mode.
+ x->reuse_inter_pred = false;
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+
+ // Note: We don't reset pc_tree->split[i]->none here because it
+ // could contain results from the additional check. Instead, it is
+ // reset before we enter the nonrd_check_partition_merge_mode
+ // condition.
+ if (!pc_tree->split[i]->none) {
+ pc_tree->split[i]->none =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->split[i]->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
+ subsize, PARTITION_NONE, pc_tree->split[i]->none, NULL);
+ }
+ }
+}
+
+// Evaluate if the sub-partitions can be merged directly into a large partition
+// without calculating the RD cost.
+static void direct_partition_merging(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MB_MODE_INFO **mib,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ const PARTITION_TYPE partition =
+ (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+ : PARTITION_NONE;
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+ MB_MODE_INFO **b0 = mib;
+ MB_MODE_INFO **b1 = mib + hbs;
+ MB_MODE_INFO **b2 = mib + hbs * mi_params->mi_stride;
+ MB_MODE_INFO **b3 = mib + hbs * mi_params->mi_stride + hbs;
+
+ // Check if the following conditions are met. This can be updated
+ // later with more support added.
+ const int further_split = b0[0]->bsize < subsize || b1[0]->bsize < subsize ||
+ b2[0]->bsize < subsize || b3[0]->bsize < subsize;
+ if (further_split) return;
+
+ const int no_skip = !b0[0]->skip_txfm || !b1[0]->skip_txfm ||
+ !b2[0]->skip_txfm || !b3[0]->skip_txfm;
+ if (no_skip) return;
+
+ const int compound = (b0[0]->ref_frame[1] != b1[0]->ref_frame[1] ||
+ b0[0]->ref_frame[1] != b2[0]->ref_frame[1] ||
+ b0[0]->ref_frame[1] != b3[0]->ref_frame[1] ||
+ b0[0]->ref_frame[1] > NONE_FRAME);
+ if (compound) return;
+
+ // Intra modes aren't considered here.
+ const int different_ref = (b0[0]->ref_frame[0] != b1[0]->ref_frame[0] ||
+ b0[0]->ref_frame[0] != b2[0]->ref_frame[0] ||
+ b0[0]->ref_frame[0] != b3[0]->ref_frame[0] ||
+ b0[0]->ref_frame[0] <= INTRA_FRAME);
+ if (different_ref) return;
+
+ const int different_mode =
+ (b0[0]->mode != b1[0]->mode || b0[0]->mode != b2[0]->mode ||
+ b0[0]->mode != b3[0]->mode);
+ if (different_mode) return;
+
+ const int unsupported_mode =
+ (b0[0]->mode != NEARESTMV && b0[0]->mode != GLOBALMV);
+ if (unsupported_mode) return;
+
+ const int different_mv = (b0[0]->mv[0].as_int != b1[0]->mv[0].as_int ||
+ b0[0]->mv[0].as_int != b2[0]->mv[0].as_int ||
+ b0[0]->mv[0].as_int != b3[0]->mv[0].as_int);
+ if (different_mv) return;
+
+ const int unsupported_motion_mode =
+ (b0[0]->motion_mode != b1[0]->motion_mode ||
+ b0[0]->motion_mode != b2[0]->motion_mode ||
+ b0[0]->motion_mode != b3[0]->motion_mode ||
+ b0[0]->motion_mode != SIMPLE_TRANSLATION);
+ if (unsupported_motion_mode) return;
+
+ const int diffent_filter =
+ (b0[0]->interp_filters.as_int != b1[0]->interp_filters.as_int ||
+ b0[0]->interp_filters.as_int != b2[0]->interp_filters.as_int ||
+ b0[0]->interp_filters.as_int != b3[0]->interp_filters.as_int);
+ if (diffent_filter) return;
+
+ const int different_seg = (b0[0]->segment_id != b1[0]->segment_id ||
+ b0[0]->segment_id != b2[0]->segment_id ||
+ b0[0]->segment_id != b3[0]->segment_id);
+ if (different_seg) return;
+
+ // Evaluate the ref_mv.
+ MB_MODE_INFO **this_mi = mib;
+ BLOCK_SIZE orig_bsize = this_mi[0]->bsize;
+ const PARTITION_TYPE orig_partition = this_mi[0]->partition;
+
+ this_mi[0]->bsize = bsize;
+ this_mi[0]->partition = PARTITION_NONE;
+ this_mi[0]->skip_txfm = 1;
+
+ // TODO(yunqing): functions called below can be optimized by
+ // removing unrelated operations.
+ av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
+ mi_col, bsize);
+
+ const MV_REFERENCE_FRAME ref_frame = this_mi[0]->ref_frame[0];
+ int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+ int force_skip_low_temp_var = 0;
+ int skip_pred_mv = 0;
+ bool use_scaled_ref;
+
+ for (int i = 0; i < MB_MODE_COUNT; ++i) {
+ for (int j = 0; j < REF_FRAMES; ++j) {
+ frame_mv[i][j].as_int = INVALID_MV;
+ }
+ }
+ av1_copy(x->color_sensitivity, x->color_sensitivity_sb);
+ skip_pred_mv = (x->nonrd_prune_ref_frame_search > 2 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] != 2 &&
+ x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)] != 2);
+
+ find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, bsize,
+ force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref);
+
+ int continue_merging = 1;
+ if (frame_mv[NEARESTMV][ref_frame].as_mv.row != b0[0]->mv[0].as_mv.row ||
+ frame_mv[NEARESTMV][ref_frame].as_mv.col != b0[0]->mv[0].as_mv.col)
+ continue_merging = 0;
+
+ if (!continue_merging) {
+ this_mi[0]->bsize = orig_bsize;
+ this_mi[0]->partition = orig_partition;
+
+ // TODO(yunqing): Store the results and restore here instead of
+ // calling find_predictors() again.
+ av1_set_offsets_without_segment_id(cpi, &tile_data->tile_info, x, mi_row,
+ mi_col, this_mi[0]->bsize);
+ find_predictors(cpi, x, ref_frame, frame_mv, yv12_mb, this_mi[0]->bsize,
+ force_skip_low_temp_var, skip_pred_mv, &use_scaled_ref);
+ } else {
+ struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
+ const int is_scaled = av1_is_scaled(sf);
+ const int is_y_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 8) ||
+ (abs(this_mi[0]->mv[0].as_mv.col) % 8);
+ const int is_uv_subpel_mv = (abs(this_mi[0]->mv[0].as_mv.row) % 16) ||
+ (abs(this_mi[0]->mv[0].as_mv.col) % 16);
+
+ if (cpi->ppi->use_svc || is_scaled || is_y_subpel_mv || is_uv_subpel_mv) {
+ const int num_planes = av1_num_planes(cm);
+ set_ref_ptrs(cm, xd, ref_frame, this_mi[0]->ref_frame[1]);
+ const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame);
+ av1_setup_pre_planes(xd, 0, cfg, mi_row, mi_col,
+ xd->block_ref_scale_factors[0], num_planes);
+
+ if (!cpi->ppi->use_svc && !is_scaled && !is_y_subpel_mv) {
+ assert(is_uv_subpel_mv == 1);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 1,
+ num_planes - 1);
+ } else {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ num_planes - 1);
+ }
+ }
+
+ // Copy out mbmi_ext information.
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame = x->mbmi_ext_frame;
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(
+ mbmi_ext_frame, mbmi_ext, av1_ref_frame_type(this_mi[0]->ref_frame));
+
+ const BLOCK_SIZE this_subsize =
+ get_partition_subsize(bsize, this_mi[0]->partition);
+ // Update partition contexts.
+ update_ext_partition_context(xd, mi_row, mi_col, this_subsize, bsize,
+ this_mi[0]->partition);
+
+ const int num_planes = av1_num_planes(cm);
+ av1_reset_entropy_context(xd, bsize, num_planes);
+
+ // Note: use x->txfm_search_params.tx_mode_search_type instead of
+ // cm->features.tx_mode here.
+ TX_SIZE tx_size =
+ tx_size_from_tx_mode(bsize, x->txfm_search_params.tx_mode_search_type);
+ if (xd->lossless[this_mi[0]->segment_id]) tx_size = TX_4X4;
+ this_mi[0]->tx_size = tx_size;
+ memset(this_mi[0]->inter_tx_size, this_mi[0]->tx_size,
+ sizeof(this_mi[0]->inter_tx_size));
+
+ // Update txfm contexts.
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ set_txfm_ctxs(this_mi[0]->tx_size, xd->width, xd->height,
+ this_mi[0]->skip_txfm && is_inter_block(this_mi[0]), xd);
+
+ // Update mi for this partition block.
+ for (int y = 0; y < bs; y++) {
+ for (int x_idx = 0; x_idx < bs; x_idx++) {
+ this_mi[x_idx + y * mi_params->mi_stride] = this_mi[0];
+ }
+ }
+ }
+}
+
+/*!\brief AV1 block partition application (minimal RD search).
+*
+* \ingroup partition_search
+* \callgraph
+* \callergraph
+* Encode the block by applying pre-calculated partition patterns that are
+* represented by coding block sizes stored in the mbmi array. The only
+* partition adjustment allowed is merging leaf split nodes if it leads to a
+* lower rd cost. The partition types are limited to a basic set: none, horz,
+* vert, and split. This function is only used in the real-time mode.
+*
+* \param[in] cpi Top-level encoder structure
+* \param[in] td Pointer to thread data
+* \param[in] tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during encoding
+* \param[in] mib Array representing MB_MODE_INFO pointers for mi
+blocks starting from the first pixel of the current
+block
+* \param[in] tp Pointer to the starting token
+* \param[in] mi_row Row coordinate of the block in a step size of MI_SIZE
+* \param[in] mi_col Column coordinate of the block in a step size of
+MI_SIZE
+* \param[in] bsize Current block size
+* \param[in] pc_tree Pointer to the PC_TREE node holding the picked
+partitions and mode info for the current block
+*
+* \remark Nothing is returned. The pc_tree struct is modified to store the
+* picked partition and modes.
+*/
+void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MB_MODE_INFO **mib,
+ TokenExtra **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ // Only square blocks from 8x8 to 128x128 are supported
+ assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ PARTITION_TYPE partition = (bsize >= BLOCK_8X8)
+ ? get_partition(cm, mi_row, mi_col, bsize)
+ : PARTITION_NONE;
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+ assert(subsize <= BLOCK_LARGEST);
+ const int pl = (bsize >= BLOCK_8X8)
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : 0;
+
+ RD_STATS dummy_cost;
+ av1_invalid_rd_stats(&dummy_cost);
+
+ if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ // Initialize default mode evaluation params
+ set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+ x->reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd;
+
+ int change_none_to_split = 0;
+ if (partition == PARTITION_NONE &&
+ cpi->sf.rt_sf.nonrd_check_partition_split == 1) {
+ change_none_to_split =
+ try_split_partition(cpi, td, tile_data, tile_info, tp, x, xd, mi_params,
+ mi_row, mi_col, bsize, pl, pc_tree);
+ if (change_none_to_split) {
+ partition = PARTITION_SPLIT;
+ subsize = get_partition_subsize(bsize, partition);
+ assert(subsize <= BLOCK_LARGEST);
+ }
+ }
+
+ pc_tree->partitioning = partition;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ if (!pc_tree->none) {
+ pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->none);
+ }
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost, bsize,
+ pc_tree->none);
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize,
+ partition, pc_tree->none, NULL);
+ break;
+ case PARTITION_VERT:
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (!pc_tree->vertical[i]) {
+ pc_tree->vertical[i] =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->vertical[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->vertical[i]);
+ }
+ }
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+ subsize, pc_tree->vertical[0]);
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+ PARTITION_VERT, pc_tree->vertical[0], NULL);
+ if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) {
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col + hbs,
+ &dummy_cost, subsize, pc_tree->vertical[1]);
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize,
+ PARTITION_VERT, pc_tree->vertical[1], NULL);
+ }
+ break;
+ case PARTITION_HORZ:
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (!pc_tree->horizontal[i]) {
+ pc_tree->horizontal[i] =
+ av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!pc_tree->horizontal[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ } else {
+ av1_reset_pmc(pc_tree->horizontal[i]);
+ }
+ }
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+ subsize, pc_tree->horizontal[0]);
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+ PARTITION_HORZ, pc_tree->horizontal[0], NULL);
+
+ if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) {
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row + hbs, mi_col,
+ &dummy_cost, subsize, pc_tree->horizontal[1]);
+ encode_b_nonrd(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize,
+ PARTITION_HORZ, pc_tree->horizontal[1], NULL);
+ }
+ break;
+ case PARTITION_SPLIT:
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ if (!pc_tree->split[i]) {
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ }
+ pc_tree->split[i]->index = i;
+ }
+ if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
+ av1_is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
+ !frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+ try_merge(cpi, td, tile_data, mib, tp, mi_row, mi_col, bsize, pc_tree,
+ partition, subsize, pl);
+ } else {
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ int jj = i >> 1, ii = i & 0x01;
+ if ((mi_row + y_idx >= mi_params->mi_rows) ||
+ (mi_col + x_idx >= mi_params->mi_cols))
+ continue;
+ av1_nonrd_use_partition(
+ cpi, td, tile_data,
+ mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+ mi_row + y_idx, mi_col + x_idx, subsize, pc_tree->split[i]);
+ }
+
+ if (!change_none_to_split) {
+ // Note: Palette, cfl are not supported.
+ if (!frame_is_intra_only(cm) && !tile_data->allow_update_cdf &&
+ cpi->sf.rt_sf.partition_direct_merging &&
+ mode_costs->partition_cost[pl][PARTITION_NONE] <
+ mode_costs->partition_cost[pl][PARTITION_SPLIT] &&
+ (mi_row + bs <= mi_params->mi_rows) &&
+ (mi_col + bs <= mi_params->mi_cols)) {
+ direct_partition_merging(cpi, td, tile_data, mib, mi_row, mi_col,
+ bsize);
+ }
+ }
+ }
+ break;
+ case PARTITION_VERT_A:
+ case PARTITION_VERT_B:
+ case PARTITION_HORZ_A:
+ case PARTITION_HORZ_B:
+ case PARTITION_HORZ_4:
+ case PARTITION_VERT_4:
+ assert(0 && "Cannot handle extended partition types");
+ default: assert(0); break;
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks).
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp, int is_last,
+ int mi_row, int mi_col, BLOCK_SIZE subsize,
+ RD_STATS best_rdcost, RD_STATS *sum_rdc,
+ PARTITION_TYPE partition,
+ PICK_MODE_CONTEXT *this_ctx) {
+ MACROBLOCK *const x = &td->mb;
+ const int orig_mult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL);
+
+ av1_rd_cost_update(x->rdmult, &best_rdcost);
+
+ RD_STATS rdcost_remaining;
+ av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining);
+ RD_STATS this_rdc;
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition,
+ subsize, this_ctx, rdcost_remaining);
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc->rdcost = INT64_MAX;
+ } else {
+ sum_rdc->rate += this_rdc.rate;
+ sum_rdc->dist += this_rdc.dist;
+ av1_rd_cost_update(x->rdmult, sum_rdc);
+ }
+
+ if (sum_rdc->rdcost >= best_rdcost.rdcost) {
+ x->rdmult = orig_mult;
+ return 0;
+ }
+
+ if (!is_last) {
+ av1_update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
+ }
+
+ x->rdmult = orig_mult;
+ return 1;
+}
+
+// Tests an AB partition, and updates the encoder status, the pick mode
+// contexts, the best rdcost, and the best partition.
+static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ PC_TREE *pc_tree, RD_STATS *best_rdc,
+ int64_t *this_rdcost,
+ PICK_MODE_CONTEXT *ctxs[SUB_PARTITIONS_AB],
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ PARTITION_TYPE partition,
+ const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
+ const int ab_mi_pos[SUB_PARTITIONS_AB][2],
+ const MB_MODE_INFO **mode_cache) {
+ MACROBLOCK *const x = &td->mb;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ RD_STATS sum_rdc;
+ av1_init_rd_stats(&sum_rdc);
+ sum_rdc.rate = x->mode_costs.partition_cost[pl][partition];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ // Loop over sub-partitions in AB partition type.
+ for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+ if (mode_cache && mode_cache[i]) {
+ x->use_mb_mode_cache = 1;
+ x->mb_mode_cache = mode_cache[i];
+ }
+ const int mode_search_success =
+ rd_try_subblock(cpi, td, tile_data, tp, i == SUB_PARTITIONS_AB - 1,
+ ab_mi_pos[i][0], ab_mi_pos[i][1], ab_subsize[i],
+ *best_rdc, &sum_rdc, partition, ctxs[i]);
+ x->use_mb_mode_cache = 0;
+ x->mb_mode_cache = NULL;
+ if (!mode_search_success) {
+ return false;
+ }
+ }
+
+ av1_rd_cost_update(x->rdmult, &sum_rdc);
+ *this_rdcost = sum_rdc.rdcost;
+ if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+ *this_rdcost = sum_rdc.rdcost;
+ if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+
+ *best_rdc = sum_rdc;
+ pc_tree->partitioning = partition;
+ return true;
+}
+
+#if CONFIG_COLLECT_PARTITION_STATS
+static void init_partition_block_timing_stats(
+ PartitionTimingStats *part_timing_stats) {
+ av1_zero(*part_timing_stats);
+}
+
+static INLINE void start_partition_block_timer(
+ PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type) {
+ assert(!part_timing_stats->timer_is_on);
+ part_timing_stats->partition_attempts[partition_type] += 1;
+ aom_usec_timer_start(&part_timing_stats->timer);
+ part_timing_stats->timer_is_on = 1;
+}
+
+static INLINE void end_partition_block_timer(
+ PartitionTimingStats *part_timing_stats, PARTITION_TYPE partition_type,
+ int64_t rdcost) {
+ if (part_timing_stats->timer_is_on) {
+ aom_usec_timer_mark(&part_timing_stats->timer);
+ const int64_t time = aom_usec_timer_elapsed(&part_timing_stats->timer);
+ part_timing_stats->partition_times[partition_type] += time;
+ part_timing_stats->partition_rdcost[partition_type] = rdcost;
+ part_timing_stats->timer_is_on = 0;
+ }
+}
+static INLINE void print_partition_timing_stats_with_rdcost(
+ const PartitionTimingStats *part_timing_stats, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, FRAME_UPDATE_TYPE frame_update_type, int frame_number,
+ const RD_STATS *best_rdc, const char *filename) {
+ FILE *f = fopen(filename, "a");
+ fprintf(f, "%d,%d,%d,%d,%d,%d,%" PRId64 ",%" PRId64 ",", bsize, frame_number,
+ frame_update_type, mi_row, mi_col, best_rdc->rate, best_rdc->dist,
+ best_rdc->rdcost);
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ if (part_timing_stats->partition_rdcost[idx] == INT64_MAX) {
+ fprintf(f, "%d,", -1);
+ } else {
+ fprintf(f, "%" PRId64 ",", part_timing_stats->partition_rdcost[idx]);
+ }
+ }
+ fprintf(f, "\n");
+ fclose(f);
+}
+
+static INLINE void print_partition_timing_stats(
+ const PartitionTimingStats *part_timing_stats, int intra_only,
+ int show_frame, const BLOCK_SIZE bsize, const char *filename) {
+ FILE *f = fopen(filename, "a");
+ fprintf(f, "%d,%d,%d,", bsize, show_frame, intra_only);
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%d,", part_timing_stats->partition_decisions[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]);
+ }
+ fprintf(f, "\n");
+ fclose(f);
+}
+
+static INLINE void accumulate_partition_timing_stats(
+ FramePartitionTimingStats *fr_part_timing_stats,
+ const PartitionTimingStats *part_timing_stats, BLOCK_SIZE bsize) {
+ const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
+ int *agg_attempts = fr_part_timing_stats->partition_attempts[bsize_idx];
+ int *agg_decisions = fr_part_timing_stats->partition_decisions[bsize_idx];
+ int64_t *agg_times = fr_part_timing_stats->partition_times[bsize_idx];
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ agg_attempts[idx] += part_timing_stats->partition_attempts[idx];
+ agg_decisions[idx] += part_timing_stats->partition_decisions[idx];
+ agg_times[idx] += part_timing_stats->partition_times[idx];
+ }
+}
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+// Initialize state variables of partition search used in
+// av1_rd_pick_partition().
+static void init_partition_search_state_params(
+ MACROBLOCK *x, AV1_COMP *const cpi, PartitionSearchState *part_search_state,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
+ const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+
+ // Initialization of block size related parameters.
+ blk_params->mi_step = mi_size_wide[bsize] / 2;
+ blk_params->mi_row = mi_row;
+ blk_params->mi_col = mi_col;
+ blk_params->mi_row_edge = mi_row + blk_params->mi_step;
+ blk_params->mi_col_edge = mi_col + blk_params->mi_step;
+ blk_params->width = block_size_wide[bsize];
+ blk_params->min_partition_size_1d =
+ block_size_wide[x->sb_enc.min_partition_size];
+ blk_params->subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ blk_params->split_bsize2 = blk_params->subsize;
+ blk_params->bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+ blk_params->bsize = bsize;
+
+ // Check if the partition corresponds to edge block.
+ blk_params->has_rows = (blk_params->mi_row_edge < mi_params->mi_rows);
+ blk_params->has_cols = (blk_params->mi_col_edge < mi_params->mi_cols);
+
+ // Update intra partitioning related info.
+ part_search_state->intra_part_info = &x->part_search_info;
+ // Prepare for segmentation CNN-based partitioning for intra-frame.
+ if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+ part_search_state->intra_part_info->quad_tree_idx = 0;
+ part_search_state->intra_part_info->cnn_output_valid = 0;
+ }
+
+ // Set partition plane context index.
+ part_search_state->pl_ctx_idx =
+ blk_params->bsize_at_least_8x8
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : 0;
+
+ // Partition cost buffer update
+ ModeCosts *mode_costs = &x->mode_costs;
+ part_search_state->partition_cost =
+ mode_costs->partition_cost[part_search_state->pl_ctx_idx];
+
+ // Initialize HORZ and VERT win flags as true for all split partitions.
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ part_search_state->split_part_rect_win[i].rect_part_win[HORZ] = true;
+ part_search_state->split_part_rect_win[i].rect_part_win[VERT] = true;
+ }
+
+ // Initialize the rd cost.
+ av1_init_rd_stats(&part_search_state->this_rdc);
+
+ // Initialize RD costs for partition types to 0.
+ part_search_state->none_rd = 0;
+ av1_zero(part_search_state->split_rd);
+ av1_zero(part_search_state->rect_part_rd);
+
+ // Initialize SPLIT partition to be not ready.
+ av1_zero(part_search_state->is_split_ctx_is_ready);
+ // Initialize HORZ and VERT partitions to be not ready.
+ av1_zero(part_search_state->is_rect_ctx_is_ready);
+
+ // Chroma subsampling.
+ part_search_state->ss_x = x->e_mbd.plane[1].subsampling_x;
+ part_search_state->ss_y = x->e_mbd.plane[1].subsampling_y;
+
+ // Initialize partition search flags to defaults.
+ part_search_state->terminate_partition_search = 0;
+ part_search_state->do_square_split = blk_params->bsize_at_least_8x8;
+ part_search_state->do_rectangular_split =
+ cpi->oxcf.part_cfg.enable_rect_partitions &&
+ blk_params->bsize_at_least_8x8;
+ av1_zero(part_search_state->prune_rect_part);
+
+ // Initialize allowed partition types for the partition block.
+ part_search_state->partition_none_allowed =
+ av1_blk_has_rows_and_cols(blk_params);
+ part_search_state->partition_rect_allowed[HORZ] =
+ part_search_state->do_rectangular_split && blk_params->has_cols &&
+ get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ),
+ part_search_state->ss_x,
+ part_search_state->ss_y) != BLOCK_INVALID;
+ part_search_state->partition_rect_allowed[VERT] =
+ part_search_state->do_rectangular_split && blk_params->has_rows &&
+ get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT),
+ part_search_state->ss_x,
+ part_search_state->ss_y) != BLOCK_INVALID;
+
+ // Reset the flag indicating whether a partition leading to a rdcost lower
+ // than the bound best_rdc has been found.
+ part_search_state->found_best_partition = false;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ init_partition_block_timing_stats(&part_search_state->part_timing_stats);
+#endif // CONFIG_COLLECT_PARTITION_STATS
+}
+
+// Override partition cost buffer for the edge blocks.
+static void set_partition_cost_for_edge_blk(
+ AV1_COMMON const *cm, PartitionSearchState *part_search_state) {
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ assert(blk_params.bsize_at_least_8x8 && part_search_state->pl_ctx_idx >= 0);
+ const aom_cdf_prob *partition_cdf =
+ cm->fc->partition_cdf[part_search_state->pl_ctx_idx];
+ const int max_cost = av1_cost_symbol(0);
+ for (PARTITION_TYPE i = 0; i < PARTITION_TYPES; ++i)
+ part_search_state->tmp_partition_cost[i] = max_cost;
+ if (blk_params.has_cols) {
+ // At the bottom, the two possibilities are HORZ and SPLIT.
+ aom_cdf_prob bot_cdf[2];
+ partition_gather_vert_alike(bot_cdf, partition_cdf, blk_params.bsize);
+ static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+ av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, bot_cdf,
+ bot_inv_map);
+ } else if (blk_params.has_rows) {
+ // At the right, the two possibilities are VERT and SPLIT.
+ aom_cdf_prob rhs_cdf[2];
+ partition_gather_horz_alike(rhs_cdf, partition_cdf, blk_params.bsize);
+ static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+ av1_cost_tokens_from_cdf(part_search_state->tmp_partition_cost, rhs_cdf,
+ rhs_inv_map);
+ } else {
+ // At the bottom right, we always split.
+ part_search_state->tmp_partition_cost[PARTITION_SPLIT] = 0;
+ }
+ // Override the partition cost buffer.
+ part_search_state->partition_cost = part_search_state->tmp_partition_cost;
+}
+
+// Reset the partition search state flags when
+// must_find_valid_partition is equal to 1.
+static AOM_INLINE void reset_part_limitations(
+ AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const int is_rect_part_allowed =
+ blk_params.bsize_at_least_8x8 &&
+ cpi->oxcf.part_cfg.enable_rect_partitions &&
+ (blk_params.width > blk_params.min_partition_size_1d);
+ part_search_state->do_square_split =
+ blk_params.bsize_at_least_8x8 &&
+ (blk_params.width > blk_params.min_partition_size_1d);
+ part_search_state->partition_none_allowed =
+ av1_blk_has_rows_and_cols(&blk_params) &&
+ (blk_params.width >= blk_params.min_partition_size_1d);
+ part_search_state->partition_rect_allowed[HORZ] =
+ blk_params.has_cols && is_rect_part_allowed &&
+ get_plane_block_size(
+ get_partition_subsize(blk_params.bsize, PARTITION_HORZ),
+ part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+ part_search_state->partition_rect_allowed[VERT] =
+ blk_params.has_rows && is_rect_part_allowed &&
+ get_plane_block_size(
+ get_partition_subsize(blk_params.bsize, PARTITION_VERT),
+ part_search_state->ss_x, part_search_state->ss_y) != BLOCK_INVALID;
+ part_search_state->terminate_partition_search = 0;
+}
+
+// Rectangular partitions evaluation at sub-block level.
+static void rd_pick_rect_partition(AV1_COMP *const cpi, TileDataEnc *tile_data,
+ MACROBLOCK *x,
+ PICK_MODE_CONTEXT *cur_partition_ctx,
+ PartitionSearchState *part_search_state,
+ RD_STATS *best_rdc, const int idx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ PARTITION_TYPE partition_type) {
+ // Obtain the remainder from the best rd cost
+ // for further processing of partition.
+ RD_STATS best_remain_rdcost;
+ av1_rd_stats_subtraction(x->rdmult, best_rdc, &part_search_state->sum_rdc,
+ &best_remain_rdcost);
+
+ // Obtain the best mode for the partition sub-block.
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &part_search_state->this_rdc,
+ partition_type, bsize, cur_partition_ctx, best_remain_rdcost);
+ av1_rd_cost_update(x->rdmult, &part_search_state->this_rdc);
+
+ // Update the partition rd cost with the current sub-block rd.
+ if (part_search_state->this_rdc.rate == INT_MAX) {
+ part_search_state->sum_rdc.rdcost = INT64_MAX;
+ } else {
+ part_search_state->sum_rdc.rate += part_search_state->this_rdc.rate;
+ part_search_state->sum_rdc.dist += part_search_state->this_rdc.dist;
+ av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc);
+ }
+ const RECT_PART_TYPE rect_part =
+ partition_type == PARTITION_HORZ ? HORZ : VERT;
+ part_search_state->rect_part_rd[rect_part][idx] =
+ part_search_state->this_rdc.rdcost;
+}
+
+typedef int (*active_edge_info)(const AV1_COMP *cpi, int mi_col, int mi_step);
+
+// Checks if HORZ / VERT partition search is allowed.
+static AOM_INLINE int is_rect_part_allowed(
+ const AV1_COMP *cpi, const PartitionSearchState *part_search_state,
+ const active_edge_info *active_edge, RECT_PART_TYPE rect_part,
+ const int mi_pos) {
+ const PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
+ const int is_part_allowed =
+ (!part_search_state->terminate_partition_search &&
+ part_search_state->partition_rect_allowed[rect_part] &&
+ !part_search_state->prune_rect_part[rect_part] &&
+ (part_search_state->do_rectangular_split ||
+ active_edge[rect_part](cpi, mi_pos, blk_params->mi_step)));
+ return is_part_allowed;
+}
+
+static void rectangular_partition_search(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+ RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ RD_RECT_PART_WIN_INFO *rect_part_win_info, const RECT_PART_TYPE start_type,
+ const RECT_PART_TYPE end_type) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ RD_STATS *sum_rdc = &part_search_state->sum_rdc;
+ const int rect_partition_type[NUM_RECT_PARTS] = { PARTITION_HORZ,
+ PARTITION_VERT };
+
+ // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][0]: mi_row postion of
+ // HORZ and VERT partition types.
+ // mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][1]: mi_col postion of
+ // HORZ and VERT partition types.
+ const int mi_pos_rect[NUM_RECT_PARTS][SUB_PARTITIONS_RECT][2] = {
+ { { blk_params.mi_row, blk_params.mi_col },
+ { blk_params.mi_row_edge, blk_params.mi_col } },
+ { { blk_params.mi_row, blk_params.mi_col },
+ { blk_params.mi_row, blk_params.mi_col_edge } }
+ };
+
+ // Initialize active edge_type function pointer
+ // for HOZR and VERT partition types.
+ active_edge_info active_edge_type[NUM_RECT_PARTS] = { av1_active_h_edge,
+ av1_active_v_edge };
+
+ // Indicates edge blocks for HORZ and VERT partition types.
+ const int is_not_edge_block[NUM_RECT_PARTS] = { blk_params.has_rows,
+ blk_params.has_cols };
+
+ // Initialize pc tree context for HORZ and VERT partition types.
+ PICK_MODE_CONTEXT **cur_ctx[NUM_RECT_PARTS][SUB_PARTITIONS_RECT] = {
+ { &pc_tree->horizontal[0], &pc_tree->horizontal[1] },
+ { &pc_tree->vertical[0], &pc_tree->vertical[1] }
+ };
+
+ // Loop over rectangular partition types.
+ for (RECT_PART_TYPE i = start_type; i <= end_type; i++) {
+ assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+ !part_search_state->partition_rect_allowed[i]));
+
+ // Check if the HORZ / VERT partition search is to be performed.
+ if (!is_rect_part_allowed(cpi, part_search_state, active_edge_type, i,
+ mi_pos_rect[i][0][i]))
+ continue;
+
+ // Sub-partition idx.
+ int sub_part_idx = 0;
+ PARTITION_TYPE partition_type = rect_partition_type[i];
+ blk_params.subsize =
+ get_partition_subsize(blk_params.bsize, partition_type);
+ assert(blk_params.subsize <= BLOCK_LARGEST);
+ av1_init_rd_stats(sum_rdc);
+ for (int j = 0; j < SUB_PARTITIONS_RECT; j++) {
+ if (cur_ctx[i][j][0] == NULL) {
+ cur_ctx[i][j][0] =
+ av1_alloc_pmc(cpi, blk_params.subsize, &td->shared_coeff_buf);
+ if (!cur_ctx[i][j][0])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+ }
+ sum_rdc->rate = part_search_state->partition_cost[partition_type];
+ sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, 0);
+#if CONFIG_COLLECT_PARTITION_STATS
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state->part_timing_stats;
+ if (best_rdc->rdcost - sum_rdc->rdcost >= 0) {
+ start_partition_block_timer(part_timing_stats, partition_type);
+ }
+#endif
+
+ // First sub-partition evaluation in HORZ / VERT partition type.
+ rd_pick_rect_partition(
+ cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state,
+ best_rdc, 0, mi_pos_rect[i][sub_part_idx][0],
+ mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
+
+ // Start of second sub-partition evaluation.
+ // Evaluate second sub-partition if the first sub-partition cost
+ // is less than the best cost and if it is not an edge block.
+ if (sum_rdc->rdcost < best_rdc->rdcost && is_not_edge_block[i]) {
+ const MB_MODE_INFO *const mbmi = &cur_ctx[i][sub_part_idx][0]->mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ // Neither palette mode nor cfl predicted.
+ if (pmi->palette_size[PLANE_TYPE_Y] == 0 &&
+ pmi->palette_size[PLANE_TYPE_UV] == 0) {
+ if (mbmi->uv_mode != UV_CFL_PRED)
+ part_search_state->is_rect_ctx_is_ready[i] = 1;
+ }
+ av1_update_state(cpi, td, cur_ctx[i][sub_part_idx][0], blk_params.mi_row,
+ blk_params.mi_col, blk_params.subsize, DRY_RUN_NORMAL);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL,
+ blk_params.subsize, NULL);
+
+ // Second sub-partition evaluation in HORZ / VERT partition type.
+ sub_part_idx = 1;
+ rd_pick_rect_partition(
+ cpi, tile_data, x, cur_ctx[i][sub_part_idx][0], part_search_state,
+ best_rdc, 1, mi_pos_rect[i][sub_part_idx][0],
+ mi_pos_rect[i][sub_part_idx][1], blk_params.subsize, partition_type);
+ }
+ // Update HORZ / VERT best partition.
+ if (sum_rdc->rdcost < best_rdc->rdcost) {
+ sum_rdc->rdcost = RDCOST(x->rdmult, sum_rdc->rate, sum_rdc->dist);
+ if (sum_rdc->rdcost < best_rdc->rdcost) {
+ *best_rdc = *sum_rdc;
+ part_search_state->found_best_partition = true;
+ pc_tree->partitioning = partition_type;
+ }
+ } else {
+ // Update HORZ / VERT win flag.
+ if (rect_part_win_info != NULL)
+ rect_part_win_info->rect_part_win[i] = false;
+ }
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (part_timing_stats->timer_is_on) {
+ end_partition_block_timer(part_timing_stats, partition_type,
+ sum_rdc->rdcost);
+ }
+#endif
+ av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+ blk_params.bsize, av1_num_planes(cm));
+ }
+}
+
+// AB partition type evaluation.
+static void rd_pick_ab_part(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PC_TREE *pc_tree, PICK_MODE_CONTEXT *dst_ctxs[SUB_PARTITIONS_AB],
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ const BLOCK_SIZE ab_subsize[SUB_PARTITIONS_AB],
+ const int ab_mi_pos[SUB_PARTITIONS_AB][2], const PARTITION_TYPE part_type,
+ const MB_MODE_INFO **mode_cache) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+ int64_t this_rdcost = 0;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state->part_timing_stats;
+ {
+ RD_STATS tmp_sum_rdc;
+ av1_init_rd_stats(&tmp_sum_rdc);
+ tmp_sum_rdc.rate = part_search_state->partition_cost[part_type];
+ tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+ if (best_rdc->rdcost - tmp_sum_rdc.rdcost >= 0) {
+ start_partition_block_timer(part_timing_stats, part_type);
+ }
+ }
+#endif
+
+ // Test this partition and update the best partition.
+ const bool find_best_ab_part = rd_test_partition3(
+ cpi, td, tile_data, tp, pc_tree, best_rdc, &this_rdcost, dst_ctxs, mi_row,
+ mi_col, bsize, part_type, ab_subsize, ab_mi_pos, mode_cache);
+ part_search_state->found_best_partition |= find_best_ab_part;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (part_timing_stats->timer_is_on) {
+ if (!find_best_ab_part) this_rdcost = INT64_MAX;
+ end_partition_block_timer(part_timing_stats, part_type, this_rdcost);
+ }
+#endif
+ av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// Set mode search context.
+static AOM_INLINE void set_mode_search_ctx(
+ PC_TREE *pc_tree, const int is_ctx_ready[NUM_AB_PARTS][2],
+ PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2]) {
+ mode_srch_ctx[HORZ_B][0] = &pc_tree->horizontal[0];
+ mode_srch_ctx[VERT_B][0] = &pc_tree->vertical[0];
+
+ if (is_ctx_ready[HORZ_A][0])
+ mode_srch_ctx[HORZ_A][0] = &pc_tree->split[0]->none;
+
+ if (is_ctx_ready[VERT_A][0])
+ mode_srch_ctx[VERT_A][0] = &pc_tree->split[0]->none;
+
+ if (is_ctx_ready[HORZ_A][1])
+ mode_srch_ctx[HORZ_A][1] = &pc_tree->split[1]->none;
+}
+
+static AOM_INLINE void copy_partition_mode_from_mode_context(
+ const MB_MODE_INFO **dst_mode, const PICK_MODE_CONTEXT *ctx) {
+ if (ctx && ctx->rd_stats.rate < INT_MAX) {
+ *dst_mode = &ctx->mic;
+ } else {
+ *dst_mode = NULL;
+ }
+}
+
+static AOM_INLINE void copy_partition_mode_from_pc_tree(
+ const MB_MODE_INFO **dst_mode, const PC_TREE *pc_tree) {
+ if (pc_tree) {
+ copy_partition_mode_from_mode_context(dst_mode, pc_tree->none);
+ } else {
+ *dst_mode = NULL;
+ }
+}
+
+static AOM_INLINE void set_mode_cache_for_partition_ab(
+ const MB_MODE_INFO **mode_cache, const PC_TREE *pc_tree,
+ AB_PART_TYPE ab_part_type) {
+ switch (ab_part_type) {
+ case HORZ_A:
+ copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+ copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+ copy_partition_mode_from_mode_context(&mode_cache[2],
+ pc_tree->horizontal[1]);
+ break;
+ case HORZ_B:
+ copy_partition_mode_from_mode_context(&mode_cache[0],
+ pc_tree->horizontal[0]);
+ copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+ copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+ break;
+ case VERT_A:
+ copy_partition_mode_from_pc_tree(&mode_cache[0], pc_tree->split[0]);
+ copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[2]);
+ copy_partition_mode_from_mode_context(&mode_cache[2],
+ pc_tree->vertical[1]);
+ break;
+ case VERT_B:
+ copy_partition_mode_from_mode_context(&mode_cache[0],
+ pc_tree->vertical[0]);
+ copy_partition_mode_from_pc_tree(&mode_cache[1], pc_tree->split[1]);
+ copy_partition_mode_from_pc_tree(&mode_cache[2], pc_tree->split[3]);
+ break;
+ default: assert(0 && "Invalid ab partition type!\n");
+ }
+}
+
+// AB Partitions type search.
+static void ab_partitions_search(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PC_TREE *pc_tree, PartitionSearchState *part_search_state,
+ RD_STATS *best_rdc, RD_RECT_PART_WIN_INFO *rect_part_win_info,
+ int pb_source_variance, int ext_partition_allowed,
+ const AB_PART_TYPE start_type, const AB_PART_TYPE end_type) {
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ if (part_search_state->terminate_partition_search) {
+ return;
+ }
+
+ int ab_partitions_allowed[NUM_AB_PARTS];
+ // Prune AB partitions
+ av1_prune_ab_partitions(cpi, x, pc_tree, pb_source_variance, best_rdc->rdcost,
+ rect_part_win_info, ext_partition_allowed,
+ part_search_state, ab_partitions_allowed);
+
+ // Flags to indicate whether the mode search is done.
+ const int is_ctx_ready[NUM_AB_PARTS][2] = {
+ { part_search_state->is_split_ctx_is_ready[0],
+ part_search_state->is_split_ctx_is_ready[1] },
+ { part_search_state->is_rect_ctx_is_ready[HORZ], 0 },
+ { part_search_state->is_split_ctx_is_ready[0], 0 },
+ { part_search_state->is_rect_ctx_is_ready[VERT], 0 }
+ };
+
+ // Current partition context.
+ PICK_MODE_CONTEXT **cur_part_ctxs[NUM_AB_PARTS] = { pc_tree->horizontala,
+ pc_tree->horizontalb,
+ pc_tree->verticala,
+ pc_tree->verticalb };
+
+ // Context of already evaluted partition types.
+ PICK_MODE_CONTEXT **mode_srch_ctx[NUM_AB_PARTS][2];
+ // Set context of already evaluted partition types.
+ set_mode_search_ctx(pc_tree, is_ctx_ready, mode_srch_ctx);
+
+ // Array of sub-partition size of AB partition types.
+ const BLOCK_SIZE ab_subsize[NUM_AB_PARTS][SUB_PARTITIONS_AB] = {
+ { blk_params.split_bsize2, blk_params.split_bsize2,
+ get_partition_subsize(bsize, PARTITION_HORZ_A) },
+ { get_partition_subsize(bsize, PARTITION_HORZ_B), blk_params.split_bsize2,
+ blk_params.split_bsize2 },
+ { blk_params.split_bsize2, blk_params.split_bsize2,
+ get_partition_subsize(bsize, PARTITION_VERT_A) },
+ { get_partition_subsize(bsize, PARTITION_VERT_B), blk_params.split_bsize2,
+ blk_params.split_bsize2 }
+ };
+
+ // Array of mi_row, mi_col positions corresponds to each sub-partition in AB
+ // partition types.
+ const int ab_mi_pos[NUM_AB_PARTS][SUB_PARTITIONS_AB][2] = {
+ { { mi_row, mi_col },
+ { mi_row, blk_params.mi_col_edge },
+ { blk_params.mi_row_edge, mi_col } },
+ { { mi_row, mi_col },
+ { blk_params.mi_row_edge, mi_col },
+ { blk_params.mi_row_edge, blk_params.mi_col_edge } },
+ { { mi_row, mi_col },
+ { blk_params.mi_row_edge, mi_col },
+ { mi_row, blk_params.mi_col_edge } },
+ { { mi_row, mi_col },
+ { mi_row, blk_params.mi_col_edge },
+ { blk_params.mi_row_edge, blk_params.mi_col_edge } }
+ };
+
+ // Loop over AB partition types.
+ for (AB_PART_TYPE ab_part_type = start_type; ab_part_type <= end_type;
+ ab_part_type++) {
+ const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A;
+
+ // Check if the AB partition search is to be performed.
+ if (!ab_partitions_allowed[ab_part_type]) {
+ continue;
+ }
+
+ blk_params.subsize = get_partition_subsize(bsize, part_type);
+ for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
+ // Set AB partition context.
+ cur_part_ctxs[ab_part_type][i] = av1_alloc_pmc(
+ cpi, ab_subsize[ab_part_type][i], &td->shared_coeff_buf);
+ if (!cur_part_ctxs[ab_part_type][i])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ // Set mode as not ready.
+ cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
+ }
+
+ if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab) {
+ // We can copy directly the mode search results if we have already
+ // searched the current block and the contexts match.
+ if (is_ctx_ready[ab_part_type][0]) {
+ av1_copy_tree_context(cur_part_ctxs[ab_part_type][0],
+ mode_srch_ctx[ab_part_type][0][0]);
+ cur_part_ctxs[ab_part_type][0]->mic.partition = part_type;
+ cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1;
+ if (is_ctx_ready[ab_part_type][1]) {
+ av1_copy_tree_context(cur_part_ctxs[ab_part_type][1],
+ mode_srch_ctx[ab_part_type][1][0]);
+ cur_part_ctxs[ab_part_type][1]->mic.partition = part_type;
+ cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1;
+ }
+ }
+ }
+
+ // Even if the contexts don't match, we can still speed up by reusing the
+ // previous prediction mode.
+ const MB_MODE_INFO *mode_cache[3] = { NULL, NULL, NULL };
+ if (cpi->sf.part_sf.reuse_best_prediction_for_part_ab) {
+ set_mode_cache_for_partition_ab(mode_cache, pc_tree, ab_part_type);
+ }
+
+ // Evaluation of AB partition type.
+ rd_pick_ab_part(cpi, td, tile_data, tp, x, x_ctx, pc_tree,
+ cur_part_ctxs[ab_part_type], part_search_state, best_rdc,
+ ab_subsize[ab_part_type], ab_mi_pos[ab_part_type],
+ part_type, mode_cache);
+ }
+}
+
+// Set mi positions for HORZ4 / VERT4 sub-block partitions.
+static void set_mi_pos_partition4(const int inc_step[NUM_PART4_TYPES],
+ int mi_pos[SUB_PARTITIONS_PART4][2],
+ const int mi_row, const int mi_col) {
+ for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; i++) {
+ mi_pos[i][0] = mi_row + i * inc_step[HORZ4];
+ mi_pos[i][1] = mi_col + i * inc_step[VERT4];
+ }
+}
+
+// Set context and RD cost for HORZ4 / VERT4 partition types.
+static void set_4_part_ctx_and_rdcost(
+ MACROBLOCK *x, const AV1_COMP *const cpi, ThreadData *td,
+ PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
+ PartitionSearchState *part_search_state, PARTITION_TYPE partition_type,
+ BLOCK_SIZE bsize) {
+ // Initialize sum_rdc RD cost structure.
+ av1_init_rd_stats(&part_search_state->sum_rdc);
+ const int subsize = get_partition_subsize(bsize, partition_type);
+ part_search_state->sum_rdc.rate =
+ part_search_state->partition_cost[partition_type];
+ part_search_state->sum_rdc.rdcost =
+ RDCOST(x->rdmult, part_search_state->sum_rdc.rate, 0);
+ for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ cur_part_ctx[i] = av1_alloc_pmc(cpi, subsize, &td->shared_coeff_buf);
+ if (!cur_part_ctx[i])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ }
+}
+
+// Partition search of HORZ4 / VERT4 partition types.
+static void rd_pick_4partition(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PC_TREE *pc_tree, PICK_MODE_CONTEXT *cur_part_ctx[SUB_PARTITIONS_PART4],
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ const int inc_step[NUM_PART4_TYPES], PARTITION_TYPE partition_type) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ // mi positions needed for HORZ4 and VERT4 partition types.
+ int mi_pos_check[NUM_PART4_TYPES] = { cm->mi_params.mi_rows,
+ cm->mi_params.mi_cols };
+ const PART4_TYPES part4_idx = (partition_type != PARTITION_HORZ_4);
+ int mi_pos[SUB_PARTITIONS_PART4][2];
+
+ blk_params.subsize = get_partition_subsize(blk_params.bsize, partition_type);
+ // Set partition context and RD cost.
+ set_4_part_ctx_and_rdcost(x, cpi, td, cur_part_ctx, part_search_state,
+ partition_type, blk_params.bsize);
+ // Set mi positions for sub-block sizes.
+ set_mi_pos_partition4(inc_step, mi_pos, blk_params.mi_row, blk_params.mi_col);
+#if CONFIG_COLLECT_PARTITION_STATS
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state->part_timing_stats;
+ if (best_rdc->rdcost - part_search_state->sum_rdc.rdcost >= 0) {
+ start_partition_block_timer(part_timing_stats, partition_type);
+ }
+#endif
+ // Loop over sub-block partitions.
+ for (PART4_TYPES i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ if (i > 0 && mi_pos[i][part4_idx] >= mi_pos_check[part4_idx]) break;
+
+ // Sub-block evaluation of Horz4 / Vert4 partition type.
+ cur_part_ctx[i]->rd_mode_is_ready = 0;
+ if (!rd_try_subblock(
+ cpi, td, tile_data, tp, (i == SUB_PARTITIONS_PART4 - 1),
+ mi_pos[i][0], mi_pos[i][1], blk_params.subsize, *best_rdc,
+ &part_search_state->sum_rdc, partition_type, cur_part_ctx[i])) {
+ av1_invalid_rd_stats(&part_search_state->sum_rdc);
+ break;
+ }
+ }
+
+ // Calculate the total cost and update the best partition.
+ av1_rd_cost_update(x->rdmult, &part_search_state->sum_rdc);
+ if (part_search_state->sum_rdc.rdcost < best_rdc->rdcost) {
+ *best_rdc = part_search_state->sum_rdc;
+ part_search_state->found_best_partition = true;
+ pc_tree->partitioning = partition_type;
+ }
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (part_timing_stats->timer_is_on) {
+ end_partition_block_timer(part_timing_stats, partition_type,
+ part_search_state->sum_rdc.rdcost);
+ }
+#endif
+ av1_restore_context(x, x_ctx, blk_params.mi_row, blk_params.mi_col,
+ blk_params.bsize, av1_num_planes(cm));
+}
+
+// Do not evaluate extended partitions if NONE partition is skippable.
+static INLINE int prune_ext_part_none_skippable(
+ PICK_MODE_CONTEXT *part_none, int must_find_valid_partition,
+ int skip_non_sq_part_based_on_none, BLOCK_SIZE bsize) {
+ if ((skip_non_sq_part_based_on_none >= 1) && (part_none != NULL)) {
+ if (part_none->skippable && !must_find_valid_partition &&
+ bsize >= BLOCK_16X16) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+// Allow ab partition search
+static int allow_ab_partition_search(PartitionSearchState *part_search_state,
+ PARTITION_SPEED_FEATURES *part_sf,
+ PARTITION_TYPE curr_best_part,
+ int must_find_valid_partition,
+ int prune_ext_part_state,
+ int64_t best_rdcost) {
+ const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ // Do not prune if there is no valid partition
+ if (best_rdcost == INT64_MAX) return 1;
+
+ // Determine bsize threshold to evaluate ab partitions
+ BLOCK_SIZE ab_bsize_thresh = part_sf->ext_partition_eval_thresh;
+ if (part_sf->ext_part_eval_based_on_cur_best && !must_find_valid_partition &&
+ !(curr_best_part == PARTITION_HORZ || curr_best_part == PARTITION_VERT))
+ ab_bsize_thresh = BLOCK_128X128;
+
+ // ab partitions are only allowed for square block sizes BLOCK_16X16 or
+ // higher, so ab_bsize_thresh must be large enough to exclude BLOCK_4X4 and
+ // BLOCK_8X8.
+ assert(ab_bsize_thresh >= BLOCK_8X8);
+
+ int ab_partition_allowed =
+ part_search_state->do_rectangular_split && bsize > ab_bsize_thresh &&
+ av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state;
+
+ return ab_partition_allowed;
+}
+
+// Prune 4-way partitions based on the number of horz/vert wins
+// in the current block and sub-blocks in PARTITION_SPLIT.
+static void prune_4_partition_using_split_info(
+ AV1_COMP *const cpi, MACROBLOCK *x, PartitionSearchState *part_search_state,
+ int part4_search_allowed[NUM_PART4_TYPES]) {
+ PART4_TYPES cur_part[NUM_PART4_TYPES] = { HORZ4, VERT4 };
+ // Count of child blocks in which HORZ or VERT partition has won
+ int num_child_rect_win[NUM_RECT_PARTS] = { 0, 0 };
+ // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of
+ // split partiitons.
+ // Conservative pruning for high quantizers.
+ const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3);
+
+ for (RECT_PART_TYPE i = HORZ; i < NUM_RECT_PARTS; i++) {
+ if (!(cpi->sf.part_sf.prune_ext_part_using_split_info &&
+ part4_search_allowed[cur_part[i]]))
+ continue;
+ // Loop over split partitions.
+ // Get rectangular partitions winner info of split partitions.
+ for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; idx++)
+ num_child_rect_win[i] +=
+ (part_search_state->split_part_rect_win[idx].rect_part_win[i]) ? 1
+ : 0;
+ if (num_child_rect_win[i] < num_win_thresh) {
+ part4_search_allowed[cur_part[i]] = 0;
+ }
+ }
+}
+
+// Prune 4-way partition search.
+static void prune_4_way_partition_search(
+ AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree,
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ int pb_source_variance, int prune_ext_part_state,
+ int part4_search_allowed[NUM_PART4_TYPES]) {
+ const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ // Do not prune if there is no valid partition
+ if (best_rdc->rdcost == INT64_MAX) return;
+
+ // Determine bsize threshold to evaluate 4-way partitions
+ BLOCK_SIZE part4_bsize_thresh = cpi->sf.part_sf.ext_partition_eval_thresh;
+ if (cpi->sf.part_sf.ext_part_eval_based_on_cur_best &&
+ !x->must_find_valid_partition && pc_tree->partitioning == PARTITION_NONE)
+ part4_bsize_thresh = BLOCK_128X128;
+
+ // 4-way partitions are only allowed for BLOCK_16X16, BLOCK_32X32, and
+ // BLOCK_64X64, so part4_bsize_thresh must be large enough to exclude
+ // BLOCK_4X4 and BLOCK_8X8.
+ assert(part4_bsize_thresh >= BLOCK_8X8);
+
+ bool partition4_allowed =
+ part_search_state->do_rectangular_split && bsize > part4_bsize_thresh &&
+ av1_blk_has_rows_and_cols(&blk_params) && !prune_ext_part_state;
+
+ // Disable 4-way partition search flags for width less than a multiple of the
+ // minimum partition width.
+ if (blk_params.width < (blk_params.min_partition_size_1d
+ << cpi->sf.part_sf.prune_part4_search)) {
+ part4_search_allowed[HORZ4] = 0;
+ part4_search_allowed[VERT4] = 0;
+ return;
+ }
+
+ PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4,
+ PARTITION_VERT_4 };
+ const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+ // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+ // PARTITION_VERT_4 for this block. This is almost the same as
+ // partition4_allowed, except that we don't allow 128x32 or 32x128
+ // blocks, so we require that bsize is not BLOCK_128X128.
+ partition4_allowed &=
+ part_cfg->enable_1to4_partitions && bsize != BLOCK_128X128;
+
+ for (PART4_TYPES i = HORZ4; i < NUM_PART4_TYPES; i++) {
+ part4_search_allowed[i] =
+ partition4_allowed && part_search_state->partition_rect_allowed[i] &&
+ get_plane_block_size(get_partition_subsize(bsize, cur_part[i]),
+ part_search_state->ss_x,
+ part_search_state->ss_y) != BLOCK_INVALID;
+ }
+ // Pruning: pruning out 4-way partitions based on the current best partition.
+ if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) {
+ part4_search_allowed[HORZ4] &= (pc_tree->partitioning == PARTITION_HORZ ||
+ pc_tree->partitioning == PARTITION_HORZ_A ||
+ pc_tree->partitioning == PARTITION_HORZ_B ||
+ pc_tree->partitioning == PARTITION_SPLIT ||
+ pc_tree->partitioning == PARTITION_NONE);
+ part4_search_allowed[VERT4] &= (pc_tree->partitioning == PARTITION_VERT ||
+ pc_tree->partitioning == PARTITION_VERT_A ||
+ pc_tree->partitioning == PARTITION_VERT_B ||
+ pc_tree->partitioning == PARTITION_SPLIT ||
+ pc_tree->partitioning == PARTITION_NONE);
+ }
+
+ // Pruning: pruning out some 4-way partitions using a DNN taking rd costs of
+ // sub-blocks from basic partition types.
+ if (cpi->sf.part_sf.ml_prune_partition && partition4_allowed &&
+ part_search_state->partition_rect_allowed[HORZ] &&
+ part_search_state->partition_rect_allowed[VERT]) {
+ av1_ml_prune_4_partition(cpi, x, pc_tree->partitioning, best_rdc->rdcost,
+ part_search_state, part4_search_allowed,
+ pb_source_variance);
+ }
+
+ // Pruning: pruning out 4-way partitions based on the number of horz/vert wins
+ // in the current block and sub-blocks in PARTITION_SPLIT.
+ prune_4_partition_using_split_info(cpi, x, part_search_state,
+ part4_search_allowed);
+}
+
+// Set params needed for PARTITION_NONE search.
+static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td,
+ MACROBLOCK *x, PC_TREE *pc_tree,
+ PartitionSearchState *part_search_state,
+ RD_STATS *best_remain_rdcost,
+ RD_STATS *best_rdc, int *pt_cost) {
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ RD_STATS partition_rdcost;
+ // Set PARTITION_NONE context.
+ if (pc_tree->none == NULL)
+ pc_tree->none = av1_alloc_pmc(cpi, blk_params.bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+
+ // Set PARTITION_NONE type cost.
+ if (part_search_state->partition_none_allowed) {
+ if (blk_params.bsize_at_least_8x8) {
+ *pt_cost = part_search_state->partition_cost[PARTITION_NONE] < INT_MAX
+ ? part_search_state->partition_cost[PARTITION_NONE]
+ : 0;
+ }
+
+ // Initialize the RD stats structure.
+ av1_init_rd_stats(&partition_rdcost);
+ partition_rdcost.rate = *pt_cost;
+ av1_rd_cost_update(x->rdmult, &partition_rdcost);
+ av1_rd_stats_subtraction(x->rdmult, best_rdc, &partition_rdcost,
+ best_remain_rdcost);
+ }
+}
+
+// Skip other partitions based on PARTITION_NONE rd cost.
+static void prune_partitions_after_none(AV1_COMP *const cpi, MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PICK_MODE_CONTEXT *ctx_none,
+ PartitionSearchState *part_search_state,
+ RD_STATS *best_rdc,
+ unsigned int *pb_source_variance) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ RD_STATS *this_rdc = &part_search_state->this_rdc;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ if (!frame_is_intra_only(cm) &&
+ (part_search_state->do_square_split ||
+ part_search_state->do_rectangular_split) &&
+ !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+ const int use_ml_based_breakout =
+ bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
+ bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1;
+ if (use_ml_based_breakout) {
+ av1_ml_predict_breakout(cpi, x, this_rdc, *pb_source_variance, xd->bd,
+ part_search_state);
+ }
+
+ // Adjust dist breakout threshold according to the partition size.
+ const int64_t dist_breakout_thr =
+ cpi->sf.part_sf.partition_search_breakout_dist_thr >>
+ ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+ (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+ const int rate_breakout_thr =
+ cpi->sf.part_sf.partition_search_breakout_rate_thr *
+ num_pels_log2_lookup[bsize];
+ // If all y, u, v transform blocks in this partition are skippable,
+ // and the dist & rate are within the thresholds, the partition
+ // search is terminated for current branch of the partition search
+ // tree. The dist & rate thresholds are set to 0 at speed 0 to
+ // disable the early termination at that speed.
+ if (best_rdc->dist < dist_breakout_thr &&
+ best_rdc->rate < rate_breakout_thr) {
+ part_search_state->do_square_split = 0;
+ part_search_state->do_rectangular_split = 0;
+ }
+ }
+
+ // Early termination: using simple_motion_search features and the
+ // rate, distortion, and rdcost of PARTITION_NONE, a DNN will make a
+ // decision on early terminating at PARTITION_NONE.
+ if (cpi->sf.part_sf.simple_motion_search_early_term_none && cm->show_frame &&
+ !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 &&
+ av1_blk_has_rows_and_cols(&blk_params) && this_rdc->rdcost < INT64_MAX &&
+ this_rdc->rdcost >= 0 && this_rdc->rate < INT_MAX &&
+ this_rdc->rate >= 0 &&
+ (part_search_state->do_square_split ||
+ part_search_state->do_rectangular_split)) {
+ av1_simple_motion_search_early_term_none(cpi, x, sms_tree, this_rdc,
+ part_search_state);
+ }
+}
+
+// Decide early termination and rectangular partition pruning
+// based on PARTITION_NONE and PARTITION_SPLIT costs.
+static void prune_partitions_after_split(
+ AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ int64_t part_none_rd, int64_t part_split_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ // Early termination: using the rd costs of PARTITION_NONE and subblocks
+ // from PARTITION_SPLIT to determine an early breakout.
+ if (cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+ !frame_is_intra_only(cm) &&
+ !part_search_state->terminate_partition_search &&
+ part_search_state->do_rectangular_split &&
+ (part_search_state->partition_rect_allowed[HORZ] ||
+ part_search_state->partition_rect_allowed[VERT])) {
+ av1_ml_early_term_after_split(
+ cpi, x, sms_tree, best_rdc->rdcost, part_none_rd, part_split_rd,
+ part_search_state->split_rd, part_search_state);
+ }
+
+ // Use the rd costs of PARTITION_NONE and subblocks from PARTITION_SPLIT
+ // to prune out rectangular partitions in some directions.
+ if (!cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+ cpi->sf.part_sf.ml_prune_partition && !frame_is_intra_only(cm) &&
+ (part_search_state->partition_rect_allowed[HORZ] ||
+ part_search_state->partition_rect_allowed[VERT]) &&
+ !(part_search_state->prune_rect_part[HORZ] ||
+ part_search_state->prune_rect_part[VERT]) &&
+ !part_search_state->terminate_partition_search) {
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm),
+ bsize);
+ av1_ml_prune_rect_partition(cpi, x, best_rdc->rdcost,
+ part_search_state->none_rd,
+ part_search_state->split_rd, part_search_state);
+ }
+}
+
+// Returns true if either of the left and top neighbor blocks is larger than
+// the current block; false otherwise.
+static AOM_INLINE bool is_neighbor_blk_larger_than_cur_blk(
+ const MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+ const int cur_blk_area = (block_size_high[bsize] * block_size_wide[bsize]);
+ if (xd->left_available) {
+ const BLOCK_SIZE left_bsize = xd->left_mbmi->bsize;
+ if (block_size_high[left_bsize] * block_size_wide[left_bsize] >
+ cur_blk_area)
+ return true;
+ }
+
+ if (xd->up_available) {
+ const BLOCK_SIZE above_bsize = xd->above_mbmi->bsize;
+ if (block_size_high[above_bsize] * block_size_wide[above_bsize] >
+ cur_blk_area)
+ return true;
+ }
+ return false;
+}
+
+static AOM_INLINE void prune_rect_part_using_none_pred_mode(
+ const MACROBLOCKD *xd, PartitionSearchState *part_state,
+ PREDICTION_MODE mode, BLOCK_SIZE bsize) {
+ if (mode == DC_PRED || mode == SMOOTH_PRED) {
+ // If the prediction mode of NONE partition is either DC_PRED or
+ // SMOOTH_PRED, it indicates that the current block has less variation. In
+ // this case, HORZ and VERT partitions are pruned if at least one of left
+ // and top neighbor blocks is larger than the current block.
+ if (is_neighbor_blk_larger_than_cur_blk(xd, bsize)) {
+ part_state->prune_rect_part[HORZ] = 1;
+ part_state->prune_rect_part[VERT] = 1;
+ }
+ } else if (mode == D67_PRED || mode == V_PRED || mode == D113_PRED) {
+ // If the prediction mode chosen by NONE partition is close to 90 degrees,
+ // it implies a dominant vertical pattern, and the chance of choosing a
+ // vertical rectangular partition is high. Hence, horizontal partition is
+ // pruned in these cases.
+ part_state->prune_rect_part[HORZ] = 1;
+ } else if (mode == D157_PRED || mode == H_PRED || mode == D203_PRED) {
+ // If the prediction mode chosen by NONE partition is close to 180 degrees,
+ // it implies a dominant horizontal pattern, and the chance of choosing a
+ // horizontal rectangular partition is high. Hence, vertical partition is
+ // pruned in these cases.
+ part_state->prune_rect_part[VERT] = 1;
+ }
+}
+
+// PARTITION_NONE search.
+static void none_partition_search(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MACROBLOCK *x,
+ PC_TREE *pc_tree, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ unsigned int *pb_source_variance, int64_t *none_rd, int64_t *part_none_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ RD_STATS *this_rdc = &part_search_state->this_rdc;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+
+ if (part_search_state->terminate_partition_search ||
+ !part_search_state->partition_none_allowed)
+ return;
+
+ int pt_cost = 0;
+ RD_STATS best_remain_rdcost;
+ av1_invalid_rd_stats(&best_remain_rdcost);
+
+ // Set PARTITION_NONE context and cost.
+ set_none_partition_params(cpi, td, x, pc_tree, part_search_state,
+ &best_remain_rdcost, best_rdc, &pt_cost);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ // Timer start for partition None.
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state->part_timing_stats;
+ if (best_remain_rdcost.rdcost >= 0) {
+ start_partition_block_timer(part_timing_stats, PARTITION_NONE);
+ }
+#endif
+ // PARTITION_NONE evaluation and cost update.
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, PARTITION_NONE,
+ bsize, pc_tree->none, best_remain_rdcost);
+
+ av1_rd_cost_update(x->rdmult, this_rdc);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ // Timer end for partition None.
+ if (part_timing_stats->timer_is_on) {
+ RD_STATS tmp_rdc;
+ av1_init_rd_stats(&tmp_rdc);
+ if (this_rdc->rate != INT_MAX) {
+ tmp_rdc.rate = this_rdc->rate;
+ tmp_rdc.dist = this_rdc->dist;
+ tmp_rdc.rdcost = this_rdc->rdcost;
+ if (blk_params.bsize_at_least_8x8) {
+ tmp_rdc.rate += pt_cost;
+ tmp_rdc.rdcost = RDCOST(x->rdmult, tmp_rdc.rate, tmp_rdc.dist);
+ }
+ }
+ end_partition_block_timer(part_timing_stats, PARTITION_NONE,
+ tmp_rdc.rdcost);
+ }
+#endif
+ *pb_source_variance = x->source_variance;
+ if (none_rd) *none_rd = this_rdc->rdcost;
+ part_search_state->none_rd = this_rdc->rdcost;
+ if (this_rdc->rate != INT_MAX) {
+ // Record picked ref frame to prune ref frames for other partition types.
+ if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
+ const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame);
+ av1_update_picked_ref_frames_mask(
+ x, ref_type, bsize, cm->seq_params->mib_size, mi_row, mi_col);
+ }
+
+ // Calculate the total cost and update the best partition.
+ if (blk_params.bsize_at_least_8x8) {
+ this_rdc->rate += pt_cost;
+ this_rdc->rdcost = RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist);
+ }
+ *part_none_rd = this_rdc->rdcost;
+ if (this_rdc->rdcost < best_rdc->rdcost) {
+ *best_rdc = *this_rdc;
+ part_search_state->found_best_partition = true;
+ if (blk_params.bsize_at_least_8x8) {
+ pc_tree->partitioning = PARTITION_NONE;
+ }
+
+ // Disable split and rectangular partition search
+ // based on PARTITION_NONE cost.
+ prune_partitions_after_none(cpi, x, sms_tree, pc_tree->none,
+ part_search_state, best_rdc,
+ pb_source_variance);
+ }
+
+ if (cpi->sf.part_sf.prune_rect_part_using_none_pred_mode)
+ prune_rect_part_using_none_pred_mode(&x->e_mbd, part_search_state,
+ pc_tree->none->mic.mode, bsize);
+ }
+ av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// PARTITION_SPLIT search.
+static void split_partition_search(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, MACROBLOCK *x, PC_TREE *pc_tree,
+ SIMPLE_MOTION_DATA_TREE *sms_tree, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+ PartitionSearchState *part_search_state, RD_STATS *best_rdc,
+ SB_MULTI_PASS_MODE multi_pass_mode, int64_t *part_split_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+ assert(bsize < BLOCK_SIZES_ALL);
+ RD_STATS sum_rdc = part_search_state->sum_rdc;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+ // Check if partition split is allowed.
+ if (part_search_state->terminate_partition_search ||
+ !part_search_state->do_square_split)
+ return;
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ if (pc_tree->split[i] == NULL)
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ pc_tree->split[i]->index = i;
+ }
+
+ // Initialization of this partition RD stats.
+ av1_init_rd_stats(&sum_rdc);
+ sum_rdc.rate = part_search_state->partition_cost[PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+ int idx;
+#if CONFIG_COLLECT_PARTITION_STATS
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state->part_timing_stats;
+ if (best_rdc->rdcost - sum_rdc.rdcost >= 0) {
+ start_partition_block_timer(part_timing_stats, PARTITION_SPLIT);
+ }
+#endif
+ // Recursive partition search on 4 sub-blocks.
+ for (idx = 0; idx < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc->rdcost;
+ ++idx) {
+ const int x_idx = (idx & 1) * blk_params.mi_step;
+ const int y_idx = (idx >> 1) * blk_params.mi_step;
+
+ if (mi_row + y_idx >= mi_params->mi_rows ||
+ mi_col + x_idx >= mi_params->mi_cols)
+ continue;
+
+ pc_tree->split[idx]->index = idx;
+ int64_t *p_split_rd = &part_search_state->split_rd[idx];
+ RD_STATS best_remain_rdcost;
+ av1_rd_stats_subtraction(x->rdmult, best_rdc, &sum_rdc,
+ &best_remain_rdcost);
+
+ int curr_quad_tree_idx = 0;
+ if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+ curr_quad_tree_idx = part_search_state->intra_part_info->quad_tree_idx;
+ part_search_state->intra_part_info->quad_tree_idx =
+ 4 * curr_quad_tree_idx + idx + 1;
+ }
+ // Split partition evaluation of corresponding idx.
+ // If the RD cost exceeds the best cost then do not
+ // evaluate other split sub-partitions.
+ SIMPLE_MOTION_DATA_TREE *const sms_tree_split =
+ (sms_tree == NULL) ? NULL : sms_tree->split[idx];
+ if (!av1_rd_pick_partition(
+ cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ &part_search_state->this_rdc, best_remain_rdcost,
+ pc_tree->split[idx], sms_tree_split, p_split_rd, multi_pass_mode,
+ &part_search_state->split_part_rect_win[idx])) {
+ av1_invalid_rd_stats(&sum_rdc);
+ break;
+ }
+ if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+ part_search_state->intra_part_info->quad_tree_idx = curr_quad_tree_idx;
+ }
+
+ sum_rdc.rate += part_search_state->this_rdc.rate;
+ sum_rdc.dist += part_search_state->this_rdc.dist;
+ av1_rd_cost_update(x->rdmult, &sum_rdc);
+
+ // Set split ctx as ready for use.
+ if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+ pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+ const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none->mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ // Neither palette mode nor cfl predicted.
+ if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+ if (mbmi->uv_mode != UV_CFL_PRED)
+ part_search_state->is_split_ctx_is_ready[idx] = 1;
+ }
+ }
+ }
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (part_timing_stats->timer_is_on) {
+ end_partition_block_timer(part_timing_stats, PARTITION_SPLIT,
+ sum_rdc.rdcost);
+ }
+#endif
+ const int reached_last_index = (idx == SUB_PARTITIONS_SPLIT);
+
+ // Calculate the total cost and update the best partition.
+ *part_split_rd = sum_rdc.rdcost;
+ if (reached_last_index && sum_rdc.rdcost < best_rdc->rdcost) {
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+ if (sum_rdc.rdcost < best_rdc->rdcost) {
+ *best_rdc = sum_rdc;
+ part_search_state->found_best_partition = true;
+ pc_tree->partitioning = PARTITION_SPLIT;
+ }
+ } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) {
+ // Skip rectangular partition test when partition type none gives better
+ // rd than partition type split.
+ if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) {
+ const int partition_none_valid = part_search_state->none_rd > 0;
+ const int partition_none_better =
+ part_search_state->none_rd < sum_rdc.rdcost;
+ part_search_state->do_rectangular_split &=
+ !(partition_none_valid && partition_none_better);
+ }
+ }
+ // Restore the context for the following cases:
+ // 1) Current block size not more than maximum partition size as dry run
+ // encode happens for these cases
+ // 2) Current block size same as superblock size as the final encode
+ // happens for this case
+ if (bsize <= x->sb_enc.max_partition_size || bsize == cm->seq_params->sb_size)
+ av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
+}
+
+// The max number of nodes in the partition tree.
+// The number of leaf nodes is (128x128) / (4x4) = 1024.
+// The number of All possible parent nodes is 1 + 2 + ... + 512 = 1023.
+#define NUM_NODES 2048
+
+static void write_partition_tree(AV1_COMP *const cpi,
+ const PC_TREE *const pc_tree,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col) {
+ (void)mi_row;
+ (void)mi_col;
+ const char *path = cpi->oxcf.partition_info_path;
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
+ cpi->sb_counter, 0);
+ FILE *pfile = fopen(filename, "w");
+ fprintf(pfile, "%d", bsize);
+
+ // Write partition type with BFS order.
+ const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+ int q_idx = 0;
+ int last_idx = 1;
+ int num_nodes = 1;
+
+ // First traversal to get number of leaf nodes.
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ const PC_TREE *node = tree_node_queue[q_idx];
+ if (node->partitioning == PARTITION_SPLIT) {
+ for (int i = 0; i < 4; ++i) {
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ num_nodes += 4;
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+ const int num_leafs = last_idx;
+ fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1);
+
+ // Write partitions for each node.
+ q_idx = 0;
+ last_idx = 1;
+ num_nodes = 1;
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ const PC_TREE *node = tree_node_queue[q_idx];
+ fprintf(pfile, ",%d", node->partitioning);
+ if (node->partitioning == PARTITION_SPLIT) {
+ for (int i = 0; i < 4; ++i) {
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ num_nodes += 4;
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+ fprintf(pfile, "\n");
+
+ fclose(pfile);
+}
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+static void verify_write_partition_tree(const AV1_COMP *const cpi,
+ const PC_TREE *const pc_tree,
+ const BLOCK_SIZE bsize,
+ const int config_id, const int mi_row,
+ const int mi_col) {
+ (void)mi_row;
+ (void)mi_col;
+ const char *path = cpi->oxcf.partition_info_path;
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/verify_partition_tree_sb%d_c%d",
+ path, cpi->sb_counter, config_id);
+ FILE *pfile = fopen(filename, "w");
+ fprintf(pfile, "%d", bsize);
+
+ // Write partition type with BFS order.
+ const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+ int q_idx = 0;
+ int last_idx = 1;
+ int num_nodes = 1;
+
+ // First traversal to get number of leaf nodes.
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ const PC_TREE *node = tree_node_queue[q_idx];
+ if (node != NULL && node->partitioning == PARTITION_SPLIT) {
+ for (int i = 0; i < 4; ++i) {
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ num_nodes += 4;
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+ const int num_leafs = last_idx;
+ fprintf(pfile, ",%d,%d", num_leafs, /*num_configs=*/1);
+
+ // Write partitions for each node.
+ q_idx = 0;
+ last_idx = 1;
+ num_nodes = 1;
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ const PC_TREE *node = tree_node_queue[q_idx];
+ if (node != NULL) { // suppress warning
+ fprintf(pfile, ",%d", node->partitioning);
+ if (node->partitioning == PARTITION_SPLIT) {
+ for (int i = 0; i < 4; ++i) {
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ num_nodes += 4;
+ }
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+ fprintf(pfile, "\n");
+
+ fclose(pfile);
+}
+
+static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree,
+ struct aom_internal_error_info *error_info,
+ const int config_id) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const char *path = cpi->oxcf.partition_info_path;
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
+ cpi->sb_counter, config_id);
+ FILE *pfile = fopen(filename, "r");
+ if (pfile == NULL) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR, "Can't find input file: %s.",
+ filename);
+ }
+
+ int read_bsize;
+ int num_nodes;
+ int num_configs;
+ fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs);
+ assert(read_bsize == cpi->common.seq_params->sb_size);
+ BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize;
+ assert(bsize == pc_tree->block_size);
+
+ PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+ int last_idx = 1;
+ int q_idx = 0;
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ int partitioning;
+ fscanf(pfile, ",%d", &partitioning);
+ assert(partitioning >= PARTITION_NONE &&
+ partitioning < EXT_PARTITION_TYPES);
+ PC_TREE *node = tree_node_queue[q_idx];
+ if (node != NULL) {
+ node->partitioning = partitioning;
+ bsize = node->block_size;
+ }
+ if (partitioning == PARTITION_SPLIT) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int i = 0; i < 4; ++i) {
+ if (node != NULL) { // Suppress warning
+ node->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!node->split[i])
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ node->split[i]->index = i;
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ }
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+ fclose(pfile);
+
+ return num_configs;
+}
+
+static RD_STATS rd_search_for_fixed_partition(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ TokenExtra **tp, SIMPLE_MOTION_DATA_TREE *sms_tree, int mi_row, int mi_col,
+ const BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ RD_STATS best_rdc;
+ av1_invalid_rd_stats(&best_rdc);
+ int sum_subblock_rate = 0;
+ int64_t sum_subblock_dist = 0;
+ PartitionSearchState part_search_state;
+ init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
+ bsize);
+ // Override partition costs at the edges of the frame in the same
+ // way as in read_partition (see decodeframe.c).
+ PartitionBlkParams blk_params = part_search_state.part_blk_params;
+ if (!av1_blk_has_rows_and_cols(&blk_params))
+ set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+ (void)orig_rdmult;
+
+ // Set the context.
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ assert(bsize < BLOCK_SIZES_ALL);
+ unsigned int pb_source_variance = UINT_MAX;
+ int64_t part_none_rd = INT64_MAX;
+ int64_t none_rd = INT64_MAX;
+ int inc_step[NUM_PART4_TYPES] = { 0 };
+ if (partition == PARTITION_HORZ_4) inc_step[HORZ4] = mi_size_high[bsize] / 4;
+ if (partition == PARTITION_VERT_4) inc_step[VERT4] = mi_size_wide[bsize] / 4;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+ &part_search_state, &best_rdc, &pb_source_variance,
+ &none_rd, &part_none_rd);
+ break;
+ case PARTITION_HORZ:
+ rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+ &part_search_state, &best_rdc, NULL, HORZ,
+ HORZ);
+ break;
+ case PARTITION_VERT:
+ rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+ &part_search_state, &best_rdc, NULL, VERT,
+ VERT);
+ break;
+ case PARTITION_HORZ_A:
+ ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ &part_search_state, &best_rdc, NULL,
+ pb_source_variance, 1, HORZ_A, HORZ_A);
+ break;
+ case PARTITION_HORZ_B:
+ ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ &part_search_state, &best_rdc, NULL,
+ pb_source_variance, 1, HORZ_B, HORZ_B);
+ break;
+ case PARTITION_VERT_A:
+ ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ &part_search_state, &best_rdc, NULL,
+ pb_source_variance, 1, VERT_A, VERT_A);
+ break;
+ case PARTITION_VERT_B:
+ ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ &part_search_state, &best_rdc, NULL,
+ pb_source_variance, 1, VERT_B, VERT_B);
+ break;
+ case PARTITION_HORZ_4:
+ rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ pc_tree->horizontal4, &part_search_state, &best_rdc,
+ inc_step, PARTITION_HORZ_4);
+ break;
+ case PARTITION_VERT_4:
+ rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ pc_tree->vertical4, &part_search_state, &best_rdc,
+ inc_step, PARTITION_VERT_4);
+ break;
+ case PARTITION_SPLIT:
+ for (int idx = 0; idx < SUB_PARTITIONS_SPLIT; ++idx) {
+ const BLOCK_SIZE subsize =
+ get_partition_subsize(bsize, PARTITION_SPLIT);
+ assert(subsize < BLOCK_SIZES_ALL);
+ const int next_mi_row =
+ idx < 2 ? mi_row : mi_row + mi_size_high[subsize];
+ const int next_mi_col =
+ idx % 2 == 0 ? mi_col : mi_col + mi_size_wide[subsize];
+ if (next_mi_row >= cm->mi_params.mi_rows ||
+ next_mi_col >= cm->mi_params.mi_cols) {
+ continue;
+ }
+ const RD_STATS subblock_rdc = rd_search_for_fixed_partition(
+ cpi, td, tile_data, tp, sms_tree->split[idx], next_mi_row,
+ next_mi_col, subsize, pc_tree->split[idx]);
+ sum_subblock_rate += subblock_rdc.rate;
+ sum_subblock_dist += subblock_rdc.dist;
+ }
+ best_rdc.rate = sum_subblock_rate;
+ best_rdc.rate += part_search_state.partition_cost[PARTITION_SPLIT];
+ best_rdc.dist = sum_subblock_dist;
+ best_rdc.rdcost = RDCOST(x->rdmult, best_rdc.rate, best_rdc.dist);
+ break;
+ default:
+ assert(0 && "invalid partition type.");
+ aom_internal_error(cm->error, AOM_CODEC_ERROR, "Invalid partition type.");
+ }
+ // Note: it is necessary to restore context information.
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ if (bsize != cm->seq_params->sb_size) {
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ x->rdmult = orig_rdmult;
+
+ return best_rdc;
+}
+
+static void prepare_sb_features_before_search(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row,
+ int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features) {
+ av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+ bsize, features);
+ collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, features);
+}
+
+static void update_partition_stats(const RD_STATS *const this_rdcost,
+ aom_partition_stats_t *stats) {
+ stats->rate = this_rdcost->rate;
+ stats->dist = this_rdcost->dist;
+ stats->rdcost = this_rdcost->rdcost;
+}
+
+static void build_pc_tree_from_part_decision(
+ const aom_partition_decision_t *partition_decision,
+ const BLOCK_SIZE this_bsize, PC_TREE *pc_tree,
+ struct aom_internal_error_info *error_info) {
+ BLOCK_SIZE bsize = this_bsize;
+ int num_nodes = partition_decision->num_nodes;
+ PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+ int last_idx = 1;
+ int q_idx = 0;
+ tree_node_queue[q_idx] = pc_tree;
+ while (num_nodes > 0) {
+ const int partitioning = partition_decision->partition_decision[q_idx];
+ assert(partitioning >= PARTITION_NONE &&
+ partitioning < EXT_PARTITION_TYPES);
+ PC_TREE *node = tree_node_queue[q_idx];
+ if (node != NULL) {
+ node->partitioning = partitioning;
+ bsize = node->block_size;
+ }
+ if (partitioning == PARTITION_SPLIT) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int i = 0; i < 4; ++i) {
+ if (node != NULL) { // Suppress warning
+ node->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!node->split[i])
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ node->split[i]->index = i;
+ tree_node_queue[last_idx] = node->split[i];
+ ++last_idx;
+ }
+ }
+ }
+ --num_nodes;
+ ++q_idx;
+ }
+}
+
+// The ML model needs to provide the whole decision tree for the superblock.
+static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data,
+ TokenExtra **tp,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ int mi_row, int mi_col,
+ const BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ struct aom_internal_error_info *error_info = x->e_mbd.error_info;
+ aom_partition_features_t features;
+ prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+ &features);
+ features.mi_row = mi_row;
+ features.mi_col = mi_col;
+ features.frame_width = cpi->frame_info.frame_width;
+ features.frame_height = cpi->frame_info.frame_height;
+ features.block_size = bsize;
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // rd mode search (dry run) for a valid partition decision from the ml model.
+ aom_partition_decision_t partition_decision;
+ do {
+ const bool valid_decision = av1_ext_part_get_partition_decision(
+ ext_part_controller, &partition_decision);
+ if (!valid_decision) return false;
+
+ // First, let's take the easy approach.
+ // We require that the ml model has to provide partition decisions for the
+ // whole superblock.
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ build_pc_tree_from_part_decision(&partition_decision, bsize, td->pc_root,
+ error_info);
+
+ const RD_STATS this_rdcost = rd_search_for_fixed_partition(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root);
+ aom_partition_stats_t stats;
+ update_partition_stats(&this_rdcost, &stats);
+ av1_ext_part_send_partition_stats(ext_part_controller, &stats);
+ if (!partition_decision.is_final_decision) {
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+ }
+ } while (!partition_decision.is_final_decision);
+
+ // Encode with the selected mode and partition.
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ td->pc_root, NULL);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+
+ return true;
+}
+
+// Use a bitmask to represent the valid partition types for the current
+// block. "1" represents the corresponding partition type is vaild.
+// The least significant bit represents "PARTITION_NONE", the
+// largest significant bit represents "PARTITION_VERT_4", follow
+// the enum order for PARTITION_TYPE in "enums.h"
+static int get_valid_partition_types(
+ const AV1_COMP *const cpi,
+ const PartitionSearchState *const part_search_state,
+ const BLOCK_SIZE bsize) {
+ const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+ const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+ int valid_types = 0;
+ // PARTITION_NONE
+ valid_types |= (part_search_state->partition_none_allowed << 0);
+ // PARTITION_HORZ
+ valid_types |= (part_search_state->partition_rect_allowed[HORZ] << 1);
+ // PARTITION_VERT
+ valid_types |= (part_search_state->partition_rect_allowed[VERT] << 2);
+ // PARTITION_SPLIT
+ valid_types |= (part_search_state->do_square_split << 3);
+ // PARTITION_HORZ_A
+ const int ext_partition_allowed = part_search_state->do_rectangular_split &&
+ av1_blk_has_rows_and_cols(&blk_params);
+ const int horzab_partition_allowed =
+ ext_partition_allowed && part_cfg->enable_ab_partitions &&
+ part_search_state->partition_rect_allowed[HORZ];
+ valid_types |= (horzab_partition_allowed << 4);
+ // PARTITION_HORZ_B
+ valid_types |= (horzab_partition_allowed << 5);
+ // PARTITION_VERT_A
+ const int vertab_partition_allowed =
+ ext_partition_allowed && part_cfg->enable_ab_partitions &&
+ part_search_state->partition_rect_allowed[VERT];
+ valid_types |= (vertab_partition_allowed << 6);
+ // PARTITION_VERT_B
+ valid_types |= (vertab_partition_allowed << 7);
+ // PARTITION_HORZ_4
+ const int partition4_allowed = part_cfg->enable_1to4_partitions &&
+ ext_partition_allowed &&
+ bsize != BLOCK_128X128;
+ const int horz4_allowed =
+ partition4_allowed && part_search_state->partition_rect_allowed[HORZ] &&
+ get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4),
+ part_search_state->ss_x,
+ part_search_state->ss_y) != BLOCK_INVALID;
+ valid_types |= (horz4_allowed << 8);
+ // PARTITION_VERT_4
+ const int vert4_allowed =
+ partition4_allowed && part_search_state->partition_rect_allowed[HORZ] &&
+ get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4),
+ part_search_state->ss_x,
+ part_search_state->ss_y) != BLOCK_INVALID;
+ valid_types |= (vert4_allowed << 9);
+
+ return valid_types;
+}
+
+static void prepare_tpl_stats_block(const AV1_COMP *const cpi,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int64_t *intra_cost,
+ int64_t *inter_cost, int64_t *mc_dep_cost) {
+ const AV1_COMMON *const cm = &cpi->common;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+ return;
+ }
+
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ // If tpl stats is not established, early return
+ if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) {
+ return;
+ }
+
+ const int tpl_stride = tpl_frame->stride;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+ const int mi_width =
+ AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+ const int mi_height =
+ AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+
+ int64_t sum_intra_cost = 0;
+ int64_t sum_inter_cost = 0;
+ int64_t sum_mc_dep_cost = 0;
+ for (int row = 0; row < mi_height; row += step) {
+ for (int col = 0; col < mi_width; col += step) {
+ TplDepStats *this_stats =
+ &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+ tpl_data->tpl_stats_block_mis_log2)];
+ sum_intra_cost += this_stats->intra_cost;
+ sum_inter_cost += this_stats->inter_cost;
+ const int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ sum_mc_dep_cost += mc_dep_delta;
+ }
+ }
+
+ *intra_cost = sum_intra_cost;
+ *inter_cost = sum_inter_cost;
+ *mc_dep_cost = sum_mc_dep_cost;
+}
+
+static bool recursive_partition(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ PC_TREE *pc_tree, int mi_row, int mi_col,
+ const BLOCK_SIZE bsize, RD_STATS *this_rdcost) {
+ const AV1_COMMON *const cm = &cpi->common;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) {
+ return false;
+ }
+ aom_partition_decision_t partition_decision;
+ do {
+ PartitionSearchState part_search_state;
+ // Initialization of state variables used in partition search.
+ // TODO(chengchen): check if there is hidden conditions that don't allow
+ // all possible partition types.
+ init_partition_search_state_params(x, cpi, &part_search_state, mi_row,
+ mi_col, bsize);
+ // Override partition costs at the edges of the frame in the same
+ // way as in read_partition (see decodeframe.c).
+ PartitionBlkParams blk_params = part_search_state.part_blk_params;
+ if (!av1_blk_has_rows_and_cols(&blk_params))
+ set_partition_cost_for_edge_blk(cm, &part_search_state);
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+ const int valid_partition_types =
+ get_valid_partition_types(cpi, &part_search_state, bsize);
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ const int qindex = av1_get_qindex(&cm->seg, xd->mi[0]->segment_id,
+ cm->quant_params.base_qindex);
+ // RD multiplier
+ const int rdmult = x->rdmult;
+ // pyramid level
+ const int pyramid_level =
+ cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+ x->rdmult = orig_rdmult;
+ // Neighbor information
+ const int has_above = !!xd->above_mbmi;
+ const int has_left = !!xd->left_mbmi;
+ const BLOCK_SIZE above_bsize =
+ has_above ? xd->above_mbmi->bsize : BLOCK_INVALID;
+ const BLOCK_SIZE left_bsize =
+ has_left ? xd->left_mbmi->bsize : BLOCK_INVALID;
+ const int above_block_width =
+ above_bsize == BLOCK_INVALID ? -1 : block_size_wide[above_bsize];
+ const int above_block_height =
+ above_bsize == BLOCK_INVALID ? -1 : block_size_high[above_bsize];
+ const int left_block_width =
+ left_bsize == BLOCK_INVALID ? -1 : block_size_wide[left_bsize];
+ const int left_block_height =
+ left_bsize == BLOCK_INVALID ? -1 : block_size_high[left_bsize];
+ // Prepare simple motion search stats as features
+ unsigned int block_sse = -1;
+ unsigned int block_var = -1;
+ unsigned int sub_block_sse[4] = { -1, -1, -1, -1 };
+ unsigned int sub_block_var[4] = { -1, -1, -1, -1 };
+ unsigned int horz_block_sse[2] = { -1, -1 };
+ unsigned int horz_block_var[2] = { -1, -1 };
+ unsigned int vert_block_sse[2] = { -1, -1 };
+ unsigned int vert_block_var[2] = { -1, -1 };
+ av1_prepare_motion_search_features_block(
+ cpi, td, tile_data, mi_row, mi_col, bsize, valid_partition_types,
+ &block_sse, &block_var, sub_block_sse, sub_block_var, horz_block_sse,
+ horz_block_var, vert_block_sse, vert_block_var);
+ // Prepare tpl stats for the current block as features
+ int64_t tpl_intra_cost = -1;
+ int64_t tpl_inter_cost = -1;
+ int64_t tpl_mc_dep_cost = -1;
+ prepare_tpl_stats_block(cpi, bsize, mi_row, mi_col, &tpl_intra_cost,
+ &tpl_inter_cost, &tpl_mc_dep_cost);
+
+ aom_partition_features_t features;
+ features.mi_row = mi_row;
+ features.mi_col = mi_col;
+ features.frame_width = cpi->frame_info.frame_width;
+ features.frame_height = cpi->frame_info.frame_height;
+ features.block_size = bsize;
+ features.valid_partition_types = valid_partition_types;
+ features.update_type = update_type;
+ features.qindex = qindex;
+ features.rdmult = rdmult;
+ features.pyramid_level = pyramid_level;
+ features.has_above_block = has_above;
+ features.above_block_width = above_block_width;
+ features.above_block_height = above_block_height;
+ features.has_left_block = has_left;
+ features.left_block_width = left_block_width;
+ features.left_block_height = left_block_height;
+ features.block_sse = block_sse;
+ features.block_var = block_var;
+ for (int i = 0; i < 4; ++i) {
+ features.sub_block_sse[i] = sub_block_sse[i];
+ features.sub_block_var[i] = sub_block_var[i];
+ }
+ for (int i = 0; i < 2; ++i) {
+ features.horz_block_sse[i] = horz_block_sse[i];
+ features.horz_block_var[i] = horz_block_var[i];
+ features.vert_block_sse[i] = vert_block_sse[i];
+ features.vert_block_var[i] = vert_block_var[i];
+ }
+ features.tpl_intra_cost = tpl_intra_cost;
+ features.tpl_inter_cost = tpl_inter_cost;
+ features.tpl_mc_dep_cost = tpl_mc_dep_cost;
+ av1_ext_part_send_features(ext_part_controller, &features);
+ const bool valid_decision = av1_ext_part_get_partition_decision(
+ ext_part_controller, &partition_decision);
+ if (!valid_decision) return false;
+ pc_tree->partitioning = partition_decision.current_decision;
+
+ av1_init_rd_stats(this_rdcost);
+ if (partition_decision.current_decision == PARTITION_SPLIT) {
+ assert(block_size_wide[bsize] >= 8 && block_size_high[bsize] >= 8);
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ RD_STATS split_rdc[SUB_PARTITIONS_SPLIT];
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ av1_init_rd_stats(&split_rdc[i]);
+ if (pc_tree->split[i] == NULL)
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ pc_tree->split[i]->index = i;
+ }
+ const int orig_rdmult_tmp = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+ // TODO(chengchen): check boundary conditions
+ // top-left
+ recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[0],
+ mi_row, mi_col, subsize, &split_rdc[0]);
+ // top-right
+ recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[1],
+ mi_row, mi_col + mi_size_wide[subsize], subsize,
+ &split_rdc[1]);
+ // bottom-left
+ recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[2],
+ mi_row + mi_size_high[subsize], mi_col, subsize,
+ &split_rdc[2]);
+ // bottom_right
+ recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[3],
+ mi_row + mi_size_high[subsize],
+ mi_col + mi_size_wide[subsize], subsize,
+ &split_rdc[3]);
+ this_rdcost->rate += part_search_state.partition_cost[PARTITION_SPLIT];
+ // problem is here, the rdmult is different from the rdmult in sub block.
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ this_rdcost->rate += split_rdc[i].rate;
+ this_rdcost->dist += split_rdc[i].dist;
+ av1_rd_cost_update(x->rdmult, this_rdcost);
+ }
+ x->rdmult = orig_rdmult_tmp;
+ } else {
+ *this_rdcost = rd_search_for_fixed_partition(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree);
+ }
+
+ aom_partition_stats_t stats;
+ update_partition_stats(this_rdcost, &stats);
+ av1_ext_part_send_partition_stats(ext_part_controller, &stats);
+ if (!partition_decision.is_final_decision) {
+ if (partition_decision.current_decision == PARTITION_SPLIT) {
+ for (int i = 0; i < 4; ++i) {
+ if (pc_tree->split[i] != NULL) {
+ av1_free_pc_tree_recursive(pc_tree->split[i], av1_num_planes(cm), 0,
+ 0,
+ cpi->sf.part_sf.partition_search_type);
+ pc_tree->split[i] = NULL;
+ }
+ }
+ }
+ }
+ } while (!partition_decision.is_final_decision);
+
+ return true;
+}
+
+// The ML model only needs to make decisions for the current block each time.
+static bool ml_partition_search_partial(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ int mi_row, int mi_col,
+ const BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ aom_partition_features_t features;
+ prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+ &features);
+ features.mi_row = mi_row;
+ features.mi_col = mi_col;
+ features.frame_width = cpi->frame_info.frame_width;
+ features.frame_height = cpi->frame_info.frame_height;
+ features.block_size = bsize;
+ av1_ext_part_send_features(ext_part_controller, &features);
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(x->e_mbd.error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+
+ RD_STATS rdcost;
+ const bool valid_partition =
+ recursive_partition(cpi, td, tile_data, tp, sms_root, td->pc_root, mi_row,
+ mi_col, bsize, &rdcost);
+ if (!valid_partition) {
+ return false;
+ }
+
+ // Encode with the selected mode and partition.
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ td->pc_root, NULL);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+
+ return true;
+}
+
+bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
+ int mi_col, const BLOCK_SIZE bsize,
+ RD_STATS *best_rd_cost) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (cpi->ext_part_controller.ready) {
+ bool valid_search = true;
+ const aom_ext_part_decision_mode_t decision_mode =
+ av1_get_ext_part_decision_mode(&cpi->ext_part_controller);
+ if (decision_mode == AOM_EXT_PART_WHOLE_TREE) {
+ valid_search = ml_partition_search_whole_tree(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize);
+ } else if (decision_mode == AOM_EXT_PART_RECURSIVE) {
+ valid_search = ml_partition_search_partial(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize);
+ } else {
+ assert(0 && "Unknown decision mode.");
+ return false;
+ }
+ if (!valid_search) {
+ aom_internal_error(
+ cm->error, AOM_CODEC_ERROR,
+ "Invalid search from ML model, partition search failed");
+ }
+ return true;
+ }
+
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int best_idx = 0;
+ int64_t min_rdcost = INT64_MAX;
+ int num_configs;
+ int i = 0;
+ do {
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ num_configs = read_partition_tree(cpi, td->pc_root, xd->error_info, i);
+ if (num_configs <= 0) {
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+ aom_internal_error(xd->error_info, AOM_CODEC_ERROR, "Invalid configs.");
+ }
+ verify_write_partition_tree(cpi, td->pc_root, bsize, i, mi_row, mi_col);
+ if (i == 0) {
+ AOM_CHECK_MEM_ERROR(xd->error_info, x->rdcost,
+ aom_calloc(num_configs, sizeof(*x->rdcost)));
+ }
+ // Encode the block with the given partition tree. Get rdcost and encoding
+ // time.
+ x->rdcost[i] = rd_search_for_fixed_partition(
+ cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, td->pc_root);
+
+ if (x->rdcost[i].rdcost < min_rdcost) {
+ min_rdcost = x->rdcost[i].rdcost;
+ best_idx = i;
+ *best_rd_cost = x->rdcost[i];
+ }
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+ ++i;
+ } while (i < num_configs);
+
+ aom_free(x->rdcost);
+ x->rdcost = NULL;
+ // Encode with the partition configuration with the smallest rdcost.
+ td->pc_root = av1_alloc_pc_tree_node(bsize);
+ if (!td->pc_root)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ read_partition_tree(cpi, td->pc_root, xd->error_info, best_idx);
+ rd_search_for_fixed_partition(cpi, td, tile_data, tp, sms_root, mi_row,
+ mi_col, bsize, td->pc_root);
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ td->pc_root, NULL);
+ av1_free_pc_tree_recursive(td->pc_root, av1_num_planes(cm), 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ td->pc_root = NULL;
+ ++cpi->sb_counter;
+
+ return true;
+}
+#endif // CONFIG_PARTITION_SEARCH_ORDER
+
+static AOM_INLINE bool should_do_dry_run_encode_for_current_block(
+ BLOCK_SIZE sb_size, BLOCK_SIZE max_partition_size, int curr_block_index,
+ BLOCK_SIZE bsize) {
+ if (bsize > max_partition_size) return false;
+
+ // Enable the reconstruction with dry-run for the 4th sub-block only if its
+ // parent block's reconstruction with dry-run is skipped. If
+ // max_partition_size is the same as immediate split of superblock, then avoid
+ // reconstruction of the 4th sub-block, as this data is not consumed.
+ if (curr_block_index != 3) return true;
+
+ const BLOCK_SIZE sub_sb_size =
+ get_partition_subsize(sb_size, PARTITION_SPLIT);
+ return bsize == max_partition_size && sub_sb_size != max_partition_size;
+}
+
+static void log_sub_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+ double *var_min, double *var_max) {
+ // This functions returns a the minimum and maximum log variances for 4x4
+ // sub blocks in the current block.
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int is_hbd = is_cur_buf_hbd(xd);
+ const int right_overflow =
+ (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+ const int bottom_overflow =
+ (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+ const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+ const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+ // Initialize minimum variance to a large value and maximum variance to 0.
+ double min_var_4x4 = (double)INT_MAX;
+ double max_var_4x4 = 0.0;
+
+ for (int i = 0; i < bh; i += MI_SIZE) {
+ for (int j = 0; j < bw; j += MI_SIZE) {
+ int var;
+ // Calculate the 4x4 sub-block variance.
+ var = av1_calc_normalized_variance(
+ cpi->ppi->fn_ptr[BLOCK_4X4].vf,
+ x->plane[0].src.buf + (i * x->plane[0].src.stride) + j,
+ x->plane[0].src.stride, is_hbd);
+
+ // Record min and max for over-arching block
+ min_var_4x4 = AOMMIN(min_var_4x4, var);
+ max_var_4x4 = AOMMAX(max_var_4x4, var);
+ }
+ }
+ *var_min = log1p(min_var_4x4 / 16.0);
+ *var_max = log1p(max_var_4x4 / 16.0);
+}
+
+static AOM_INLINE void set_sms_tree_partitioning(
+ SIMPLE_MOTION_DATA_TREE *sms_tree, PARTITION_TYPE partition) {
+ if (sms_tree == NULL) return;
+ sms_tree->partitioning = partition;
+}
+
+/*!\brief AV1 block partition search (full search).
+*
+* \ingroup partition_search
+* \callgraph
+* Searches for the best partition pattern for a block based on the
+* rate-distortion cost, and returns a bool value to indicate whether a valid
+* partition pattern is found. The partition can recursively go down to the
+* smallest block size.
+*
+* \param[in] cpi Top-level encoder structure
+* \param[in] td Pointer to thread data
+* \param[in] tile_data Pointer to struct holding adaptive
+data/contexts/models for the tile during
+encoding
+* \param[in] tp Pointer to the starting token
+* \param[in] mi_row Row coordinate of the block in a step size
+of MI_SIZE
+* \param[in] mi_col Column coordinate of the block in a step
+size of MI_SIZE
+* \param[in] bsize Current block size
+* \param[in] rd_cost Pointer to the final rd cost of the block
+* \param[in] best_rdc Upper bound of rd cost of a valid partition
+* \param[in] pc_tree Pointer to the PC_TREE node storing the
+picked partitions and mode info for the
+current block
+* \param[in] sms_tree Pointer to struct holding simple motion
+search data for the current block
+* \param[in] none_rd Pointer to the rd cost in the case of not
+splitting the current block
+* \param[in] multi_pass_mode SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS
+* \param[in] rect_part_win_info Pointer to struct storing whether horz/vert
+partition outperforms previously tested
+partitions
+*
+* \return A bool value is returned indicating if a valid partition is found.
+* The pc_tree struct is modified to store the picked partition and modes.
+* The rd_cost struct is also updated with the RD stats corresponding to the
+* best partition found.
+*/
+bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
+ RD_STATS best_rdc, PC_TREE *pc_tree,
+ SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
+ SB_MULTI_PASS_MODE multi_pass_mode,
+ RD_RECT_PART_WIN_INFO *rect_part_win_info) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ const TokenExtra *const tp_orig = *tp;
+ PartitionSearchState part_search_state;
+
+ // Initialization of state variables used in partition search.
+ init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
+ bsize);
+ PartitionBlkParams blk_params = part_search_state.part_blk_params;
+
+ set_sms_tree_partitioning(sms_tree, PARTITION_NONE);
+ if (best_rdc.rdcost < 0) {
+ av1_invalid_rd_stats(rd_cost);
+ return part_search_state.found_best_partition;
+ }
+ if (bsize == cm->seq_params->sb_size) x->must_find_valid_partition = 0;
+
+ // Override skipping rectangular partition operations for edge blocks.
+ if (none_rd) *none_rd = 0;
+ (void)*tp_orig;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ // Stats at the current quad tree
+ PartitionTimingStats *part_timing_stats =
+ &part_search_state.part_timing_stats;
+ // Stats aggregated at frame level
+ FramePartitionTimingStats *fr_part_timing_stats = &cpi->partition_stats;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+ // Override partition costs at the edges of the frame in the same
+ // way as in read_partition (see decodeframe.c).
+ if (!av1_blk_has_rows_and_cols(&blk_params))
+ set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+ // Disable rectangular partitions for inner blocks when the current block is
+ // forced to only use square partitions.
+ if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) {
+ part_search_state.partition_rect_allowed[HORZ] &= !blk_params.has_rows;
+ part_search_state.partition_rect_allowed[VERT] &= !blk_params.has_cols;
+ }
+
+#ifndef NDEBUG
+ // Nothing should rely on the default value of this array (which is just
+ // leftover from encoding the previous block. Setting it to fixed pattern
+ // when debugging.
+ // bit 0, 1, 2 are blk_skip of each plane
+ // bit 4, 5, 6 are initialization checking of each plane
+ memset(x->txfm_search_info.blk_skip, 0x77,
+ sizeof(x->txfm_search_info.blk_skip));
+#endif // NDEBUG
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+ // Set buffers and offsets.
+ av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+ if (cpi->oxcf.mode == ALLINTRA) {
+ if (bsize == cm->seq_params->sb_size) {
+ double var_min, var_max;
+ log_sub_block_var(cpi, x, bsize, &var_min, &var_max);
+
+ x->intra_sb_rdmult_modifier = 128;
+ if ((var_min < 2.0) && (var_max > 4.0)) {
+ if ((var_max - var_min) > 8.0) {
+ x->intra_sb_rdmult_modifier -= 48;
+ } else {
+ x->intra_sb_rdmult_modifier -= (int)((var_max - var_min) * 6);
+ }
+ }
+ }
+ }
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ const int orig_rdmult = x->rdmult;
+ setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+ // Apply simple motion search for the entire super block with fixed block
+ // size, e.g., 16x16, to collect features and write to files for the
+ // external ML model.
+ // TODO(chengchen): reduce motion search. This function is similar to
+ // av1_get_max_min_partition_features().
+ if (COLLECT_MOTION_SEARCH_FEATURE_SB && !frame_is_intra_only(cm) &&
+ bsize == cm->seq_params->sb_size) {
+ av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+ bsize, /*features=*/NULL);
+ collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, /*features=*/NULL);
+ }
+
+ // Update rd cost of the bound using the current multiplier.
+ av1_rd_cost_update(x->rdmult, &best_rdc);
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
+
+ // Set the context.
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_prune_partitions_time);
+#endif
+ // Pruning: before searching any partition type, using source and simple
+ // motion search results to prune out unlikely partitions.
+ av1_prune_partitions_before_search(cpi, x, sms_tree, &part_search_state);
+
+ // Pruning: eliminating partition types leading to coding block sizes outside
+ // the min and max bsize limitations set from the encoder.
+ av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_prune_partitions_time);
+#endif
+
+ // Partition search
+BEGIN_PARTITION_SEARCH:
+ // If a valid partition is required, usually when the first round cannot find
+ // a valid one under the cost limit after pruning, reset the limitations on
+ // partition types and intra cnn output.
+ if (x->must_find_valid_partition) {
+ reset_part_limitations(cpi, &part_search_state);
+ av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state);
+ // Invalidate intra cnn output for key frames.
+ if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+ part_search_state.intra_part_info->quad_tree_idx = 0;
+ part_search_state.intra_part_info->cnn_output_valid = 0;
+ }
+ }
+ // Partition block source pixel variance.
+ unsigned int pb_source_variance = UINT_MAX;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, none_partition_search_time);
+#endif
+
+ if (cpi->oxcf.mode == ALLINTRA) {
+ const bool bsize_at_least_16x16 = (bsize >= BLOCK_16X16);
+ const bool prune_rect_part_using_4x4_var_deviation =
+ (cpi->sf.part_sf.prune_rect_part_using_4x4_var_deviation &&
+ !x->must_find_valid_partition);
+
+ if (bsize_at_least_16x16 || prune_rect_part_using_4x4_var_deviation) {
+ double var_min, var_max;
+ log_sub_block_var(cpi, x, bsize, &var_min, &var_max);
+
+ // Further pruning or in some cases reverse pruning when allintra is set.
+ // This code helps visual and in some cases metrics quality where the
+ // current block comprises at least one very low variance sub-block and at
+ // least one where the variance is much higher.
+ //
+ // The idea is that in such cases there is danger of ringing and other
+ // visual artifacts from a high variance feature such as an edge into a
+ // very low variance region.
+ //
+ // The approach taken is to force break down / split to a smaller block
+ // size to try and separate out the low variance and well predicted blocks
+ // from the more complex ones and to prevent propagation of ringing over a
+ // large region.
+ if (bsize_at_least_16x16 && (var_min < 0.272) &&
+ ((var_max - var_min) > 3.0)) {
+ part_search_state.partition_none_allowed = 0;
+ part_search_state.terminate_partition_search = 0;
+ part_search_state.do_square_split = 1;
+ } else if (prune_rect_part_using_4x4_var_deviation &&
+ (var_max - var_min < 3.0)) {
+ // Prune rectangular partitions if the variance deviation of 4x4
+ // sub-blocks within the block is less than a threshold (derived
+ // empirically).
+ part_search_state.do_rectangular_split = 0;
+ }
+ }
+ }
+
+ // PARTITION_NONE search stage.
+ int64_t part_none_rd = INT64_MAX;
+ none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
+ &part_search_state, &best_rdc, &pb_source_variance,
+ none_rd, &part_none_rd);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, none_partition_search_time);
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, split_partition_search_time);
+#endif
+ // PARTITION_SPLIT search stage.
+ int64_t part_split_rd = INT64_MAX;
+ split_partition_search(cpi, td, tile_data, tp, x, pc_tree, sms_tree, &x_ctx,
+ &part_search_state, &best_rdc, multi_pass_mode,
+ &part_split_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, split_partition_search_time);
+#endif
+ // Terminate partition search for child partition,
+ // when NONE and SPLIT partition rd_costs are INT64_MAX.
+ if (cpi->sf.part_sf.early_term_after_none_split &&
+ part_none_rd == INT64_MAX && part_split_rd == INT64_MAX &&
+ !x->must_find_valid_partition && (bsize != cm->seq_params->sb_size)) {
+ part_search_state.terminate_partition_search = 1;
+ }
+
+ // Do not evaluate non-square partitions if NONE partition did not choose a
+ // newmv mode and is skippable.
+ if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 2) &&
+ (pc_tree->none != NULL)) {
+ if (x->qindex <= 200 && is_inter_mode(pc_tree->none->mic.mode) &&
+ !have_newmv_in_inter_mode(pc_tree->none->mic.mode) &&
+ pc_tree->none->skippable && !x->must_find_valid_partition &&
+ bsize >= BLOCK_16X16)
+ part_search_state.do_rectangular_split = 0;
+ }
+
+ // Prune partitions based on PARTITION_NONE and PARTITION_SPLIT.
+ prune_partitions_after_split(cpi, x, sms_tree, &part_search_state, &best_rdc,
+ part_none_rd, part_split_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rectangular_partition_search_time);
+#endif
+ // Rectangular partitions search stage.
+ rectangular_partition_search(cpi, td, tile_data, tp, x, pc_tree, &x_ctx,
+ &part_search_state, &best_rdc,
+ rect_part_win_info, HORZ, VERT);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rectangular_partition_search_time);
+#endif
+
+ if (pb_source_variance == UINT_MAX) {
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+ pb_source_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+ }
+
+ assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+ !part_search_state.do_rectangular_split));
+
+ const int prune_ext_part_state = prune_ext_part_none_skippable(
+ pc_tree->none, x->must_find_valid_partition,
+ cpi->sf.part_sf.skip_non_sq_part_based_on_none, bsize);
+
+ const int ab_partition_allowed = allow_ab_partition_search(
+ &part_search_state, &cpi->sf.part_sf, pc_tree->partitioning,
+ x->must_find_valid_partition, prune_ext_part_state, best_rdc.rdcost);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, ab_partitions_search_time);
+#endif
+ // AB partitions search stage.
+ ab_partitions_search(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ &part_search_state, &best_rdc, rect_part_win_info,
+ pb_source_variance, ab_partition_allowed, HORZ_A,
+ VERT_B);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, ab_partitions_search_time);
+#endif
+
+ // 4-way partitions search stage.
+ int part4_search_allowed[NUM_PART4_TYPES] = { 1, 1 };
+ // Prune 4-way partition search.
+ prune_4_way_partition_search(cpi, x, pc_tree, &part_search_state, &best_rdc,
+ pb_source_variance, prune_ext_part_state,
+ part4_search_allowed);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_4partition_time);
+#endif
+ // PARTITION_HORZ_4
+ assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+ !part4_search_allowed[HORZ4]));
+ if (!part_search_state.terminate_partition_search &&
+ part4_search_allowed[HORZ4]) {
+ const int inc_step[NUM_PART4_TYPES] = { mi_size_high[blk_params.bsize] / 4,
+ 0 };
+ // Evaluation of Horz4 partition type.
+ rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ pc_tree->horizontal4, &part_search_state, &best_rdc,
+ inc_step, PARTITION_HORZ_4);
+ }
+
+ // PARTITION_VERT_4
+ assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
+ !part4_search_allowed[VERT4]));
+ if (!part_search_state.terminate_partition_search &&
+ part4_search_allowed[VERT4] && blk_params.has_cols) {
+ const int inc_step[NUM_PART4_TYPES] = { 0, mi_size_wide[blk_params.bsize] /
+ 4 };
+ // Evaluation of Vert4 partition type.
+ rd_pick_4partition(cpi, td, tile_data, tp, x, &x_ctx, pc_tree,
+ pc_tree->vertical4, &part_search_state, &best_rdc,
+ inc_step, PARTITION_VERT_4);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_4partition_time);
+#endif
+
+ if (bsize == cm->seq_params->sb_size &&
+ !part_search_state.found_best_partition) {
+ // Did not find a valid partition, go back and search again, with less
+ // constraint on which partition types to search.
+ x->must_find_valid_partition = 1;
+#if CONFIG_COLLECT_PARTITION_STATS
+ fr_part_timing_stats->partition_redo += 1;
+#endif // CONFIG_COLLECT_PARTITION_STATS
+ goto BEGIN_PARTITION_SEARCH;
+ }
+
+ // Store the final rd cost
+ *rd_cost = best_rdc;
+
+ // Also record the best partition in simple motion data tree because it is
+ // necessary for the related speed features.
+ set_sms_tree_partitioning(sms_tree, pc_tree->partitioning);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+ part_timing_stats->partition_decisions[pc_tree->partitioning] += 1;
+ }
+
+ // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each
+ // prediction block.
+ print_partition_timing_stats_with_rdcost(
+ part_timing_stats, mi_row, mi_col, bsize,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+ cm->current_frame.frame_number, &best_rdc, "part_timing.csv");
+ const bool print_timing_stats = false;
+ if (print_timing_stats) {
+ print_partition_timing_stats(part_timing_stats, cm->show_frame,
+ frame_is_intra_only(cm), bsize,
+ "part_timing_data.csv");
+ }
+ // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
+ // the whole clip. So we need to pass the information upstream to the encoder.
+ accumulate_partition_timing_stats(fr_part_timing_stats, part_timing_stats,
+ bsize);
+#endif // CONFIG_COLLECT_PARTITION_STATS
+
+ // Reset the PC_TREE deallocation flag.
+ int pc_tree_dealloc = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_sb_time);
+#endif
+ if (part_search_state.found_best_partition) {
+ if (bsize == cm->seq_params->sb_size) {
+ // Encode the superblock.
+ const int emit_output = multi_pass_mode != SB_DRY_PASS;
+ const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
+
+ // Write partition tree to file. Not used by default.
+ if (COLLECT_MOTION_SEARCH_FEATURE_SB) {
+ write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col);
+ ++cpi->sb_counter;
+ }
+
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
+ pc_tree, NULL);
+ assert(pc_tree == td->pc_root);
+ // Dealloc the whole PC_TREE after a superblock is done.
+ av1_free_pc_tree_recursive(pc_tree, num_planes, 0, 0,
+ cpi->sf.part_sf.partition_search_type);
+ pc_tree = NULL;
+ td->pc_root = NULL;
+ pc_tree_dealloc = 1;
+ } else if (should_do_dry_run_encode_for_current_block(
+ cm->seq_params->sb_size, x->sb_enc.max_partition_size,
+ pc_tree->index, bsize)) {
+ // Encode the smaller blocks in DRY_RUN mode.
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_sb_time);
+#endif
+
+ // If the tree still exists (non-superblock), dealloc most nodes, only keep
+ // nodes for the best partition and PARTITION_NONE.
+ if (pc_tree_dealloc == 0)
+ av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1,
+ cpi->sf.part_sf.partition_search_type);
+
+ if (bsize == cm->seq_params->sb_size) {
+ assert(best_rdc.rate < INT_MAX);
+ assert(best_rdc.dist < INT64_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+
+ // Restore the rd multiplier.
+ x->rdmult = orig_rdmult;
+ return part_search_state.found_best_partition;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef COLLECT_MOTION_SEARCH_FEATURE_SB
+
+#if CONFIG_RT_ML_PARTITIONING
+#define FEATURES 6
+#define LABELS 2
+static int ml_predict_var_partitioning(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const NN_CONFIG *nn_config = NULL;
+ const float *means = NULL;
+ const float *vars = NULL;
+ switch (bsize) {
+ case BLOCK_64X64:
+ nn_config = &av1_var_part_nnconfig_64;
+ means = av1_var_part_means_64;
+ vars = av1_var_part_vars_64;
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_var_part_nnconfig_32;
+ means = av1_var_part_means_32;
+ vars = av1_var_part_vars_32;
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_var_part_nnconfig_16;
+ means = av1_var_part_means_16;
+ vars = av1_var_part_vars_16;
+ break;
+ case BLOCK_8X8:
+ default: assert(0 && "Unexpected block size."); return -1;
+ }
+
+ if (!nn_config) return -1;
+
+ {
+ const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f;
+ float features[FEATURES] = { 0.0f };
+ const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+ cm->seq_params->bit_depth);
+ int feature_idx = 0;
+ float score[LABELS];
+
+ features[feature_idx] =
+ (log1pf((float)(dc_q * dc_q) / 256.0f) - means[feature_idx]) /
+ sqrtf(vars[feature_idx]);
+ feature_idx++;
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize);
+ {
+ const int bs = block_size_wide[bsize];
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ const int sb_offset_row = 4 * (mi_row & 15);
+ const int sb_offset_col = 4 * (mi_col & 15);
+ const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+ const uint8_t *src = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const int pred_stride = 64;
+ unsigned int sse;
+ int i;
+ // Variance of whole block.
+ const unsigned int var =
+ cpi->ppi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+ const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+ features[feature_idx] =
+ (log1pf((float)var) - means[feature_idx]) / sqrtf(vars[feature_idx]);
+ feature_idx++;
+ for (i = 0; i < 4; ++i) {
+ const int x_idx = (i & 1) * bs / 2;
+ const int y_idx = (i >> 1) * bs / 2;
+ const int src_offset = y_idx * src_stride + x_idx;
+ const int pred_offset = y_idx * pred_stride + x_idx;
+ // Variance of quarter block.
+ const unsigned int sub_var =
+ cpi->ppi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+ pred + pred_offset, pred_stride, &sse);
+ const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+ features[feature_idx] =
+ (var_ratio - means[feature_idx]) / sqrtf(vars[feature_idx]);
+ feature_idx++;
+ }
+ }
+ // for (int i = 0; i<FEATURES; i++)
+ // printf("F_%d, %f; ", i, features[i]);
+ assert(feature_idx == FEATURES);
+ av1_nn_predict(features, nn_config, 1, score);
+ // printf("Score %f, thr %f ", (float)score[0], thresh);
+ if (score[0] > thresh) return PARTITION_SPLIT;
+ if (score[0] < -thresh) return PARTITION_NONE;
+ return -1;
+ }
+}
+#undef FEATURES
+#undef LABELS
+
+// Uncomment for collecting data for ML-based partitioning
+// #define _COLLECT_GROUND_TRUTH_
+
+#ifdef _COLLECT_GROUND_TRUTH_
+static int store_partition_data(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, PARTITION_TYPE part) {
+ AV1_COMMON *const cm = &cpi->common;
+ char fname[128];
+ switch (bsize) {
+ case BLOCK_64X64: sprintf(fname, "data_64x64.txt"); break;
+ case BLOCK_32X32: sprintf(fname, "data_32x32.txt"); break;
+ case BLOCK_16X16: sprintf(fname, "data_16x16.txt"); break;
+ case BLOCK_8X8: sprintf(fname, "data_8x8.txt"); break;
+ default: assert(0 && "Unexpected block size."); return -1;
+ }
+
+ float features[6]; // DC_Q, VAR, VAR_RATIO-0..3
+
+ FILE *f = fopen(fname, "a");
+
+ {
+ const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+ cm->seq_params->bit_depth);
+ int feature_idx = 0;
+
+ features[feature_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f);
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, 1, bsize);
+ {
+ const int bs = block_size_wide[bsize];
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ const int sb_offset_row = 4 * (mi_row & 15);
+ const int sb_offset_col = 4 * (mi_col & 15);
+ const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+ const uint8_t *src = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const int pred_stride = 64;
+ unsigned int sse;
+ int i;
+ // Variance of whole block.
+ /*
+ if (bs == 8)
+ {
+ int r, c;
+ printf("%d %d\n", mi_row, mi_col);
+ for (r = 0; r < bs; ++r) {
+ for (c = 0; c < bs; ++c) {
+ printf("%3d ",
+ src[r * src_stride + c] - pred[64 * r + c]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+ }
+ */
+ const unsigned int var =
+ cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+ const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+ features[feature_idx++] = log1pf((float)var);
+
+ fprintf(f, "%f,%f,", features[0], features[1]);
+ for (i = 0; i < 4; ++i) {
+ const int x_idx = (i & 1) * bs / 2;
+ const int y_idx = (i >> 1) * bs / 2;
+ const int src_offset = y_idx * src_stride + x_idx;
+ const int pred_offset = y_idx * pred_stride + x_idx;
+ // Variance of quarter block.
+ const unsigned int sub_var =
+ cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+ pred + pred_offset, pred_stride, &sse);
+ const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+ features[feature_idx++] = var_ratio;
+ fprintf(f, "%f,", var_ratio);
+ }
+
+ fprintf(f, "%d\n", part == PARTITION_NONE ? 0 : 1);
+ }
+
+ fclose(f);
+ return -1;
+ }
+}
+#endif
+
+static void duplicate_mode_info_in_sb(AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const int block_width =
+ AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+ const int block_height =
+ AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+ const int mi_stride = xd->mi_stride;
+ MB_MODE_INFO *const src_mi = xd->mi[0];
+ int i, j;
+
+ for (j = 0; j < block_height; ++j)
+ for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi;
+}
+
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+ MB_MODE_INFO_EXT *const mbmi_ext,
+ const MB_MODE_INFO_EXT_FRAME *mbmi_ext_best, uint8_t ref_frame_type) {
+ memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+ sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+ memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+ sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+ mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+ mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+ memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+ sizeof(mbmi_ext->global_mvs));
+}
+
+static void fill_mode_info_sb(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int hbs = mi_size_wide[bsize] >> 1;
+ PARTITION_TYPE partition = pc_tree->partitioning;
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+ assert(bsize >= BLOCK_8X8);
+
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+ return;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ set_mode_info_offsets(&cm->mi_params, &cpi->mbmi_ext_info, x, xd, mi_row,
+ mi_col);
+ *(xd->mi[0]) = pc_tree->none->mic;
+ copy_mbmi_ext_frame_to_mbmi_ext(
+ &x->mbmi_ext, &pc_tree->none->mbmi_ext_best, LAST_FRAME);
+ duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+ break;
+ case PARTITION_SPLIT: {
+ fill_mode_info_sb(cpi, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+ fill_mode_info_sb(cpi, x, mi_row, mi_col + hbs, subsize,
+ pc_tree->split[1]);
+ fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col, subsize,
+ pc_tree->split[2]);
+ fill_mode_info_sb(cpi, x, mi_row + hbs, mi_col + hbs, subsize,
+ pc_tree->split[3]);
+ break;
+ }
+ default: break;
+ }
+}
+
+void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RD_STATS *rd_cost, int do_recon, int64_t best_rd,
+ PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int hbs = mi_size_wide[bsize] >> 1;
+ TokenExtra *tp_orig = *tp;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ RD_STATS this_rdc, best_rdc;
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ int do_split = bsize > BLOCK_8X8;
+ // Override skipping rectangular partition operations for edge blocks
+ const int force_horz_split = (mi_row + 2 * hbs > cm->mi_params.mi_rows);
+ const int force_vert_split = (mi_col + 2 * hbs > cm->mi_params.mi_cols);
+
+ int partition_none_allowed = !force_horz_split && !force_vert_split;
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]); // Square partition only
+ assert(cm->seq_params->sb_size == BLOCK_64X64); // Small SB so far
+
+ (void)*tp_orig;
+
+ av1_invalid_rd_stats(&best_rdc);
+ best_rdc.rdcost = best_rd;
+#ifndef _COLLECT_GROUND_TRUTH_
+ if (partition_none_allowed && do_split) {
+ const int ml_predicted_partition =
+ ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col);
+ if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
+ if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
+ }
+#endif
+
+ xd->above_txfm_context =
+ cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ av1_save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+ // PARTITION_NONE
+ if (partition_none_allowed) {
+ pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
+ if (!pc_tree->none)
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PICK_MODE_CONTEXT");
+ PICK_MODE_CONTEXT *ctx = pc_tree->none;
+
+// Flip for RDO based pick mode
+#if 0
+ RD_STATS dummy;
+ av1_invalid_rd_stats(&dummy);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+ PARTITION_NONE, bsize, ctx, dummy);
+#else
+ pick_sb_modes_nonrd(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize,
+ ctx);
+#endif
+ if (this_rdc.rate != INT_MAX) {
+ const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+
+ this_rdc.rate += mode_costs->partition_cost[pl][PARTITION_NONE];
+ this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = this_rdc;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+ }
+ }
+ }
+
+ // PARTITION_SPLIT
+ if (do_split) {
+ RD_STATS sum_rdc;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+ av1_init_rd_stats(&sum_rdc);
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+ if (!pc_tree->split[i])
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate PC_TREE");
+ pc_tree->split[i]->index = i;
+ }
+
+ int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ sum_rdc.rate += mode_costs->partition_cost[pl][PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+ for (int i = 0;
+ i < SUB_PARTITIONS_SPLIT && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+ const int x_idx = (i & 1) * hbs;
+ const int y_idx = (i >> 1) * hbs;
+
+ if (mi_row + y_idx >= cm->mi_params.mi_rows ||
+ mi_col + x_idx >= cm->mi_params.mi_cols)
+ continue;
+ av1_nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+ mi_col + x_idx, subsize, &this_rdc, i < 3,
+ best_rdc.rdcost - sum_rdc.rdcost,
+ pc_tree->split[i]);
+
+ if (this_rdc.rate == INT_MAX) {
+ av1_invalid_rd_stats(&sum_rdc);
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ }
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_SPLIT;
+ }
+ }
+
+#ifdef _COLLECT_GROUND_TRUTH_
+ store_partition_data(cpi, x, bsize, mi_row, mi_col, pc_tree->partitioning);
+#endif
+
+ *rd_cost = best_rdc;
+
+ av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+ if (best_rdc.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
+
+ // update mode info array
+ fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree);
+
+ if (do_recon) {
+ if (bsize == cm->seq_params->sb_size) {
+ // NOTE: To get estimate for rate due to the tokens, use:
+ // int rate_coeffs = 0;
+ // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+ // bsize, pc_tree, &rate_coeffs);
+ set_cb_offsets(x->cb_offset, 0, 0);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+
+ if (bsize == BLOCK_64X64 && do_recon) {
+ assert(best_rdc.rate < INT_MAX);
+ assert(best_rdc.dist < INT64_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+}
+#endif // CONFIG_RT_ML_PARTITIONING
diff --git a/third_party/aom/av1/encoder/partition_search.h b/third_party/aom/av1/encoder/partition_search.h
new file mode 100644
index 0000000000..1b5d71b7da
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_search.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_SEARCH_H_
+#define AOM_AV1_ENCODER_PARTITION_SEARCH_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/tokenize.h"
+
+void av1_set_offsets_without_segment_id(const AV1_COMP *const cpi,
+ const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize);
+void av1_set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+void av1_rd_use_partition(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+ MB_MODE_INFO **mib, TokenExtra **tp, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *rate,
+ int64_t *dist, int do_recon, PC_TREE *pc_tree);
+void av1_nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MB_MODE_INFO **mib,
+ TokenExtra **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, PC_TREE *pc_tree);
+#if CONFIG_RT_ML_PARTITIONING
+void av1_nonrd_pick_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RD_STATS *rd_cost, int do_recon, int64_t best_rd,
+ PC_TREE *pc_tree);
+#endif
+void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf);
+void av1_reset_sf_for_ext_part(AV1_COMP *const cpi);
+
+#if CONFIG_PARTITION_SEARCH_ORDER
+bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp,
+ SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ RD_STATS *best_rd_cost);
+#endif
+
+bool av1_rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TokenExtra **tp, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, RD_STATS *rd_cost,
+ RD_STATS best_rdc, PC_TREE *pc_tree,
+ SIMPLE_MOTION_DATA_TREE *sms_tree, int64_t *none_rd,
+ SB_MULTI_PASS_MODE multi_pass_mode,
+ RD_RECT_PART_WIN_INFO *rect_part_win_info);
+
+static AOM_INLINE void set_cb_offsets(uint16_t *cb_offset,
+ const uint16_t cb_offset_y,
+ const uint16_t cb_offset_uv) {
+ cb_offset[PLANE_TYPE_Y] = cb_offset_y;
+ cb_offset[PLANE_TYPE_UV] = cb_offset_uv;
+}
+
+static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize,
+ const int subsampling_x,
+ const int subsampling_y) {
+ x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize];
+ if (x->e_mbd.is_chroma_ref) {
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, subsampling_x, subsampling_y);
+ assert(plane_bsize != BLOCK_INVALID);
+ x->cb_offset[PLANE_TYPE_UV] +=
+ block_size_wide[plane_bsize] * block_size_high[plane_bsize];
+ }
+}
+
+#endif // AOM_AV1_ENCODER_PARTITION_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/partition_strategy.c b/third_party/aom/av1/encoder/partition_strategy.c
new file mode 100644
index 0000000000..ce06313579
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_strategy.c
@@ -0,0 +1,2573 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/thirdpass.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconinter.h"
+
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/partition_model_weights.h"
+#include "av1/encoder/partition_cnn_weights.h"
+#endif
+#include "av1/encoder/encoder.h"
+
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/partition_search.h"
+#include "av1/encoder/rdopt.h"
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void simple_motion_search_prune_part_features(
+ AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
+ int features_to_get);
+
+static bool ext_ml_model_decision_before_none(
+ AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+ int *partition_none_allowed, int *partition_horz_allowed,
+ int *partition_vert_allowed, int *do_rectangular_split,
+ int *do_square_split);
+
+static bool ext_ml_model_decision_before_none_part2(
+ AV1_COMP *cpi,
+ const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+ int *prune_horz, int *prune_vert);
+
+static bool ext_ml_model_decision_after_none(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_after_none, int *do_square_split,
+ int *do_rectangular_split);
+
+static bool ext_ml_model_decision_after_none_part2(
+ AV1_COMP *const cpi, const float *const features_terminate,
+ int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split(
+ AV1_COMP *const cpi, const float *const features_terminate,
+ int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split_part2(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_prune, int *prune_rect_part_horz,
+ int *prune_rect_part_vert);
+
+static bool ext_ml_model_decision_after_rect(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_after_rect, int *horza_partition_allowed,
+ int *horzb_partition_allowed, int *verta_partition_allowed,
+ int *vertb_partition_allowed);
+
+static bool ext_ml_model_decision_after_part_ab(
+ AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+ int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+ int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+ int *const partition_vert4_allowed, unsigned int pb_source_variance,
+ int mi_row, int mi_col);
+
+static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
+ switch (bsize) {
+ case BLOCK_128X128: return 0;
+ case BLOCK_64X64: return 1;
+ case BLOCK_32X32: return 2;
+ case BLOCK_16X16: return 3;
+ case BLOCK_8X8: return 4;
+ default: assert(0 && "Invalid bsize"); return -1;
+ }
+}
+
+static char *get_feature_file_name(int id) {
+ static char *feature_file_names[] = {
+ "feature_before_partition_none",
+ "feature_before_partition_none_prune_rect",
+ "feature_after_partition_none_prune",
+ "feature_after_partition_none_terminate",
+ "feature_after_partition_split_terminate",
+ "feature_after_partition_split_prune_rect",
+ "feature_after_partition_rect",
+ "feature_after_partition_ab",
+ };
+
+ return feature_file_names[id];
+}
+
+static void write_features_to_file(const char *const path,
+ const bool is_test_mode,
+ const float *features,
+ const int feature_size, const int id,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col) {
+ if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return;
+
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/%s", path,
+ get_feature_file_name(id));
+ FILE *pfile = fopen(filename, "a");
+ if (pfile == NULL) return;
+ if (!is_test_mode) {
+ fprintf(pfile, "%d,%d,%d,%d,%d\n", id, (int)bsize, mi_row, mi_col,
+ feature_size);
+ }
+ for (int i = 0; i < feature_size; ++i) {
+ fprintf(pfile, "%.6f", features[i]);
+ if (i < feature_size - 1) fprintf(pfile, ",");
+ }
+ fprintf(pfile, "\n");
+ fclose(pfile);
+}
+
+// TODO(chiyotsai@google.com): This is very much a work in progress. We still
+// need to the following:
+// -- add support for hdres
+// -- add support for pruning rectangular partitions
+// -- use reconstructed pixels instead of source pixels for padding
+// -- use chroma pixels in addition to luma pixels
+void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
+ int quad_tree_idx,
+ int intra_cnn_based_part_prune_level,
+ PartitionSearchState *part_state) {
+ assert(cm->seq_params->sb_size >= BLOCK_64X64 &&
+ "Invalid sb_size for intra_cnn!");
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ const int bsize_idx = convert_bsize_to_idx(bsize);
+
+ if (bsize == BLOCK_128X128) {
+ return;
+ }
+
+ PartitionSearchInfo *part_info = &x->part_search_info;
+
+ // Precompute the CNN part and cache the result in MACROBLOCK
+ if (bsize == BLOCK_64X64 && !part_info->cnn_output_valid) {
+ const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config;
+
+ // Prepare the output
+ const CNN_THREAD_DATA thread_data = { .num_workers = 1, .workers = NULL };
+ const int num_outputs = 4;
+ const int output_dims[4] = { 1, 2, 4, 8 };
+ const int out_chs[4] = { CNN_BRANCH_0_OUT_CH, CNN_BRANCH_1_OUT_CH,
+ CNN_BRANCH_2_OUT_CH, CNN_BRANCH_3_OUT_CH };
+ float *output_buffer[CNN_TOT_OUT_CH];
+
+ float **cur_output_buf = output_buffer;
+ float *curr_buf_ptr = part_info->cnn_buffer;
+ for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+ const int num_chs = out_chs[output_idx];
+ const int ch_size = output_dims[output_idx] * output_dims[output_idx];
+ for (int ch = 0; ch < num_chs; ch++) {
+ cur_output_buf[ch] = curr_buf_ptr;
+ curr_buf_ptr += ch_size;
+ }
+ cur_output_buf += num_chs;
+ }
+
+ CNN_MULTI_OUT output = {
+ .num_outputs = 4,
+ .output_channels = out_chs,
+ .output_strides = output_dims,
+ .output_buffer = output_buffer,
+ };
+
+ // Prepare the input
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int bit_depth = xd->bd;
+ const int dc_q =
+ av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8);
+ part_info->log_q = log1pf((float)(dc_q * dc_q) / 256.0f);
+ part_info->log_q =
+ (part_info->log_q - av1_intra_mode_cnn_partition_mean[0]) /
+ av1_intra_mode_cnn_partition_std[0];
+
+ const int width = 65, height = 65,
+ stride = x->plane[AOM_PLANE_Y].src.stride;
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *image[1] = {
+ CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1
+ };
+
+ if (!av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
+ cnn_config, &thread_data,
+ bit_depth, &output)) {
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating CNN data");
+ return;
+ }
+ } else {
+ uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 };
+
+ if (!av1_cnn_predict_img_multi_out(image, width, height, stride,
+ cnn_config, &thread_data, &output)) {
+ aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating CNN data");
+ return;
+ }
+ }
+
+ part_info->cnn_output_valid = 1;
+ }
+
+ if (!part_info->cnn_output_valid) {
+ return;
+ }
+
+ const NN_CONFIG *dnn_configs[5] = {
+ NULL,
+ &av1_intra_mode_cnn_partition_branch_0_dnn_config,
+ &av1_intra_mode_cnn_partition_branch_1_dnn_config,
+ &av1_intra_mode_cnn_partition_branch_2_dnn_config,
+ &av1_intra_mode_cnn_partition_branch_3_dnn_config,
+ };
+
+ const NN_CONFIG *dnn_config = dnn_configs[bsize_idx];
+
+ float dnn_features[100];
+ float logits[4] = { 0.0f };
+
+ const float *branch_0 = part_info->cnn_buffer;
+ const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE;
+ const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE;
+ const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE;
+
+ if (bsize == BLOCK_64X64) {
+ int f_idx = 0;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_0_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_0[ch_idx];
+ }
+
+ const int spa_stride = 2 * 2;
+ for (int lin_idx = 0; lin_idx < spa_stride; lin_idx++) {
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride];
+ }
+ }
+ dnn_features[f_idx++] = part_info->log_q;
+ } else if (bsize == BLOCK_32X32) {
+ int f_idx = 0;
+ for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) {
+ dnn_features[f_idx++] = branch_0[idx];
+ }
+
+ const int curr_lin_idx = quad_to_linear_1[quad_tree_idx - 1];
+ const int spa_stride = 2 * 2;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride];
+ }
+ dnn_features[f_idx++] = part_info->log_q;
+ } else if (bsize == BLOCK_16X16) {
+ int f_idx = 0;
+ const int prev_quad_idx = (quad_tree_idx - 1) / 4;
+ const int prev_lin_idx = quad_to_linear_1[prev_quad_idx - 1];
+ const int prev_spa_stride = 2 * 2;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_1[prev_lin_idx + ch_idx * prev_spa_stride];
+ }
+
+ const int curr_lin_idx = quad_to_linear_2[quad_tree_idx - 5];
+ const int spa_stride = 4 * 4;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride];
+ }
+ dnn_features[f_idx++] = part_info->log_q;
+ } else if (bsize == BLOCK_8X8) {
+ int f_idx = 0;
+ const int prev_quad_idx = (quad_tree_idx - 1) / 4;
+ const int prev_lin_idx = quad_to_linear_2[prev_quad_idx - 5];
+ const int prev_spa_stride = 4 * 4;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_2[prev_lin_idx + ch_idx * prev_spa_stride];
+ }
+
+ const int curr_lin_idx = quad_to_linear_3[quad_tree_idx - 21];
+ const int spa_stride = 8 * 8;
+ for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) {
+ dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride];
+ }
+ dnn_features[f_idx++] = part_info->log_q;
+ } else {
+ assert(0 && "Invalid bsize in intra_cnn partition");
+ }
+
+ // Make decision
+ av1_nn_predict(dnn_features, dnn_config, 1, logits);
+
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ float split_only_thresh = 100.0f, no_split_thresh = -100.0f;
+ if (is_720p_or_larger) {
+ split_only_thresh =
+ av1_intra_mode_cnn_partition_split_thresh_hdres[bsize_idx];
+ no_split_thresh =
+ av1_intra_mode_cnn_partition_no_split_thresh_hdres[bsize_idx];
+ } else if (is_480p_or_larger) {
+ split_only_thresh =
+ av1_intra_mode_cnn_partition_split_thresh_midres[bsize_idx];
+ no_split_thresh =
+ av1_intra_mode_cnn_partition_no_split_thresh_midres[bsize_idx];
+ } else {
+ split_only_thresh =
+ av1_intra_mode_cnn_partition_split_thresh_lowres[bsize_idx];
+ no_split_thresh =
+ av1_intra_mode_cnn_partition_no_split_thresh_lowres[bsize_idx];
+ }
+
+ if (logits[0] > split_only_thresh) {
+ // As screen contents tend to choose larger partitions, do not prune
+ // PARTITION_NONE when intra_cnn_based_part_prune_level=1.
+ if (intra_cnn_based_part_prune_level != 1) {
+ part_state->partition_none_allowed = 0;
+ }
+ part_state->do_square_split = 1;
+ av1_disable_rect_partitions(part_state);
+ }
+
+ if (logits[0] < no_split_thresh) {
+ av1_disable_square_split_partition(part_state);
+ }
+}
+
+static INLINE int get_simple_motion_search_prune_agg(int qindex,
+ int prune_level,
+ int is_rect_part) {
+ assert(prune_level < TOTAL_AGG_LVLS);
+ if (prune_level == NO_PRUNING) {
+ return -1;
+ }
+
+ // Aggressiveness value for SIMPLE_MOTION_SEARCH_PRUNE_LEVEL except
+ // QIDX_BASED_AGG_LVL
+ const int sms_prune_agg_levels[TOTAL_SIMPLE_AGG_LVLS] = { 0, 1, 2, 3 };
+ if (prune_level < TOTAL_SIMPLE_AGG_LVLS) {
+ return sms_prune_agg_levels[prune_level];
+ }
+
+ // Map the QIDX_BASED_AGG_LVL to corresponding aggressiveness value.
+ // Aggressive pruning for lower quantizers in non-boosted frames to prune
+ // rectangular partitions.
+ const int qband = is_rect_part ? (qindex <= 90 ? 1 : 0) : 0;
+ const int sms_prune_agg_qindex_based[2] = { 1, 2 };
+ return sms_prune_agg_qindex_based[qband];
+}
+
+void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PartitionSearchState *part_state) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ const int bsize_idx = convert_bsize_to_idx(bsize);
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ // res_idx is 0 for res < 480p, 1 for 480p, 2 for 720p+
+ const int res_idx = is_480p_or_larger + is_720p_or_larger;
+
+ assert(bsize_idx >= 0 && bsize_idx <= 4 &&
+ "Invalid bsize in simple_motion_search_based_split");
+
+ const float *ml_mean = av1_simple_motion_search_split_mean[bsize_idx];
+ const float *ml_std = av1_simple_motion_search_split_std[bsize_idx];
+ const NN_CONFIG *nn_config =
+ av1_simple_motion_search_split_nn_config[bsize_idx];
+
+ const int agg = get_simple_motion_search_prune_agg(
+ x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 0);
+ if (agg < 0) {
+ return;
+ }
+
+ const float split_only_thresh =
+ av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx];
+ const float no_split_thresh =
+ av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx];
+
+ float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f };
+ simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+ bsize, features,
+ FEATURE_SMS_SPLIT_MODEL_FLAG);
+
+ // Write features to file
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features,
+ FEATURE_SIZE_SMS_SPLIT, 0, bsize, mi_row, mi_col);
+
+ // Note: it is intended to not normalize the features here, to keep it
+ // consistent for all features collected and passed to the external model.
+ if (ext_ml_model_decision_before_none(
+ cpi, features, &part_state->partition_none_allowed,
+ &part_state->partition_rect_allowed[HORZ],
+ &part_state->partition_rect_allowed[VERT],
+ &part_state->do_rectangular_split, &part_state->do_square_split)) {
+ return;
+ }
+
+ for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) {
+ features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx];
+ }
+
+ float score = 0.0f;
+
+ av1_nn_predict(features, nn_config, 1, &score);
+
+ if (score > split_only_thresh) {
+ av1_set_square_split_only(part_state);
+ }
+
+ if (cpi->sf.part_sf.simple_motion_search_split >= 2 &&
+ score < no_split_thresh) {
+ av1_disable_square_split_partition(part_state);
+ }
+
+ // If the score is very low, prune rectangular split since it is unlikely to
+ // occur.
+ if (cpi->sf.part_sf.simple_motion_search_rect_split) {
+ const float scale = res_idx >= 2 ? 3.0f : 2.0f;
+ const float rect_split_thresh =
+ scale * av1_simple_motion_search_no_split_thresh
+ [cpi->sf.part_sf.simple_motion_search_rect_split][res_idx]
+ [bsize_idx];
+ if (score < rect_split_thresh) {
+ part_state->do_rectangular_split = 0;
+ }
+ }
+}
+
+// Given a list of ref frames in refs, performs simple_motion_search on each of
+// the refs and returns the ref with the smallest sse. Returns -1 if none of the
+// ref in the list is available. Also stores the best sse and var in best_sse,
+// best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in
+// sms_tree. If save_mv is 1, update mv_ref_fulls under sms_tree and the
+// subtrees.
+static int simple_motion_search_get_best_ref(
+ AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ int mi_row, int mi_col, BLOCK_SIZE bsize, const int *const refs,
+ int num_refs, int use_subpixel, int save_mv, unsigned int *best_sse,
+ unsigned int *best_var) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int best_ref = -1;
+
+ if (mi_col >= cm->mi_params.mi_cols || mi_row >= cm->mi_params.mi_rows) {
+ // If the whole block is outside of the image, set the var and sse to 0.
+ *best_var = 0;
+ *best_sse = 0;
+
+ return best_ref;
+ }
+
+ // Otherwise do loop through the reference frames and find the one with the
+ // minimum SSE
+ const int num_planes = 1;
+
+ *best_sse = INT_MAX;
+
+ for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) {
+ const int ref = refs[ref_idx];
+
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
+ const FULLPEL_MV *start_mvs = sms_tree->start_mvs;
+ unsigned int curr_sse = 0, curr_var = 0;
+ const int_mv best_mv = av1_simple_motion_search_sse_var(
+ cpi, x, mi_row, mi_col, bsize, ref, start_mvs[ref], num_planes,
+ use_subpixel, &curr_sse, &curr_var);
+ if (curr_sse < *best_sse) {
+ *best_sse = curr_sse;
+ *best_var = curr_var;
+ best_ref = ref;
+ }
+
+ if (save_mv) {
+ sms_tree->start_mvs[ref].row = best_mv.as_mv.row / 8;
+ sms_tree->start_mvs[ref].col = best_mv.as_mv.col / 8;
+
+ if (bsize >= BLOCK_8X8) {
+ for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) {
+ // Propagate the new motion vectors to a lower level
+ SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx];
+ sub_tree->start_mvs[ref] = sms_tree->start_mvs[ref];
+ }
+ }
+ }
+ }
+ }
+
+ return best_ref;
+}
+
+// Collects features using simple_motion_search and store them in features. The
+// features are also cached in SIMPLE_MOTION_DATA_TREE. By default, the features
+// collected are the sse and var from the subblocks flagged by features_to_get.
+// Furthermore, if features is not NULL, then 7 more features are appended to
+// the end of features:
+// - log(1.0 + dc_q ** 2)
+// - whether an above macroblock exists
+// - width of above macroblock
+// - height of above macroblock
+// - whether a left marcoblock exists
+// - width of left macroblock
+// - height of left macroblock
+static AOM_INLINE void simple_motion_search_prune_part_features(
+ AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
+ int features_to_get) {
+ const int w_mi = mi_size_wide[bsize];
+ const int h_mi = mi_size_high[bsize];
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+ assert(bsize >= BLOCK_8X8);
+ assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
+ cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+
+ // Setting up motion search
+ const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+ : LAST_FRAME };
+ const int num_refs = 1;
+ const int use_subpixel = 1;
+
+ // Doing whole block first to update the mv
+ if (!sms_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) {
+ simple_motion_search_get_best_ref(cpi, x, sms_tree, mi_row, mi_col, bsize,
+ ref_list, num_refs, use_subpixel, 1,
+ &sms_tree->sms_none_feat[0],
+ &sms_tree->sms_none_feat[1]);
+ sms_tree->sms_none_valid = 1;
+ }
+
+ // Split subblocks
+ if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int r_idx = 0; r_idx < SUB_PARTITIONS_SPLIT; r_idx++) {
+ const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
+ const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
+ SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[r_idx];
+
+ if (!sub_tree->sms_none_valid) {
+ simple_motion_search_get_best_ref(
+ cpi, x, sub_tree, sub_mi_row, sub_mi_col, subsize, ref_list,
+ num_refs, use_subpixel, 1, &sub_tree->sms_none_feat[0],
+ &sub_tree->sms_none_feat[1]);
+ sub_tree->sms_none_valid = 1;
+ }
+ }
+ }
+
+ // Rectangular subblocks
+ if (!sms_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) {
+ // Horz subblock
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+ for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) {
+ const int sub_mi_col = mi_col + 0;
+ const int sub_mi_row = mi_row + r_idx * h_mi / 2;
+
+ simple_motion_search_get_best_ref(
+ cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+ use_subpixel, 0, &sms_tree->sms_rect_feat[2 * r_idx],
+ &sms_tree->sms_rect_feat[2 * r_idx + 1]);
+ }
+
+ // Vert subblock
+ subsize = get_partition_subsize(bsize, PARTITION_VERT);
+ for (int r_idx = 0; r_idx < SUB_PARTITIONS_RECT; r_idx++) {
+ const int sub_mi_col = mi_col + r_idx * w_mi / 2;
+ const int sub_mi_row = mi_row + 0;
+
+ simple_motion_search_get_best_ref(
+ cpi, x, sms_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+ use_subpixel, 0, &sms_tree->sms_rect_feat[4 + 2 * r_idx],
+ &sms_tree->sms_rect_feat[4 + 2 * r_idx + 1]);
+ }
+ sms_tree->sms_rect_valid = 1;
+ }
+
+ if (!features) return;
+
+ int f_idx = 0;
+ if (features_to_get & FEATURE_SMS_NONE_FLAG) {
+ for (int sub_idx = 0; sub_idx < 2; sub_idx++) {
+ features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[sub_idx]);
+ }
+ }
+
+ if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
+ for (int sub_idx = 0; sub_idx < SUB_PARTITIONS_SPLIT; sub_idx++) {
+ SIMPLE_MOTION_DATA_TREE *sub_tree = sms_tree->split[sub_idx];
+ features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[0]);
+ features[f_idx++] = log1pf((float)sub_tree->sms_none_feat[1]);
+ }
+ }
+
+ if (features_to_get & FEATURE_SMS_RECT_FLAG) {
+ for (int sub_idx = 0; sub_idx < 8; sub_idx++) {
+ features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[sub_idx]);
+ }
+ }
+
+ const MACROBLOCKD *xd = &x->e_mbd;
+ set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+ // Q_INDEX
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ features[f_idx++] = log1pf((float)(dc_q * dc_q) / 256.0f);
+
+ // Neighbor stuff
+ const int has_above = !!xd->above_mbmi;
+ const int has_left = !!xd->left_mbmi;
+ const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->bsize : bsize;
+ const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->bsize : bsize;
+ features[f_idx++] = (float)has_above;
+ features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
+ features[f_idx++] = (float)mi_size_high_log2[above_bsize];
+ features[f_idx++] = (float)has_left;
+ features[f_idx++] = (float)mi_size_wide_log2[left_bsize];
+ features[f_idx++] = (float)mi_size_high_log2[left_bsize];
+}
+
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PartitionSearchState *part_state) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ const int bsize_idx = convert_bsize_to_idx(bsize);
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ // res_idx is 0 for lowres, 1 for 48p, 2 for 720p+
+ const int res_idx = is_480p_or_larger + is_720p_or_larger;
+
+ // Get model parameters
+ const NN_CONFIG *nn_config =
+ av1_simple_motion_search_prune_rect_nn_config[bsize_idx];
+ const float *ml_mean = av1_simple_motion_search_prune_rect_mean[bsize_idx],
+ *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx];
+
+ const int agg = get_simple_motion_search_prune_agg(
+ x->qindex, cpi->sf.part_sf.simple_motion_search_prune_agg, 1);
+ if (agg < 0) {
+ return;
+ }
+
+ const float prune_thresh =
+ av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx];
+
+ // If there is no valid threshold, return immediately.
+ if (!nn_config || prune_thresh == 0.0f) {
+ return;
+ }
+
+ // Get features
+ float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f };
+ simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+ bsize, features,
+ FEATURE_SMS_PRUNE_PART_FLAG);
+
+ // Note: it is intended to not normalize the features here, to keep it
+ // consistent for all features collected and passed to the external model.
+ if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
+ !frame_is_intra_only(cm) &&
+ (part_state->partition_rect_allowed[HORZ] ||
+ part_state->partition_rect_allowed[VERT]) &&
+ bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
+ // Write features to file
+ write_features_to_file(
+ cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode,
+ features, FEATURE_SIZE_SMS_PRUNE_PART, 1, bsize, mi_row, mi_col);
+
+ if (ext_ml_model_decision_before_none_part2(
+ cpi, features, &part_state->prune_rect_part[HORZ],
+ &part_state->prune_rect_part[VERT])) {
+ return;
+ }
+ }
+
+ for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
+ features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+ }
+
+ // Get probabilities
+ float scores[EXT_PARTITION_TYPES] = { 0.0f },
+ probs[EXT_PARTITION_TYPES] = { 0.0f };
+ const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8)
+ ? PARTITION_TYPES
+ : EXT_PARTITION_TYPES;
+
+ av1_nn_predict(features, nn_config, 1, scores);
+
+ av1_nn_softmax(scores, probs, num_classes);
+
+ // Determine if we should prune rectangular partitions.
+ if (probs[PARTITION_HORZ] <= prune_thresh) {
+ part_state->prune_rect_part[HORZ] = 1;
+ }
+ if (probs[PARTITION_VERT] <= prune_thresh) {
+ part_state->prune_rect_part[VERT] = 1;
+ }
+}
+
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+// - The frame is a show frame
+// - The frame is not intra only
+// - The current bsize is > BLOCK_8X8
+// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(
+ AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
+ const RD_STATS *none_rdc, PartitionSearchState *part_state) {
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f };
+ simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+ bsize, features,
+ FEATURE_SMS_PRUNE_PART_FLAG);
+ int f_idx = FEATURE_SIZE_SMS_PRUNE_PART;
+
+ features[f_idx++] = log1pf((float)none_rdc->rate);
+ features[f_idx++] = log1pf((float)none_rdc->dist);
+ features[f_idx++] = log1pf((float)none_rdc->rdcost);
+
+ assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE);
+
+ const float *ml_mean = NULL;
+ const float *ml_std = NULL;
+ const float *ml_model = NULL;
+
+ if (bsize == BLOCK_128X128) {
+ ml_mean = av1_simple_motion_search_term_none_mean_128;
+ ml_std = av1_simple_motion_search_term_none_std_128;
+ ml_model = av1_simple_motion_search_term_none_model_128;
+ } else if (bsize == BLOCK_64X64) {
+ ml_mean = av1_simple_motion_search_term_none_mean_64;
+ ml_std = av1_simple_motion_search_term_none_std_64;
+ ml_model = av1_simple_motion_search_term_none_model_64;
+ } else if (bsize == BLOCK_32X32) {
+ ml_mean = av1_simple_motion_search_term_none_mean_32;
+ ml_std = av1_simple_motion_search_term_none_std_32;
+ ml_model = av1_simple_motion_search_term_none_model_32;
+ } else if (bsize == BLOCK_16X16) {
+ ml_mean = av1_simple_motion_search_term_none_mean_16;
+ ml_std = av1_simple_motion_search_term_none_std_16;
+ ml_model = av1_simple_motion_search_term_none_model_16;
+ } else {
+ assert(0 && "Unexpected block size in simple_motion_term_none");
+ }
+
+ // Write features to file
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features,
+ FEATURE_SIZE_SMS_TERM_NONE, 3, bsize, mi_row, mi_col);
+
+ if (ext_ml_model_decision_after_none_part2(
+ cpi, features, &part_state->terminate_partition_search)) {
+ return;
+ }
+
+ if (ml_model) {
+ float score = 0.0f;
+ for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) {
+ score +=
+ ml_model[f_idx] * (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+ }
+ score += ml_model[FEATURE_SIZE_SMS_TERM_NONE];
+
+ if (score >= 0.0f) {
+ part_state->terminate_partition_search = 1;
+ }
+ }
+}
+
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ float *features) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+
+ // Currently this only allows 128X128 SB size. May extend it to 64X64 SB size.
+ assert(sb_size == BLOCK_128X128);
+
+ int f_idx = 0;
+
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ const float log_q_sq = log1pf((float)(dc_q * dc_q) / 256.0f);
+
+ // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb
+ float sum_mv_row_sq = 0;
+ float sum_mv_row = 0;
+ float min_abs_mv_row = FLT_MAX;
+ float max_abs_mv_row = 0;
+
+ float sum_mv_col_sq = 0;
+ float sum_mv_col = 0;
+ float min_abs_mv_col = FLT_MAX;
+ float max_abs_mv_col = 0;
+
+ float sum_log_sse_sq = 0;
+ float sum_log_sse = 0;
+ float min_log_sse = FLT_MAX;
+ float max_log_sse = 0;
+
+ const BLOCK_SIZE mb_size = BLOCK_16X16;
+ const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size];
+ const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size];
+ const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size];
+ const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size];
+
+ for (int mb_row = 0; mb_row < mb_rows; mb_row++)
+ for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+ const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2);
+ const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2);
+ unsigned int sse = 0;
+ unsigned int var = 0;
+ const FULLPEL_MV start_mv = kZeroFullMv;
+ const MV_REFERENCE_FRAME ref =
+ cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+ const int_mv best_mv = av1_simple_motion_search_sse_var(
+ cpi, x, this_mi_row, this_mi_col, mb_size, ref, start_mv, 1, 0, &sse,
+ &var);
+
+ const float mv_row = (float)(best_mv.as_mv.row / 8);
+ const float mv_col = (float)(best_mv.as_mv.col / 8);
+ const float log_sse = log1pf((float)sse);
+ const float abs_mv_row = fabsf(mv_row);
+ const float abs_mv_col = fabsf(mv_col);
+
+ sum_mv_row_sq += mv_row * mv_row;
+ sum_mv_row += mv_row;
+ sum_mv_col_sq += mv_col * mv_col;
+ sum_mv_col += mv_col;
+
+ if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row;
+ if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row;
+ if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col;
+ if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col;
+
+ sum_log_sse_sq += log_sse * log_sse;
+ sum_log_sse += log_sse;
+ if (log_sse < min_log_sse) min_log_sse = log_sse;
+ if (log_sse > max_log_sse) max_log_sse = log_sse;
+ }
+ const int blks = mb_rows * mb_cols;
+ const float avg_mv_row = sum_mv_row / (float)blks;
+ const float var_mv_row =
+ sum_mv_row_sq / (float)blks - avg_mv_row * avg_mv_row;
+
+ const float avg_mv_col = sum_mv_col / (float)blks;
+ const float var_mv_col =
+ sum_mv_col_sq / (float)blks - avg_mv_col * avg_mv_col;
+
+ const float avg_log_sse = sum_log_sse / (float)blks;
+ const float var_log_sse =
+ sum_log_sse_sq / (float)blks - avg_log_sse * avg_log_sse;
+
+ features[f_idx++] = avg_log_sse;
+ features[f_idx++] = avg_mv_col;
+ features[f_idx++] = avg_mv_row;
+ features[f_idx++] = log_q_sq;
+ features[f_idx++] = max_abs_mv_col;
+ features[f_idx++] = max_abs_mv_row;
+ features[f_idx++] = max_log_sse;
+ features[f_idx++] = min_abs_mv_col;
+ features[f_idx++] = min_abs_mv_row;
+ features[f_idx++] = min_log_sse;
+ features[f_idx++] = var_log_sse;
+ features[f_idx++] = var_mv_col;
+ features[f_idx++] = var_mv_row;
+
+ assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED);
+}
+
+// Convert result index to block size.
+// result idx block size
+// 0 BLOCK_16X16
+// 1 BLOCK_32X32
+// 2 BLOCK_64X64
+// 3 BLOCK_128X128
+static BLOCK_SIZE get_block_size(int idx) {
+ return (BLOCK_SIZE)((idx + 2) * 3);
+}
+
+BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const float *features) {
+ float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+ const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config;
+
+ assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
+ NOT_IN_USE);
+
+ av1_nn_predict(features, nn_config, 1, scores);
+
+ int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
+ if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+ DIRECT_PRED) {
+ result = 0;
+ float max_score = scores[0];
+ for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) {
+ if (scores[i] > max_score) {
+ max_score = scores[i];
+ result = i;
+ }
+ }
+ return get_block_size(result);
+ }
+
+ float probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+ av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
+
+ if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+ RELAXED_PRED) {
+ for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+ --result) {
+ if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+ probs[result] += probs[result + 1];
+ }
+ if (probs[result] > 0.2) break;
+ }
+ } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+ ADAPT_PRED) {
+ const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
+ // TODO(debargha): x->source_variance is unavailable at this point,
+ // so compute. The redundant recomputation later can be removed.
+ const unsigned int source_variance = av1_get_perpixel_variance_facade(
+ cpi, &x->e_mbd, &x->plane[0].src, sb_size, AOM_PLANE_Y);
+ if (source_variance > 16) {
+ const double thresh = source_variance < 128 ? 0.05 : 0.1;
+ for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+ --result) {
+ if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+ probs[result] += probs[result + 1];
+ }
+ if (probs[result] > thresh) break;
+ }
+ }
+ }
+
+ return get_block_size(result);
+}
+
+// Get the minimum partition block width and height(in log scale) under a
+// SIMPLE_MOTION_DATA_TREE.
+static AOM_INLINE void get_min_bsize(const SIMPLE_MOTION_DATA_TREE *sms_tree,
+ int *min_bw, int *min_bh) {
+ if (!sms_tree) return;
+
+ const BLOCK_SIZE bsize = sms_tree->block_size;
+ if (bsize == BLOCK_4X4) {
+ *min_bw = 0;
+ *min_bh = 0;
+ return;
+ }
+
+ PARTITION_TYPE part_type = sms_tree->partitioning;
+ if (part_type == PARTITION_INVALID) return;
+
+ if (part_type == PARTITION_SPLIT) {
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ get_min_bsize(sms_tree->split[i], min_bw, min_bh);
+ }
+ } else {
+ if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B ||
+ part_type == PARTITION_VERT_A || part_type == PARTITION_VERT_B)
+ part_type = PARTITION_SPLIT;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, part_type);
+ if (subsize != BLOCK_INVALID) {
+ *min_bw = AOMMIN(*min_bw, mi_size_wide_log2[subsize]);
+ *min_bh = AOMMIN(*min_bh, mi_size_high_log2[subsize]);
+ }
+ }
+}
+
+static INLINE void add_rd_feature(int64_t rd, int64_t best_rd, float *features,
+ int *feature_idx) {
+ const int rd_valid = rd > 0 && rd < INT64_MAX;
+ const float rd_ratio = rd_valid ? (float)rd / best_rd : 1.0f;
+ features[(*feature_idx)++] = (float)rd_valid;
+ features[(*feature_idx)++] = rd_ratio;
+}
+
+#define FEATURES 31
+void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
+ SIMPLE_MOTION_DATA_TREE *const sms_tree,
+ int64_t best_rd, int64_t part_none_rd,
+ int64_t part_split_rd,
+ int64_t *split_block_rd,
+ PartitionSearchState *part_state) {
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ if (best_rd <= 0 || best_rd == INT64_MAX ||
+ part_state->terminate_partition_search)
+ return;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ const NN_CONFIG *nn_config = NULL;
+ float thresh = -1e6;
+ switch (bsize) {
+ case BLOCK_128X128: break;
+ case BLOCK_64X64:
+ nn_config = &av1_early_term_after_split_nnconfig_64;
+ thresh = is_480p_or_larger ? -2.0f : -1.2f;
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_early_term_after_split_nnconfig_32;
+ thresh = is_480p_or_larger ? -2.6f : -2.3f;
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_early_term_after_split_nnconfig_16;
+ thresh = is_480p_or_larger ? -2.0f : -2.4f;
+ break;
+ case BLOCK_8X8:
+ nn_config = &av1_early_term_after_split_nnconfig_8;
+ thresh = is_480p_or_larger ? -1.0f : -1.4f;
+ break;
+ case BLOCK_4X4: break;
+ default:
+ assert(0 && "Invalid block size in av1_ml_early_term_after_split().");
+ break;
+ }
+ if (!nn_config) return;
+
+ // Use more conservative threshold for level 1.
+ if (cpi->sf.part_sf.ml_early_term_after_part_split_level < 2) thresh -= 0.3f;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ const int bs = block_size_wide[bsize];
+ int f_idx = 0;
+ float features[FEATURES] = { 0.0f };
+
+ features[f_idx++] = log1pf((float)dc_q / 4.0f);
+ features[f_idx++] = log1pf((float)best_rd / bs / bs / 1024.0f);
+
+ add_rd_feature(part_none_rd, best_rd, features, &f_idx);
+ add_rd_feature(part_split_rd, best_rd, features, &f_idx);
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ add_rd_feature(split_block_rd[i], best_rd, features, &f_idx);
+ int min_bw = MAX_SB_SIZE_LOG2;
+ int min_bh = MAX_SB_SIZE_LOG2;
+ get_min_bsize(sms_tree->split[i], &min_bw, &min_bh);
+ features[f_idx++] = (float)min_bw;
+ features[f_idx++] = (float)min_bh;
+ }
+
+ simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
+ bsize, NULL,
+ FEATURE_SMS_PRUNE_PART_FLAG);
+
+ features[f_idx++] = log1pf((float)sms_tree->sms_none_feat[1]);
+
+ features[f_idx++] = log1pf((float)sms_tree->split[0]->sms_none_feat[1]);
+ features[f_idx++] = log1pf((float)sms_tree->split[1]->sms_none_feat[1]);
+ features[f_idx++] = log1pf((float)sms_tree->split[2]->sms_none_feat[1]);
+ features[f_idx++] = log1pf((float)sms_tree->split[3]->sms_none_feat[1]);
+
+ features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[1]);
+ features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[3]);
+ features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[5]);
+ features[f_idx++] = log1pf((float)sms_tree->sms_rect_feat[7]);
+
+ assert(f_idx == FEATURES);
+
+ // Write features to file
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features, FEATURES,
+ 4, bsize, mi_row, mi_col);
+
+ if (ext_ml_model_decision_after_split(
+ cpi, features, &part_state->terminate_partition_search)) {
+ return;
+ }
+
+ float score = 0.0f;
+ av1_nn_predict(features, nn_config, 1, &score);
+ // Score is indicator of confidence that we should NOT terminate.
+ if (score < thresh) {
+ part_state->terminate_partition_search = 1;
+ }
+}
+#undef FEATURES
+
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
+ int64_t best_rd, int64_t none_rd,
+ const int64_t *split_rd,
+ PartitionSearchState *part_state) {
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+ best_rd = AOMMAX(best_rd, 1);
+ const NN_CONFIG *nn_config = NULL;
+ const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
+ float cur_thresh = 0.0f;
+ switch (bsize) {
+ case BLOCK_8X8:
+ nn_config = &av1_rect_partition_nnconfig_8;
+ cur_thresh = prob_thresholds[0];
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_rect_partition_nnconfig_16;
+ cur_thresh = prob_thresholds[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_rect_partition_nnconfig_32;
+ cur_thresh = prob_thresholds[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &av1_rect_partition_nnconfig_64;
+ cur_thresh = prob_thresholds[3];
+ break;
+ case BLOCK_128X128:
+ nn_config = &av1_rect_partition_nnconfig_128;
+ cur_thresh = prob_thresholds[4];
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+
+ // 1. Compute input features
+ float features[9];
+
+ // RD cost ratios
+ for (int i = 0; i < 5; i++) features[i] = 1.0f;
+ if (none_rd > 0 && none_rd < 1000000000)
+ features[0] = (float)none_rd / (float)best_rd;
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ features[1 + i] = (float)split_rd[i] / (float)best_rd;
+ }
+
+ // Variance ratios
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ int whole_block_variance;
+ whole_block_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y);
+ whole_block_variance = AOMMAX(whole_block_variance, 1);
+
+ int split_variance[SUB_PARTITIONS_SPLIT];
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ struct buf_2d buf;
+ buf.stride = x->plane[0].src.stride;
+ const int bw = block_size_wide[bsize];
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ const int x_idx = (i & 1) * bw / 2;
+ const int y_idx = (i >> 1) * bw / 2;
+ buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
+ split_variance[i] =
+ av1_get_perpixel_variance_facade(cpi, xd, &buf, subsize, AOM_PLANE_Y);
+ }
+
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++)
+ features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
+
+ // Write features to file
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features,
+ /*feature_size=*/9, 5, bsize, mi_row, mi_col);
+
+ if (ext_ml_model_decision_after_split_part2(
+ &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+ features, &part_state->prune_rect_part[HORZ],
+ &part_state->prune_rect_part[VERT])) {
+ return;
+ }
+
+ // 2. Do the prediction and prune 0-2 partitions based on their probabilities
+ float raw_scores[3] = { 0.0f };
+ av1_nn_predict(features, nn_config, 1, raw_scores);
+ float probs[3] = { 0.0f };
+ av1_nn_softmax(raw_scores, probs, 3);
+
+ // probs[0] is the probability of the fact that both rectangular partitions
+ // are worse than current best_rd
+ if (probs[1] <= cur_thresh) part_state->prune_rect_part[HORZ] = 1;
+ if (probs[2] <= cur_thresh) part_state->prune_rect_part[VERT] = 1;
+}
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
+ int64_t best_rd,
+ PartitionSearchState *part_state,
+ int *ab_partitions_allowed) {
+ const PartitionBlkParams blk_params = part_state->part_blk_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+ const NN_CONFIG *nn_config = NULL;
+ switch (bsize) {
+ case BLOCK_8X8: nn_config = NULL; break;
+ case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break;
+ case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break;
+ case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break;
+ case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+
+ // Generate features.
+ float features[10];
+ int feature_index = 0;
+ features[feature_index++] = (float)part_ctx;
+ features[feature_index++] = (float)var_ctx;
+ const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+ int sub_block_rdcost[8] = { 0 };
+ int rd_index = 0;
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ const int64_t *horz_rd = part_state->rect_part_rd[HORZ];
+ if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)horz_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ const int64_t *vert_rd = part_state->rect_part_rd[VERT];
+ if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)vert_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ const int64_t *split_rd = part_state->split_rd;
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)split_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 8; ++i) {
+ // Ratio between the sub-block RD and the whole-block RD.
+ float rd_ratio = 1.0f;
+ if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+ rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+ features[feature_index++] = rd_ratio;
+ }
+ assert(feature_index == 10);
+
+ // Write features to file
+ if (!frame_is_intra_only(&cpi->common)) {
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features,
+ /*feature_size=*/10, 6, bsize, mi_row, mi_col);
+ }
+
+ if (ext_ml_model_decision_after_rect(
+ &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+ features, &ab_partitions_allowed[HORZ_A],
+ &ab_partitions_allowed[HORZ_B], &ab_partitions_allowed[VERT_A],
+ &ab_partitions_allowed[VERT_B])) {
+ return;
+ }
+
+ // Calculate scores using the NN model.
+ float score[16] = { 0.0f };
+ av1_nn_predict(features, nn_config, 1, score);
+ int int_score[16];
+ int max_score = -1000;
+ for (int i = 0; i < 16; ++i) {
+ int_score[i] = (int)(100 * score[i]);
+ max_score = AOMMAX(int_score[i], max_score);
+ }
+
+ // Make decisions based on the model scores.
+ int thresh = max_score;
+ switch (bsize) {
+ case BLOCK_16X16: thresh -= 150; break;
+ case BLOCK_32X32: thresh -= 100; break;
+ default: break;
+ }
+ av1_zero_array(ab_partitions_allowed, NUM_AB_PARTS);
+ for (int i = 0; i < 16; ++i) {
+ if (int_score[i] >= thresh) {
+ if ((i >> 0) & 1) ab_partitions_allowed[HORZ_A] = 1;
+ if ((i >> 1) & 1) ab_partitions_allowed[HORZ_B] = 1;
+ if ((i >> 2) & 1) ab_partitions_allowed[VERT_A] = 1;
+ if ((i >> 3) & 1) ab_partitions_allowed[VERT_B] = 1;
+ }
+ }
+}
+
+#define FEATURES 18
+#define LABELS 4
+// Use a ML model to predict if horz4 and vert4 should be considered.
+void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+ int part_ctx, int64_t best_rd,
+ PartitionSearchState *part_state,
+ int *part4_allowed,
+ unsigned int pb_source_variance) {
+ const PartitionBlkParams blk_params = part_state->part_blk_params;
+ const int mi_row = blk_params.mi_row;
+ const int mi_col = blk_params.mi_col;
+ const BLOCK_SIZE bsize = blk_params.bsize;
+
+ int64_t(*rect_part_rd)[SUB_PARTITIONS_RECT] = part_state->rect_part_rd;
+ int64_t *split_rd = part_state->split_rd;
+ if (ext_ml_model_decision_after_part_ab(
+ cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd,
+ &part4_allowed[HORZ4], &part4_allowed[VERT4], pb_source_variance,
+ mi_row, mi_col))
+ return;
+
+ if (best_rd >= 1000000000) return;
+ int64_t *horz_rd = rect_part_rd[HORZ4];
+ int64_t *vert_rd = rect_part_rd[VERT4];
+ const NN_CONFIG *nn_config = NULL;
+ // 4-way partitions are only allowed for these three square block sizes.
+ switch (bsize) {
+ case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
+ case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
+ case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+
+ // Generate features.
+ float features[FEATURES];
+ int feature_index = 0;
+ features[feature_index++] = (float)part_ctx;
+ features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
+
+ const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+ int sub_block_rdcost[8] = { 0 };
+ int rd_index = 0;
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)horz_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)vert_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)split_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 8; ++i) {
+ // Ratio between the sub-block RD and the whole-block RD.
+ float rd_ratio = 1.0f;
+ if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+ rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+ features[feature_index++] = rd_ratio;
+ }
+
+ // Get variance of the 1:4 and 4:1 sub-blocks.
+ unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+ unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+ {
+ BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+ BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+ assert(horz_4_bs != BLOCK_INVALID);
+ assert(vert_4_bs != BLOCK_INVALID);
+
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+ av1_num_planes(&cpi->common), bsize);
+ const int src_stride = x->plane[0].src.stride;
+ uint8_t *src = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+
+ struct buf_2d horz_4_src, vert_4_src;
+ horz_4_src.stride = src_stride;
+ vert_4_src.stride = src_stride;
+
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+ vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+ horz_4_source_var[i] = av1_get_perpixel_variance_facade(
+ cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y);
+ vert_4_source_var[i] = av1_get_perpixel_variance_facade(
+ cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y);
+ }
+ }
+
+ const float denom = (float)(pb_source_variance + 1);
+ const float low_b = 0.1f;
+ const float high_b = 10.0f;
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ // Ratio between the 4:1 sub-block variance and the whole-block variance.
+ float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features[feature_index++] = var_ratio;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ // Ratio between the 1:4 sub-block RD and the whole-block RD.
+ float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features[feature_index++] = var_ratio;
+ }
+ assert(feature_index == FEATURES);
+
+ // Write features to file
+ if (!frame_is_intra_only(&cpi->common)) {
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features,
+ FEATURES, 7, bsize, mi_row, mi_col);
+ }
+
+ // Calculate scores using the NN model.
+ float score[LABELS] = { 0.0f };
+ av1_nn_predict(features, nn_config, 1, score);
+ int int_score[LABELS];
+ int max_score = -1000;
+ for (int i = 0; i < LABELS; ++i) {
+ int_score[i] = (int)(100 * score[i]);
+ max_score = AOMMAX(int_score[i], max_score);
+ }
+
+ // Make decisions based on the model scores.
+ int thresh = max_score;
+ switch (bsize) {
+ case BLOCK_16X16: thresh -= 500; break;
+ case BLOCK_32X32: thresh -= 500; break;
+ case BLOCK_64X64: thresh -= 200; break;
+ default: break;
+ }
+ av1_zero_array(part4_allowed, NUM_PART4_TYPES);
+ for (int i = 0; i < LABELS; ++i) {
+ if (int_score[i] >= thresh) {
+ if ((i >> 0) & 1) part4_allowed[HORZ4] = 1;
+ if ((i >> 1) & 1) part4_allowed[VERT4] = 1;
+ }
+ }
+}
+#undef FEATURES
+#undef LABELS
+
+#define FEATURES 4
+void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ unsigned int pb_source_variance, int bit_depth,
+ PartitionSearchState *part_state) {
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ const NN_CONFIG *nn_config = NULL;
+ int thresh = 0;
+ switch (bsize) {
+ case BLOCK_8X8:
+ nn_config = &av1_partition_breakout_nnconfig_8;
+ thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[0];
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_partition_breakout_nnconfig_16;
+ thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_partition_breakout_nnconfig_32;
+ thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &av1_partition_breakout_nnconfig_64;
+ thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[3];
+ break;
+ case BLOCK_128X128:
+ nn_config = &av1_partition_breakout_nnconfig_128;
+ thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[4];
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config || thresh < 0) return;
+
+ const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f };
+ thresh = (int)((float)thresh *
+ ml_predict_breakout_thresh_scale
+ [cpi->sf.part_sf.ml_predict_breakout_level - 1]);
+
+ // Generate feature values.
+ float features[FEATURES];
+ int feature_index = 0;
+
+ const int num_pels_log2 = num_pels_log2_lookup[bsize];
+ float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
+ rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+ rate_f;
+ features[feature_index++] = rate_f;
+
+ const float dist_f =
+ (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
+ features[feature_index++] = dist_f;
+
+ features[feature_index++] = (float)pb_source_variance;
+
+ const int dc_q = (int)x->plane[0].dequant_QTX[0] >> (bit_depth - 8);
+ features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
+ assert(feature_index == FEATURES);
+
+ // Write features to file
+ write_features_to_file(cpi->oxcf.partition_info_path,
+ cpi->ext_part_controller.test_mode, features, FEATURES,
+ 2, bsize, mi_row, mi_col);
+
+ if (ext_ml_model_decision_after_none(&cpi->ext_part_controller,
+ frame_is_intra_only(&cpi->common),
+ features, &part_state->do_square_split,
+ &part_state->do_rectangular_split)) {
+ return;
+ }
+
+ // Calculate score using the NN model.
+ float score = 0.0f;
+ av1_nn_predict(features, nn_config, 1, &score);
+
+ // Make decision.
+ if ((int)(score * 100) >= thresh) {
+ part_state->do_square_split = 0;
+ part_state->do_rectangular_split = 0;
+ }
+}
+#undef FEATURES
+
+void av1_prune_partitions_before_search(AV1_COMP *const cpi,
+ MACROBLOCK *const x,
+ SIMPLE_MOTION_DATA_TREE *const sms_tree,
+ PartitionSearchState *part_state) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+
+ if (cpi->third_pass_ctx) {
+ int mi_row = blk_params->mi_row;
+ int mi_col = blk_params->mi_col;
+ double ratio_h, ratio_w;
+ av1_get_third_pass_ratio(cpi->third_pass_ctx, 0, cm->height, cm->width,
+ &ratio_h, &ratio_w);
+ THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+ cpi->third_pass_ctx, 0, mi_row, mi_col, ratio_h, ratio_w);
+ BLOCK_SIZE third_pass_bsize =
+ av1_get_third_pass_adjusted_blk_size(this_mi, ratio_h, ratio_w);
+ // check the actual partition of this block in the second pass
+ PARTITION_TYPE third_pass_part =
+ av1_third_pass_get_sb_part_type(cpi->third_pass_ctx, this_mi);
+
+ int is_edge = (mi_row + mi_size_high[bsize] >= cm->mi_params.mi_rows) ||
+ (mi_col + mi_size_wide[bsize] >= cm->mi_params.mi_cols);
+
+ if (!is_edge && block_size_wide[bsize] >= 16) {
+ // If in second pass we used rectangular partition, then do not search for
+ // rectangular partition in the different direction.
+ if (third_pass_part != PARTITION_NONE) {
+ if (third_pass_part == PARTITION_HORZ ||
+ third_pass_part == PARTITION_HORZ_4 ||
+ third_pass_part == PARTITION_HORZ_A ||
+ third_pass_part == PARTITION_HORZ_B) {
+ part_state->partition_rect_allowed[VERT] = 0;
+ } else if (third_pass_part == PARTITION_VERT ||
+ third_pass_part == PARTITION_VERT_4 ||
+ third_pass_part == PARTITION_VERT_A ||
+ third_pass_part == PARTITION_VERT_B) {
+ part_state->partition_rect_allowed[HORZ] = 0;
+ }
+ }
+
+ int minSize = AOMMIN(block_size_wide[third_pass_bsize],
+ block_size_high[third_pass_bsize]);
+ int maxSize = AOMMAX(block_size_wide[third_pass_bsize],
+ block_size_high[third_pass_bsize]);
+ if (block_size_wide[bsize] < minSize / 4) {
+ // Current partition is too small, just terminate
+ part_state->terminate_partition_search = 1;
+ return;
+ } else if (block_size_wide[bsize] < minSize / 2) {
+ if (third_pass_part != PARTITION_NONE) {
+ // Current partition is very small, and in second pass we used
+ // rectangular partition. Terminate the search here then.
+ part_state->terminate_partition_search = 1;
+ return;
+ } else {
+ // Partition is small, but we still check this partition, only disable
+ // further splits.
+ // TODO(any): check why this is not covered by the termination for <
+ // minSize/4.
+ av1_disable_square_split_partition(part_state);
+ av1_disable_rect_partitions(part_state);
+ return;
+ }
+ } else if (block_size_wide[bsize] > maxSize) {
+ // Partition is larger than in the second pass. Only allow split.
+ av1_set_square_split_only(part_state);
+ return;
+ } else if (block_size_wide[bsize] >= minSize &&
+ block_size_wide[bsize] <= maxSize) {
+ // Partition is within a range where it is very likely to find a good
+ // choice, so do not prune anything.
+ return;
+ }
+ }
+ }
+
+ // Prune rectangular partitions for larger blocks.
+ if (bsize > cpi->sf.part_sf.rect_partition_eval_thresh) {
+ part_state->do_rectangular_split = 0;
+ part_state->partition_rect_allowed[HORZ] = 0;
+ part_state->partition_rect_allowed[VERT] = 0;
+ }
+
+ // Prune rectangular, AB and 4-way partition based on q index and block size
+ if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 1) {
+ if (bsize == BLOCK_8X8 && x->qindex < 35)
+ av1_disable_rect_partitions(part_state);
+
+ } else if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 2) {
+ // Enumeration difference between two square partitions
+ const int sqr_bsize_step = BLOCK_32X32 - BLOCK_16X16;
+ int max_bsize =
+ BLOCK_32X32 - (x->qindex * 3 / QINDEX_RANGE) * sqr_bsize_step;
+ max_bsize = AOMMAX(max_bsize, BLOCK_4X4);
+ const BLOCK_SIZE max_prune_bsize =
+ (BLOCK_SIZE)AOMMIN(max_bsize, BLOCK_32X32);
+
+ // Prune partition
+ // qidx 0 to 85: prune bsize below BLOCK_32X32
+ // qidx 86 to 170: prune bsize below BLOCK_16X16
+ // qidx 171 to 255: prune bsize below BLOCK_8X8
+ if (bsize < max_prune_bsize) {
+ av1_disable_rect_partitions(part_state);
+ }
+ }
+
+ if (cpi->sf.part_sf.prune_sub_8x8_partition_level && (bsize == BLOCK_8X8)) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ int prune_sub_8x8;
+ if (cpi->sf.part_sf.prune_sub_8x8_partition_level == 2) {
+ prune_sub_8x8 = 1;
+ } else {
+ assert(cpi->sf.part_sf.prune_sub_8x8_partition_level == 1);
+ // Prune if both neighbors are available and either is > BLOCK_8X8
+ prune_sub_8x8 = xd->left_available && xd->up_available &&
+ (xd->left_mbmi->bsize > BLOCK_8X8 ||
+ xd->above_mbmi->bsize > BLOCK_8X8);
+ }
+ if (prune_sub_8x8) {
+ av1_disable_all_splits(part_state);
+ }
+ }
+
+ // A CNN-based speed feature pruning out either split or all non-split
+ // partition in INTRA frame coding.
+ const int try_intra_cnn_based_part_prune =
+ frame_is_intra_only(cm) &&
+ cpi->sf.part_sf.intra_cnn_based_part_prune_level &&
+ cm->seq_params->sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
+ blk_params->bsize_at_least_8x8 &&
+ av1_is_whole_blk_in_frame(blk_params, mi_params);
+
+ if (try_intra_cnn_based_part_prune) {
+ av1_intra_mode_cnn_partition(
+ &cpi->common, x, x->part_search_info.quad_tree_idx,
+ cpi->sf.part_sf.intra_cnn_based_part_prune_level, part_state);
+ }
+
+ // Use simple motion search to prune out split or non-split partitions. This
+ // must be done prior to PARTITION_SPLIT to propagate the initial mvs to a
+ // smaller blocksize.
+ const int try_split_only =
+ cpi->sf.part_sf.simple_motion_search_split &&
+ part_state->do_square_split && blk_params->bsize_at_least_8x8 &&
+ av1_is_whole_blk_in_frame(blk_params, mi_params) &&
+ !frame_is_intra_only(cm) && !av1_superres_scaled(cm);
+
+ if (try_split_only) {
+ av1_simple_motion_search_based_split(cpi, x, sms_tree, part_state);
+ }
+
+ // Use simple motion search to prune out rectangular partition in some
+ // direction. The results are stored in prune_horz and prune_vert in order to
+ // bypass future related pruning checks if a pruning decision has been made.
+
+ // We want to search at least one partition mode, so don't prune if NONE and
+ // SPLIT are disabled.
+ const int non_rect_part_allowed =
+ part_state->do_square_split || part_state->partition_none_allowed;
+ // Only run the model if the partitions are not already pruned.
+ const int rect_part_allowed = part_state->do_rectangular_split &&
+ ((part_state->partition_rect_allowed[HORZ] &&
+ !part_state->prune_rect_part[HORZ]) ||
+ (part_state->partition_rect_allowed[VERT] &&
+ !part_state->prune_rect_part[VERT]));
+
+ const int try_prune_rect = cpi->sf.part_sf.simple_motion_search_prune_rect &&
+ !frame_is_intra_only(cm) &&
+ non_rect_part_allowed && rect_part_allowed &&
+ !av1_superres_scaled(cm);
+
+ if (try_prune_rect) {
+ av1_simple_motion_search_prune_rect(cpi, x, sms_tree, part_state);
+ }
+}
+
+#ifndef NDEBUG
+static AOM_INLINE int is_bsize_square(BLOCK_SIZE bsize) {
+ return block_size_wide[bsize] == block_size_high[bsize];
+}
+#endif // NDEBUG
+
+void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
+ PartitionSearchState *part_state) {
+ assert(is_bsize_square(sb_enc->max_partition_size));
+ assert(is_bsize_square(sb_enc->min_partition_size));
+ assert(sb_enc->min_partition_size <= sb_enc->max_partition_size);
+ const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+ const BLOCK_SIZE bsize = blk_params->bsize;
+ assert(is_bsize_square(bsize));
+ const int max_partition_size_1d = block_size_wide[sb_enc->max_partition_size];
+ const int min_partition_size_1d = block_size_wide[sb_enc->min_partition_size];
+ const int bsize_1d = block_size_wide[bsize];
+ assert(min_partition_size_1d <= max_partition_size_1d);
+ const int is_le_min_sq_part = bsize_1d <= min_partition_size_1d;
+ const int is_gt_max_sq_part = bsize_1d > max_partition_size_1d;
+ if (is_gt_max_sq_part) {
+ // If current block size is larger than max, only allow split.
+ av1_set_square_split_only(part_state);
+ } else if (is_le_min_sq_part) {
+ // If current block size is less or equal to min, only allow none if valid
+ // block large enough; only allow split otherwise.
+ av1_disable_rect_partitions(part_state);
+
+ // only disable square split when current block is not at the picture
+ // boundary. otherwise, inherit the square split flag from previous logic
+ if (av1_blk_has_rows_and_cols(blk_params)) {
+ part_state->do_square_split = 0;
+ }
+ part_state->partition_none_allowed = !(part_state->do_square_split);
+ }
+}
+
+// Decide whether to evaluate the AB partition specified by part_type based on
+// split and HORZ/VERT info
+int evaluate_ab_partition_based_on_split(
+ const PC_TREE *pc_tree, PARTITION_TYPE rect_part,
+ const RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1,
+ int split_idx2) {
+ int num_win = 0;
+ // Threshold for number of winners
+ // Conservative pruning for high quantizers
+ const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3);
+ int sub_part_win =
+ (rect_part_win_info == NULL) ? (pc_tree->partitioning == rect_part)
+ : (rect_part == PARTITION_HORZ) ? rect_part_win_info->rect_part_win[HORZ]
+ : rect_part_win_info->rect_part_win[VERT];
+ num_win += (sub_part_win) ? 1 : 0;
+ if (pc_tree->split[split_idx1]) {
+ num_win +=
+ (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0;
+ } else {
+ num_win += 1;
+ }
+ if (pc_tree->split[split_idx2]) {
+ num_win +=
+ (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0;
+ } else {
+ num_win += 1;
+ }
+ if (num_win < num_win_thresh) {
+ return 0;
+ }
+ return 1;
+}
+
+void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x,
+ const PC_TREE *pc_tree, int pb_source_variance,
+ int64_t best_rdcost,
+ const RD_RECT_PART_WIN_INFO *rect_part_win_info,
+ bool ext_partition_allowed,
+ PartitionSearchState *part_state,
+ int *ab_partitions_allowed) {
+ int64_t *horz_rd = part_state->rect_part_rd[HORZ];
+ int64_t *vert_rd = part_state->rect_part_rd[VERT];
+ int64_t *split_rd = part_state->split_rd;
+ const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+ // The standard AB partitions are allowed initially if ext-partition-types are
+ // allowed.
+ int horzab_partition_allowed = ext_partition_allowed &&
+ part_cfg->enable_ab_partitions &&
+ part_state->partition_rect_allowed[HORZ];
+ int vertab_partition_allowed = ext_partition_allowed &&
+ part_cfg->enable_ab_partitions &&
+ part_state->partition_rect_allowed[VERT];
+
+ // Pruning: pruning out AB partitions on one main direction based on the
+ // current best partition and source variance.
+ if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+ if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) {
+ // TODO(debargha,huisu@google.com): may need to tune the threshold for
+ // pb_source_variance.
+ horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+ (pc_tree->partitioning == PARTITION_NONE &&
+ pb_source_variance < 32) ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+ (pc_tree->partitioning == PARTITION_NONE &&
+ pb_source_variance < 32) ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ } else {
+ horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ }
+ horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
+ horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
+ vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
+ vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
+ split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
+ split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+ split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+ split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+ }
+
+ // Pruning: pruning out horz_a or horz_b if the combined rdcost of its
+ // subblocks estimated from previous partitions is much higher than the best
+ // rd so far.
+ ab_partitions_allowed[HORZ_A] = horzab_partition_allowed;
+ ab_partitions_allowed[HORZ_B] = horzab_partition_allowed;
+ if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+ const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
+ const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
+ switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+ case 1:
+ ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 14 < best_rdcost);
+ ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 14 < best_rdcost);
+ break;
+ case 2:
+ default:
+ ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 15 < best_rdcost);
+ ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 15 < best_rdcost);
+ break;
+ }
+ }
+
+ // Pruning: pruning out vert_a or vert_b if the combined rdcost of its
+ // subblocks estimated from previous partitions is much higher than the best
+ // rd so far.
+ ab_partitions_allowed[VERT_A] = vertab_partition_allowed;
+ ab_partitions_allowed[VERT_B] = vertab_partition_allowed;
+ if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+ const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
+ const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
+ switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+ case 1:
+ ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 14 < best_rdcost);
+ ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 14 < best_rdcost);
+ break;
+ case 2:
+ default:
+ ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 15 < best_rdcost);
+ ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 15 < best_rdcost);
+ break;
+ }
+ }
+
+ // Pruning: pruning out some ab partitions using a DNN taking rd costs of
+ // sub-blocks from previous basic partition types.
+ if (cpi->sf.part_sf.ml_prune_partition && ext_partition_allowed &&
+ part_state->partition_rect_allowed[HORZ] &&
+ part_state->partition_rect_allowed[VERT]) {
+ // TODO(huisu@google.com): x->source_variance may not be the current
+ // block's variance. The correct one to use is pb_source_variance. Need to
+ // re-train the model to fix it.
+ av1_ml_prune_ab_partition(cpi, pc_tree->partitioning,
+ get_unsigned_bits(x->source_variance),
+ best_rdcost, part_state, ab_partitions_allowed);
+ }
+
+ // Pruning: pruning AB partitions based on the number of horz/vert wins
+ // in the current block and sub-blocks in PARTITION_SPLIT.
+ if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+ ab_partitions_allowed[HORZ_A]) {
+ ab_partitions_allowed[HORZ_A] &= evaluate_ab_partition_based_on_split(
+ pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1);
+ }
+ if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+ ab_partitions_allowed[HORZ_B]) {
+ ab_partitions_allowed[HORZ_B] &= evaluate_ab_partition_based_on_split(
+ pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3);
+ }
+ if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+ ab_partitions_allowed[VERT_A]) {
+ ab_partitions_allowed[VERT_A] &= evaluate_ab_partition_based_on_split(
+ pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2);
+ }
+ if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
+ ab_partitions_allowed[VERT_B]) {
+ ab_partitions_allowed[VERT_B] &= evaluate_ab_partition_based_on_split(
+ pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
+ }
+}
+
+// Prepare features for the external model. Specifically, features after
+// ab partition is searched.
+static void prepare_features_after_part_ab(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int part_ctx, int64_t best_rd,
+ int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+ int64_t split_rd[SUB_PARTITIONS_SPLIT], unsigned int pb_source_variance,
+ int mi_row, int mi_col, aom_partition_features_t *const features) {
+ int64_t *horz_rd = rect_part_rd[HORZ];
+ int64_t *vert_rd = rect_part_rd[VERT];
+
+ // Generate features.
+ int feature_index = 0;
+ features->after_part_ab.f[feature_index++] = (float)part_ctx;
+ features->after_part_ab.f[feature_index++] =
+ (float)get_unsigned_bits(pb_source_variance);
+
+ const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+ int sub_block_rdcost[8] = { 0 };
+ int rd_index = 0;
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)horz_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+ if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)vert_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)split_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 8; ++i) {
+ // Ratio between the sub-block RD and the whole-block RD.
+ float rd_ratio = 1.0f;
+ if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+ rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+ features->after_part_ab.f[feature_index++] = rd_ratio;
+ }
+
+ // 4-way partitions are only allowed for these three square block sizes.
+ assert(bsize == BLOCK_16X16 || bsize == BLOCK_32X32 || bsize == BLOCK_64X64);
+
+ // Get variance of the 1:4 and 4:1 sub-blocks.
+ unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+ unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+ {
+ BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+ BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+
+ assert(horz_4_bs != BLOCK_INVALID);
+ assert(vert_4_bs != BLOCK_INVALID);
+
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+ av1_num_planes(&cpi->common), bsize);
+ const int src_stride = x->plane[0].src.stride;
+ uint8_t *src = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+
+ struct buf_2d horz_4_src, vert_4_src;
+ horz_4_src.stride = src_stride;
+ vert_4_src.stride = src_stride;
+
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+ vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+ horz_4_source_var[i] = av1_get_perpixel_variance_facade(
+ cpi, xd, &horz_4_src, horz_4_bs, AOM_PLANE_Y);
+ vert_4_source_var[i] = av1_get_perpixel_variance_facade(
+ cpi, xd, &vert_4_src, vert_4_bs, AOM_PLANE_Y);
+ }
+ }
+
+ const float denom = (float)(pb_source_variance + 1);
+ const float low_b = 0.1f;
+ const float high_b = 10.0f;
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ // Ratio between the 4:1 sub-block variance and the whole-block variance.
+ float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features->after_part_ab.f[feature_index++] = var_ratio;
+ }
+ for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+ // Ratio between the 1:4 sub-block RD and the whole-block RD.
+ float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features->after_part_ab.f[feature_index++] = var_ratio;
+ }
+ assert(feature_index == 18);
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// partition_none_allowed
+// partition_horz_allowed
+// partition_vert_allowed
+// do_rectangular_split
+// do_square_split
+static bool ext_ml_model_decision_before_none(
+ AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+ int *partition_none_allowed, int *partition_horz_allowed,
+ int *partition_vert_allowed, int *do_rectangular_split,
+ int *do_square_split) {
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ if (!ext_part_controller->ready) return false;
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE;
+ for (int i = 0; i < FEATURE_SIZE_SMS_SPLIT; ++i) {
+ features.before_part_none.f[i] = features_from_motion[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *partition_none_allowed = decision.partition_none_allowed;
+ *partition_horz_allowed = decision.partition_rect_allowed[HORZ];
+ *partition_vert_allowed = decision.partition_rect_allowed[VERT];
+ *do_rectangular_split = decision.do_rectangular_split;
+ *do_square_split = decision.do_square_split;
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// prune_horz
+// prune_vert
+static bool ext_ml_model_decision_before_none_part2(
+ AV1_COMP *cpi,
+ const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+ int *prune_horz, int *prune_vert) {
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ if (!ext_part_controller->ready) return false;
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2;
+ for (int i = 0; i < FEATURE_SIZE_SMS_PRUNE_PART; ++i) {
+ features.before_part_none.f_part2[i] = features_from_motion[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *prune_horz = decision.prune_rect_part[HORZ];
+ *prune_vert = decision.prune_rect_part[VERT];
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// do_square_split
+// do_rectangular_split
+bool ext_ml_model_decision_after_none(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_after_none, int *do_square_split,
+ int *do_rectangular_split) {
+ if (!ext_part_controller->ready || is_intra_frame) return false;
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_NONE;
+ for (int i = 0; i < 4; ++i) {
+ features.after_part_none.f[i] = features_after_none[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *do_square_split = decision.do_square_split;
+ *do_rectangular_split = decision.do_rectangular_split;
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_none_part2(
+ AV1_COMP *const cpi, const float *const features_terminate,
+ int *terminate_partition_search) {
+ AV1_COMMON *const cm = &cpi->common;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ if (!ext_part_controller->ready || frame_is_intra_only(cm)) return false;
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_NONE_PART2;
+ for (int i = 0; i < FEATURE_SIZE_SMS_TERM_NONE; ++i) {
+ features.after_part_none.f_terminate[i] = features_terminate[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *terminate_partition_search = decision.terminate_partition_search;
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_split(AV1_COMP *const cpi,
+ const float *const features_terminate,
+ int *terminate_partition_search) {
+ const AV1_COMMON *const cm = &cpi->common;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+ if (frame_is_intra_only(cm) || !cpi->ext_part_controller.ready) {
+ return false;
+ }
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT;
+ for (int i = 0; i < 31; ++i) {
+ features.after_part_split.f_terminate[i] = features_terminate[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *terminate_partition_search = decision.terminate_partition_search;
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// prune_rect_part[HORZ]
+// prune_rect_part[VERT]
+bool ext_ml_model_decision_after_split_part2(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_prune, int *prune_rect_part_horz,
+ int *prune_rect_part_vert) {
+ if (is_intra_frame || !ext_part_controller->ready) {
+ return false;
+ }
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2;
+ for (int i = 0; i < 9; ++i) {
+ features.after_part_split.f_prune_rect[i] = features_prune[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *prune_rect_part_horz = decision.prune_rect_part[0];
+ *prune_rect_part_vert = decision.prune_rect_part[1];
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after rectangular partition. Specifically, these parameters:
+// horza_partition_allowed
+// horzb_partition_allowed
+// verta_partition_allowed
+// vertb_partition_allowed
+static bool ext_ml_model_decision_after_rect(
+ ExtPartController *const ext_part_controller, const int is_intra_frame,
+ const float *const features_after_rect, int *horza_partition_allowed,
+ int *horzb_partition_allowed, int *verta_partition_allowed,
+ int *vertb_partition_allowed) {
+ if (is_intra_frame || !ext_part_controller->ready) return false;
+
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_RECT;
+ for (int i = 0; i < 10; ++i) {
+ features.after_part_rect.f[i] = features_after_rect[i];
+ }
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *horza_partition_allowed = decision.horza_partition_allowed;
+ *horzb_partition_allowed = decision.horzb_partition_allowed;
+ *verta_partition_allowed = decision.verta_partition_allowed;
+ *vertb_partition_allowed = decision.vertb_partition_allowed;
+
+ return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after AB partition. Specifically, these parameters:
+// partition_vert4_allowed
+// partition_horz4_allowed
+static bool ext_ml_model_decision_after_part_ab(
+ AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+ int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+ int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+ int *const partition_vert4_allowed, unsigned int pb_source_variance,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+
+ if (!frame_is_intra_only(cm) && ext_part_controller->ready) {
+ // Setup features.
+ aom_partition_features_t features;
+ features.id = AOM_EXT_PART_FEATURE_AFTER_AB;
+ prepare_features_after_part_ab(cpi, x, bsize, part_ctx, best_rd,
+ rect_part_rd, split_rd, pb_source_variance,
+ mi_row, mi_col, &features);
+
+ // Send necessary features to the external model.
+ av1_ext_part_send_features(ext_part_controller, &features);
+
+ // Get partition decisions from the external model.
+ aom_partition_decision_t decision;
+ const bool valid_decision =
+ av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+ if (!valid_decision) return false;
+
+ // Populate decisions
+ *partition_horz4_allowed = decision.partition_horz4_allowed;
+ *partition_vert4_allowed = decision.partition_vert4_allowed;
+
+ return true;
+ }
+
+ return false;
+}
+
+// This function resembles "av1_setup_sms_tree()" in context_tree.c
+// with function signature change.
+static SIMPLE_MOTION_DATA_TREE *setup_sms_tree(
+ AV1_COMP *const cpi, SIMPLE_MOTION_DATA_TREE *sms_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int stat_generation_stage = is_stat_generation_stage(cpi);
+ const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+ const int tree_nodes =
+ av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+ int sms_tree_index = 0;
+ SIMPLE_MOTION_DATA_TREE *this_sms;
+ int square_index = 1;
+ int nodes;
+ this_sms = &sms_tree[0];
+
+ if (!stat_generation_stage) {
+ const int leaf_factor = is_sb_size_128 ? 4 : 1;
+ const int leaf_nodes = 256 * leaf_factor;
+
+ // Sets up all the leaf nodes in the tree.
+ for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) {
+ SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+ tree->block_size = square[0];
+ }
+
+ // Each node has 4 leaf nodes, fill each block_size level of the tree
+ // from leafs to the root.
+ for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+ for (int i = 0; i < nodes; ++i) {
+ SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+ tree->block_size = square[square_index];
+ for (int j = 0; j < 4; j++) tree->split[j] = this_sms++;
+ ++sms_tree_index;
+ }
+ ++square_index;
+ }
+ } else {
+ // Allocation for firstpass/LAP stage
+ // TODO(Mufaddal): refactor square_index to use a common block_size macro
+ // from firstpass.c
+ SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+ square_index = 2;
+ tree->block_size = square[square_index];
+ }
+
+ // Set up the root node for the largest superblock size
+ return &sms_tree[tree_nodes - 1];
+}
+
+static void write_motion_feature_to_file(
+ const char *const path, const int sb_counter, const unsigned int *block_sse,
+ const unsigned int *block_var, const int num_blocks, const BLOCK_SIZE bsize,
+ const BLOCK_SIZE fixed_block_size, const int mi_row, const int mi_col) {
+ char filename[256];
+ snprintf(filename, sizeof(filename), "%s/motion_search_feature_sb%d", path,
+ sb_counter);
+ FILE *pfile = fopen(filename, "w");
+ fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize,
+ block_size_wide[fixed_block_size], num_blocks);
+ for (int i = 0; i < num_blocks; ++i) {
+ fprintf(pfile, "%d", block_sse[i]);
+ if (i < num_blocks - 1) fprintf(pfile, ",");
+ }
+ fprintf(pfile, "\n");
+ for (int i = 0; i < num_blocks; ++i) {
+ fprintf(pfile, "%d", block_var[i]);
+ if (i < num_blocks - 1) fprintf(pfile, ",");
+ }
+ fprintf(pfile, "\n");
+ fclose(pfile);
+}
+
+void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data,
+ const int mi_row, const int mi_col,
+ const BLOCK_SIZE bsize,
+ aom_partition_features_t *features) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (frame_is_intra_only(cm)) return;
+
+ MACROBLOCK *const x = &td->mb;
+ const BLOCK_SIZE fixed_block_size = BLOCK_16X16;
+ const int col_step = mi_size_wide[fixed_block_size];
+ const int row_step = mi_size_high[fixed_block_size];
+ SIMPLE_MOTION_DATA_TREE *sms_tree = NULL;
+ const int stat_generation_stage = is_stat_generation_stage(cpi);
+ const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+ const int tree_nodes =
+ av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+ CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree)));
+ SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize);
+ av1_init_simple_motion_search_mvs_for_sb(cpi, NULL, x, sms_root, mi_row,
+ mi_col);
+ av1_reset_simple_motion_tree_partition(sms_root, bsize);
+ const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+ : LAST_FRAME };
+ const int mi_width =
+ AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+ const int mi_height =
+ AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+ const int col_steps = (mi_width / col_step) + ((mi_width % col_step) > 0);
+ const int row_steps = (mi_height / row_step) + ((mi_height % row_step) > 0);
+ const int num_blocks = col_steps * row_steps;
+ unsigned int *block_sse = aom_calloc(num_blocks, sizeof(*block_sse));
+ unsigned int *block_var = aom_calloc(num_blocks, sizeof(*block_var));
+ if (!(block_sse && block_var)) {
+ aom_free(sms_tree);
+ aom_free(block_sse);
+ aom_free(block_var);
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating block_sse & block_var");
+ }
+ int idx = 0;
+
+ for (int row = mi_row;
+ row < AOMMIN(mi_row + mi_size_high[bsize], cm->mi_params.mi_rows);
+ row += row_step) {
+ for (int col = mi_col;
+ col < AOMMIN(mi_col + mi_size_wide[bsize], cm->mi_params.mi_cols);
+ col += col_step) {
+ simple_motion_search_get_best_ref(
+ cpi, x, sms_root, row, col, fixed_block_size, ref_list,
+ /*num_refs=*/1, /*use_subpixel=*/1,
+ /*save_mv=*/1, &block_sse[idx], &block_var[idx]);
+ ++idx;
+ }
+ }
+ if (features == NULL) {
+ write_motion_feature_to_file(cpi->oxcf.partition_info_path, cpi->sb_counter,
+ block_sse, block_var, idx, bsize,
+ fixed_block_size, mi_row, mi_col);
+ } else {
+ features->sb_features.motion_features.unit_length =
+ block_size_wide[fixed_block_size];
+ features->sb_features.motion_features.num_units = idx;
+ for (int i = 0; i < idx; ++i) {
+ features->sb_features.motion_features.block_sse[i] = block_sse[i];
+ features->sb_features.motion_features.block_var[i] = block_var[i];
+ }
+ }
+
+ aom_free(block_sse);
+ aom_free(block_var);
+ aom_free(sms_tree);
+}
+
+void av1_prepare_motion_search_features_block(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ const int mi_row, const int mi_col, const BLOCK_SIZE bsize,
+ const int valid_partition_types, unsigned int *block_sse,
+ unsigned int *block_var, unsigned int sub_block_sse[4],
+ unsigned int sub_block_var[4], unsigned int horz_block_sse[2],
+ unsigned int horz_block_var[2], unsigned int vert_block_sse[2],
+ unsigned int vert_block_var[2]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (frame_is_intra_only(cm)) return;
+ MACROBLOCK *const x = &td->mb;
+ SIMPLE_MOTION_DATA_TREE *sms_tree = NULL;
+ const int stat_generation_stage = is_stat_generation_stage(cpi);
+ const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+ const int tree_nodes =
+ av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+ CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree)));
+ SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize);
+ av1_reset_simple_motion_tree_partition(sms_root, bsize);
+ const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+ : LAST_FRAME };
+ const int sub_mi_width = mi_size_wide[bsize] / 2;
+ const int sub_mi_height = sub_mi_width;
+ simple_motion_search_get_best_ref(
+ cpi, x, sms_root, mi_row, mi_col, bsize, ref_list, /*num_refs=*/1,
+ /*use_subpixel=*/1, /*save_mv=*/1, block_sse, block_var);
+ // Split to 4 sub blocks.
+ if (valid_partition_types & (1 << PARTITION_SPLIT)) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int i = 0; i < 4; ++i) {
+ const int row = mi_row + (i >> 1) * sub_mi_height;
+ const int col = mi_col + (i & 1) * sub_mi_width;
+ simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+ ref_list, /*num_refs=*/1,
+ /*use_subpixel=*/1, /*save_mv=*/1,
+ &sub_block_sse[i], &sub_block_var[i]);
+ }
+ }
+ // Horizontal split
+ if (valid_partition_types & (1 << PARTITION_HORZ)) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+ for (int i = 0; i < 2; ++i) {
+ const int row = mi_row + (i & 1) * sub_mi_height;
+ const int col = mi_col;
+ simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+ ref_list, /*num_refs=*/1,
+ /*use_subpixel=*/1, /*save_mv=*/1,
+ &horz_block_sse[i], &horz_block_var[i]);
+ }
+ }
+ // Vertical split
+ if (valid_partition_types & (1 << PARTITION_VERT)) {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+ for (int i = 0; i < 2; ++i) {
+ const int row = mi_row;
+ const int col = mi_col + (i & 1) * sub_mi_width;
+ simple_motion_search_get_best_ref(cpi, x, sms_root, row, col, subsize,
+ ref_list, /*num_refs=*/1,
+ /*use_subpixel=*/1, /*save_mv=*/1,
+ &vert_block_sse[i], &vert_block_var[i]);
+ }
+ }
+
+ aom_free(sms_tree);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void init_simple_motion_search_mvs(
+ SIMPLE_MOTION_DATA_TREE *sms_tree, const FULLPEL_MV *start_mvs) {
+ memcpy(sms_tree->start_mvs, start_mvs, sizeof(sms_tree->start_mvs));
+ av1_zero(sms_tree->sms_none_feat);
+ av1_zero(sms_tree->sms_rect_feat);
+ av1_zero(sms_tree->sms_none_valid);
+ av1_zero(sms_tree->sms_rect_valid);
+
+ if (sms_tree->block_size >= BLOCK_8X8) {
+ init_simple_motion_search_mvs(sms_tree->split[0], start_mvs);
+ init_simple_motion_search_mvs(sms_tree->split[1], start_mvs);
+ init_simple_motion_search_mvs(sms_tree->split[2], start_mvs);
+ init_simple_motion_search_mvs(sms_tree->split[3], start_mvs);
+ }
+}
+
+void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
+ const TileInfo *tile_info,
+ MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ int mi_row, int mi_col) {
+ // Use the NEARESTMV of the sb as the start mv
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FULLPEL_MV ref_mvs[REF_FRAMES];
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ av1_zero(ref_mvs);
+ // If tile_info is NULL, assume that the offsets have already been set.
+ if (tile_info) {
+ av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col,
+ sb_size);
+ }
+
+ MB_MODE_INFO_EXT mbmi_ext;
+ const int ref_frame =
+ cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+ av1_find_mv_refs(cm, xd, xd->mi[0], ref_frame, mbmi_ext.ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext.global_mvs,
+ mbmi_ext.mode_context);
+ if (mbmi_ext.ref_mv_count[ref_frame] > 0) {
+ ref_mvs[ref_frame] =
+ get_fullmv_from_mv(&xd->ref_mv_stack[ref_frame][0].this_mv.as_mv);
+ } else {
+ ref_mvs[ref_frame] =
+ get_fullmv_from_mv(&mbmi_ext.global_mvs[ref_frame].as_mv);
+ }
+
+ init_simple_motion_search_mvs(sms_root, ref_mvs);
+}
diff --git a/third_party/aom/av1/encoder/partition_strategy.h b/third_party/aom/av1/encoder/partition_strategy.h
new file mode 100644
index 0000000000..84683f5fd4
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_strategy.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+#define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encoder.h"
+
+void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
+ int label_idx,
+ int intra_cnn_based_part_prune_level,
+ PartitionSearchState *part_state);
+
+// Performs a simple_motion_search with a single reference frame and extract
+// the variance of residues. Then use the features to determine whether we want
+// to go straight to splitting without trying PARTITION_NONE
+void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PartitionSearchState *part_state);
+
+// Performs a simple_motion_search with two reference frames and extract
+// the variance of residues. Then use the features to determine whether we want
+// to prune some partitions.
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ PartitionSearchState *part_state);
+
+#if !CONFIG_REALTIME_ONLY
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+// - The frame is a show frame
+// - The frame is not intra only
+// - The current bsize is > BLOCK_8X8
+// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_tree,
+ const RD_STATS *none_rdc,
+ PartitionSearchState *part_state);
+
+// Get the features for selecting the max and min partition size. Currently this
+// performs simple_motion_search on 16X16 subblocks of the current superblock,
+// and then extract the statistics of sse and motion vectors as features.
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ float *features);
+
+// Predict the maximum BLOCK_SIZE to be used to encoder the current superblock.
+BLOCK_SIZE av1_predict_max_partition(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const float *features);
+
+// Attempts an early termination after PARTITION_SPLIT.
+void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
+ SIMPLE_MOTION_DATA_TREE *const sms_tree,
+ int64_t best_rd, int64_t part_none_rd,
+ int64_t part_split_rd,
+ int64_t *split_block_rd,
+ PartitionSearchState *part_state);
+
+// Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and
+// PARTITION_VERT.
+// TODO(chiyotsai@google.com): Currently this model does not use q value and has
+// no information about rectangular partitions. Preliminary experiments suggest
+// that we can get better performance by adding in q_index and rectangular
+// sse/var from SMS. We should retrain and tune this model later.
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
+ int64_t best_rd, int64_t none_rd,
+ const int64_t *split_rd,
+ PartitionSearchState *part_state);
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
+ int64_t best_rd,
+ PartitionSearchState *part_state,
+ int *ab_partitions_allowed);
+
+// Use a ML model to predict if horz4 and vert4 should be considered.
+void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+ int part_ctx, int64_t best_rd,
+ PartitionSearchState *part_state,
+ int *part4_allowed,
+ unsigned int pb_source_variance);
+
+// ML-based partition search breakout after PARTITION_NONE.
+void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ unsigned int pb_source_variance, int bit_depth,
+ PartitionSearchState *part_state);
+
+// The first round of partition pruning determined before any partition
+// has been tested. The decisions will be updated and passed back
+// to the partition search function.
+void av1_prune_partitions_before_search(AV1_COMP *const cpi,
+ MACROBLOCK *const x,
+ SIMPLE_MOTION_DATA_TREE *const sms_tree,
+ PartitionSearchState *part_state);
+
+// Prune out partitions that lead to coding block sizes outside the min and max
+// bsizes set by the encoder. Max and min square partition levels are defined as
+// the partition nodes that the recursive function rd_pick_partition() can
+// reach. To implement this: only PARTITION_NONE is allowed if the current node
+// equals max_partition_size, only PARTITION_SPLIT is allowed if the current
+// node exceeds max_partition_size.
+void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
+ PartitionSearchState *part_state);
+
+// Prune out AB partitions based on rd decisions made from testing the
+// basic partitions.
+void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x,
+ const PC_TREE *pc_tree, int pb_source_variance,
+ int64_t best_rdcost,
+ const RD_RECT_PART_WIN_INFO *rect_part_win_info,
+ bool ext_partition_allowed,
+ PartitionSearchState *part_state,
+ int *ab_partitions_allowed);
+
+void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data,
+ const int mi_row, const int mi_col,
+ const BLOCK_SIZE bsize,
+ aom_partition_features_t *features);
+void av1_prepare_motion_search_features_block(
+ AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+ const int mi_row, const int mi_col, const BLOCK_SIZE bsize,
+ const int valid_partition_types, unsigned int *block_sse,
+ unsigned int *block_var, unsigned int sub_block_sse[4],
+ unsigned int sub_block_var[4], unsigned int horz_block_sse[2],
+ unsigned int horz_block_var[2], unsigned int vert_block_sse[2],
+ unsigned int vert_block_var[2]);
+#endif // !CONFIG_REALTIME_ONLY
+
+// A simplified version of set_offsets meant to be used for
+// simple_motion_search.
+static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *const x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+
+ set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+ mi_row, mi_col);
+
+ // Set up destination pointers.
+ av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+ num_planes);
+
+ // Set up limit values for MV components.
+ // Mv beyond the range do not produce new/different prediction block.
+ av1_set_mv_limits(mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+ mi_width, cpi->oxcf.border_in_pixels);
+
+ set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+ xd->mi_row = mi_row;
+ xd->mi_col = mi_col;
+
+ // Set up distance of MB to edge of frame in 1/8th pel units.
+ assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+ xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+ xd->mb_to_bottom_edge =
+ GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+ xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+ xd->mb_to_right_edge =
+ GET_MV_SUBPEL((mi_params->mi_cols - mi_width - mi_col) * MI_SIZE);
+
+ // Set up source buffers.
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+}
+
+void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
+ const TileInfo *tile_info,
+ MACROBLOCK *x,
+ SIMPLE_MOTION_DATA_TREE *sms_root,
+ int mi_row, int mi_col);
+
+static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col, BLOCK_SIZE sb_size) {
+ const int sb_mi_wide = mi_size_wide[sb_size];
+ const int sb_mi_high = mi_size_high[sb_size];
+
+ return (mi_row + sb_mi_high) <= mi_params->mi_rows &&
+ (mi_col + sb_mi_wide) <= mi_params->mi_cols;
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Do not use this criteria for screen content videos.
+// Since screen content videos could often find good predictors and the largest
+// block size is likely to be used.
+static INLINE int use_auto_max_partition(const AV1_COMP *const cpi,
+ BLOCK_SIZE sb_size, int mi_row,
+ int mi_col) {
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const AV1_COMMON *const cm = &cpi->common;
+ return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools &&
+ cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
+ NOT_IN_USE &&
+ sb_size == BLOCK_128X128 &&
+ is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) &&
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+ OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+ INTNL_OVERLAY_UPDATE;
+}
+
+static BLOCK_SIZE dim_to_size(int dim) {
+ switch (dim) {
+ case 4: return BLOCK_4X4;
+ case 8: return BLOCK_8X8;
+ case 16: return BLOCK_16X16;
+ case 32: return BLOCK_32X32;
+ case 64: return BLOCK_64X64;
+ case 128: return BLOCK_128X128;
+ default: assert(0); return 0;
+ }
+}
+
+static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc,
+ AV1_COMP *cpi, MACROBLOCK *x,
+ const SPEED_FEATURES *sf,
+ BLOCK_SIZE sb_size,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+
+ sb_enc->max_partition_size =
+ AOMMIN(sf->part_sf.default_max_partition_size,
+ dim_to_size(cpi->oxcf.part_cfg.max_partition_size));
+ sb_enc->min_partition_size =
+ AOMMAX(sf->part_sf.default_min_partition_size,
+ dim_to_size(cpi->oxcf.part_cfg.min_partition_size));
+ sb_enc->max_partition_size =
+ AOMMIN(sb_enc->max_partition_size, cm->seq_params->sb_size);
+ sb_enc->min_partition_size =
+ AOMMIN(sb_enc->min_partition_size, cm->seq_params->sb_size);
+
+ if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+ float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+ av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+ sb_enc->max_partition_size =
+ AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features),
+ sb_enc->max_partition_size),
+ sb_enc->min_partition_size);
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+#endif // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
diff --git a/third_party/aom/av1/encoder/pass2_strategy.c b/third_party/aom/av1/encoder/pass2_strategy.c
new file mode 100644
index 0000000000..a9442ffc1a
--- /dev/null
+++ b/third_party/aom/av1/encoder/pass2_strategy.c
@@ -0,0 +1,4488 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup gf_group_algo Golden Frame Group
+ * \ingroup high_level_algo
+ * Algorithms regarding determining the length of GF groups and defining GF
+ * group structures.
+ * @{
+ */
+/*! @} - end defgroup gf_group_algo */
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "aom_mem/aom_mem.h"
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "av1/common/av1_common_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rc_utils.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/encode_strategy.h"
+
+#define DEFAULT_KF_BOOST 2300
+#define DEFAULT_GF_BOOST 2000
+#define GROUP_ADAPTIVE_MAXQ 1
+
+static void init_gf_stats(GF_GROUP_STATS *gf_stats);
+static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+ int is_final_pass);
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *this_frame) {
+ const double active_pct =
+ 1.0 -
+ ((this_frame->intra_skip_pct / 2) +
+ ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows));
+ return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err_new(const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *total_stats,
+ const FIRSTPASS_STATS *this_stats,
+ int vbrbias, double modified_error_min,
+ double modified_error_max) {
+ if (total_stats == NULL) {
+ return 0;
+ }
+ const double av_weight = total_stats->weight / total_stats->count;
+ const double av_err =
+ (total_stats->coded_error * av_weight) / total_stats->count;
+ double modified_error =
+ av_err * pow(this_stats->coded_error * this_stats->weight /
+ DOUBLE_DIVIDE_CHECK(av_err),
+ vbrbias / 100.0);
+
+ // Correction for active area. Frames with a reduced active area
+ // (eg due to formatting bars) have a higher error per mb for the
+ // remaining active MBs. The correction here assumes that coding
+ // 0.5N blocks of complexity 2X is a little easier than coding N
+ // blocks of complexity X.
+ modified_error *=
+ pow(calculate_active_area(frame_info, this_stats), ACT_AREA_CORRECTION);
+
+ return fclamp(modified_error, modified_error_min, modified_error_max);
+}
+
+static double calculate_modified_err(const FRAME_INFO *frame_info,
+ const TWO_PASS *twopass,
+ const AV1EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame) {
+ const FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
+ return calculate_modified_err_new(
+ frame_info, total_stats, this_frame, oxcf->rc_cfg.vbrbias,
+ twopass->modified_error_min, twopass->modified_error_max);
+}
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS_FRAME *p_frame,
+ const FIRSTPASS_STATS *position) {
+ p_frame->stats_in = position;
+}
+
+static int input_stats(TWO_PASS *p, TWO_PASS_FRAME *p_frame,
+ FIRSTPASS_STATS *fps) {
+ if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+
+ *fps = *p_frame->stats_in;
+ ++p_frame->stats_in;
+ return 1;
+}
+
+static int input_stats_lap(TWO_PASS *p, TWO_PASS_FRAME *p_frame,
+ FIRSTPASS_STATS *fps) {
+ if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+
+ *fps = *p_frame->stats_in;
+ /* Move old stats[0] out to accommodate for next frame stats */
+ memmove(p->frame_stats_arr[0], p->frame_stats_arr[1],
+ (p->stats_buf_ctx->stats_in_end - p_frame->stats_in - 1) *
+ sizeof(FIRSTPASS_STATS));
+ p->stats_buf_ctx->stats_in_end--;
+ return 1;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p,
+ const TWO_PASS_FRAME *p_frame,
+ int offset) {
+ if ((offset >= 0 &&
+ p_frame->stats_in + offset >= p->stats_buf_ctx->stats_in_end) ||
+ (offset < 0 &&
+ p_frame->stats_in + offset < p->stats_buf_ctx->stats_in_start)) {
+ return NULL;
+ }
+
+ return &p_frame->stats_in[offset];
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+ const AV1EncoderConfig *oxcf) {
+ int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+ (int64_t)oxcf->rc_cfg.vbrmax_section) /
+ 100;
+ if (max_bits < 0)
+ max_bits = 0;
+ else if (max_bits > rc->max_frame_bandwidth)
+ max_bits = rc->max_frame_bandwidth;
+
+ return (int)max_bits;
+}
+
+static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75,
+ 0.80, 0.85, 0.90,
+ 0.95, 0.95, 0.95 };
+#define ERR_DIVISOR 96.0
+static double calc_correction_factor(double err_per_mb, int q) {
+ const double error_term = err_per_mb / ERR_DIVISOR;
+ const int index = q >> 5;
+ // Adjustment to power term based on qindex
+ const double power_term =
+ q_pow_term[index] +
+ (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0);
+ assert(error_term >= 0.0);
+ return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+// Based on history adjust expectations of bits per macroblock.
+static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
+ // Based on recent history adjust expectations of bits per macroblock.
+ double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0);
+ double rate_err_factor = 1.0;
+ const double adj_limit = AOMMAX(0.2, (double)(100 - rate_err_tol) / 200.0);
+ const double min_fac = 1.0 - adj_limit;
+ const double max_fac = 1.0 + adj_limit;
+
+ if (cpi->third_pass_ctx && cpi->third_pass_ctx->frame_info_count > 0) {
+ int64_t actual_bits = 0;
+ int64_t target_bits = 0;
+ double factor = 0.0;
+ int count = 0;
+ for (int i = 0; i < cpi->third_pass_ctx->frame_info_count; i++) {
+ actual_bits += cpi->third_pass_ctx->frame_info[i].actual_bits;
+ target_bits += cpi->third_pass_ctx->frame_info[i].bits_allocated;
+ factor += cpi->third_pass_ctx->frame_info[i].bpm_factor;
+ count++;
+ }
+
+ if (count == 0) {
+ factor = 1.0;
+ } else {
+ factor /= (double)count;
+ }
+
+ factor *= (double)actual_bits / DOUBLE_DIVIDE_CHECK((double)target_bits);
+
+ if ((twopass->bpm_factor <= 1 && factor < twopass->bpm_factor) ||
+ (twopass->bpm_factor >= 1 && factor > twopass->bpm_factor)) {
+ twopass->bpm_factor = factor;
+ twopass->bpm_factor =
+ AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+ }
+ }
+
+ int err_estimate = p_rc->rate_error_estimate;
+ int64_t bits_left = twopass->bits_left;
+ int64_t total_actual_bits = p_rc->total_actual_bits;
+ int64_t bits_off_target = p_rc->vbr_bits_off_target;
+ double rolling_arf_group_actual_bits =
+ (double)twopass->rolling_arf_group_actual_bits;
+ double rolling_arf_group_target_bits =
+ (double)twopass->rolling_arf_group_target_bits;
+
+#if CONFIG_FPMT_TEST
+ const int is_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 ? 1 : 0;
+ const int simulate_parallel_frame =
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE
+ ? is_parallel_frame
+ : 0;
+ total_actual_bits = simulate_parallel_frame ? p_rc->temp_total_actual_bits
+ : p_rc->total_actual_bits;
+ bits_off_target = simulate_parallel_frame ? p_rc->temp_vbr_bits_off_target
+ : p_rc->vbr_bits_off_target;
+ bits_left =
+ simulate_parallel_frame ? p_rc->temp_bits_left : twopass->bits_left;
+ rolling_arf_group_target_bits =
+ (double)(simulate_parallel_frame
+ ? p_rc->temp_rolling_arf_group_target_bits
+ : twopass->rolling_arf_group_target_bits);
+ rolling_arf_group_actual_bits =
+ (double)(simulate_parallel_frame
+ ? p_rc->temp_rolling_arf_group_actual_bits
+ : twopass->rolling_arf_group_actual_bits);
+ err_estimate = simulate_parallel_frame ? p_rc->temp_rate_error_estimate
+ : p_rc->rate_error_estimate;
+#endif
+
+ if (p_rc->bits_off_target && total_actual_bits > 0) {
+ if (cpi->ppi->lap_enabled) {
+ rate_err_factor = rolling_arf_group_actual_bits /
+ DOUBLE_DIVIDE_CHECK(rolling_arf_group_target_bits);
+ } else {
+ rate_err_factor = 1.0 - ((double)(bits_off_target) /
+ AOMMAX(total_actual_bits, bits_left));
+ }
+
+ // Adjustment is damped if this is 1 pass with look ahead processing
+ // (as there are only ever a few frames of data) and for all but the first
+ // GOP in normal two pass.
+ if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) {
+ rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac);
+ }
+ rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor));
+ }
+
+ // Is the rate control trending in the right direction. Only make
+ // an adjustment if things are getting worse.
+ if ((rate_err_factor < 1.0 && err_estimate >= 0) ||
+ (rate_err_factor > 1.0 && err_estimate <= 0)) {
+ twopass->bpm_factor *= rate_err_factor;
+ if (rate_err_tol >= 100) {
+ twopass->bpm_factor =
+ AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor));
+ } else {
+ twopass->bpm_factor = AOMMAX(0.1, AOMMIN(10.0, twopass->bpm_factor));
+ }
+ }
+}
+
+static int qbpm_enumerator(int rate_err_tol) {
+ return 1200000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75);
+}
+
+// Similar to find_qindex_by_rate() function in ratectrl.c, but includes
+// calculation of a correction_factor.
+static int find_qindex_by_rate_with_correction(
+ int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb,
+ double group_weight_factor, int rate_err_tol, int best_qindex,
+ int worst_qindex) {
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const double mid_factor = calc_correction_factor(error_per_mb, mid);
+ const double q = av1_convert_qindex_to_q(mid, bit_depth);
+ const int enumerator = qbpm_enumerator(rate_err_tol);
+ const int mid_bits_per_mb =
+ (int)((enumerator * mid_factor * group_weight_factor) / q);
+
+ if (mid_bits_per_mb > desired_bits_per_mb) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ return low;
+}
+
+/*!\brief Choose a target maximum Q for a group of frames
+ *
+ * \ingroup rate_control
+ *
+ * This function is used to estimate a suitable maximum Q for a
+ * group of frames. Inititally it is called to get a crude estimate
+ * for the whole clip. It is then called for each ARF/GF group to get
+ * a revised estimate for that group.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] av_frame_err The average per frame coded error score
+ * for frames making up this section/group.
+ * \param[in] inactive_zone Used to mask off /ignore part of the
+ * frame. The most common use case is where
+ * a wide format video (e.g. 16:9) is
+ * letter-boxed into a more square format.
+ * Here we want to ignore the bands at the
+ * top and bottom.
+ * \param[in] av_target_bandwidth The target bits per frame
+ *
+ * \return The maximum Q for frames in the group.
+ */
+static int get_twopass_worst_quality(AV1_COMP *cpi, const double av_frame_err,
+ double inactive_zone,
+ int av_target_bandwidth) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+ inactive_zone = fclamp(inactive_zone, 0.0, 0.9999);
+
+ if (av_target_bandwidth <= 0) {
+ return rc->worst_quality; // Highest value allowed
+ } else {
+ const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.mi_params.MBs;
+ const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+ const double av_err_per_mb = av_frame_err / (1.0 - inactive_zone);
+ const int target_norm_bits_per_mb =
+ (int)((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs;
+ int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
+
+ // Update bpm correction factor based on previous GOP rate error.
+ twopass_update_bpm_factor(cpi, rate_err_tol);
+
+ // Try and pick a max Q that will be high enough to encode the
+ // content at the given rate.
+ int q = find_qindex_by_rate_with_correction(
+ target_norm_bits_per_mb, cpi->common.seq_params->bit_depth,
+ av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol,
+ rc->best_quality, rc->worst_quality);
+
+ // Restriction on active max q for constrained quality mode.
+ if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level);
+ return q;
+ }
+}
+
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.01
+#define NCOUNT_FRAME_II_THRESH 5.0
+#define LOW_CODED_ERR_PER_MB 0.01
+
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It comapres the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitframe boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const FIRSTPASS_STATS *frame) {
+ double sr_diff = (frame->sr_coded_error - frame->coded_error);
+ double sr_decay = 1.0;
+ double modified_pct_inter;
+ double modified_pcnt_intra;
+
+ modified_pct_inter = frame->pcnt_inter;
+ if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+ ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+ (double)NCOUNT_FRAME_II_THRESH)) {
+ modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+ }
+ modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+ if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+ double sr_diff_part = ((sr_diff * 0.25) / frame->intra_error);
+ sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
+ }
+ return AOMMAX(sr_decay, DEFAULT_DECAY_LIMIT);
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) {
+ const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
+ double sr_decay = get_sr_decay_rate(frame);
+ return AOMMIN(sr_decay, zero_motion_pct);
+}
+
+#define DEFAULT_ZM_FACTOR 0.5
+static double get_prediction_decay_rate(const FIRSTPASS_STATS *frame_stats) {
+ const double sr_decay_rate = get_sr_decay_rate(frame_stats);
+ double zero_motion_factor =
+ DEFAULT_ZM_FACTOR * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
+
+ // Clamp value to range 0.0 to 1.0
+ // This should happen anyway if input values are sensibly clamped but checked
+ // here just in case.
+ if (zero_motion_factor > 1.0)
+ zero_motion_factor = 1.0;
+ else if (zero_motion_factor < 0.0)
+ zero_motion_factor = 0.0;
+
+ return AOMMAX(zero_motion_factor,
+ (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(const FIRSTPASS_INFO *firstpass_info,
+ int next_stats_index,
+ const int min_gf_interval,
+ const int frame_interval,
+ const int still_interval,
+ const double loop_decay_rate,
+ const double last_decay_rate) {
+ // Break clause to detect very still sections after motion
+ // For example a static image after a fade or other transition
+ // instead of a clean scene cut.
+ if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 &&
+ last_decay_rate < 0.9) {
+ int stats_left =
+ av1_firstpass_info_future_count(firstpass_info, next_stats_index);
+ if (stats_left >= still_interval) {
+ int j;
+ // Look ahead a few frames to see if static condition persists...
+ for (j = 0; j < still_interval; ++j) {
+ const FIRSTPASS_STATS *stats =
+ av1_firstpass_info_peek(firstpass_info, next_stats_index + j);
+ if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+ }
+ // Only if it does do we signal a transition to still.
+ return j == still_interval;
+ }
+ }
+ return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass,
+ const TWO_PASS_FRAME *twopass_frame, const int offset) {
+ const FIRSTPASS_STATS *const next_frame =
+ read_frame_stats(twopass, twopass_frame, offset);
+
+ // What we are looking for here is a situation where there is a
+ // brief break in prediction (such as a flash) but subsequent frames
+ // are reasonably well predicted by an earlier (pre flash) frame.
+ // The recovery after a flash is indicated by a high pcnt_second_ref
+ // compared to pcnt_inter.
+ return next_frame != NULL &&
+ next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+ next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+ GF_GROUP_STATS *gf_stats, double f_w,
+ double f_h) {
+ const double pct = stats->pcnt_motion;
+
+ // Accumulate Motion In/Out of frame stats.
+ gf_stats->this_frame_mv_in_out = stats->mv_in_out_count * pct;
+ gf_stats->mv_in_out_accumulator += gf_stats->this_frame_mv_in_out;
+ gf_stats->abs_mv_in_out_accumulator += fabs(gf_stats->this_frame_mv_in_out);
+
+ // Accumulate a measure of how uniform (or conversely how random) the motion
+ // field is (a ratio of abs(mv) / mv).
+ if (pct > 0.05) {
+ const double mvr_ratio =
+ fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+ const double mvc_ratio =
+ fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+ gf_stats->mv_ratio_accumulator +=
+ pct *
+ (mvr_ratio < stats->mvr_abs * f_h ? mvr_ratio : stats->mvr_abs * f_h);
+ gf_stats->mv_ratio_accumulator +=
+ pct *
+ (mvc_ratio < stats->mvc_abs * f_w ? mvc_ratio : stats->mvc_abs * f_w);
+ }
+}
+
+static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats,
+ const double mod_frame_err,
+ GF_GROUP_STATS *gf_stats) {
+ gf_stats->gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_stats->gf_group_raw_error += stats->coded_error;
+#endif
+ gf_stats->gf_group_skip_pct += stats->intra_skip_pct;
+ gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows;
+}
+
+static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats,
+ const int flash_detected,
+ const int frames_since_key,
+ const int cur_idx,
+ GF_GROUP_STATS *gf_stats, int f_w,
+ int f_h) {
+ accumulate_frame_motion_stats(stats, gf_stats, f_w, f_h);
+ // sum up the metric values of current gf group
+ gf_stats->avg_sr_coded_error += stats->sr_coded_error;
+ gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref;
+ gf_stats->avg_new_mv_count += stats->new_mv_count;
+ gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy;
+ if (fabs(stats->raw_error_stdev) > 0.000001) {
+ gf_stats->non_zero_stdev_count++;
+ gf_stats->avg_raw_err_stdev += stats->raw_error_stdev;
+ }
+
+ // Accumulate the effect of prediction quality decay
+ if (!flash_detected) {
+ gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate;
+ gf_stats->loop_decay_rate = get_prediction_decay_rate(stats);
+
+ gf_stats->decay_accumulator =
+ gf_stats->decay_accumulator * gf_stats->loop_decay_rate;
+
+ // Monitor for static sections.
+ if ((frames_since_key + cur_idx - 1) > 1) {
+ gf_stats->zero_motion_accumulator = AOMMIN(
+ gf_stats->zero_motion_accumulator, get_zero_motion_factor(stats));
+ }
+ }
+}
+
+static void average_gf_stats(const int total_frame, GF_GROUP_STATS *gf_stats) {
+ if (total_frame) {
+ gf_stats->avg_sr_coded_error /= total_frame;
+ gf_stats->avg_pcnt_second_ref /= total_frame;
+ gf_stats->avg_new_mv_count /= total_frame;
+ gf_stats->avg_wavelet_energy /= total_frame;
+ }
+
+ if (gf_stats->non_zero_stdev_count)
+ gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count;
+}
+
+#define BOOST_FACTOR 12.5
+static double baseline_err_per_mb(const FRAME_INFO *frame_info) {
+ unsigned int screen_area = frame_info->frame_height * frame_info->frame_width;
+
+ // Use a different error per mb factor for calculating boost for
+ // different formats.
+ if (screen_area <= 640 * 360) {
+ return 500.0;
+ } else {
+ return 1000.0;
+ }
+}
+
+static double calc_frame_boost(const PRIMARY_RATE_CONTROL *p_rc,
+ const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *this_frame,
+ double this_frame_mv_in_out, double max_boost) {
+ double frame_boost;
+ const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME],
+ frame_info->bit_depth);
+ const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
+ const double active_area = calculate_active_area(frame_info, this_frame);
+
+ // Underlying boost factor is based on inter error ratio.
+ frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area,
+ this_frame->intra_error * active_area) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+ frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+ // Increase boost for frames where new data coming into frame (e.g. zoom out).
+ // Slightly reduce boost if there is a net balance of motion out of the frame
+ // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+ if (this_frame_mv_in_out > 0.0)
+ frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+ // In the extreme case the boost is halved.
+ else
+ frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+ return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static double calc_kf_frame_boost(const PRIMARY_RATE_CONTROL *p_rc,
+ const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *this_frame,
+ double *sr_accumulator, double max_boost) {
+ double frame_boost;
+ const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME],
+ frame_info->bit_depth);
+ const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00);
+ const double active_area = calculate_active_area(frame_info, this_frame);
+
+ // Underlying boost factor is based on inter error ratio.
+ frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area,
+ this_frame->intra_error * active_area) /
+ DOUBLE_DIVIDE_CHECK(
+ (this_frame->coded_error + *sr_accumulator) * active_area);
+
+ // Update the accumulator for second ref error difference.
+ // This is intended to give an indication of how much the coded error is
+ // increasing over time.
+ *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
+ *sr_accumulator = AOMMAX(0.0, *sr_accumulator);
+
+ // Q correction and scaling
+ // The 40.0 value here is an experimentally derived baseline minimum.
+ // This value is in line with the minimum per frame boost in the alt_ref
+ // boost calculation.
+ frame_boost = ((frame_boost + 40.0) * boost_q_correction);
+
+ return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int get_projected_gfu_boost(const PRIMARY_RATE_CONTROL *p_rc,
+ int gfu_boost, int frames_to_project,
+ int num_stats_used_for_gfu_boost) {
+ /*
+ * If frames_to_project is equal to num_stats_used_for_gfu_boost,
+ * it means that gfu_boost was calculated over frames_to_project to
+ * begin with(ie; all stats required were available), hence return
+ * the original boost.
+ */
+ if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost;
+
+ double min_boost_factor = sqrt(p_rc->baseline_gf_interval);
+ // Get the current tpl factor (number of frames = frames_to_project).
+ double tpl_factor = av1_get_gfu_boost_projection_factor(
+ min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project);
+ // Get the tpl factor when number of frames = num_stats_used_for_prior_boost.
+ double tpl_factor_num_stats = av1_get_gfu_boost_projection_factor(
+ min_boost_factor, MAX_GFUBOOST_FACTOR, num_stats_used_for_gfu_boost);
+ int projected_gfu_boost =
+ (int)rint((tpl_factor * gfu_boost) / tpl_factor_num_stats);
+ return projected_gfu_boost;
+}
+
+#define GF_MAX_BOOST 90.0
+#define GF_MIN_BOOST 50
+#define MIN_DECAY_FACTOR 0.01
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+ const TWO_PASS_FRAME *twopass_frame,
+ const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+ int offset, int f_frames, int b_frames,
+ int *num_fpstats_used, int *num_fpstats_required,
+ int project_gfu_boost) {
+ int i;
+ GF_GROUP_STATS gf_stats;
+ init_gf_stats(&gf_stats);
+ double boost_score = (double)NORMAL_BOOST;
+ int arf_boost;
+ int flash_detected = 0;
+ if (num_fpstats_used) *num_fpstats_used = 0;
+
+ // Search forward from the proposed arf/next gf position.
+ for (i = 0; i < f_frames; ++i) {
+ const FIRSTPASS_STATS *this_frame =
+ read_frame_stats(twopass, twopass_frame, i + offset);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(this_frame, &gf_stats,
+ frame_info->frame_width,
+ frame_info->frame_height);
+
+ // We want to discount the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(twopass, twopass_frame, i + offset) ||
+ detect_flash(twopass, twopass_frame, i + offset + 1);
+
+ // Accumulate the effect of prediction quality decay.
+ if (!flash_detected) {
+ gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
+ gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : gf_stats.decay_accumulator;
+ }
+
+ boost_score +=
+ gf_stats.decay_accumulator *
+ calc_frame_boost(p_rc, frame_info, this_frame,
+ gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
+ if (num_fpstats_used) (*num_fpstats_used)++;
+ }
+
+ arf_boost = (int)boost_score;
+
+ // Reset for backward looking loop.
+ boost_score = 0.0;
+ init_gf_stats(&gf_stats);
+ // Search backward towards last gf position.
+ for (i = -1; i >= -b_frames; --i) {
+ const FIRSTPASS_STATS *this_frame =
+ read_frame_stats(twopass, twopass_frame, i + offset);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(this_frame, &gf_stats,
+ frame_info->frame_width,
+ frame_info->frame_height);
+
+ // We want to discount the the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(twopass, twopass_frame, i + offset) ||
+ detect_flash(twopass, twopass_frame, i + offset + 1);
+
+ // Cumulative effect of prediction quality decay.
+ if (!flash_detected) {
+ gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
+ gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : gf_stats.decay_accumulator;
+ }
+
+ boost_score +=
+ gf_stats.decay_accumulator *
+ calc_frame_boost(p_rc, frame_info, this_frame,
+ gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
+ if (num_fpstats_used) (*num_fpstats_used)++;
+ }
+ arf_boost += (int)boost_score;
+
+ if (project_gfu_boost) {
+ assert(num_fpstats_required != NULL);
+ assert(num_fpstats_used != NULL);
+ *num_fpstats_required = f_frames + b_frames;
+ arf_boost = get_projected_gfu_boost(p_rc, arf_boost, *num_fpstats_required,
+ *num_fpstats_used);
+ }
+
+ if (arf_boost < ((b_frames + f_frames) * GF_MIN_BOOST))
+ arf_boost = ((b_frames + f_frames) * GF_MIN_BOOST);
+
+ return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+ const FIRSTPASS_STATS *end,
+ int section_length) {
+ const FIRSTPASS_STATS *s = begin;
+ double intra_error = 0.0;
+ double coded_error = 0.0;
+ int i = 0;
+
+ while (s < end && i < section_length) {
+ intra_error += s->intra_error;
+ coded_error += s->coded_error;
+ ++s;
+ ++i;
+ }
+
+ return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+/*!\brief Calculates the bit target for this GF/ARF group
+ *
+ * \ingroup rate_control
+ *
+ * Calculates the total bits to allocate in this GF/ARF group.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] gf_group_err Cumulative coded error score for the
+ * frames making up this group.
+ *
+ * \return The target total number of bits for this GF/ARF group.
+ */
+static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
+ double gf_group_err) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const TWO_PASS *const twopass = &cpi->ppi->twopass;
+ const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+ int64_t total_group_bits;
+
+ // Calculate the bits to be allocated to the group as a whole.
+ if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+ total_group_bits = (int64_t)(twopass->kf_group_bits *
+ (gf_group_err / twopass->kf_group_error_left));
+ } else {
+ total_group_bits = 0;
+ }
+
+ // Clamp odd edge cases.
+ total_group_bits = (total_group_bits < 0) ? 0
+ : (total_group_bits > twopass->kf_group_bits)
+ ? twopass->kf_group_bits
+ : total_group_bits;
+
+ // Clip based on user supplied data rate variability limit.
+ if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval)
+ total_group_bits = (int64_t)max_bits * p_rc->baseline_gf_interval;
+
+ return total_group_bits;
+}
+
+// Calculate the number of bits to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+ int64_t total_group_bits) {
+ int allocation_chunks;
+
+ // return 0 for invalid inputs (could arise e.g. through rounding errors)
+ if (!boost || (total_group_bits <= 0)) return 0;
+
+ if (frame_count <= 0) return (int)(AOMMIN(total_group_bits, INT_MAX));
+
+ allocation_chunks = (frame_count * 100) + boost;
+
+ // Prevent overflow.
+ if (boost > 1023) {
+ int divisor = boost >> 10;
+ boost /= divisor;
+ allocation_chunks /= divisor;
+ }
+
+ // Calculate the number of extra bits for use in the boosted frame or frames.
+ return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+ 0);
+}
+
+// Calculate the boost factor based on the number of bits assigned, i.e. the
+// inverse of calculate_boost_bits().
+static int calculate_boost_factor(int frame_count, int bits,
+ int64_t total_group_bits) {
+ return (int)(100.0 * frame_count * bits / (total_group_bits - bits));
+}
+
+// Reduce the number of bits assigned to keyframe or arf if necessary, to
+// prevent bitrate spikes that may break level constraints.
+// frame_type: 0: keyframe; 1: arf.
+static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
+ RATE_CONTROL *const rc,
+ int bits_assigned,
+ int64_t group_bits,
+ int frame_type) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const int temporal_layer_id = cm->temporal_layer_id;
+ const int spatial_layer_id = cm->spatial_layer_id;
+ for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1;
+ ++index) {
+ if (!is_in_operating_point(seq_params->operating_point_idc[index],
+ temporal_layer_id, spatial_layer_id)) {
+ continue;
+ }
+
+ const AV1_LEVEL target_level =
+ cpi->ppi->level_params.target_seq_level_idx[index];
+ if (target_level >= SEQ_LEVELS) continue;
+
+ assert(is_valid_seq_level_idx(target_level));
+
+ const double level_bitrate_limit = av1_get_max_bitrate_for_level(
+ target_level, seq_params->tier[0], seq_params->profile);
+ const int target_bits_per_frame =
+ (int)(level_bitrate_limit / cpi->framerate);
+ if (frame_type == 0) {
+ // Maximum bits for keyframe is 8 times the target_bits_per_frame.
+ const int level_enforced_max_kf_bits = target_bits_per_frame * 8;
+ if (bits_assigned > level_enforced_max_kf_bits) {
+ const int frames = rc->frames_to_key - 1;
+ p_rc->kf_boost = calculate_boost_factor(
+ frames, level_enforced_max_kf_bits, group_bits);
+ bits_assigned =
+ calculate_boost_bits(frames, p_rc->kf_boost, group_bits);
+ }
+ } else if (frame_type == 1) {
+ // Maximum bits for arf is 4 times the target_bits_per_frame.
+ const int level_enforced_max_arf_bits = target_bits_per_frame * 4;
+ if (bits_assigned > level_enforced_max_arf_bits) {
+ p_rc->gfu_boost =
+ calculate_boost_factor(p_rc->baseline_gf_interval,
+ level_enforced_max_arf_bits, group_bits);
+ bits_assigned = calculate_boost_bits(p_rc->baseline_gf_interval,
+ p_rc->gfu_boost, group_bits);
+ }
+ } else {
+ assert(0);
+ }
+ }
+
+ return bits_assigned;
+}
+
+// Allocate bits to each frame in a GF / ARF group
+double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0, 0.70, 0.55, 0.60,
+ 0.60, 1.0, 1.0 };
+static void allocate_gf_group_bits(GF_GROUP *gf_group,
+ PRIMARY_RATE_CONTROL *const p_rc,
+ RATE_CONTROL *const rc,
+ int64_t gf_group_bits, int gf_arf_bits,
+ int key_frame, int use_arf) {
+ int64_t total_group_bits = gf_group_bits;
+ int base_frame_bits;
+ const int gf_group_size = gf_group->size;
+ int layer_frames[MAX_ARF_LAYERS + 1] = { 0 };
+
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ // === [frame_index == 0] ===
+ int frame_index = !!key_frame;
+
+ // Subtract the extra bits set aside for ARF frames from the Group Total
+ if (use_arf) total_group_bits -= gf_arf_bits;
+
+ int num_frames =
+ AOMMAX(1, p_rc->baseline_gf_interval - (rc->frames_since_key == 0));
+ base_frame_bits = (int)(total_group_bits / num_frames);
+
+ // Check the number of frames in each layer in case we have a
+ // non standard group length.
+ int max_arf_layer = gf_group->max_layer_depth - 1;
+ for (int idx = frame_index; idx < gf_group_size; ++idx) {
+ if ((gf_group->update_type[idx] == ARF_UPDATE) ||
+ (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) {
+ layer_frames[gf_group->layer_depth[idx]]++;
+ }
+ }
+
+ // Allocate extra bits to each ARF layer
+ int i;
+ int layer_extra_bits[MAX_ARF_LAYERS + 1] = { 0 };
+ assert(max_arf_layer <= MAX_ARF_LAYERS);
+ for (i = 1; i <= max_arf_layer; ++i) {
+ double fraction = (i == max_arf_layer) ? 1.0 : layer_fraction[i];
+ layer_extra_bits[i] =
+ (int)((gf_arf_bits * fraction) / AOMMAX(1, layer_frames[i]));
+ gf_arf_bits -= (int)(gf_arf_bits * fraction);
+ }
+
+ // Now combine ARF layer and baseline bits to give total bits for each frame.
+ int arf_extra_bits;
+ for (int idx = frame_index; idx < gf_group_size; ++idx) {
+ switch (gf_group->update_type[idx]) {
+ case ARF_UPDATE:
+ case INTNL_ARF_UPDATE:
+ arf_extra_bits = layer_extra_bits[gf_group->layer_depth[idx]];
+ gf_group->bit_allocation[idx] = base_frame_bits + arf_extra_bits;
+ break;
+ case INTNL_OVERLAY_UPDATE:
+ case OVERLAY_UPDATE: gf_group->bit_allocation[idx] = 0; break;
+ default: gf_group->bit_allocation[idx] = base_frame_bits; break;
+ }
+ }
+
+ // Set the frame following the current GOP to 0 bit allocation. For ARF
+ // groups, this next frame will be overlay frame, which is the first frame
+ // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
+ // Setting this frame to use 0 bit (of out the current GOP budget) will
+ // simplify logics in reference frame management.
+ if (gf_group_size < MAX_STATIC_GF_GROUP_LENGTH)
+ gf_group->bit_allocation[gf_group_size] = 0;
+}
+
+// Returns true if KF group and GF group both are almost completely static.
+static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion,
+ int is_lap_enabled) {
+ if (is_lap_enabled) {
+ /*
+ * when LAP enabled kf_zero_motion is not reliable, so use strict
+ * constraint on gf_zero_motion.
+ */
+ return (gf_zero_motion >= 0.999);
+ } else {
+ return (gf_zero_motion >= 0.995) &&
+ (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
+ }
+}
+
+#define ARF_ABS_ZOOM_THRESH 4.4
+static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
+ int flash_detected, int active_max_gf_interval,
+ int active_min_gf_interval,
+ GF_GROUP_STATS *gf_stats) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ AV1_COMMON *const cm = &cpi->common;
+ // Motion breakout threshold for loop below depends on image size.
+ const double mv_ratio_accumulator_thresh = (cm->height + cm->width) / 4.0;
+
+ if (!flash_detected) {
+ // Break clause to detect very still sections after motion. For example,
+ // a static image after a fade or other transition.
+
+ // TODO(angiebird): This is a temporary change, we will avoid using
+ // twopass_frame.stats_in in the follow-up CL
+ int index = (int)(cpi->twopass_frame.stats_in -
+ twopass->stats_buf_ctx->stats_in_start);
+ if (detect_transition_to_still(&twopass->firstpass_info, index,
+ rc->min_gf_interval, frame_index - cur_start,
+ 5, gf_stats->loop_decay_rate,
+ gf_stats->last_loop_decay_rate)) {
+ return 1;
+ }
+ }
+
+ // Some conditions to breakout after min interval.
+ if (frame_index - cur_start >= active_min_gf_interval &&
+ // If possible don't break very close to a kf
+ (rc->frames_to_key - frame_index >= rc->min_gf_interval) &&
+ ((frame_index - cur_start) & 0x01) && !flash_detected &&
+ (gf_stats->mv_ratio_accumulator > mv_ratio_accumulator_thresh ||
+ gf_stats->abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) {
+ return 1;
+ }
+
+ // If almost totally static, we will not use the the max GF length later,
+ // so we can continue for more frames.
+ if (((frame_index - cur_start) >= active_max_gf_interval + 1) &&
+ !is_almost_static(gf_stats->zero_motion_accumulator,
+ twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) {
+ return 1;
+ }
+ return 0;
+}
+
+static int is_shorter_gf_interval_better(
+ AV1_COMP *cpi, const EncodeFrameParams *frame_params) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method;
+ int shorten_gf_interval;
+
+ av1_tpl_preload_rc_estimate(cpi, frame_params);
+
+ if (gop_length_decision_method == 2) {
+ // GF group length is decided based on GF boost and tpl stats of ARFs from
+ // base layer, (base+1) layer.
+ shorten_gf_interval =
+ (p_rc->gfu_boost <
+ p_rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) &&
+ !av1_tpl_setup_stats(cpi, 3, frame_params);
+ } else {
+ int do_complete_tpl = 1;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ int is_temporal_filter_enabled =
+ (rc->frames_since_key > 0 && gf_group->arf_index > -1);
+
+ if (gop_length_decision_method == 1) {
+ // Check if tpl stats of ARFs from base layer, (base+1) layer,
+ // (base+2) layer can decide the GF group length.
+ int gop_length_eval = av1_tpl_setup_stats(cpi, 2, frame_params);
+
+ if (gop_length_eval != 2) {
+ do_complete_tpl = 0;
+ shorten_gf_interval = !gop_length_eval;
+ }
+ }
+
+ if (do_complete_tpl) {
+ // Decide GF group length based on complete tpl stats.
+ shorten_gf_interval = !av1_tpl_setup_stats(cpi, 1, frame_params);
+ // Tpl stats is reused when the ARF is temporally filtered and GF
+ // interval is not shortened.
+ if (is_temporal_filter_enabled && !shorten_gf_interval) {
+ cpi->skip_tpl_setup_stats = 1;
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+ assert(cpi->gf_frame_index == 0);
+ av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data,
+ gf_group,
+ cpi->common.seq_params->bit_depth);
+#endif // CONFIG_BITRATE_ACCURACY
+ }
+ }
+ }
+ return shorten_gf_interval;
+}
+
+#define MIN_SHRINK_LEN 6 // the minimum length of gf if we are shrinking
+#define SMOOTH_FILT_LEN 7
+#define HALF_FILT_LEN (SMOOTH_FILT_LEN / 2)
+#define WINDOW_SIZE 7
+#define HALF_WIN (WINDOW_SIZE / 2)
+// A 7-tap gaussian smooth filter
+const double smooth_filt[SMOOTH_FILT_LEN] = { 0.006, 0.061, 0.242, 0.383,
+ 0.242, 0.061, 0.006 };
+
+// Smooth filter intra_error and coded_error in firstpass stats.
+// If stats[i].is_flash==1, the ith element should not be used in the filtering.
+static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx,
+ int last_idx, double *filt_intra_err,
+ double *filt_coded_err) {
+ int i, j;
+ for (i = start_idx; i <= last_idx; i++) {
+ double total_wt = 0;
+ for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+ int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
+ if (stats[idx].is_flash) continue;
+
+ filt_intra_err[i] +=
+ smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error;
+ total_wt += smooth_filt[j + HALF_FILT_LEN];
+ }
+ if (total_wt > 0.01) {
+ filt_intra_err[i] /= total_wt;
+ } else {
+ filt_intra_err[i] = stats[i].intra_error;
+ }
+ }
+ for (i = start_idx; i <= last_idx; i++) {
+ double total_wt = 0;
+ for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+ int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
+ // Coded error involves idx and idx - 1.
+ if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
+
+ filt_coded_err[i] +=
+ smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error;
+ total_wt += smooth_filt[j + HALF_FILT_LEN];
+ }
+ if (total_wt > 0.01) {
+ filt_coded_err[i] /= total_wt;
+ } else {
+ filt_coded_err[i] = stats[i].coded_error;
+ }
+ }
+}
+
+// Calculate gradient
+static void get_gradient(const double *values, int start, int last,
+ double *grad) {
+ if (start == last) {
+ grad[start] = 0;
+ return;
+ }
+ for (int i = start; i <= last; i++) {
+ int prev = AOMMAX(i - 1, start);
+ int next = AOMMIN(i + 1, last);
+ grad[i] = (values[next] - values[prev]) / (next - prev);
+ }
+}
+
+static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start,
+ int first, int last) {
+ // Identify unstable areas caused by scenecuts.
+ // Find the max and 2nd max coded error, and the average of the rest frames.
+ // If there is only one frame that yields a huge coded error, it is likely a
+ // scenecut.
+ double this_ratio, max_prev_ratio, max_next_ratio, max_prev_coded,
+ max_next_coded;
+
+ if (last - first == 0) return -1;
+
+ for (int i = first; i <= last; i++) {
+ if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+ continue;
+ double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01);
+ this_ratio = stats_start[i].coded_error / temp_intra;
+ // find the avg ratio in the preceding neighborhood
+ max_prev_ratio = 0;
+ max_prev_coded = 0;
+ for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) {
+ if (stats_start[j].is_flash || (j > 0 && stats_start[j - 1].is_flash))
+ continue;
+ temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
+ double temp_ratio = stats_start[j].coded_error / temp_intra;
+ if (temp_ratio > max_prev_ratio) {
+ max_prev_ratio = temp_ratio;
+ }
+ if (stats_start[j].coded_error > max_prev_coded) {
+ max_prev_coded = stats_start[j].coded_error;
+ }
+ }
+ // find the avg ratio in the following neighborhood
+ max_next_ratio = 0;
+ max_next_coded = 0;
+ for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) {
+ if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+ continue;
+ temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
+ double temp_ratio = stats_start[j].coded_error / temp_intra;
+ if (temp_ratio > max_next_ratio) {
+ max_next_ratio = temp_ratio;
+ }
+ if (stats_start[j].coded_error > max_next_coded) {
+ max_next_coded = stats_start[j].coded_error;
+ }
+ }
+
+ if (max_prev_ratio < 0.001 && max_next_ratio < 0.001) {
+ // the ratios are very small, only check a small fixed threshold
+ if (this_ratio < 0.02) continue;
+ } else {
+ // check if this frame has a larger ratio than the neighborhood
+ double max_sr = stats_start[i].sr_coded_error;
+ if (i < last) max_sr = AOMMAX(max_sr, stats_start[i + 1].sr_coded_error);
+ double max_sr_fr_ratio =
+ max_sr / AOMMAX(stats_start[i].coded_error, 0.01);
+
+ if (max_sr_fr_ratio > 1.2) continue;
+ if (this_ratio < 2 * AOMMAX(max_prev_ratio, max_next_ratio) &&
+ stats_start[i].coded_error <
+ 2 * AOMMAX(max_prev_coded, max_next_coded)) {
+ continue;
+ }
+ }
+ return i;
+ }
+ return -1;
+}
+
+// Remove the region with index next_region.
+// parameter merge: 0: merge with previous; 1: merge with next; 2:
+// merge with both, take type from previous if possible
+// After removing, next_region will be the index of the next region.
+static void remove_region(int merge, REGIONS *regions, int *num_regions,
+ int *next_region) {
+ int k = *next_region;
+ assert(k < *num_regions);
+ if (*num_regions == 1) {
+ *num_regions = 0;
+ return;
+ }
+ if (k == 0) {
+ merge = 1;
+ } else if (k == *num_regions - 1) {
+ merge = 0;
+ }
+ int num_merge = (merge == 2) ? 2 : 1;
+ switch (merge) {
+ case 0:
+ regions[k - 1].last = regions[k].last;
+ *next_region = k;
+ break;
+ case 1:
+ regions[k + 1].start = regions[k].start;
+ *next_region = k + 1;
+ break;
+ case 2:
+ regions[k - 1].last = regions[k + 1].last;
+ *next_region = k;
+ break;
+ default: assert(0);
+ }
+ *num_regions -= num_merge;
+ for (k = *next_region - (merge == 1); k < *num_regions; k++) {
+ regions[k] = regions[k + num_merge];
+ }
+}
+
+// Insert a region in the cur_region_idx. The start and last should both be in
+// the current region. After insertion, the cur_region_idx will point to the
+// last region that was splitted from the original region.
+static void insert_region(int start, int last, REGION_TYPES type,
+ REGIONS *regions, int *num_regions,
+ int *cur_region_idx) {
+ int k = *cur_region_idx;
+ REGION_TYPES this_region_type = regions[k].type;
+ int this_region_last = regions[k].last;
+ int num_add = (start != regions[k].start) + (last != regions[k].last);
+ // move the following regions further to the back
+ for (int r = *num_regions - 1; r > k; r--) {
+ regions[r + num_add] = regions[r];
+ }
+ *num_regions += num_add;
+ if (start > regions[k].start) {
+ regions[k].last = start - 1;
+ k++;
+ regions[k].start = start;
+ }
+ regions[k].type = type;
+ if (last < this_region_last) {
+ regions[k].last = last;
+ k++;
+ regions[k].start = last + 1;
+ regions[k].last = this_region_last;
+ regions[k].type = this_region_type;
+ } else {
+ regions[k].last = this_region_last;
+ }
+ *cur_region_idx = k;
+}
+
+// Get the average of stats inside a region.
+static void analyze_region(const FIRSTPASS_STATS *stats, int k,
+ REGIONS *regions) {
+ int i;
+ regions[k].avg_cor_coeff = 0;
+ regions[k].avg_sr_fr_ratio = 0;
+ regions[k].avg_intra_err = 0;
+ regions[k].avg_coded_err = 0;
+
+ int check_first_sr = (k != 0);
+
+ for (i = regions[k].start; i <= regions[k].last; i++) {
+ if (i > regions[k].start || check_first_sr) {
+ double num_frames =
+ (double)(regions[k].last - regions[k].start + check_first_sr);
+ double max_coded_error =
+ AOMMAX(stats[i].coded_error, stats[i - 1].coded_error);
+ double this_ratio =
+ stats[i].sr_coded_error / AOMMAX(max_coded_error, 0.001);
+ regions[k].avg_sr_fr_ratio += this_ratio / num_frames;
+ }
+
+ regions[k].avg_intra_err +=
+ stats[i].intra_error / (double)(regions[k].last - regions[k].start + 1);
+ regions[k].avg_coded_err +=
+ stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1);
+
+ regions[k].avg_cor_coeff +=
+ AOMMAX(stats[i].cor_coeff, 0.001) /
+ (double)(regions[k].last - regions[k].start + 1);
+ regions[k].avg_noise_var +=
+ AOMMAX(stats[i].noise_var, 0.001) /
+ (double)(regions[k].last - regions[k].start + 1);
+ }
+}
+
+// Calculate the regions stats of every region.
+static void get_region_stats(const FIRSTPASS_STATS *stats, REGIONS *regions,
+ int num_regions) {
+ for (int k = 0; k < num_regions; k++) {
+ analyze_region(stats, k, regions);
+ }
+}
+
+// Find tentative stable regions
+static int find_stable_regions(const FIRSTPASS_STATS *stats,
+ const double *grad_coded, int this_start,
+ int this_last, REGIONS *regions) {
+ int i, j, k = 0;
+ regions[k].start = this_start;
+ for (i = this_start; i <= this_last; i++) {
+ // Check mean and variance of stats in a window
+ double mean_intra = 0.001, var_intra = 0.001;
+ double mean_coded = 0.001, var_coded = 0.001;
+ int count = 0;
+ for (j = -HALF_WIN; j <= HALF_WIN; j++) {
+ int idx = AOMMIN(AOMMAX(i + j, this_start), this_last);
+ if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
+ mean_intra += stats[idx].intra_error;
+ var_intra += stats[idx].intra_error * stats[idx].intra_error;
+ mean_coded += stats[idx].coded_error;
+ var_coded += stats[idx].coded_error * stats[idx].coded_error;
+ count++;
+ }
+
+ REGION_TYPES cur_type;
+ if (count > 0) {
+ mean_intra /= (double)count;
+ var_intra /= (double)count;
+ mean_coded /= (double)count;
+ var_coded /= (double)count;
+ int is_intra_stable = (var_intra / (mean_intra * mean_intra) < 1.03);
+ int is_coded_stable = (var_coded / (mean_coded * mean_coded) < 1.04 &&
+ fabs(grad_coded[i]) / mean_coded < 0.05) ||
+ mean_coded / mean_intra < 0.05;
+ int is_coded_small = mean_coded < 0.5 * mean_intra;
+ cur_type = (is_intra_stable && is_coded_stable && is_coded_small)
+ ? STABLE_REGION
+ : HIGH_VAR_REGION;
+ } else {
+ cur_type = HIGH_VAR_REGION;
+ }
+
+ // mark a new region if type changes
+ if (i == regions[k].start) {
+ // first frame in the region
+ regions[k].type = cur_type;
+ } else if (cur_type != regions[k].type) {
+ // Append a new region
+ regions[k].last = i - 1;
+ regions[k + 1].start = i;
+ regions[k + 1].type = cur_type;
+ k++;
+ }
+ }
+ regions[k].last = this_last;
+ return k + 1;
+}
+
+// Clean up regions that should be removed or merged.
+static void cleanup_regions(REGIONS *regions, int *num_regions) {
+ int k = 0;
+ while (k < *num_regions) {
+ if ((k > 0 && regions[k - 1].type == regions[k].type &&
+ regions[k].type != SCENECUT_REGION) ||
+ regions[k].last < regions[k].start) {
+ remove_region(0, regions, num_regions, &k);
+ } else {
+ k++;
+ }
+ }
+}
+
+// Remove regions that are of type and shorter than length.
+// Merge it with its neighboring regions.
+static void remove_short_regions(REGIONS *regions, int *num_regions,
+ REGION_TYPES type, int length) {
+ int k = 0;
+ while (k < *num_regions && (*num_regions) > 1) {
+ if ((regions[k].last - regions[k].start + 1 < length &&
+ regions[k].type == type)) {
+ // merge current region with the previous and next regions
+ remove_region(2, regions, num_regions, &k);
+ } else {
+ k++;
+ }
+ }
+ cleanup_regions(regions, num_regions);
+}
+
+static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
+ REGIONS *regions, int *num_regions) {
+ int i, j, k;
+ // Remove regions that are too short. Likely noise.
+ remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN);
+ remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+
+ get_region_stats(stats, regions, *num_regions);
+
+ // Adjust region boundaries. The thresholds are empirically obtained, but
+ // overall the performance is not very sensitive to small changes to them.
+ for (k = 0; k < *num_regions; k++) {
+ if (regions[k].type == STABLE_REGION) continue;
+ if (k > 0) {
+ // Adjust previous boundary.
+ // First find the average intra/coded error in the previous
+ // neighborhood.
+ double avg_intra_err = 0;
+ const int starti = AOMMAX(regions[k - 1].last - WINDOW_SIZE + 1,
+ regions[k - 1].start + 1);
+ const int lasti = regions[k - 1].last;
+ int counti = 0;
+ for (i = starti; i <= lasti; i++) {
+ avg_intra_err += stats[i].intra_error;
+ counti++;
+ }
+ if (counti > 0) {
+ avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
+ int count_coded = 0, count_grad = 0;
+ for (j = lasti + 1; j <= regions[k].last; j++) {
+ const int intra_close =
+ fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
+ const int coded_small = stats[j].coded_error / avg_intra_err < 0.1;
+ const int coeff_close = stats[j].cor_coeff > 0.995;
+ if (!coeff_close || !coded_small) count_coded--;
+ if (intra_close && count_coded >= 0 && count_grad >= 0) {
+ // this frame probably belongs to the previous stable region
+ regions[k - 1].last = j;
+ regions[k].start = j + 1;
+ } else {
+ break;
+ }
+ }
+ }
+ } // if k > 0
+ if (k < *num_regions - 1) {
+ // Adjust next boundary.
+ // First find the average intra/coded error in the next neighborhood.
+ double avg_intra_err = 0;
+ const int starti = regions[k + 1].start;
+ const int lasti = AOMMIN(regions[k + 1].last - 1,
+ regions[k + 1].start + WINDOW_SIZE - 1);
+ int counti = 0;
+ for (i = starti; i <= lasti; i++) {
+ avg_intra_err += stats[i].intra_error;
+ counti++;
+ }
+ if (counti > 0) {
+ avg_intra_err = AOMMAX(avg_intra_err / (double)counti, 0.001);
+ // At the boundary, coded error is large, but still the frame is stable
+ int count_coded = 1, count_grad = 1;
+ for (j = starti - 1; j >= regions[k].start; j--) {
+ const int intra_close =
+ fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
+ const int coded_small =
+ stats[j + 1].coded_error / avg_intra_err < 0.1;
+ const int coeff_close = stats[j].cor_coeff > 0.995;
+ if (!coeff_close || !coded_small) count_coded--;
+ if (intra_close && count_coded >= 0 && count_grad >= 0) {
+ // this frame probably belongs to the next stable region
+ regions[k + 1].start = j;
+ regions[k].last = j - 1;
+ } else {
+ break;
+ }
+ }
+ }
+ } // if k < *num_regions - 1
+ } // end of loop over all regions
+
+ cleanup_regions(regions, num_regions);
+ remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+ get_region_stats(stats, regions, *num_regions);
+
+ // If a stable regions has higher error than neighboring high var regions,
+ // or if the stable region has a lower average correlation,
+ // then it should be merged with them
+ k = 0;
+ while (k < *num_regions && (*num_regions) > 1) {
+ if (regions[k].type == STABLE_REGION &&
+ (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
+ ((k > 0 && // previous regions
+ (regions[k].avg_coded_err > regions[k - 1].avg_coded_err * 1.01 ||
+ regions[k].avg_cor_coeff < regions[k - 1].avg_cor_coeff * 0.999)) &&
+ (k < *num_regions - 1 && // next region
+ (regions[k].avg_coded_err > regions[k + 1].avg_coded_err * 1.01 ||
+ regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff * 0.999)))) {
+ // merge current region with the previous and next regions
+ remove_region(2, regions, num_regions, &k);
+ analyze_region(stats, k - 1, regions);
+ } else if (regions[k].type == HIGH_VAR_REGION &&
+ (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
+ ((k > 0 && // previous regions
+ (regions[k].avg_coded_err <
+ regions[k - 1].avg_coded_err * 0.99 ||
+ regions[k].avg_cor_coeff >
+ regions[k - 1].avg_cor_coeff * 1.001)) &&
+ (k < *num_regions - 1 && // next region
+ (regions[k].avg_coded_err <
+ regions[k + 1].avg_coded_err * 0.99 ||
+ regions[k].avg_cor_coeff >
+ regions[k + 1].avg_cor_coeff * 1.001)))) {
+ // merge current region with the previous and next regions
+ remove_region(2, regions, num_regions, &k);
+ analyze_region(stats, k - 1, regions);
+ } else {
+ k++;
+ }
+ }
+
+ remove_short_regions(regions, num_regions, STABLE_REGION, WINDOW_SIZE);
+ remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
+}
+
+// Identify blending regions.
+static void find_blending_regions(const FIRSTPASS_STATS *stats,
+ REGIONS *regions, int *num_regions) {
+ int i, k = 0;
+ // Blending regions will have large content change, therefore will have a
+ // large consistent change in intra error.
+ int count_stable = 0;
+ while (k < *num_regions) {
+ if (regions[k].type == STABLE_REGION) {
+ k++;
+ count_stable++;
+ continue;
+ }
+ int dir = 0;
+ int start = 0, last;
+ for (i = regions[k].start; i <= regions[k].last; i++) {
+ // First mark the regions that has consistent large change of intra error.
+ if (k == 0 && i == regions[k].start) continue;
+ if (stats[i].is_flash || (i > 0 && stats[i - 1].is_flash)) continue;
+ double grad = stats[i].intra_error - stats[i - 1].intra_error;
+ int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05;
+ int this_dir = 0;
+ if (large_change) {
+ this_dir = (grad > 0) ? 1 : -1;
+ }
+ // the current trend continues
+ if (dir == this_dir) continue;
+ if (dir != 0) {
+ // Mark the end of a new large change group and add it
+ last = i - 1;
+ insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
+ }
+ dir = this_dir;
+ if (k == 0 && i == regions[k].start + 1) {
+ start = i - 1;
+ } else {
+ start = i;
+ }
+ }
+ if (dir != 0) {
+ last = regions[k].last;
+ insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
+ }
+ k++;
+ }
+
+ // If the blending region has very low correlation, mark it as high variance
+ // since we probably cannot benefit from it anyways.
+ get_region_stats(stats, regions, *num_regions);
+ for (k = 0; k < *num_regions; k++) {
+ if (regions[k].type != BLENDING_REGION) continue;
+ if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 ||
+ count_stable == 0)
+ regions[k].type = HIGH_VAR_REGION;
+ }
+ get_region_stats(stats, regions, *num_regions);
+
+ // It is possible for blending to result in a "dip" in intra error (first
+ // decrease then increase). Therefore we need to find the dip and combine the
+ // two regions.
+ k = 1;
+ while (k < *num_regions) {
+ if (k < *num_regions - 1 && regions[k].type == HIGH_VAR_REGION) {
+ // Check if this short high variance regions is actually in the middle of
+ // a blending region.
+ if (regions[k - 1].type == BLENDING_REGION &&
+ regions[k + 1].type == BLENDING_REGION &&
+ regions[k].last - regions[k].start < 3) {
+ int prev_dir = (stats[regions[k - 1].last].intra_error -
+ stats[regions[k - 1].last - 1].intra_error) > 0
+ ? 1
+ : -1;
+ int next_dir = (stats[regions[k + 1].last].intra_error -
+ stats[regions[k + 1].last - 1].intra_error) > 0
+ ? 1
+ : -1;
+ if (prev_dir < 0 && next_dir > 0) {
+ // This is possibly a mid region of blending. Check the ratios
+ double ratio_thres = AOMMIN(regions[k - 1].avg_sr_fr_ratio,
+ regions[k + 1].avg_sr_fr_ratio) *
+ 0.95;
+ if (regions[k].avg_sr_fr_ratio > ratio_thres) {
+ regions[k].type = BLENDING_REGION;
+ remove_region(2, regions, num_regions, &k);
+ analyze_region(stats, k - 1, regions);
+ continue;
+ }
+ }
+ }
+ }
+ // Check if we have a pair of consecutive blending regions.
+ if (regions[k - 1].type == BLENDING_REGION &&
+ regions[k].type == BLENDING_REGION) {
+ int prev_dir = (stats[regions[k - 1].last].intra_error -
+ stats[regions[k - 1].last - 1].intra_error) > 0
+ ? 1
+ : -1;
+ int next_dir = (stats[regions[k].last].intra_error -
+ stats[regions[k].last - 1].intra_error) > 0
+ ? 1
+ : -1;
+
+ // if both are too short, no need to check
+ int total_length = regions[k].last - regions[k - 1].start + 1;
+ if (total_length < 4) {
+ regions[k - 1].type = HIGH_VAR_REGION;
+ k++;
+ continue;
+ }
+
+ int to_merge = 0;
+ if (prev_dir < 0 && next_dir > 0) {
+ // In this case we check the last frame in the previous region.
+ double prev_length =
+ (double)(regions[k - 1].last - regions[k - 1].start + 1);
+ double last_ratio, ratio_thres;
+ if (prev_length < 2.01) {
+ // if the previous region is very short
+ double max_coded_error =
+ AOMMAX(stats[regions[k - 1].last].coded_error,
+ stats[regions[k - 1].last - 1].coded_error);
+ last_ratio = stats[regions[k - 1].last].sr_coded_error /
+ AOMMAX(max_coded_error, 0.001);
+ ratio_thres = regions[k].avg_sr_fr_ratio * 0.95;
+ } else {
+ double max_coded_error =
+ AOMMAX(stats[regions[k - 1].last].coded_error,
+ stats[regions[k - 1].last - 1].coded_error);
+ last_ratio = stats[regions[k - 1].last].sr_coded_error /
+ AOMMAX(max_coded_error, 0.001);
+ double prev_ratio =
+ (regions[k - 1].avg_sr_fr_ratio * prev_length - last_ratio) /
+ (prev_length - 1.0);
+ ratio_thres = AOMMIN(prev_ratio, regions[k].avg_sr_fr_ratio) * 0.95;
+ }
+ if (last_ratio > ratio_thres) {
+ to_merge = 1;
+ }
+ }
+
+ if (to_merge) {
+ remove_region(0, regions, num_regions, &k);
+ analyze_region(stats, k - 1, regions);
+ continue;
+ } else {
+ // These are possibly two separate blending regions. Mark the boundary
+ // frame as HIGH_VAR_REGION to separate the two.
+ int prev_k = k - 1;
+ insert_region(regions[prev_k].last, regions[prev_k].last,
+ HIGH_VAR_REGION, regions, num_regions, &prev_k);
+ analyze_region(stats, prev_k, regions);
+ k = prev_k + 1;
+ analyze_region(stats, k, regions);
+ }
+ }
+ k++;
+ }
+ cleanup_regions(regions, num_regions);
+}
+
+// Clean up decision for blendings. Remove blending regions that are too short.
+// Also if a very short high var region is between a blending and a stable
+// region, just merge it with one of them.
+static void cleanup_blendings(REGIONS *regions, int *num_regions) {
+ int k = 0;
+ while (k<*num_regions && * num_regions> 1) {
+ int is_short_blending = regions[k].type == BLENDING_REGION &&
+ regions[k].last - regions[k].start + 1 < 5;
+ int is_short_hv = regions[k].type == HIGH_VAR_REGION &&
+ regions[k].last - regions[k].start + 1 < 5;
+ int has_stable_neighbor =
+ ((k > 0 && regions[k - 1].type == STABLE_REGION) ||
+ (k < *num_regions - 1 && regions[k + 1].type == STABLE_REGION));
+ int has_blend_neighbor =
+ ((k > 0 && regions[k - 1].type == BLENDING_REGION) ||
+ (k < *num_regions - 1 && regions[k + 1].type == BLENDING_REGION));
+ int total_neighbors = (k > 0) + (k < *num_regions - 1);
+
+ if (is_short_blending ||
+ (is_short_hv &&
+ has_stable_neighbor + has_blend_neighbor >= total_neighbors)) {
+ // Remove this region.Try to determine whether to combine it with the
+ // previous or next region.
+ int merge;
+ double prev_diff =
+ (k > 0)
+ ? fabs(regions[k].avg_cor_coeff - regions[k - 1].avg_cor_coeff)
+ : 1;
+ double next_diff =
+ (k < *num_regions - 1)
+ ? fabs(regions[k].avg_cor_coeff - regions[k + 1].avg_cor_coeff)
+ : 1;
+ // merge == 0 means to merge with previous, 1 means to merge with next
+ merge = prev_diff > next_diff;
+ remove_region(merge, regions, num_regions, &k);
+ } else {
+ k++;
+ }
+ }
+ cleanup_regions(regions, num_regions);
+}
+
+static void free_firstpass_stats_buffers(REGIONS *temp_regions,
+ double *filt_intra_err,
+ double *filt_coded_err,
+ double *grad_coded) {
+ aom_free(temp_regions);
+ aom_free(filt_intra_err);
+ aom_free(filt_coded_err);
+ aom_free(grad_coded);
+}
+
+// Identify stable and unstable regions from first pass stats.
+// stats_start points to the first frame to analyze.
+// |offset| is the offset from the current frame to the frame stats_start is
+// pointing to.
+// Returns 0 on success, -1 on memory allocation failure.
+static int identify_regions(const FIRSTPASS_STATS *const stats_start,
+ int total_frames, int offset, REGIONS *regions,
+ int *total_regions) {
+ int k;
+ if (total_frames <= 1) return 0;
+
+ // store the initial decisions
+ REGIONS *temp_regions =
+ (REGIONS *)aom_malloc(total_frames * sizeof(temp_regions[0]));
+ // buffers for filtered stats
+ double *filt_intra_err =
+ (double *)aom_calloc(total_frames, sizeof(*filt_intra_err));
+ double *filt_coded_err =
+ (double *)aom_calloc(total_frames, sizeof(*filt_coded_err));
+ double *grad_coded = (double *)aom_calloc(total_frames, sizeof(*grad_coded));
+ if (!(temp_regions && filt_intra_err && filt_coded_err && grad_coded)) {
+ free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err,
+ grad_coded);
+ return -1;
+ }
+ av1_zero_array(temp_regions, total_frames);
+
+ int cur_region = 0, this_start = 0, this_last;
+
+ int next_scenecut = -1;
+ do {
+ // first get the obvious scenecuts
+ next_scenecut =
+ find_next_scenecut(stats_start, this_start, total_frames - 1);
+ this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1;
+
+ // low-pass filter the needed stats
+ smooth_filter_stats(stats_start, this_start, this_last, filt_intra_err,
+ filt_coded_err);
+ get_gradient(filt_coded_err, this_start, this_last, grad_coded);
+
+ // find tentative stable regions and unstable regions
+ int num_regions = find_stable_regions(stats_start, grad_coded, this_start,
+ this_last, temp_regions);
+
+ adjust_unstable_region_bounds(stats_start, temp_regions, &num_regions);
+
+ get_region_stats(stats_start, temp_regions, num_regions);
+
+ // Try to identify blending regions in the unstable regions
+ find_blending_regions(stats_start, temp_regions, &num_regions);
+ cleanup_blendings(temp_regions, &num_regions);
+
+ // The flash points should all be considered high variance points
+ k = 0;
+ while (k < num_regions) {
+ if (temp_regions[k].type != STABLE_REGION) {
+ k++;
+ continue;
+ }
+ int start = temp_regions[k].start;
+ int last = temp_regions[k].last;
+ for (int i = start; i <= last; i++) {
+ if (stats_start[i].is_flash) {
+ insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k);
+ }
+ }
+ k++;
+ }
+ cleanup_regions(temp_regions, &num_regions);
+
+ // copy the regions in the scenecut group
+ for (k = 0; k < num_regions; k++) {
+ if (temp_regions[k].last < temp_regions[k].start &&
+ k == num_regions - 1) {
+ num_regions--;
+ break;
+ }
+ regions[k + cur_region] = temp_regions[k];
+ }
+ cur_region += num_regions;
+
+ // add the scenecut region
+ if (next_scenecut > -1) {
+ // add the scenecut region, and find the next scenecut
+ regions[cur_region].type = SCENECUT_REGION;
+ regions[cur_region].start = next_scenecut;
+ regions[cur_region].last = next_scenecut;
+ cur_region++;
+ this_start = next_scenecut + 1;
+ }
+ } while (next_scenecut >= 0);
+
+ *total_regions = cur_region;
+ get_region_stats(stats_start, regions, *total_regions);
+
+ for (k = 0; k < *total_regions; k++) {
+ // If scenecuts are very minor, mark them as high variance.
+ if (regions[k].type != SCENECUT_REGION ||
+ regions[k].avg_cor_coeff *
+ (1 - stats_start[regions[k].start].noise_var /
+ regions[k].avg_intra_err) <
+ 0.8) {
+ continue;
+ }
+ regions[k].type = HIGH_VAR_REGION;
+ }
+ cleanup_regions(regions, total_regions);
+ get_region_stats(stats_start, regions, *total_regions);
+
+ for (k = 0; k < *total_regions; k++) {
+ regions[k].start += offset;
+ regions[k].last += offset;
+ }
+
+ free_firstpass_stats_buffers(temp_regions, filt_intra_err, filt_coded_err,
+ grad_coded);
+ return 0;
+}
+
+static int find_regions_index(const REGIONS *regions, int num_regions,
+ int frame_idx) {
+ for (int k = 0; k < num_regions; k++) {
+ if (regions[k].start <= frame_idx && regions[k].last >= frame_idx) {
+ return k;
+ }
+ }
+ return -1;
+}
+
+/*!\brief Determine the length of future GF groups.
+ *
+ * \ingroup gf_group_algo
+ * This function decides the gf group length of future frames in batch
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] max_gop_length Maximum length of the GF group
+ * \param[in] max_intervals Maximum number of intervals to decide
+ *
+ * \remark Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is
+ * changed to store the decided GF group lengths.
+ */
+static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
+ int max_intervals) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+ const FIRSTPASS_STATS *const stats = start_pos - (rc->frames_since_key == 0);
+
+ const int f_w = cpi->common.width;
+ const int f_h = cpi->common.height;
+ int i;
+
+ int flash_detected;
+
+ av1_zero(next_frame);
+
+ if (has_no_stats_stage(cpi)) {
+ for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) {
+ p_rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
+ }
+ p_rc->cur_gf_index = 0;
+ rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS;
+ return;
+ }
+
+ // TODO(urvang): Try logic to vary min and max interval based on q.
+ const int active_min_gf_interval = rc->min_gf_interval;
+ const int active_max_gf_interval =
+ AOMMIN(rc->max_gf_interval, max_gop_length);
+ const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval);
+
+ i = (rc->frames_since_key == 0);
+ max_intervals = cpi->ppi->lap_enabled ? 1 : max_intervals;
+ int count_cuts = 1;
+ // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF.
+ int cur_start = -1 + !cpi->ppi->gf_state.arf_gf_boost_lst, cur_last;
+ int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 };
+ int cut_here;
+ GF_GROUP_STATS gf_stats;
+ init_gf_stats(&gf_stats);
+ while (count_cuts < max_intervals + 1) {
+ // reaches next key frame, break here
+ if (i >= rc->frames_to_key) {
+ cut_here = 2;
+ } else if (i - cur_start >= rc->static_scene_max_gf_interval) {
+ // reached maximum len, but nothing special yet (almost static)
+ // let's look at the next interval
+ cut_here = 1;
+ } else if (EOF == input_stats(twopass, &cpi->twopass_frame, &next_frame)) {
+ // reaches last frame, break
+ cut_here = 2;
+ } else {
+ // Test for the case where there is a brief flash but the prediction
+ // quality back to an earlier frame is then restored.
+ flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0);
+ // TODO(bohanli): remove redundant accumulations here, or unify
+ // this and the ones in define_gf_group
+ accumulate_next_frame_stats(&next_frame, flash_detected,
+ rc->frames_since_key, i, &gf_stats, f_w, f_h);
+
+ cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected,
+ active_max_gf_interval, active_min_gf_interval,
+ &gf_stats);
+ }
+ if (cut_here) {
+ cur_last = i - 1; // the current last frame in the gf group
+ int ori_last = cur_last;
+ // The region frame idx does not start from the same frame as cur_start
+ // and cur_last. Need to offset them.
+ int offset = rc->frames_since_key - p_rc->regions_offset;
+ REGIONS *regions = p_rc->regions;
+ int num_regions = p_rc->num_regions;
+
+ int scenecut_idx = -1;
+ // only try shrinking if interval smaller than active_max_gf_interval
+ if (cur_last - cur_start <= active_max_gf_interval &&
+ cur_last > cur_start) {
+ // find the region indices of where the first and last frame belong.
+ int k_start =
+ find_regions_index(regions, num_regions, cur_start + offset);
+ int k_last =
+ find_regions_index(regions, num_regions, cur_last + offset);
+ if (cur_start + offset == 0) k_start = 0;
+
+ // See if we have a scenecut in between
+ for (int r = k_start + 1; r <= k_last; r++) {
+ if (regions[r].type == SCENECUT_REGION &&
+ regions[r].last - offset - cur_start > active_min_gf_interval) {
+ scenecut_idx = r;
+ break;
+ }
+ }
+
+ // if the found scenecut is very close to the end, ignore it.
+ if (regions[num_regions - 1].last - regions[scenecut_idx].last < 4) {
+ scenecut_idx = -1;
+ }
+
+ if (scenecut_idx != -1) {
+ // If we have a scenecut, then stop at it.
+ // TODO(bohanli): add logic here to stop before the scenecut and for
+ // the next gop start from the scenecut with GF
+ int is_minor_sc =
+ (regions[scenecut_idx].avg_cor_coeff *
+ (1 - stats[regions[scenecut_idx].start - offset].noise_var /
+ regions[scenecut_idx].avg_intra_err) >
+ 0.6);
+ cur_last = regions[scenecut_idx].last - offset - !is_minor_sc;
+ } else {
+ int is_last_analysed = (k_last == num_regions - 1) &&
+ (cur_last + offset == regions[k_last].last);
+ int not_enough_regions =
+ k_last - k_start <=
+ 1 + (regions[k_start].type == SCENECUT_REGION);
+ // if we are very close to the end, then do not shrink since it may
+ // introduce intervals that are too short
+ if (!(is_last_analysed && not_enough_regions)) {
+ const double arf_length_factor = 0.1;
+ double best_score = 0;
+ int best_j = -1;
+ const int first_frame = regions[0].start - offset;
+ const int last_frame = regions[num_regions - 1].last - offset;
+ // score of how much the arf helps the whole GOP
+ double base_score = 0.0;
+ // Accumulate base_score in
+ for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) {
+ if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+ base_score = (base_score + 1.0) * stats[j].cor_coeff;
+ }
+ int met_blending = 0; // Whether we have met blending areas before
+ int last_blending = 0; // Whether the previous frame if blending
+ for (int j = cur_start + min_shrink_int; j <= cur_last; j++) {
+ if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+ base_score = (base_score + 1.0) * stats[j].cor_coeff;
+ int this_reg =
+ find_regions_index(regions, num_regions, j + offset);
+ if (this_reg < 0) continue;
+ // A GOP should include at most 1 blending region.
+ if (regions[this_reg].type == BLENDING_REGION) {
+ last_blending = 1;
+ if (met_blending) {
+ break;
+ } else {
+ base_score = 0;
+ continue;
+ }
+ } else {
+ if (last_blending) met_blending = 1;
+ last_blending = 0;
+ }
+
+ // Add the factor of how good the neighborhood is for this
+ // candidate arf.
+ double this_score = arf_length_factor * base_score;
+ double temp_accu_coeff = 1.0;
+ // following frames
+ int count_f = 0;
+ for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) {
+ if (stats + n >= twopass->stats_buf_ctx->stats_in_end) break;
+ temp_accu_coeff *= stats[n].cor_coeff;
+ this_score +=
+ temp_accu_coeff *
+ sqrt(AOMMAX(0.5,
+ 1 - stats[n].noise_var /
+ AOMMAX(stats[n].intra_error, 0.001)));
+ count_f++;
+ }
+ // preceding frames
+ temp_accu_coeff = 1.0;
+ for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) {
+ if (stats + n < twopass->stats_buf_ctx->stats_in_start) break;
+ temp_accu_coeff *= stats[n].cor_coeff;
+ this_score +=
+ temp_accu_coeff *
+ sqrt(AOMMAX(0.5,
+ 1 - stats[n].noise_var /
+ AOMMAX(stats[n].intra_error, 0.001)));
+ }
+
+ if (this_score > best_score) {
+ best_score = this_score;
+ best_j = j;
+ }
+ }
+
+ // For blending areas, move one more frame in case we missed the
+ // first blending frame.
+ int best_reg =
+ find_regions_index(regions, num_regions, best_j + offset);
+ if (best_reg < num_regions - 1 && best_reg > 0) {
+ if (regions[best_reg - 1].type == BLENDING_REGION &&
+ regions[best_reg + 1].type == BLENDING_REGION) {
+ if (best_j + offset == regions[best_reg].start &&
+ best_j + offset < regions[best_reg].last) {
+ best_j += 1;
+ } else if (best_j + offset == regions[best_reg].last &&
+ best_j + offset > regions[best_reg].start) {
+ best_j -= 1;
+ }
+ }
+ }
+
+ if (cur_last - best_j < 2) best_j = cur_last;
+ if (best_j > 0 && best_score > 0.1) cur_last = best_j;
+ // if cannot find anything, just cut at the original place.
+ }
+ }
+ }
+ cut_pos[count_cuts] = cur_last;
+ count_cuts++;
+
+ // reset pointers to the shrunken location
+ cpi->twopass_frame.stats_in = start_pos + cur_last;
+ cur_start = cur_last;
+ int cur_region_idx =
+ find_regions_index(regions, num_regions, cur_start + 1 + offset);
+ if (cur_region_idx >= 0)
+ if (regions[cur_region_idx].type == SCENECUT_REGION) cur_start++;
+
+ i = cur_last;
+
+ if (cut_here > 1 && cur_last == ori_last) break;
+
+ // reset accumulators
+ init_gf_stats(&gf_stats);
+ }
+ ++i;
+ }
+
+ // save intervals
+ rc->intervals_till_gf_calculate_due = count_cuts - 1;
+ for (int n = 1; n < count_cuts; n++) {
+ p_rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1];
+ }
+ p_rc->cur_gf_index = 0;
+ cpi->twopass_frame.stats_in = start_pos;
+}
+
+static void correct_frames_to_key(AV1_COMP *cpi) {
+ int lookahead_size =
+ (int)av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+ if (lookahead_size <
+ av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage)) {
+ assert(
+ IMPLIES(cpi->oxcf.pass != AOM_RC_ONE_PASS && cpi->ppi->frames_left > 0,
+ lookahead_size == cpi->ppi->frames_left));
+ cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size);
+ } else if (cpi->ppi->frames_left > 0) {
+ // Correct frames to key based on limit
+ cpi->rc.frames_to_key =
+ AOMMIN(cpi->rc.frames_to_key, cpi->ppi->frames_left);
+ }
+}
+
+/*!\brief Define a GF group in one pass mode when no look ahead stats are
+ * available.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group, along with various
+ * parameters regarding bit-allocation and quality setup in the special
+ * case of one pass encoding where no lookahead stats are avialable.
+ *
+ * \param[in] cpi Top-level encoder structure
+ *
+ * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed.
+ */
+static void define_gf_group_pass0(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+ int target;
+
+ if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
+ av1_cyclic_refresh_set_golden_update(cpi);
+ } else {
+ p_rc->baseline_gf_interval = p_rc->gf_intervals[p_rc->cur_gf_index];
+ rc->intervals_till_gf_calculate_due--;
+ p_rc->cur_gf_index++;
+ }
+
+ // correct frames_to_key when lookahead queue is flushing
+ correct_frames_to_key(cpi);
+
+ if (p_rc->baseline_gf_interval > rc->frames_to_key)
+ p_rc->baseline_gf_interval = rc->frames_to_key;
+
+ p_rc->gfu_boost = DEFAULT_GF_BOOST;
+ p_rc->constrained_gf_group =
+ (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+
+ gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height;
+
+ // Rare case when the look-ahead is less than the target GOP length, can't
+ // generate ARF frame.
+ if (p_rc->baseline_gf_interval > gf_cfg->lag_in_frames ||
+ !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) ||
+ p_rc->baseline_gf_interval < rc->min_gf_interval)
+ gf_group->max_layer_depth_allowed = 0;
+
+ // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+ av1_gop_setup_structure(cpi);
+
+ // Allocate bits to each of the frames in the GF group.
+ // TODO(sarahparker) Extend this to work with pyramid structure.
+ for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) {
+ const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index];
+ if (oxcf->rc_cfg.mode == AOM_CBR) {
+ if (cur_update_type == KF_UPDATE) {
+ target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
+ } else {
+ target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type);
+ }
+ } else {
+ if (cur_update_type == KF_UPDATE) {
+ target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
+ } else {
+ target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type);
+ }
+ }
+ gf_group->bit_allocation[cur_index] = target;
+ }
+}
+
+static INLINE void set_baseline_gf_interval(PRIMARY_RATE_CONTROL *p_rc,
+ int arf_position) {
+ p_rc->baseline_gf_interval = arf_position;
+}
+
+// initialize GF_GROUP_STATS
+static void init_gf_stats(GF_GROUP_STATS *gf_stats) {
+ gf_stats->gf_group_err = 0.0;
+ gf_stats->gf_group_raw_error = 0.0;
+ gf_stats->gf_group_skip_pct = 0.0;
+ gf_stats->gf_group_inactive_zone_rows = 0.0;
+
+ gf_stats->mv_ratio_accumulator = 0.0;
+ gf_stats->decay_accumulator = 1.0;
+ gf_stats->zero_motion_accumulator = 1.0;
+ gf_stats->loop_decay_rate = 1.0;
+ gf_stats->last_loop_decay_rate = 1.0;
+ gf_stats->this_frame_mv_in_out = 0.0;
+ gf_stats->mv_in_out_accumulator = 0.0;
+ gf_stats->abs_mv_in_out_accumulator = 0.0;
+
+ gf_stats->avg_sr_coded_error = 0.0;
+ gf_stats->avg_pcnt_second_ref = 0.0;
+ gf_stats->avg_new_mv_count = 0.0;
+ gf_stats->avg_wavelet_energy = 0.0;
+ gf_stats->avg_raw_err_stdev = 0.0;
+ gf_stats->non_zero_stdev_count = 0;
+}
+
+static void accumulate_gop_stats(AV1_COMP *cpi, int is_intra_only, int f_w,
+ int f_h, FIRSTPASS_STATS *next_frame,
+ const FIRSTPASS_STATS *start_pos,
+ GF_GROUP_STATS *gf_stats, int *idx) {
+ int i, flash_detected;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ RATE_CONTROL *const rc = &cpi->rc;
+ FRAME_INFO *frame_info = &cpi->frame_info;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ init_gf_stats(gf_stats);
+ av1_zero(*next_frame);
+
+ // If this is a key frame or the overlay from a previous arf then
+ // the error score / cost of this frame has already been accounted for.
+ i = is_intra_only;
+ // get the determined gf group length from p_rc->gf_intervals
+ while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
+ // read in the next frame
+ if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break;
+ // Accumulate error score of frames in this gf group.
+ double mod_frame_err =
+ calculate_modified_err(frame_info, twopass, oxcf, next_frame);
+ // accumulate stats for this frame
+ accumulate_this_frame_stats(next_frame, mod_frame_err, gf_stats);
+ ++i;
+ }
+
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+ i = is_intra_only;
+ input_stats(twopass, &cpi->twopass_frame, next_frame);
+ while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
+ // read in the next frame
+ if (EOF == input_stats(twopass, &cpi->twopass_frame, next_frame)) break;
+
+ // Test for the case where there is a brief flash but the prediction
+ // quality back to an earlier frame is then restored.
+ flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0);
+
+ // accumulate stats for next frame
+ accumulate_next_frame_stats(next_frame, flash_detected,
+ rc->frames_since_key, i, gf_stats, f_w, f_h);
+
+ ++i;
+ }
+
+ i = p_rc->gf_intervals[p_rc->cur_gf_index];
+ average_gf_stats(i, gf_stats);
+
+ *idx = i;
+}
+
+static void update_gop_length(RATE_CONTROL *rc, PRIMARY_RATE_CONTROL *p_rc,
+ int idx, int is_final_pass) {
+ if (is_final_pass) {
+ rc->intervals_till_gf_calculate_due--;
+ p_rc->cur_gf_index++;
+ }
+
+ // Was the group length constrained by the requirement for a new KF?
+ p_rc->constrained_gf_group = (idx >= rc->frames_to_key) ? 1 : 0;
+
+ set_baseline_gf_interval(p_rc, idx);
+ rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+}
+
+#define MAX_GF_BOOST 5400
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+static void set_gop_bits_boost(AV1_COMP *cpi, int i, int is_intra_only,
+ int is_final_pass, int use_alt_ref,
+ int alt_offset, const FIRSTPASS_STATS *start_pos,
+ GF_GROUP_STATS *gf_stats) {
+ // Should we use the alternate reference frame.
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ FRAME_INFO *frame_info = &cpi->frame_info;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ int ext_len = i - is_intra_only;
+ if (use_alt_ref) {
+ const int forward_frames = (rc->frames_to_key - i >= ext_len)
+ ? ext_len
+ : AOMMAX(0, rc->frames_to_key - i);
+
+ // Calculate the boost for alt ref.
+ p_rc->gfu_boost = av1_calc_arf_boost(
+ twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset,
+ forward_frames, ext_len, &p_rc->num_stats_used_for_gfu_boost,
+ &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled);
+ } else {
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+ p_rc->gfu_boost = AOMMIN(
+ MAX_GF_BOOST,
+ av1_calc_arf_boost(
+ twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, ext_len,
+ 0, &p_rc->num_stats_used_for_gfu_boost,
+ &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled));
+ }
+
+#define LAST_ALR_BOOST_FACTOR 0.2f
+ p_rc->arf_boost_factor = 1.0;
+ if (use_alt_ref && !is_lossless_requested(rc_cfg)) {
+ // Reduce the boost of altref in the last gf group
+ if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY ||
+ rc->frames_to_key - ext_len == 0) {
+ p_rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+ }
+ }
+
+ // Reset the file position.
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+ if (cpi->ppi->lap_enabled) {
+ // Since we don't have enough stats to know the actual error of the
+ // gf group, we assume error of each frame to be equal to 1 and set
+ // the error of the group as baseline_gf_interval.
+ gf_stats->gf_group_err = p_rc->baseline_gf_interval;
+ }
+ // Calculate the bits to be allocated to the gf/arf group as a whole
+ p_rc->gf_group_bits =
+ calculate_total_gf_group_bits(cpi, gf_stats->gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+ // Calculate an estimate of the maxq needed for the group.
+ // We are more aggressive about correcting for sections
+ // where there could be significant overshoot than for easier
+ // sections where we do not wish to risk creating an overshoot
+ // of the allocated bit budget.
+ if ((rc_cfg->mode != AOM_Q) && (p_rc->baseline_gf_interval > 1) &&
+ is_final_pass) {
+ const int vbr_group_bits_per_frame =
+ (int)(p_rc->gf_group_bits / p_rc->baseline_gf_interval);
+ const double group_av_err =
+ gf_stats->gf_group_raw_error / p_rc->baseline_gf_interval;
+ const double group_av_skip_pct =
+ gf_stats->gf_group_skip_pct / p_rc->baseline_gf_interval;
+ const double group_av_inactive_zone =
+ ((gf_stats->gf_group_inactive_zone_rows * 2) /
+ (p_rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
+
+ int tmp_q;
+ tmp_q = get_twopass_worst_quality(
+ cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+ vbr_group_bits_per_frame);
+ rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1);
+ }
+#endif
+
+ // Adjust KF group bits and error remaining.
+ if (is_final_pass) twopass->kf_group_error_left -= gf_stats->gf_group_err;
+
+ // Reset the file position.
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ if (rc->frames_since_key != 0) {
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_pos, twopass->stats_buf_ctx->stats_in_end,
+ p_rc->baseline_gf_interval);
+ }
+
+ av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0,
+ use_alt_ref, p_rc->gf_group_bits);
+
+ // TODO(jingning): Generalize this condition.
+ if (is_final_pass) {
+ cpi->ppi->gf_state.arf_gf_boost_lst = use_alt_ref;
+
+ // Reset rolling actual and target bits counters for ARF groups.
+ twopass->rolling_arf_group_target_bits = 1;
+ twopass->rolling_arf_group_actual_bits = 1;
+ }
+#if CONFIG_BITRATE_ACCURACY
+ if (is_final_pass) {
+ av1_vbr_rc_set_gop_bit_budget(&cpi->vbr_rc_info,
+ p_rc->baseline_gf_interval);
+ }
+#endif
+}
+
+/*!\brief Define a GF group.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group, along with various
+ * parameters regarding bit-allocation and quality setup.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] frame_params Structure with frame parameters
+ * \param[in] is_final_pass Whether this is the final pass for the
+ * GF group, or a trial (non-zero)
+ *
+ * \remark Nothing is returned. Instead, cpi->ppi->gf_group is changed.
+ */
+static void define_gf_group(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+ int is_final_pass) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+ const int f_w = cm->width;
+ const int f_h = cm->height;
+ int i;
+ const int is_intra_only = rc->frames_since_key == 0;
+
+ cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
+
+ // Reset the GF group data structures unless this is a key
+ // frame in which case it will already have been done.
+ if (!is_intra_only) {
+ av1_zero(cpi->ppi->gf_group);
+ cpi->gf_frame_index = 0;
+ }
+
+ if (has_no_stats_stage(cpi)) {
+ define_gf_group_pass0(cpi);
+ return;
+ }
+
+ if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) {
+ int ret = define_gf_group_pass3(cpi, frame_params, is_final_pass);
+ if (ret == 0) return;
+
+ av1_free_thirdpass_ctx(cpi->third_pass_ctx);
+ cpi->third_pass_ctx = NULL;
+ }
+
+ // correct frames_to_key when lookahead queue is emptying
+ if (cpi->ppi->lap_enabled) {
+ correct_frames_to_key(cpi);
+ }
+
+ GF_GROUP_STATS gf_stats;
+ accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos,
+ &gf_stats, &i);
+
+ const int can_disable_arf = !gf_cfg->gf_min_pyr_height;
+
+ // If this is a key frame or the overlay from a previous arf then
+ // the error score / cost of this frame has already been accounted for.
+ const int active_min_gf_interval = rc->min_gf_interval;
+
+ // Disable internal ARFs for "still" gf groups.
+ // zero_motion_accumulator: minimum percentage of (0,0) motion;
+ // avg_sr_coded_error: average of the SSE per pixel of each frame;
+ // avg_raw_err_stdev: average of the standard deviation of (0,0)
+ // motion error per block of each frame.
+ const int can_disable_internal_arfs = gf_cfg->gf_min_pyr_height <= 1;
+ if (can_disable_internal_arfs &&
+ gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION &&
+ gf_stats.avg_sr_coded_error < MAX_SR_CODED_ERROR &&
+ gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
+ cpi->ppi->internal_altref_allowed = 0;
+ }
+
+ int use_alt_ref;
+ if (can_disable_arf) {
+ use_alt_ref =
+ !is_almost_static(gf_stats.zero_motion_accumulator,
+ twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled) &&
+ p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) &&
+ (i >= MIN_GF_INTERVAL);
+ } else {
+ use_alt_ref = p_rc->use_arf_in_this_kf_group &&
+ (i < gf_cfg->lag_in_frames) && (i > 2);
+ }
+ if (use_alt_ref) {
+ gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height;
+ } else {
+ gf_group->max_layer_depth_allowed = 0;
+ }
+
+ int alt_offset = 0;
+ // The length reduction strategy is tweaked for certain cases, and doesn't
+ // work well for certain other cases.
+ const int allow_gf_length_reduction =
+ ((rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 128) ||
+ !cpi->ppi->internal_altref_allowed) &&
+ !is_lossless_requested(rc_cfg);
+
+ if (allow_gf_length_reduction && use_alt_ref) {
+ // adjust length of this gf group if one of the following condition met
+ // 1: only one overlay frame left and this gf is too long
+ // 2: next gf group is too short to have arf compared to the current gf
+
+ // maximum length of next gf group
+ const int next_gf_len = rc->frames_to_key - i;
+ const int single_overlay_left =
+ next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+ // the next gf is probably going to have a ARF but it will be shorter than
+ // this gf
+ const int unbalanced_gf =
+ i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 >= rc->min_gf_interval;
+
+ if (single_overlay_left || unbalanced_gf) {
+ const int roll_back = REDUCE_GF_LENGTH_BY;
+ // Reduce length only if active_min_gf_interval will be respected later.
+ if (i - roll_back >= active_min_gf_interval + 1) {
+ alt_offset = -roll_back;
+ i -= roll_back;
+ if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
+ p_rc->gf_intervals[p_rc->cur_gf_index] -= roll_back;
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+ accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame,
+ start_pos, &gf_stats, &i);
+ }
+ }
+ }
+
+ update_gop_length(rc, p_rc, i, is_final_pass);
+
+ // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+ av1_gop_setup_structure(cpi);
+
+ set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref,
+ alt_offset, start_pos, &gf_stats);
+
+ frame_params->frame_type =
+ rc->frames_since_key == 0 ? KEY_FRAME : INTER_FRAME;
+ frame_params->show_frame =
+ !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE);
+}
+
+/*!\brief Define a GF group for the third apss.
+ *
+ * \ingroup gf_group_algo
+ * This function defines the structure of a GF group for the third pass, along
+ * with various parameters regarding bit-allocation and quality setup based on
+ * the two-pass bitstream.
+ * Much of the function still uses the strategies used for the second pass and
+ * relies on first pass statistics. It is expected that over time these portions
+ * would be replaced with strategies specific to the third pass.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] frame_params Structure with frame parameters
+ * \param[in] is_final_pass Whether this is the final pass for the
+ * GF group, or a trial (non-zero)
+ *
+ * \return 0: Success;
+ * -1: There are conflicts between the bitstream and current config
+ * The values in cpi->ppi->gf_group are also changed.
+ */
+static int define_gf_group_pass3(AV1_COMP *cpi, EncodeFrameParams *frame_params,
+ int is_final_pass) {
+ if (!cpi->third_pass_ctx) return -1;
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const GFConfig *const gf_cfg = &oxcf->gf_cfg;
+ const int f_w = cm->width;
+ const int f_h = cm->height;
+ int i;
+ const int is_intra_only = rc->frames_since_key == 0;
+
+ cpi->ppi->internal_altref_allowed = (gf_cfg->gf_max_pyr_height > 1);
+
+ // Reset the GF group data structures unless this is a key
+ // frame in which case it will already have been done.
+ if (!is_intra_only) {
+ av1_zero(cpi->ppi->gf_group);
+ cpi->gf_frame_index = 0;
+ }
+
+ GF_GROUP_STATS gf_stats;
+ accumulate_gop_stats(cpi, is_intra_only, f_w, f_h, &next_frame, start_pos,
+ &gf_stats, &i);
+
+ const int can_disable_arf = !gf_cfg->gf_min_pyr_height;
+
+ // TODO(any): set cpi->ppi->internal_altref_allowed accordingly;
+
+ int use_alt_ref = av1_check_use_arf(cpi->third_pass_ctx);
+ if (use_alt_ref == 0 && !can_disable_arf) return -1;
+ if (use_alt_ref) {
+ gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height;
+ } else {
+ gf_group->max_layer_depth_allowed = 0;
+ }
+
+ update_gop_length(rc, p_rc, i, is_final_pass);
+
+ // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+ av1_gop_setup_structure(cpi);
+
+ set_gop_bits_boost(cpi, i, is_intra_only, is_final_pass, use_alt_ref, 0,
+ start_pos, &gf_stats);
+
+ frame_params->frame_type = cpi->third_pass_ctx->frame_info[0].frame_type;
+ frame_params->show_frame = cpi->third_pass_ctx->frame_info[0].is_show_frame;
+ return 0;
+}
+
+// #define FIXED_ARF_BITS
+#ifdef FIXED_ARF_BITS
+#define ARF_BITS_FRACTION 0.75
+#endif
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+ GF_GROUP *gf_group, int is_key_frame, int use_arf,
+ int64_t gf_group_bits) {
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ // Calculate the extra bits to be used for boosted frame(s)
+#ifdef FIXED_ARF_BITS
+ int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits);
+#else
+ int gf_arf_bits = calculate_boost_bits(
+ p_rc->baseline_gf_interval - (rc->frames_since_key == 0), p_rc->gfu_boost,
+ gf_group_bits);
+#endif
+
+ gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits,
+ gf_group_bits, 1);
+
+ // Allocate bits to each of the frames in the GF group.
+ allocate_gf_group_bits(gf_group, p_rc, rc, gf_group_bits, gf_arf_bits,
+ is_key_frame, use_arf);
+}
+
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 1.9
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+// Intra / Inter threshold very low
+#define VERY_LOW_II 1.5
+// Clean slide transitions we expect a sharp single frame spike in error.
+#define ERROR_SPIKE 5.0
+
+// Slide show transition detection.
+// Tests for case where there is very low error either side of the current frame
+// but much higher just for this frame. This can help detect key frames in
+// slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
+// It will not help if the transition is a fade or other multi-frame effect.
+static int slide_transition(const FIRSTPASS_STATS *this_frame,
+ const FIRSTPASS_STATS *last_frame,
+ const FIRSTPASS_STATS *next_frame) {
+ return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
+ (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
+ (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
+}
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+// We adapt the threshold based on number of frames in this key-frame group so
+// far.
+static double get_second_ref_usage_thresh(int frame_count_so_far) {
+ const int adapt_upto = 32;
+ const double min_second_ref_usage_thresh = 0.085;
+ const double second_ref_usage_thresh_max_delta = 0.035;
+ if (frame_count_so_far >= adapt_upto) {
+ return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta;
+ }
+ return min_second_ref_usage_thresh +
+ ((double)frame_count_so_far / (adapt_upto - 1)) *
+ second_ref_usage_thresh_max_delta;
+}
+
+static int test_candidate_kf(const FIRSTPASS_INFO *firstpass_info,
+ int this_stats_index, int frame_count_so_far,
+ enum aom_rc_mode rc_mode, int scenecut_mode,
+ int num_mbs) {
+ const FIRSTPASS_STATS *last_stats =
+ av1_firstpass_info_peek(firstpass_info, this_stats_index - 1);
+ const FIRSTPASS_STATS *this_stats =
+ av1_firstpass_info_peek(firstpass_info, this_stats_index);
+ const FIRSTPASS_STATS *next_stats =
+ av1_firstpass_info_peek(firstpass_info, this_stats_index + 1);
+ if (last_stats == NULL || this_stats == NULL || next_stats == NULL) {
+ return 0;
+ }
+
+ int is_viable_kf = 0;
+ double pcnt_intra = 1.0 - this_stats->pcnt_inter;
+ double modified_pcnt_inter =
+ this_stats->pcnt_inter - this_stats->pcnt_neutral;
+ const double second_ref_usage_thresh =
+ get_second_ref_usage_thresh(frame_count_so_far);
+ int frames_to_test_after_candidate_key = SCENE_CUT_KEY_TEST_INTERVAL;
+ int count_for_tolerable_prediction = 3;
+
+ // We do "-1" because the candidate key is not counted.
+ int stats_after_this_stats =
+ av1_firstpass_info_future_count(firstpass_info, this_stats_index) - 1;
+
+ if (scenecut_mode == ENABLE_SCENECUT_MODE_1) {
+ if (stats_after_this_stats < 3) {
+ return 0;
+ } else {
+ frames_to_test_after_candidate_key = 3;
+ count_for_tolerable_prediction = 1;
+ }
+ }
+ // Make sure we have enough stats after the candidate key.
+ frames_to_test_after_candidate_key =
+ AOMMIN(frames_to_test_after_candidate_key, stats_after_this_stats);
+
+ // Does the frame satisfy the primary criteria of a key frame?
+ // See above for an explanation of the test criteria.
+ // If so, then examine how well it predicts subsequent frames.
+ if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) &&
+ (this_stats->pcnt_second_ref < second_ref_usage_thresh) &&
+ (next_stats->pcnt_second_ref < second_ref_usage_thresh) &&
+ ((this_stats->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+ slide_transition(this_stats, last_stats, next_stats) ||
+ ((pcnt_intra > MIN_INTRA_LEVEL) &&
+ (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+ ((this_stats->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_stats->coded_error)) <
+ KF_II_ERR_THRESHOLD) &&
+ ((fabs(last_stats->coded_error - this_stats->coded_error) /
+ DOUBLE_DIVIDE_CHECK(this_stats->coded_error) >
+ ERR_CHANGE_THRESHOLD) ||
+ (fabs(last_stats->intra_error - this_stats->intra_error) /
+ DOUBLE_DIVIDE_CHECK(this_stats->intra_error) >
+ ERR_CHANGE_THRESHOLD) ||
+ ((next_stats->intra_error /
+ DOUBLE_DIVIDE_CHECK(next_stats->coded_error)) >
+ II_IMPROVEMENT_THRESHOLD))))) {
+ int i;
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double decay_accumulator = 1.0;
+
+ // Examine how well the key frame predicts subsequent frames.
+ for (i = 1; i <= frames_to_test_after_candidate_key; ++i) {
+ // Get the next frame details
+ const FIRSTPASS_STATS *local_next_frame =
+ av1_firstpass_info_peek(firstpass_info, this_stats_index + i);
+ double next_iiratio =
+ (BOOST_FACTOR * local_next_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(local_next_frame->coded_error));
+
+ if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+ // Cumulative effect of decay in prediction quality.
+ if (local_next_frame->pcnt_inter > 0.85)
+ decay_accumulator *= local_next_frame->pcnt_inter;
+ else
+ decay_accumulator *= (0.85 + local_next_frame->pcnt_inter) / 2.0;
+
+ // Keep a running total.
+ boost_score += (decay_accumulator * next_iiratio);
+
+ // Test various breakout clauses.
+ // TODO(any): Test of intra error should be normalized to an MB.
+ if ((local_next_frame->pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+ (((local_next_frame->pcnt_inter - local_next_frame->pcnt_neutral) <
+ 0.20) &&
+ (next_iiratio < 3.0)) ||
+ ((boost_score - old_boost_score) < 3.0) ||
+ (local_next_frame->intra_error < (200.0 / (double)num_mbs))) {
+ break;
+ }
+
+ old_boost_score = boost_score;
+ }
+
+ // If there is tolerable prediction for at least the next 3 frames then
+ // break out else discard this potential key frame and move on
+ if (boost_score > 30.0 && (i > count_for_tolerable_prediction)) {
+ is_viable_kf = 1;
+ } else {
+ is_viable_kf = 0;
+ }
+ }
+ return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
+#define MIN_KF_BOOST 600 // Minimum boost for non-static KF interval
+#define MAX_KF_BOOST 3200
+#define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval
+
+static int detect_app_forced_key(AV1_COMP *cpi) {
+ int num_frames_to_app_forced_key = is_forced_keyframe_pending(
+ cpi->ppi->lookahead, cpi->ppi->lookahead->max_sz, cpi->compressor_stage);
+ return num_frames_to_app_forced_key;
+}
+
+static int get_projected_kf_boost(AV1_COMP *cpi) {
+ /*
+ * If num_stats_used_for_kf_boost >= frames_to_key, then
+ * all stats needed for prior boost calculation are available.
+ * Hence projecting the prior boost is not needed in this cases.
+ */
+ if (cpi->ppi->p_rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
+ return cpi->ppi->p_rc.kf_boost;
+
+ // Get the current tpl factor (number of frames = frames_to_key).
+ double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key);
+ // Get the tpl factor when number of frames = num_stats_used_for_kf_boost.
+ double tpl_factor_num_stats = av1_get_kf_boost_projection_factor(
+ cpi->ppi->p_rc.num_stats_used_for_kf_boost);
+ int projected_kf_boost =
+ (int)rint((tpl_factor * cpi->ppi->p_rc.kf_boost) / tpl_factor_num_stats);
+ return projected_kf_boost;
+}
+
+/*!\brief Determine the location of the next key frame
+ *
+ * \ingroup gf_group_algo
+ * This function decides the placement of the next key frame when a
+ * scenecut is detected or the maximum key frame distance is reached.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] firstpass_info struct for firstpass info
+ * \param[in] num_frames_to_detect_scenecut Maximum lookahead frames.
+ * \param[in] search_start_idx the start index for searching key frame.
+ * Set it to one if we already know the
+ * current frame is key frame. Otherwise,
+ * set it to zero.
+ *
+ * \return Number of frames to the next key including the current frame.
+ */
+static int define_kf_interval(AV1_COMP *cpi,
+ const FIRSTPASS_INFO *firstpass_info,
+ int num_frames_to_detect_scenecut,
+ int search_start_idx) {
+ const TWO_PASS *const twopass = &cpi->ppi->twopass;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
+ double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+ double decay_accumulator = 1.0;
+ int i = 0, j;
+ int frames_to_key = search_start_idx;
+ int frames_since_key = rc->frames_since_key + 1;
+ int scenecut_detected = 0;
+
+ int num_frames_to_next_key = detect_app_forced_key(cpi);
+
+ if (num_frames_to_detect_scenecut == 0) {
+ if (num_frames_to_next_key != -1)
+ return num_frames_to_next_key;
+ else
+ return rc->frames_to_key;
+ }
+
+ if (num_frames_to_next_key != -1)
+ num_frames_to_detect_scenecut =
+ AOMMIN(num_frames_to_detect_scenecut, num_frames_to_next_key);
+
+ // Initialize the decay rates for the recent frames to check
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
+ i = 0;
+ const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.mi_params.MBs;
+ const int future_stats_count =
+ av1_firstpass_info_future_count(firstpass_info, 0);
+ while (frames_to_key < future_stats_count &&
+ frames_to_key < num_frames_to_detect_scenecut) {
+ // Provided that we are not at the end of the file...
+ if ((cpi->ppi->p_rc.enable_scenecut_detection > 0) && kf_cfg->auto_key &&
+ frames_to_key + 1 < future_stats_count) {
+ double loop_decay_rate;
+
+ // Check for a scene cut.
+ if (frames_since_key >= kf_cfg->key_freq_min) {
+ scenecut_detected = test_candidate_kf(
+ &twopass->firstpass_info, frames_to_key, frames_since_key,
+ oxcf->rc_cfg.mode, cpi->ppi->p_rc.enable_scenecut_detection,
+ num_mbs);
+ if (scenecut_detected) {
+ break;
+ }
+ }
+
+ // How fast is the prediction quality decaying?
+ const FIRSTPASS_STATS *next_stats =
+ av1_firstpass_info_peek(firstpass_info, frames_to_key + 1);
+ loop_decay_rate = get_prediction_decay_rate(next_stats);
+
+ // We want to know something about the recent past... rather than
+ // as used elsewhere where we are concerned with decay in prediction
+ // quality since the last GF or KF.
+ recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+ decay_accumulator = 1.0;
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+ decay_accumulator *= recent_loop_decay[j];
+
+ // Special check for transition or high motion followed by a
+ // static scene.
+ if (frames_since_key >= kf_cfg->key_freq_min) {
+ scenecut_detected = detect_transition_to_still(
+ firstpass_info, frames_to_key + 1, rc->min_gf_interval, i,
+ kf_cfg->key_freq_max - i, loop_decay_rate, decay_accumulator);
+ if (scenecut_detected) {
+ // In the case of transition followed by a static scene, the key frame
+ // could be a good predictor for the following frames, therefore we
+ // do not use an arf.
+ p_rc->use_arf_in_this_kf_group = 0;
+ break;
+ }
+ }
+
+ // Step on to the next frame.
+ ++frames_to_key;
+ ++frames_since_key;
+
+ // If we don't have a real key frame within the next two
+ // key_freq_max intervals then break out of the loop.
+ if (frames_to_key >= 2 * kf_cfg->key_freq_max) {
+ break;
+ }
+ } else {
+ ++frames_to_key;
+ ++frames_since_key;
+ }
+ ++i;
+ }
+ if (cpi->ppi->lap_enabled && !scenecut_detected)
+ frames_to_key = num_frames_to_next_key;
+
+ return frames_to_key;
+}
+
+static double get_kf_group_avg_error(TWO_PASS *twopass,
+ TWO_PASS_FRAME *twopass_frame,
+ const FIRSTPASS_STATS *first_frame,
+ const FIRSTPASS_STATS *start_position,
+ int frames_to_key) {
+ FIRSTPASS_STATS cur_frame = *first_frame;
+ int num_frames, i;
+ double kf_group_avg_error = 0.0;
+
+ reset_fpf_position(twopass_frame, start_position);
+
+ for (i = 0; i < frames_to_key; ++i) {
+ kf_group_avg_error += cur_frame.coded_error;
+ if (EOF == input_stats(twopass, twopass_frame, &cur_frame)) break;
+ }
+ num_frames = i + 1;
+ num_frames = AOMMIN(num_frames, frames_to_key);
+ kf_group_avg_error = kf_group_avg_error / num_frames;
+
+ return (kf_group_avg_error);
+}
+
+static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err,
+ double kf_group_avg_error) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ int64_t kf_group_bits;
+ if (cpi->ppi->lap_enabled) {
+ kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth;
+ if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) {
+ double vbr_corpus_complexity_lap =
+ cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap / 10.0;
+ /* Get the average corpus complexity of the frame */
+ kf_group_bits = (int64_t)(
+ kf_group_bits * (kf_group_avg_error / vbr_corpus_complexity_lap));
+ }
+ } else {
+ kf_group_bits = (int64_t)(twopass->bits_left *
+ (kf_group_err / twopass->modified_error_left));
+ }
+
+ return kf_group_bits;
+}
+
+static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FIRSTPASS_STATS cur_frame;
+ av1_zero(cur_frame);
+ int num_frames = 0;
+ // Accumulate total stat using available number of stats.
+ for (num_frames = 0; num_frames < (rc->frames_to_key - 1); ++num_frames) {
+ if (EOF == input_stats(twopass, &cpi->twopass_frame, &cur_frame)) break;
+ av1_accumulate_stats(avg_frame_stat, &cur_frame);
+ }
+
+ if (num_frames < 2) {
+ return num_frames;
+ }
+ // Average the total stat
+ avg_frame_stat->weight = avg_frame_stat->weight / num_frames;
+ avg_frame_stat->intra_error = avg_frame_stat->intra_error / num_frames;
+ avg_frame_stat->frame_avg_wavelet_energy =
+ avg_frame_stat->frame_avg_wavelet_energy / num_frames;
+ avg_frame_stat->coded_error = avg_frame_stat->coded_error / num_frames;
+ avg_frame_stat->sr_coded_error = avg_frame_stat->sr_coded_error / num_frames;
+ avg_frame_stat->pcnt_inter = avg_frame_stat->pcnt_inter / num_frames;
+ avg_frame_stat->pcnt_motion = avg_frame_stat->pcnt_motion / num_frames;
+ avg_frame_stat->pcnt_second_ref =
+ avg_frame_stat->pcnt_second_ref / num_frames;
+ avg_frame_stat->pcnt_neutral = avg_frame_stat->pcnt_neutral / num_frames;
+ avg_frame_stat->intra_skip_pct = avg_frame_stat->intra_skip_pct / num_frames;
+ avg_frame_stat->inactive_zone_rows =
+ avg_frame_stat->inactive_zone_rows / num_frames;
+ avg_frame_stat->inactive_zone_cols =
+ avg_frame_stat->inactive_zone_cols / num_frames;
+ avg_frame_stat->MVr = avg_frame_stat->MVr / num_frames;
+ avg_frame_stat->mvr_abs = avg_frame_stat->mvr_abs / num_frames;
+ avg_frame_stat->MVc = avg_frame_stat->MVc / num_frames;
+ avg_frame_stat->mvc_abs = avg_frame_stat->mvc_abs / num_frames;
+ avg_frame_stat->MVrv = avg_frame_stat->MVrv / num_frames;
+ avg_frame_stat->MVcv = avg_frame_stat->MVcv / num_frames;
+ avg_frame_stat->mv_in_out_count =
+ avg_frame_stat->mv_in_out_count / num_frames;
+ avg_frame_stat->new_mv_count = avg_frame_stat->new_mv_count / num_frames;
+ avg_frame_stat->count = avg_frame_stat->count / num_frames;
+ avg_frame_stat->duration = avg_frame_stat->duration / num_frames;
+
+ return num_frames;
+}
+
+static double get_kf_boost_score(AV1_COMP *cpi, double kf_raw_err,
+ double *zero_motion_accumulator,
+ double *sr_accumulator, int use_avg_stat) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FRAME_INFO *const frame_info = &cpi->frame_info;
+ FIRSTPASS_STATS frame_stat;
+ av1_zero(frame_stat);
+ int i = 0, num_stat_used = 0;
+ double boost_score = 0.0;
+ const double kf_max_boost =
+ cpi->oxcf.rc_cfg.mode == AOM_Q
+ ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+ KF_MAX_FRAME_BOOST)
+ : KF_MAX_FRAME_BOOST;
+
+ // Calculate the average using available number of stats.
+ if (use_avg_stat) num_stat_used = calc_avg_stats(cpi, &frame_stat);
+
+ for (i = num_stat_used; i < (rc->frames_to_key - 1); ++i) {
+ if (!use_avg_stat &&
+ EOF == input_stats(twopass, &cpi->twopass_frame, &frame_stat))
+ break;
+
+ // Monitor for static sections.
+ // For the first frame in kf group, the second ref indicator is invalid.
+ if (i > 0) {
+ *zero_motion_accumulator =
+ AOMMIN(*zero_motion_accumulator, get_zero_motion_factor(&frame_stat));
+ } else {
+ *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion;
+ }
+
+ // Not all frames in the group are necessarily used in calculating boost.
+ if ((*sr_accumulator < (kf_raw_err * 1.50)) &&
+ (i <= rc->max_gf_interval * 2)) {
+ double frame_boost;
+ double zm_factor;
+
+ // Factor 0.75-1.25 based on how much of frame is static.
+ zm_factor = (0.75 + (*zero_motion_accumulator / 2.0));
+
+ if (i < 2) *sr_accumulator = 0.0;
+ frame_boost =
+ calc_kf_frame_boost(&cpi->ppi->p_rc, frame_info, &frame_stat,
+ sr_accumulator, kf_max_boost);
+ boost_score += frame_boost * zm_factor;
+ }
+ }
+ return boost_score;
+}
+
+/*!\brief Interval(in seconds) to clip key-frame distance to in LAP.
+ */
+#define MAX_KF_BITS_INTERVAL_SINGLE_PASS 5
+
+/*!\brief Determine the next key frame group
+ *
+ * \ingroup gf_group_algo
+ * This function decides the placement of the next key frame, and
+ * calculates the bit allocation of the KF group and the keyframe itself.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] this_frame Pointer to first pass stats
+ */
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ FRAME_INFO *const frame_info = &cpi->frame_info;
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
+ const FIRSTPASS_STATS first_frame = *this_frame;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_INFO *firstpass_info = &twopass->firstpass_info;
+ av1_zero(next_frame);
+
+ rc->frames_since_key = 0;
+ // Use arfs if possible.
+ p_rc->use_arf_in_this_kf_group = is_altref_enabled(
+ oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf);
+
+ // Reset the GF group data structures.
+ av1_zero(*gf_group);
+ cpi->gf_frame_index = 0;
+
+ // KF is always a GF so clear frames till next gf counter.
+ rc->frames_till_gf_update_due = 0;
+
+ if (has_no_stats_stage(cpi)) {
+ int num_frames_to_app_forced_key = detect_app_forced_key(cpi);
+ p_rc->this_key_frame_forced =
+ current_frame->frame_number != 0 && rc->frames_to_key == 0;
+ if (num_frames_to_app_forced_key != -1)
+ rc->frames_to_key = num_frames_to_app_forced_key;
+ else
+ rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max);
+ correct_frames_to_key(cpi);
+ p_rc->kf_boost = DEFAULT_KF_BOOST;
+ gf_group->update_type[0] = KF_UPDATE;
+ return;
+ }
+ int i;
+ const FIRSTPASS_STATS *const start_position = cpi->twopass_frame.stats_in;
+ int kf_bits = 0;
+ double zero_motion_accumulator = 1.0;
+ double boost_score = 0.0;
+ double kf_raw_err = 0.0;
+ double kf_mod_err = 0.0;
+ double sr_accumulator = 0.0;
+ double kf_group_avg_error = 0.0;
+ int frames_to_key, frames_to_key_clipped = INT_MAX;
+ int64_t kf_group_bits_clipped = INT64_MAX;
+
+ // Is this a forced key frame by interval.
+ p_rc->this_key_frame_forced = p_rc->next_key_frame_forced;
+
+ twopass->kf_group_bits = 0; // Total bits available to kf group
+ twopass->kf_group_error_left = 0; // Group modified error score.
+
+ kf_raw_err = this_frame->intra_error;
+ kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+
+ // We assume the current frame is a key frame and we are looking for the next
+ // key frame. Therefore search_start_idx = 1
+ frames_to_key = define_kf_interval(cpi, firstpass_info, kf_cfg->key_freq_max,
+ /*search_start_idx=*/1);
+
+ if (frames_to_key != -1) {
+ rc->frames_to_key = AOMMIN(kf_cfg->key_freq_max, frames_to_key);
+ } else {
+ rc->frames_to_key = kf_cfg->key_freq_max;
+ }
+
+ if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi);
+
+ // If there is a max kf interval set by the user we must obey it.
+ // We already breakout of the loop above at 2x max.
+ // This code centers the extra kf if the actual natural interval
+ // is between 1x and 2x.
+ if (kf_cfg->auto_key && rc->frames_to_key > kf_cfg->key_freq_max) {
+ FIRSTPASS_STATS tmp_frame = first_frame;
+
+ rc->frames_to_key /= 2;
+
+ // Reset to the start of the group.
+ reset_fpf_position(&cpi->twopass_frame, start_position);
+ // Rescan to get the correct error data for the forced kf group.
+ for (i = 0; i < rc->frames_to_key; ++i) {
+ if (EOF == input_stats(twopass, &cpi->twopass_frame, &tmp_frame)) break;
+ }
+ p_rc->next_key_frame_forced = 1;
+ } else if ((cpi->twopass_frame.stats_in ==
+ twopass->stats_buf_ctx->stats_in_end &&
+ is_stat_consumption_stage_twopass(cpi)) ||
+ rc->frames_to_key >= kf_cfg->key_freq_max) {
+ p_rc->next_key_frame_forced = 1;
+ } else {
+ p_rc->next_key_frame_forced = 0;
+ }
+
+ double kf_group_err = 0;
+ for (i = 0; i < rc->frames_to_key; ++i) {
+ const FIRSTPASS_STATS *this_stats =
+ av1_firstpass_info_peek(&twopass->firstpass_info, i);
+ if (this_stats != NULL) {
+ // Accumulate kf group error.
+ kf_group_err += calculate_modified_err_new(
+ frame_info, &firstpass_info->total_stats, this_stats,
+ oxcf->rc_cfg.vbrbias, twopass->modified_error_min,
+ twopass->modified_error_max);
+ ++p_rc->num_stats_used_for_kf_boost;
+ }
+ }
+
+ // Calculate the number of bits that should be assigned to the kf group.
+ if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) ||
+ (cpi->ppi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) {
+ // Maximum number of bits for a single normal frame (not key frame).
+ const int max_bits = frame_max_bits(rc, oxcf);
+
+ // Maximum number of bits allocated to the key frame group.
+ int64_t max_grp_bits;
+
+ if (oxcf->rc_cfg.vbr_corpus_complexity_lap) {
+ kf_group_avg_error =
+ get_kf_group_avg_error(twopass, &cpi->twopass_frame, &first_frame,
+ start_position, rc->frames_to_key);
+ }
+
+ // Default allocation based on bits left and relative
+ // complexity of the section.
+ twopass->kf_group_bits =
+ get_kf_group_bits(cpi, kf_group_err, kf_group_avg_error);
+ // Clip based on maximum per frame rate defined by the user.
+ max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+ if (twopass->kf_group_bits > max_grp_bits)
+ twopass->kf_group_bits = max_grp_bits;
+ } else {
+ twopass->kf_group_bits = 0;
+ }
+ twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
+
+ if (cpi->ppi->lap_enabled) {
+ // In the case of single pass based on LAP, frames to key may have an
+ // inaccurate value, and hence should be clipped to an appropriate
+ // interval.
+ frames_to_key_clipped =
+ (int)(MAX_KF_BITS_INTERVAL_SINGLE_PASS * cpi->framerate);
+
+ // This variable calculates the bits allocated to kf_group with a clipped
+ // frames_to_key.
+ if (rc->frames_to_key > frames_to_key_clipped) {
+ kf_group_bits_clipped =
+ (int64_t)((double)twopass->kf_group_bits * frames_to_key_clipped /
+ rc->frames_to_key);
+ }
+ }
+
+ // Reset the first pass file position.
+ reset_fpf_position(&cpi->twopass_frame, start_position);
+
+ // Scan through the kf group collating various stats used to determine
+ // how many bits to spend on it.
+ boost_score = get_kf_boost_score(cpi, kf_raw_err, &zero_motion_accumulator,
+ &sr_accumulator, 0);
+ reset_fpf_position(&cpi->twopass_frame, start_position);
+ // Store the zero motion percentage
+ twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key);
+
+ p_rc->kf_boost = (int)boost_score;
+
+ if (cpi->ppi->lap_enabled) {
+ if (oxcf->rc_cfg.mode == AOM_Q) {
+ p_rc->kf_boost = get_projected_kf_boost(cpi);
+ } else {
+ // TODO(any): Explore using average frame stats for AOM_Q as well.
+ boost_score = get_kf_boost_score(
+ cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1);
+ reset_fpf_position(&cpi->twopass_frame, start_position);
+ p_rc->kf_boost += (int)boost_score;
+ }
+ }
+
+ // Special case for static / slide show content but don't apply
+ // if the kf group is very short.
+ if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
+ (rc->frames_to_key > 8)) {
+ p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_STATIC_KF_BOOST);
+ } else {
+ // Apply various clamps for min and max boost
+ p_rc->kf_boost = AOMMAX(p_rc->kf_boost, (rc->frames_to_key * 3));
+ p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_KF_BOOST);
+#ifdef STRICT_RC
+ p_rc->kf_boost = AOMMIN(p_rc->kf_boost, MAX_KF_BOOST);
+#endif
+ }
+
+ // Work out how many bits to allocate for the key frame itself.
+ // In case of LAP enabled for VBR, if the frames_to_key value is
+ // very high, we calculate the bits based on a clipped value of
+ // frames_to_key.
+ kf_bits = calculate_boost_bits(
+ AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, p_rc->kf_boost,
+ AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped));
+ // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n",
+ // p_rc->kf_boost,
+ // kf_bits, twopass->kf_zeromotion_pct);
+ kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits,
+ twopass->kf_group_bits, 0);
+
+ twopass->kf_group_bits -= kf_bits;
+
+ // Save the bits to spend on the key frame.
+ gf_group->bit_allocation[0] = kf_bits;
+ gf_group->update_type[0] = KF_UPDATE;
+
+ // Note the total error score of the kf group minus the key frame itself.
+ if (cpi->ppi->lap_enabled)
+ // As we don't have enough stats to know the actual error of the group,
+ // we assume the complexity of each frame to be equal to 1, and set the
+ // error as the number of frames in the group(minus the keyframe).
+ twopass->kf_group_error_left = (double)(rc->frames_to_key - 1);
+ else
+ twopass->kf_group_error_left = kf_group_err - kf_mod_err;
+
+ // Adjust the count of total modified error left.
+ // The count of bits left is adjusted elsewhere based on real coded frame
+ // sizes.
+ twopass->modified_error_left -= kf_group_err;
+}
+
+#define ARF_STATS_OUTPUT 0
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+
+static int get_section_target_bandwidth(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ int section_target_bandwidth;
+ const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
+ current_frame->frame_number);
+ if (cpi->ppi->lap_enabled)
+ section_target_bandwidth = (int)rc->avg_frame_bandwidth;
+ else
+ section_target_bandwidth = (int)(twopass->bits_left / frames_left);
+ return section_target_bandwidth;
+}
+
+static INLINE void set_twopass_params_based_on_fp_stats(
+ AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) {
+ if (this_frame_ptr == NULL) return;
+
+ TWO_PASS_FRAME *twopass_frame = &cpi->twopass_frame;
+ // The multiplication by 256 reverses a scaling factor of (>> 8)
+ // applied when combining MB error values for the frame.
+ twopass_frame->mb_av_energy = log1p(this_frame_ptr->intra_error);
+
+ const FIRSTPASS_STATS *const total_stats =
+ cpi->ppi->twopass.stats_buf_ctx->total_stats;
+ if (is_fp_wavelet_energy_invalid(total_stats) == 0) {
+ twopass_frame->frame_avg_haar_energy =
+ log1p(this_frame_ptr->frame_avg_wavelet_energy);
+ }
+
+ // Set the frame content type flag.
+ if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH)
+ twopass_frame->fr_content_type = FC_GRAPHICS_ANIMATION;
+ else
+ twopass_frame->fr_content_type = FC_NORMAL;
+}
+
+static void process_first_pass_stats(AV1_COMP *cpi,
+ FIRSTPASS_STATS *this_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
+
+ if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 &&
+ cpi->gf_frame_index == 0 && total_stats &&
+ twopass->stats_buf_ctx->total_left_stats) {
+ if (cpi->ppi->lap_enabled) {
+ /*
+ * Accumulate total_stats using available limited number of stats,
+ * and assign it to total_left_stats.
+ */
+ *twopass->stats_buf_ctx->total_left_stats = *total_stats;
+ }
+ // Special case code for first frame.
+ const int section_target_bandwidth = get_section_target_bandwidth(cpi);
+ const double section_length =
+ twopass->stats_buf_ctx->total_left_stats->count;
+ const double section_error =
+ twopass->stats_buf_ctx->total_left_stats->coded_error / section_length;
+ const double section_intra_skip =
+ twopass->stats_buf_ctx->total_left_stats->intra_skip_pct /
+ section_length;
+ const double section_inactive_zone =
+ (twopass->stats_buf_ctx->total_left_stats->inactive_zone_rows * 2) /
+ ((double)cm->mi_params.mb_rows * section_length);
+ const int tmp_q = get_twopass_worst_quality(
+ cpi, section_error, section_intra_skip + section_inactive_zone,
+ section_target_bandwidth);
+
+ rc->active_worst_quality = tmp_q;
+ rc->ni_av_qi = tmp_q;
+ p_rc->last_q[INTER_FRAME] = tmp_q;
+ p_rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params->bit_depth);
+ p_rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+ p_rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2;
+ p_rc->avg_frame_qindex[KEY_FRAME] = p_rc->last_q[KEY_FRAME];
+ }
+
+ if (cpi->twopass_frame.stats_in < twopass->stats_buf_ctx->stats_in_end) {
+ *this_frame = *cpi->twopass_frame.stats_in;
+ ++cpi->twopass_frame.stats_in;
+ }
+ set_twopass_params_based_on_fp_stats(cpi, this_frame);
+}
+
+static void setup_target_rate(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+ int target_rate = gf_group->bit_allocation[cpi->gf_frame_index];
+
+ if (has_no_stats_stage(cpi)) {
+ av1_rc_set_frame_target(cpi, target_rate, cpi->common.width,
+ cpi->common.height);
+ }
+
+ rc->base_frame_target = target_rate;
+}
+
+void av1_mark_flashes(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats) {
+ FIRSTPASS_STATS *this_stats = first_stats, *next_stats;
+ while (this_stats < last_stats - 1) {
+ next_stats = this_stats + 1;
+ if (next_stats->pcnt_second_ref > next_stats->pcnt_inter &&
+ next_stats->pcnt_second_ref >= 0.5) {
+ this_stats->is_flash = 1;
+ } else {
+ this_stats->is_flash = 0;
+ }
+ this_stats = next_stats;
+ }
+ // We always treat the last one as none flash.
+ if (last_stats - 1 >= first_stats) {
+ (last_stats - 1)->is_flash = 0;
+ }
+}
+
+// Smooth-out the noise variance so it is more stable
+// Returns 0 on success, -1 on memory allocation failure.
+// TODO(bohanli): Use a better low-pass filter than averaging
+static int smooth_filter_noise(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats) {
+ int len = (int)(last_stats - first_stats);
+ double *smooth_noise = aom_malloc(len * sizeof(*smooth_noise));
+ if (!smooth_noise) return -1;
+
+ for (int i = 0; i < len; i++) {
+ double total_noise = 0;
+ double total_wt = 0;
+ for (int j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
+ int idx = AOMMIN(AOMMAX(i + j, 0), len - 1);
+ if (first_stats[idx].is_flash) continue;
+
+ total_noise += first_stats[idx].noise_var;
+ total_wt += 1.0;
+ }
+ if (total_wt > 0.01) {
+ total_noise /= total_wt;
+ } else {
+ total_noise = first_stats[i].noise_var;
+ }
+ smooth_noise[i] = total_noise;
+ }
+
+ for (int i = 0; i < len; i++) {
+ first_stats[i].noise_var = smooth_noise[i];
+ }
+
+ aom_free(smooth_noise);
+ return 0;
+}
+
+// Estimate the noise variance of each frame from the first pass stats
+void av1_estimate_noise(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats,
+ struct aom_internal_error_info *error_info) {
+ FIRSTPASS_STATS *this_stats, *next_stats;
+ double C1, C2, C3, noise;
+ for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+ this_stats->noise_var = 0.0;
+ // flashes tend to have high correlation of innovations, so ignore them.
+ if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+ (this_stats - 2)->is_flash)
+ continue;
+
+ C1 = (this_stats - 1)->intra_error *
+ (this_stats->intra_error - this_stats->coded_error);
+ C2 = (this_stats - 2)->intra_error *
+ ((this_stats - 1)->intra_error - (this_stats - 1)->coded_error);
+ C3 = (this_stats - 2)->intra_error *
+ (this_stats->intra_error - this_stats->sr_coded_error);
+ if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue;
+ C1 = sqrt(C1);
+ C2 = sqrt(C2);
+ C3 = sqrt(C3);
+
+ noise = (this_stats - 1)->intra_error - C1 * C2 / C3;
+ noise = AOMMAX(noise, 0.01);
+ this_stats->noise_var = noise;
+ }
+
+ // Copy noise from the neighbor if the noise value is not trustworthy
+ for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+ if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+ (this_stats - 2)->is_flash)
+ continue;
+ if (this_stats->noise_var < 1.0) {
+ int found = 0;
+ // TODO(bohanli): consider expanding to two directions at the same time
+ for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+ if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+ (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+ continue;
+ found = 1;
+ this_stats->noise_var = next_stats->noise_var;
+ break;
+ }
+ if (found) continue;
+ for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+ next_stats--) {
+ if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+ (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+ continue;
+ this_stats->noise_var = next_stats->noise_var;
+ break;
+ }
+ }
+ }
+
+ // copy the noise if this is a flash
+ for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+ if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+ (this_stats - 2)->is_flash) {
+ int found = 0;
+ for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+ if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+ (next_stats - 2)->is_flash)
+ continue;
+ found = 1;
+ this_stats->noise_var = next_stats->noise_var;
+ break;
+ }
+ if (found) continue;
+ for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+ next_stats--) {
+ if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+ (next_stats - 2)->is_flash)
+ continue;
+ this_stats->noise_var = next_stats->noise_var;
+ break;
+ }
+ }
+ }
+
+ // if we are at the first 2 frames, copy the noise
+ for (this_stats = first_stats;
+ this_stats < first_stats + 2 && (first_stats + 2) < last_stats;
+ this_stats++) {
+ this_stats->noise_var = (first_stats + 2)->noise_var;
+ }
+
+ if (smooth_filter_noise(first_stats, last_stats) == -1) {
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers in smooth_filter_noise()");
+ }
+}
+
+// Estimate correlation coefficient of each frame with its previous frame.
+void av1_estimate_coeff(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats) {
+ FIRSTPASS_STATS *this_stats;
+ for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) {
+ const double C =
+ sqrt(AOMMAX((this_stats - 1)->intra_error *
+ (this_stats->intra_error - this_stats->coded_error),
+ 0.001));
+ const double cor_coeff =
+ C /
+ AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001);
+
+ this_stats->cor_coeff =
+ cor_coeff *
+ sqrt(AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var,
+ 0.001) /
+ AOMMAX(this_stats->intra_error - this_stats->noise_var, 0.001));
+ // clip correlation coefficient.
+ this_stats->cor_coeff = AOMMIN(AOMMAX(this_stats->cor_coeff, 0), 1);
+ }
+ first_stats->cor_coeff = 1.0;
+}
+
+void av1_get_second_pass_params(AV1_COMP *cpi,
+ EncodeFrameParams *const frame_params,
+ unsigned int frame_flags) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ if (cpi->use_ducky_encode &&
+ cpi->ducky_encode_info.frame_info.gop_mode == DUCKY_ENCODE_GOP_MODE_RCL) {
+ frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
+ frame_params->show_frame =
+ !(gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE);
+ if (cpi->gf_frame_index == 0) {
+ av1_tf_info_reset(&cpi->ppi->tf_info);
+ av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+ }
+ return;
+ }
+
+ const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+ int update_total_stats = 0;
+
+ if (is_stat_consumption_stage(cpi) && !cpi->twopass_frame.stats_in) return;
+
+ // Check forced key frames.
+ const int frames_to_next_forced_key = detect_app_forced_key(cpi);
+ if (frames_to_next_forced_key == 0) {
+ rc->frames_to_key = 0;
+ frame_flags &= FRAMEFLAGS_KEY;
+ } else if (frames_to_next_forced_key > 0 &&
+ frames_to_next_forced_key < rc->frames_to_key) {
+ rc->frames_to_key = frames_to_next_forced_key;
+ }
+
+ assert(cpi->twopass_frame.stats_in != NULL);
+ const int update_type = gf_group->update_type[cpi->gf_frame_index];
+ frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
+
+ if (cpi->gf_frame_index < gf_group->size && !(frame_flags & FRAMEFLAGS_KEY)) {
+ assert(cpi->gf_frame_index < gf_group->size);
+
+ setup_target_rate(cpi);
+
+ // If this is an arf frame then we dont want to read the stats file or
+ // advance the input pointer as we already have what we need.
+ if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) {
+ const FIRSTPASS_STATS *const this_frame_ptr =
+ read_frame_stats(twopass, &cpi->twopass_frame,
+ gf_group->arf_src_offset[cpi->gf_frame_index]);
+ set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
+ return;
+ }
+ }
+
+ if (oxcf->rc_cfg.mode == AOM_Q)
+ rc->active_worst_quality = oxcf->rc_cfg.cq_level;
+
+ if (cpi->gf_frame_index == gf_group->size) {
+ if (cpi->ppi->lap_enabled && cpi->ppi->p_rc.enable_scenecut_detection) {
+ const int num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
+ const int frames_to_key = define_kf_interval(
+ cpi, &twopass->firstpass_info, num_frames_to_detect_scenecut,
+ /*search_start_idx=*/0);
+ if (frames_to_key != -1)
+ rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
+ }
+ }
+
+ FIRSTPASS_STATS this_frame;
+ av1_zero(this_frame);
+ // call above fn
+ if (is_stat_consumption_stage(cpi)) {
+ if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0) {
+ process_first_pass_stats(cpi, &this_frame);
+ update_total_stats = 1;
+ }
+ } else {
+ rc->active_worst_quality = oxcf->rc_cfg.cq_level;
+ }
+
+ // Keyframe and section processing.
+ FIRSTPASS_STATS this_frame_copy;
+ this_frame_copy = this_frame;
+ if (rc->frames_to_key <= 0) {
+ assert(rc->frames_to_key == 0);
+ // Define next KF group and assign bits to it.
+ frame_params->frame_type = KEY_FRAME;
+ find_next_key_frame(cpi, &this_frame);
+ this_frame = this_frame_copy;
+ }
+
+ if (rc->frames_to_fwd_kf <= 0)
+ rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist;
+
+ // Define a new GF/ARF group. (Should always enter here for key frames).
+ if (cpi->gf_frame_index == gf_group->size) {
+ av1_tf_info_reset(&cpi->ppi->tf_info);
+#if CONFIG_BITRATE_ACCURACY && !CONFIG_THREE_PASS
+ vbr_rc_reset_gop_data(&cpi->vbr_rc_info);
+#endif // CONFIG_BITRATE_ACCURACY
+ int max_gop_length =
+ (oxcf->gf_cfg.lag_in_frames >= 32)
+ ? AOMMIN(MAX_GF_INTERVAL, oxcf->gf_cfg.lag_in_frames -
+ oxcf->algo_cfg.arnr_max_frames / 2)
+ : MAX_GF_LENGTH_LAP;
+
+ // Handle forward key frame when enabled.
+ if (oxcf->kf_cfg.fwd_kf_dist > 0)
+ max_gop_length = AOMMIN(rc->frames_to_fwd_kf + 1, max_gop_length);
+
+ // Use the provided gop size in low delay setting
+ if (oxcf->gf_cfg.lag_in_frames == 0) max_gop_length = rc->max_gf_interval;
+
+ // Limit the max gop length for the last gop in 1 pass setting.
+ max_gop_length = AOMMIN(max_gop_length, rc->frames_to_key);
+
+ // Identify regions if needed.
+ // TODO(bohanli): identify regions for all stats available.
+ if (rc->frames_since_key == 0 || rc->frames_since_key == 1 ||
+ (p_rc->frames_till_regions_update - rc->frames_since_key <
+ rc->frames_to_key &&
+ p_rc->frames_till_regions_update - rc->frames_since_key <
+ max_gop_length + 1)) {
+ // how many frames we can analyze from this frame
+ int rest_frames =
+ AOMMIN(rc->frames_to_key, MAX_FIRSTPASS_ANALYSIS_FRAMES);
+ rest_frames =
+ AOMMIN(rest_frames, (int)(twopass->stats_buf_ctx->stats_in_end -
+ cpi->twopass_frame.stats_in +
+ (rc->frames_since_key == 0)));
+ p_rc->frames_till_regions_update = rest_frames;
+
+ int ret;
+ if (cpi->ppi->lap_enabled) {
+ av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end);
+ av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end,
+ cpi->common.error);
+ av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end);
+ ret = identify_regions(cpi->twopass_frame.stats_in, rest_frames,
+ (rc->frames_since_key == 0), p_rc->regions,
+ &p_rc->num_regions);
+ } else {
+ ret = identify_regions(
+ cpi->twopass_frame.stats_in - (rc->frames_since_key == 0),
+ rest_frames, 0, p_rc->regions, &p_rc->num_regions);
+ }
+ if (ret == -1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers in identify_regions");
+ }
+ }
+
+ int cur_region_idx =
+ find_regions_index(p_rc->regions, p_rc->num_regions,
+ rc->frames_since_key - p_rc->regions_offset);
+ if ((cur_region_idx >= 0 &&
+ p_rc->regions[cur_region_idx].type == SCENECUT_REGION) ||
+ rc->frames_since_key == 0) {
+ // If we start from a scenecut, then the last GOP's arf boost is not
+ // needed for this GOP.
+ cpi->ppi->gf_state.arf_gf_boost_lst = 0;
+ }
+
+ int need_gf_len = 1;
+ if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) {
+ // set up bitstream to read
+ if (!cpi->third_pass_ctx->input_file_name && oxcf->two_pass_output) {
+ cpi->third_pass_ctx->input_file_name = oxcf->two_pass_output;
+ }
+ av1_open_second_pass_log(cpi, 1);
+ THIRD_PASS_GOP_INFO *gop_info = &cpi->third_pass_ctx->gop_info;
+ // Read in GOP information from the second pass file.
+ av1_read_second_pass_gop_info(cpi->second_pass_log_stream, gop_info,
+ cpi->common.error);
+#if CONFIG_BITRATE_ACCURACY
+ TPL_INFO *tpl_info;
+ AOM_CHECK_MEM_ERROR(cpi->common.error, tpl_info,
+ aom_malloc(sizeof(*tpl_info)));
+ av1_read_tpl_info(tpl_info, cpi->second_pass_log_stream,
+ cpi->common.error);
+ aom_free(tpl_info);
+#if CONFIG_THREE_PASS
+ // TODO(angiebird): Put this part into a func
+ cpi->vbr_rc_info.cur_gop_idx++;
+#endif // CONFIG_THREE_PASS
+#endif // CONFIG_BITRATE_ACCURACY
+ // Read in third_pass_info from the bitstream.
+ av1_set_gop_third_pass(cpi->third_pass_ctx);
+ // Read in per-frame info from second-pass encoding
+ av1_read_second_pass_per_frame_info(
+ cpi->second_pass_log_stream, cpi->third_pass_ctx->frame_info,
+ gop_info->num_frames, cpi->common.error);
+
+ p_rc->cur_gf_index = 0;
+ p_rc->gf_intervals[0] = cpi->third_pass_ctx->gop_info.gf_length;
+ need_gf_len = 0;
+ }
+
+ if (need_gf_len) {
+ // If we cannot obtain GF group length from second_pass_file
+ // TODO(jingning): Resolve the redundant calls here.
+ if (rc->intervals_till_gf_calculate_due == 0 || 1) {
+ calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS);
+ }
+
+ if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model &&
+ oxcf->gf_cfg.lag_in_frames >= 32 &&
+ cpi->sf.tpl_sf.gop_length_decision_method != 3) {
+ int this_idx = rc->frames_since_key +
+ p_rc->gf_intervals[p_rc->cur_gf_index] -
+ p_rc->regions_offset - 1;
+ int this_region =
+ find_regions_index(p_rc->regions, p_rc->num_regions, this_idx);
+ int next_region =
+ find_regions_index(p_rc->regions, p_rc->num_regions, this_idx + 1);
+ // TODO(angiebird): Figure out why this_region and next_region are -1 in
+ // unit test like AltRefFramePresenceTestLarge (aomedia:3134)
+ int is_last_scenecut =
+ p_rc->gf_intervals[p_rc->cur_gf_index] >= rc->frames_to_key ||
+ (this_region != -1 &&
+ p_rc->regions[this_region].type == SCENECUT_REGION) ||
+ (next_region != -1 &&
+ p_rc->regions[next_region].type == SCENECUT_REGION);
+
+ int ori_gf_int = p_rc->gf_intervals[p_rc->cur_gf_index];
+
+ if (p_rc->gf_intervals[p_rc->cur_gf_index] > 16 &&
+ rc->min_gf_interval <= 16) {
+ // The calculate_gf_length function is previously used with
+ // max_gop_length = 32 with look-ahead gf intervals.
+ define_gf_group(cpi, frame_params, 0);
+ av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+ this_frame = this_frame_copy;
+
+ if (is_shorter_gf_interval_better(cpi, frame_params)) {
+ // A shorter gf interval is better.
+ // TODO(jingning): Remove redundant computations here.
+ max_gop_length = 16;
+ calculate_gf_length(cpi, max_gop_length, 1);
+ if (is_last_scenecut &&
+ (ori_gf_int - p_rc->gf_intervals[p_rc->cur_gf_index] < 4)) {
+ p_rc->gf_intervals[p_rc->cur_gf_index] = ori_gf_int;
+ }
+ }
+ }
+ }
+ }
+
+ define_gf_group(cpi, frame_params, 0);
+
+ if (gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE &&
+ rc->frames_since_key > 0)
+ process_first_pass_stats(cpi, &this_frame);
+
+ define_gf_group(cpi, frame_params, 1);
+
+ // write gop info if needed for third pass. Per-frame info is written after
+ // each frame is encoded.
+ av1_write_second_pass_gop_info(cpi);
+
+ av1_tf_info_filtering(&cpi->ppi->tf_info, cpi, gf_group);
+
+ rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+ assert(cpi->gf_frame_index == 0);
+#if ARF_STATS_OUTPUT
+ {
+ FILE *fpfile;
+ fpfile = fopen("arf.stt", "a");
+ ++arf_count;
+ fprintf(fpfile, "%10d %10d %10d %10d %10d\n",
+ cpi->common.current_frame.frame_number,
+ rc->frames_till_gf_update_due, cpi->ppi->p_rc.kf_boost, arf_count,
+ p_rc->gfu_boost);
+
+ fclose(fpfile);
+ }
+#endif
+ }
+ assert(cpi->gf_frame_index < gf_group->size);
+
+ if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+ reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+ const FIRSTPASS_STATS *const this_frame_ptr =
+ read_frame_stats(twopass, &cpi->twopass_frame,
+ gf_group->arf_src_offset[cpi->gf_frame_index]);
+ set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
+ } else {
+ // Back up this frame's stats for updating total stats during post encode.
+ cpi->twopass_frame.this_frame = update_total_stats ? start_pos : NULL;
+ }
+
+ frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
+ setup_target_rate(cpi);
+}
+
+void av1_init_second_pass(AV1_COMP *cpi) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FRAME_INFO *const frame_info = &cpi->frame_info;
+ double frame_rate;
+ FIRSTPASS_STATS *stats;
+
+ if (!twopass->stats_buf_ctx->stats_in_end) return;
+
+ av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end);
+ av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end, cpi->common.error);
+ av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+ twopass->stats_buf_ctx->stats_in_end);
+
+ stats = twopass->stats_buf_ctx->total_stats;
+
+ *stats = *twopass->stats_buf_ctx->stats_in_end;
+ *twopass->stats_buf_ctx->total_left_stats = *stats;
+
+ frame_rate = 10000000.0 * stats->count / stats->duration;
+ // Each frame can have a different duration, as the frame rate in the source
+ // isn't guaranteed to be constant. The frame rate prior to the first frame
+ // encoded in the second pass is a guess. However, the sum duration is not.
+ // It is calculated based on the actual durations of all frames from the
+ // first pass.
+ av1_new_framerate(cpi, frame_rate);
+ twopass->bits_left =
+ (int64_t)(stats->duration * oxcf->rc_cfg.target_bandwidth / 10000000.0);
+
+#if CONFIG_BITRATE_ACCURACY
+ av1_vbr_rc_init(&cpi->vbr_rc_info, twopass->bits_left,
+ (int)round(stats->count));
+#endif
+
+#if CONFIG_RATECTRL_LOG
+ rc_log_init(&cpi->rc_log);
+#endif
+
+ // This variable monitors how far behind the second ref update is lagging.
+ twopass->sr_update_lag = 1;
+
+ // Scan the first pass file and calculate a modified total error based upon
+ // the bias/power function used to allocate bits.
+ {
+ const double avg_error =
+ stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
+ const FIRSTPASS_STATS *s = cpi->twopass_frame.stats_in;
+ double modified_error_total = 0.0;
+ twopass->modified_error_min =
+ (avg_error * oxcf->rc_cfg.vbrmin_section) / 100;
+ twopass->modified_error_max =
+ (avg_error * oxcf->rc_cfg.vbrmax_section) / 100;
+ while (s < twopass->stats_buf_ctx->stats_in_end) {
+ modified_error_total +=
+ calculate_modified_err(frame_info, twopass, oxcf, s);
+ ++s;
+ }
+ twopass->modified_error_left = modified_error_total;
+ }
+
+ // Reset the vbr bits off target counters
+ cpi->ppi->p_rc.vbr_bits_off_target = 0;
+ cpi->ppi->p_rc.vbr_bits_off_target_fast = 0;
+
+ cpi->ppi->p_rc.rate_error_estimate = 0;
+
+ // Static sequence monitor variables.
+ twopass->kf_zeromotion_pct = 100;
+ twopass->last_kfgroup_zeromotion_pct = 100;
+
+ // Initialize bits per macro_block estimate correction factor.
+ twopass->bpm_factor = 1.0;
+ // Initialize actual and target bits counters for ARF groups so that
+ // at the start we have a neutral bpm adjustment.
+ twopass->rolling_arf_group_target_bits = 1;
+ twopass->rolling_arf_group_actual_bits = 1;
+}
+
+void av1_init_single_pass_lap(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+
+ if (!twopass->stats_buf_ctx->stats_in_end) return;
+
+ // This variable monitors how far behind the second ref update is lagging.
+ twopass->sr_update_lag = 1;
+
+ twopass->bits_left = 0;
+ twopass->modified_error_min = 0.0;
+ twopass->modified_error_max = 0.0;
+ twopass->modified_error_left = 0.0;
+
+ // Reset the vbr bits off target counters
+ cpi->ppi->p_rc.vbr_bits_off_target = 0;
+ cpi->ppi->p_rc.vbr_bits_off_target_fast = 0;
+
+ cpi->ppi->p_rc.rate_error_estimate = 0;
+
+ // Static sequence monitor variables.
+ twopass->kf_zeromotion_pct = 100;
+ twopass->last_kfgroup_zeromotion_pct = 100;
+
+ // Initialize bits per macro_block estimate correction factor.
+ twopass->bpm_factor = 1.0;
+ // Initialize actual and target bits counters for ARF groups so that
+ // at the start we have a neutral bpm adjustment.
+ twopass->rolling_arf_group_target_bits = 1;
+ twopass->rolling_arf_group_actual_bits = 1;
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void av1_twopass_postencode_update(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+
+ // Increment the stats_in pointer.
+ if (is_stat_consumption_stage(cpi) &&
+ !(cpi->use_ducky_encode && cpi->ducky_encode_info.frame_info.gop_mode ==
+ DUCKY_ENCODE_GOP_MODE_RCL) &&
+ (cpi->gf_frame_index < cpi->ppi->gf_group.size ||
+ rc->frames_to_key == 0)) {
+ const int update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+ if (update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE) {
+ FIRSTPASS_STATS this_frame;
+ assert(cpi->twopass_frame.stats_in >
+ twopass->stats_buf_ctx->stats_in_start);
+ --cpi->twopass_frame.stats_in;
+ if (cpi->ppi->lap_enabled) {
+ input_stats_lap(twopass, &cpi->twopass_frame, &this_frame);
+ } else {
+ input_stats(twopass, &cpi->twopass_frame, &this_frame);
+ }
+ } else if (cpi->ppi->lap_enabled) {
+ cpi->twopass_frame.stats_in = twopass->stats_buf_ctx->stats_in_start;
+ }
+ }
+
+ // VBR correction is done through rc->vbr_bits_off_target. Based on the
+ // sign of this value, a limited % adjustment is made to the target rate
+ // of subsequent frames, to try and push it back towards 0. This method
+ // is designed to prevent extreme behaviour at the end of a clip
+ // or group of frames.
+ p_rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+ twopass->bits_left = AOMMAX(twopass->bits_left - rc->base_frame_target, 0);
+
+ if (cpi->do_update_vbr_bits_off_target_fast) {
+ // Subtract current frame's fast_extra_bits.
+ p_rc->vbr_bits_off_target_fast -= rc->frame_level_fast_extra_bits;
+ rc->frame_level_fast_extra_bits = 0;
+ }
+
+ // Target vs actual bits for this arf group.
+ twopass->rolling_arf_group_target_bits += rc->base_frame_target;
+ twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
+
+ // Calculate the pct rc error.
+ if (p_rc->total_actual_bits) {
+ p_rc->rate_error_estimate =
+ (int)((p_rc->vbr_bits_off_target * 100) / p_rc->total_actual_bits);
+ p_rc->rate_error_estimate = clamp(p_rc->rate_error_estimate, -100, 100);
+ } else {
+ p_rc->rate_error_estimate = 0;
+ }
+
+#if CONFIG_FPMT_TEST
+ /* The variables temp_vbr_bits_off_target, temp_bits_left,
+ * temp_rolling_arf_group_target_bits, temp_rolling_arf_group_actual_bits
+ * temp_rate_error_estimate are introduced for quality simulation purpose,
+ * it retains the value previous to the parallel encode frames. The
+ * variables are updated based on the update flag.
+ *
+ * If there exist show_existing_frames between parallel frames, then to
+ * retain the temp state do not update it. */
+ const int simulate_parallel_frame =
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int show_existing_between_parallel_frames =
+ (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+ simulate_parallel_frame) {
+ cpi->ppi->p_rc.temp_vbr_bits_off_target = p_rc->vbr_bits_off_target;
+ cpi->ppi->p_rc.temp_bits_left = twopass->bits_left;
+ cpi->ppi->p_rc.temp_rolling_arf_group_target_bits =
+ twopass->rolling_arf_group_target_bits;
+ cpi->ppi->p_rc.temp_rolling_arf_group_actual_bits =
+ twopass->rolling_arf_group_actual_bits;
+ cpi->ppi->p_rc.temp_rate_error_estimate = p_rc->rate_error_estimate;
+ }
+#endif
+ // Update the active best quality pyramid.
+ if (!rc->is_src_frame_alt_ref) {
+ const int pyramid_level =
+ cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+ int i;
+ for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) {
+ p_rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
+#if CONFIG_TUNE_VMAF
+ if (cpi->vmaf_info.original_qindex != -1 &&
+ (cpi->oxcf.tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
+ cpi->oxcf.tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
+ p_rc->active_best_quality[i] = cpi->vmaf_info.original_qindex;
+ }
+#endif
+ }
+ }
+
+#if 0
+ {
+ AV1_COMMON *cm = &cpi->common;
+ FILE *fpfile;
+ fpfile = fopen("details.stt", "a");
+ fprintf(fpfile,
+ "%10d %10d %10d %10" PRId64 " %10" PRId64
+ " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n",
+ cm->current_frame.frame_number, rc->base_frame_target,
+ rc->projected_frame_size, rc->total_actual_bits,
+ rc->vbr_bits_off_target, p_rc->rate_error_estimate,
+ twopass->rolling_arf_group_target_bits,
+ twopass->rolling_arf_group_actual_bits,
+ (double)twopass->rolling_arf_group_actual_bits /
+ (double)twopass->rolling_arf_group_target_bits,
+ twopass->bpm_factor,
+ av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex,
+ cm->seq_params->bit_depth),
+ av1_convert_qindex_to_q(rc->active_worst_quality,
+ cm->seq_params->bit_depth));
+ fclose(fpfile);
+ }
+#endif
+
+ if (cpi->common.current_frame.frame_type != KEY_FRAME) {
+ twopass->kf_group_bits -= rc->base_frame_target;
+ twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+ }
+ twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
+
+ // If the rate control is drifting consider adjustment to min or maxq.
+ if ((rc_cfg->mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
+ int minq_adj_limit;
+ int maxq_adj_limit;
+ minq_adj_limit =
+ (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+ maxq_adj_limit = rc->worst_quality - rc->active_worst_quality;
+
+ // Undershoot
+ if ((rc_cfg->under_shoot_pct < 100) &&
+ (p_rc->rolling_actual_bits < p_rc->rolling_target_bits)) {
+ int pct_error =
+ ((p_rc->rolling_target_bits - p_rc->rolling_actual_bits) * 100) /
+ p_rc->rolling_target_bits;
+
+ if ((pct_error >= rc_cfg->under_shoot_pct) &&
+ (p_rc->rate_error_estimate > 0)) {
+ twopass->extend_minq += 1;
+ }
+ twopass->extend_maxq -= 1;
+ // Overshoot
+ } else if ((rc_cfg->over_shoot_pct < 100) &&
+ (p_rc->rolling_actual_bits > p_rc->rolling_target_bits)) {
+ int pct_error =
+ ((p_rc->rolling_actual_bits - p_rc->rolling_target_bits) * 100) /
+ p_rc->rolling_target_bits;
+
+ pct_error = clamp(pct_error, 0, 100);
+ if ((pct_error >= rc_cfg->over_shoot_pct) &&
+ (p_rc->rate_error_estimate < 0)) {
+ twopass->extend_maxq += 1;
+ }
+ twopass->extend_minq -= 1;
+ } else {
+ // Adjustment for extreme local overshoot.
+ // Only applies when normal adjustment above is not used (e.g.
+ // when threshold is set to 100).
+ if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+ rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+ ++twopass->extend_maxq;
+ // Unwind extreme overshoot adjustment.
+ else if (p_rc->rolling_target_bits > p_rc->rolling_actual_bits)
+ --twopass->extend_maxq;
+ }
+ twopass->extend_minq =
+ clamp(twopass->extend_minq, -minq_adj_limit, minq_adj_limit);
+ twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+ // If there is a big and undexpected undershoot then feed the extra
+ // bits back in quickly. One situation where this may happen is if a
+ // frame is unexpectedly almost perfectly predicted by the ARF or GF
+ // but not very well predcited by the previous frame.
+ if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+ int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+ if (rc->projected_frame_size < fast_extra_thresh) {
+ p_rc->vbr_bits_off_target_fast +=
+ fast_extra_thresh - rc->projected_frame_size;
+ p_rc->vbr_bits_off_target_fast = AOMMIN(p_rc->vbr_bits_off_target_fast,
+ (4 * rc->avg_frame_bandwidth));
+ }
+ }
+
+#if CONFIG_FPMT_TEST
+ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+ simulate_parallel_frame) {
+ cpi->ppi->p_rc.temp_vbr_bits_off_target_fast =
+ p_rc->vbr_bits_off_target_fast;
+ cpi->ppi->p_rc.temp_extend_minq = twopass->extend_minq;
+ cpi->ppi->p_rc.temp_extend_maxq = twopass->extend_maxq;
+ }
+#endif
+ }
+
+ // Update the frame probabilities obtained from parallel encode frames
+ FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+#if CONFIG_FPMT_TEST
+ /* The variable temp_active_best_quality is introduced only for quality
+ * simulation purpose, it retains the value previous to the parallel
+ * encode frames. The variable is updated based on the update flag.
+ *
+ * If there exist show_existing_frames between parallel frames, then to
+ * retain the temp state do not update it. */
+ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+ simulate_parallel_frame) {
+ int i;
+ const int pyramid_level =
+ cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+ if (!rc->is_src_frame_alt_ref) {
+ for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i)
+ cpi->ppi->p_rc.temp_active_best_quality[i] =
+ p_rc->active_best_quality[i];
+ }
+ }
+
+ // Update the frame probabilities obtained from parallel encode frames
+ FrameProbInfo *const temp_frame_probs_simulation =
+ simulate_parallel_frame ? &cpi->ppi->temp_frame_probs_simulation
+ : frame_probs;
+ FrameProbInfo *const temp_frame_probs =
+ simulate_parallel_frame ? &cpi->ppi->temp_frame_probs : NULL;
+#endif
+ int i, j, loop;
+ // Sequentially do average on temp_frame_probs_simulation which holds
+ // probabilities of last frame before parallel encode
+ for (loop = 0; loop <= cpi->num_frame_recode; loop++) {
+ // Sequentially update tx_type_probs
+ if (cpi->do_update_frame_probs_txtype[loop] &&
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ for (i = 0; i < TX_SIZES_ALL; i++) {
+ int left = 1024;
+
+ for (j = TX_TYPES - 1; j >= 0; j--) {
+ const int new_prob =
+ cpi->frame_new_probs[loop].tx_type_probs[update_type][i][j];
+#if CONFIG_FPMT_TEST
+ int prob =
+ (temp_frame_probs_simulation->tx_type_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ temp_frame_probs_simulation->tx_type_probs[update_type][i][j] = prob;
+#else
+ int prob =
+ (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ frame_probs->tx_type_probs[update_type][i][j] = prob;
+#endif
+ }
+ }
+ }
+
+ // Sequentially update obmc_probs
+ if (cpi->do_update_frame_probs_obmc[loop] &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+ for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+ const int new_prob =
+ cpi->frame_new_probs[loop].obmc_probs[update_type][i];
+#if CONFIG_FPMT_TEST
+ temp_frame_probs_simulation->obmc_probs[update_type][i] =
+ (temp_frame_probs_simulation->obmc_probs[update_type][i] +
+ new_prob) >>
+ 1;
+#else
+ frame_probs->obmc_probs[update_type][i] =
+ (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+#endif
+ }
+ }
+
+ // Sequentially update warped_probs
+ if (cpi->do_update_frame_probs_warp[loop] &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ const int new_prob = cpi->frame_new_probs[loop].warped_probs[update_type];
+#if CONFIG_FPMT_TEST
+ temp_frame_probs_simulation->warped_probs[update_type] =
+ (temp_frame_probs_simulation->warped_probs[update_type] + new_prob) >>
+ 1;
+#else
+ frame_probs->warped_probs[update_type] =
+ (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+#endif
+ }
+
+ // Sequentially update switchable_interp_probs
+ if (cpi->do_update_frame_probs_interpfilter[loop] &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ int left = 1536;
+
+ for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+ const int new_prob = cpi->frame_new_probs[loop]
+ .switchable_interp_probs[update_type][i][j];
+#if CONFIG_FPMT_TEST
+ int prob = (temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+
+ temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type][i][j] = prob;
+#else
+ int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+ new_prob) >>
+ 1;
+ left -= prob;
+ if (j == 0) prob += left;
+ frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+#endif
+ }
+ }
+ }
+ }
+
+#if CONFIG_FPMT_TEST
+ // Copying temp_frame_probs_simulation to temp_frame_probs based on
+ // the flag
+ if (cpi->do_frame_data_update &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ simulate_parallel_frame) {
+ for (int update_type_idx = 0; update_type_idx < FRAME_UPDATE_TYPES;
+ update_type_idx++) {
+ for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+ temp_frame_probs->obmc_probs[update_type_idx][i] =
+ temp_frame_probs_simulation->obmc_probs[update_type_idx][i];
+ }
+ temp_frame_probs->warped_probs[update_type_idx] =
+ temp_frame_probs_simulation->warped_probs[update_type_idx];
+ for (i = 0; i < TX_SIZES_ALL; i++) {
+ for (j = 0; j < TX_TYPES; j++) {
+ temp_frame_probs->tx_type_probs[update_type_idx][i][j] =
+ temp_frame_probs_simulation->tx_type_probs[update_type_idx][i][j];
+ }
+ }
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ for (j = 0; j < SWITCHABLE_FILTERS; j++) {
+ temp_frame_probs->switchable_interp_probs[update_type_idx][i][j] =
+ temp_frame_probs_simulation
+ ->switchable_interp_probs[update_type_idx][i][j];
+ }
+ }
+ }
+ }
+#endif
+ // Update framerate obtained from parallel encode frames
+ if (cpi->common.show_frame &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ cpi->framerate = cpi->new_framerate;
+#if CONFIG_FPMT_TEST
+ // SIMULATION PURPOSE
+ int show_existing_between_parallel_frames_cndn =
+ (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+ if (cpi->common.show_frame && !show_existing_between_parallel_frames_cndn &&
+ cpi->do_frame_data_update && simulate_parallel_frame)
+ cpi->temp_framerate = cpi->framerate;
+#endif
+}
diff --git a/third_party/aom/av1/encoder/pass2_strategy.h b/third_party/aom/av1/encoder/pass2_strategy.h
new file mode 100644
index 0000000000..5987a78a23
--- /dev/null
+++ b/third_party/aom/av1/encoder/pass2_strategy.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+#define AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+#include "av1/encoder/encoder.h"
+
+/*!
+ * \brief accumulated stats and features in a gf group
+ */
+typedef struct {
+ /*!\cond */
+ double gf_group_err;
+ double gf_group_raw_error;
+ double gf_group_skip_pct;
+ double gf_group_inactive_zone_rows;
+
+ double mv_ratio_accumulator;
+ double decay_accumulator;
+ double zero_motion_accumulator;
+ double loop_decay_rate;
+ double last_loop_decay_rate;
+ double this_frame_mv_in_out;
+ double mv_in_out_accumulator;
+ double abs_mv_in_out_accumulator;
+
+ double avg_sr_coded_error;
+ double avg_pcnt_second_ref;
+ double avg_new_mv_count;
+ double avg_wavelet_energy;
+ double avg_raw_err_stdev;
+ int non_zero_stdev_count;
+ /*!\endcond */
+} GF_GROUP_STATS;
+
+/*!
+ * \brief accumulated stats and features for a frame
+ */
+typedef struct {
+ /*!\cond */
+ double frame_err;
+ double frame_coded_error;
+ double frame_sr_coded_error;
+ /*!\endcond */
+} GF_FRAME_STATS;
+/*!\cond */
+
+void av1_init_second_pass(struct AV1_COMP *cpi);
+
+void av1_init_single_pass_lap(AV1_COMP *cpi);
+
+/*!\endcond */
+/*!\brief Main per frame entry point for second pass of two pass encode
+ *
+ *\ingroup rate_control
+ *
+ * This function is called for each frame in the second pass of a two pass
+ * encode. It checks the frame type and if a new KF or GF/ARF is due.
+ * When a KF is due it calls find_next_key_frame() to work out how long
+ * this key frame group will be and assign bits to the key frame.
+ * At the start of a new GF/ARF group it calls calculate_gf_length()
+ * and define_gf_group() which are the main functions responsible for
+ * defining the size and structure of the new GF/ARF group.
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] frame_params Per frame encoding parameters
+ * \param[in] frame_flags Frame type and coding flags
+ *
+ * \remark No return but analyses first pass stats and assigns a target
+ * number of bits to the current frame and a target Q range.
+ */
+void av1_get_second_pass_params(struct AV1_COMP *cpi,
+ struct EncodeFrameParams *const frame_params,
+ unsigned int frame_flags);
+
+/*!\brief Adjustments to two pass and rate control after each frame.
+ *
+ *\ingroup rate_control
+ *
+ * This function is called after each frame to make adjustments to
+ * heuristics and data structures that relate to rate control.
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ *
+ * \remark No return value but this function updates various rate control
+ * related data structures that for example track overshoot and
+ * undershoot.
+ */
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+/*!\brief Distributes bits to frames in a group
+ *
+ *\ingroup rate_control
+ *
+ * This function decides on the allocation of bits between the different
+ * frames and types of frame in a GF/ARF group.
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] rc Rate control data
+ * \param[in] gf_group GF/ARF group data structure
+ * \param[in] is_key_frame Indicates if the first frame in the group is
+ * also a key frame.
+ * \param[in] use_arf Are ARF frames enabled or is this a GF only
+ * uni-directional group.
+ * \param[in] gf_group_bits Bits available to be allocated.
+ *
+ * \remark No return but updates the rate control and group data structures
+ * to reflect the allocation of bits.
+ */
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+ GF_GROUP *gf_group, int is_key_frame, int use_arf,
+ int64_t gf_group_bits);
+
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+ const TWO_PASS_FRAME *twopass_frame,
+ const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+ int offset, int f_frames, int b_frames,
+ int *num_fpstats_used, int *num_fpstats_required,
+ int project_gfu_boost);
+
+void av1_mark_flashes(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats);
+void av1_estimate_noise(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats,
+ struct aom_internal_error_info *error_info);
+void av1_estimate_coeff(FIRSTPASS_STATS *first_stats,
+ FIRSTPASS_STATS *last_stats);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PASS2_STRATEGY_H_
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
new file mode 100644
index 0000000000..232a2f9edb
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -0,0 +1,958 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/pickcdef.h"
+#include "av1/encoder/mcomp.h"
+
+// Get primary and secondary filter strength for the given strength index and
+// search method
+static INLINE void get_cdef_filter_strengths(CDEF_PICK_METHOD pick_method,
+ int *pri_strength,
+ int *sec_strength,
+ int strength_idx) {
+ const int tot_sec_filter =
+ (pick_method == CDEF_FAST_SEARCH_LVL5)
+ ? REDUCED_SEC_STRENGTHS_LVL5
+ : ((pick_method >= CDEF_FAST_SEARCH_LVL3) ? REDUCED_SEC_STRENGTHS_LVL3
+ : CDEF_SEC_STRENGTHS);
+ const int pri_idx = strength_idx / tot_sec_filter;
+ const int sec_idx = strength_idx % tot_sec_filter;
+ *pri_strength = pri_idx;
+ *sec_strength = sec_idx;
+ if (pick_method == CDEF_FULL_SEARCH) return;
+
+ switch (pick_method) {
+ case CDEF_FAST_SEARCH_LVL1:
+ assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL1);
+ *pri_strength = priconv_lvl1[pri_idx];
+ break;
+ case CDEF_FAST_SEARCH_LVL2:
+ assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2);
+ *pri_strength = priconv_lvl2[pri_idx];
+ break;
+ case CDEF_FAST_SEARCH_LVL3:
+ assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL2);
+ assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3);
+ *pri_strength = priconv_lvl2[pri_idx];
+ *sec_strength = secconv_lvl3[sec_idx];
+ break;
+ case CDEF_FAST_SEARCH_LVL4:
+ assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4);
+ assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL3);
+ *pri_strength = priconv_lvl4[pri_idx];
+ *sec_strength = secconv_lvl3[sec_idx];
+ break;
+ case CDEF_FAST_SEARCH_LVL5:
+ assert(pri_idx < REDUCED_PRI_STRENGTHS_LVL4);
+ assert(sec_idx < REDUCED_SEC_STRENGTHS_LVL5);
+ *pri_strength = priconv_lvl5[pri_idx];
+ *sec_strength = secconv_lvl5[sec_idx];
+ break;
+ default: assert(0 && "Invalid CDEF search method");
+ }
+}
+
+// Store CDEF filter strength calculated from strength index for given search
+// method
+#define STORE_CDEF_FILTER_STRENGTH(cdef_strength, pick_method, strength_idx) \
+ do { \
+ get_cdef_filter_strengths((pick_method), &pri_strength, &sec_strength, \
+ (strength_idx)); \
+ cdef_strength = pri_strength * CDEF_SEC_STRENGTHS + sec_strength; \
+ } while (0)
+
+/* Search for the best strength to add as an option, knowing we
+ already selected nb_strengths options. */
+static uint64_t search_one(int *lev, int nb_strengths,
+ uint64_t mse[][TOTAL_STRENGTHS], int sb_count,
+ CDEF_PICK_METHOD pick_method) {
+ uint64_t tot_mse[TOTAL_STRENGTHS];
+ const int total_strengths = nb_cdef_strengths[pick_method];
+ int i, j;
+ uint64_t best_tot_mse = (uint64_t)1 << 63;
+ int best_id = 0;
+ memset(tot_mse, 0, sizeof(tot_mse));
+ for (i = 0; i < sb_count; i++) {
+ int gi;
+ uint64_t best_mse = (uint64_t)1 << 63;
+ /* Find best mse among already selected options. */
+ for (gi = 0; gi < nb_strengths; gi++) {
+ if (mse[i][lev[gi]] < best_mse) {
+ best_mse = mse[i][lev[gi]];
+ }
+ }
+ /* Find best mse when adding each possible new option. */
+ for (j = 0; j < total_strengths; j++) {
+ uint64_t best = best_mse;
+ if (mse[i][j] < best) best = mse[i][j];
+ tot_mse[j] += best;
+ }
+ }
+ for (j = 0; j < total_strengths; j++) {
+ if (tot_mse[j] < best_tot_mse) {
+ best_tot_mse = tot_mse[j];
+ best_id = j;
+ }
+ }
+ lev[nb_strengths] = best_id;
+ return best_tot_mse;
+}
+
+/* Search for the best luma+chroma strength to add as an option, knowing we
+ already selected nb_strengths options. */
+static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
+ uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
+ CDEF_PICK_METHOD pick_method) {
+ uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+ int i, j;
+ uint64_t best_tot_mse = (uint64_t)1 << 63;
+ int best_id0 = 0;
+ int best_id1 = 0;
+ const int total_strengths = nb_cdef_strengths[pick_method];
+ memset(tot_mse, 0, sizeof(tot_mse));
+ for (i = 0; i < sb_count; i++) {
+ int gi;
+ uint64_t best_mse = (uint64_t)1 << 63;
+ /* Find best mse among already selected options. */
+ for (gi = 0; gi < nb_strengths; gi++) {
+ uint64_t curr = mse[0][i][lev0[gi]];
+ curr += mse[1][i][lev1[gi]];
+ if (curr < best_mse) {
+ best_mse = curr;
+ }
+ }
+ /* Find best mse when adding each possible new option. */
+ for (j = 0; j < total_strengths; j++) {
+ int k;
+ for (k = 0; k < total_strengths; k++) {
+ uint64_t best = best_mse;
+ uint64_t curr = mse[0][i][j];
+ curr += mse[1][i][k];
+ if (curr < best) best = curr;
+ tot_mse[j][k] += best;
+ }
+ }
+ }
+ for (j = 0; j < total_strengths; j++) {
+ int k;
+ for (k = 0; k < total_strengths; k++) {
+ if (tot_mse[j][k] < best_tot_mse) {
+ best_tot_mse = tot_mse[j][k];
+ best_id0 = j;
+ best_id1 = k;
+ }
+ }
+ }
+ lev0[nb_strengths] = best_id0;
+ lev1[nb_strengths] = best_id1;
+ return best_tot_mse;
+}
+
+/* Search for the set of strengths that minimizes mse. */
+static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
+ uint64_t mse[][TOTAL_STRENGTHS],
+ int sb_count,
+ CDEF_PICK_METHOD pick_method) {
+ uint64_t best_tot_mse;
+ int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
+ pick_method <= CDEF_FAST_SEARCH_LVL5);
+ int i;
+ best_tot_mse = (uint64_t)1 << 63;
+ /* Greedy search: add one strength options at a time. */
+ for (i = 0; i < nb_strengths; i++) {
+ best_tot_mse = search_one(best_lev, i, mse, sb_count, pick_method);
+ }
+ /* Trying to refine the greedy search by reconsidering each
+ already-selected option. */
+ if (!fast) {
+ for (i = 0; i < 4 * nb_strengths; i++) {
+ int j;
+ for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
+ best_tot_mse =
+ search_one(best_lev, nb_strengths - 1, mse, sb_count, pick_method);
+ }
+ }
+ return best_tot_mse;
+}
+
+/* Search for the set of luma+chroma strengths that minimizes mse. */
+static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
+ int nb_strengths,
+ uint64_t (**mse)[TOTAL_STRENGTHS],
+ int sb_count,
+ CDEF_PICK_METHOD pick_method) {
+ uint64_t best_tot_mse;
+ int i;
+ best_tot_mse = (uint64_t)1 << 63;
+ /* Greedy search: add one strength options at a time. */
+ for (i = 0; i < nb_strengths; i++) {
+ best_tot_mse =
+ search_one_dual(best_lev0, best_lev1, i, mse, sb_count, pick_method);
+ }
+ /* Trying to refine the greedy search by reconsidering each
+ already-selected option. */
+ for (i = 0; i < 4 * nb_strengths; i++) {
+ int j;
+ for (j = 0; j < nb_strengths - 1; j++) {
+ best_lev0[j] = best_lev0[j + 1];
+ best_lev1[j] = best_lev1[j + 1];
+ }
+ best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse,
+ sb_count, pick_method);
+ }
+ return best_tot_mse;
+}
+
+static INLINE void init_src_params(int *src_stride, int *width, int *height,
+ int *width_log2, int *height_log2,
+ BLOCK_SIZE bsize) {
+ *src_stride = block_size_wide[bsize];
+ *width = block_size_wide[bsize];
+ *height = block_size_high[bsize];
+ *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+ *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+}
+#if CONFIG_AV1_HIGHBITDEPTH
+/* Compute MSE only on the blocks we filtered. */
+static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src,
+ cdef_list *dlist, int cdef_count,
+ BLOCK_SIZE bsize, int coeff_shift,
+ int row, int col) {
+ assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+ bsize == BLOCK_8X8);
+ uint64_t sum = 0;
+ int bi, bx, by;
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR((uint8_t *)dst);
+ uint16_t *dst_buff = &dst16[row * dstride + col];
+ int src_stride, width, height, width_log2, height_log2;
+ init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
+ bsize);
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ sum += aom_mse_wxh_16bit_highbd(
+ &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
+ &src[bi << (height_log2 + width_log2)], src_stride, width, height);
+ }
+ return sum >> 2 * coeff_shift;
+}
+#endif
+
+// Checks dual and quad block processing is applicable for block widths 8 and 4
+// respectively.
+static INLINE int is_dual_or_quad_applicable(cdef_list *dlist, int width,
+ int cdef_count, int bi, int iter) {
+ assert(width == 8 || width == 4);
+ const int blk_offset = (width == 8) ? 1 : 3;
+ if ((iter + blk_offset) >= cdef_count) return 0;
+
+ if (dlist[bi].by == dlist[bi + blk_offset].by &&
+ dlist[bi].bx + blk_offset == dlist[bi + blk_offset].bx)
+ return 1;
+
+ return 0;
+}
+
+static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
+ cdef_list *dlist, int cdef_count,
+ BLOCK_SIZE bsize, int coeff_shift, int row,
+ int col) {
+ assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+ bsize == BLOCK_8X8);
+ uint64_t sum = 0;
+ int bi, bx, by;
+ int iter = 0;
+ int inc = 1;
+ uint8_t *dst8 = (uint8_t *)dst;
+ uint8_t *dst_buff = &dst8[row * dstride + col];
+ int src_stride, width, height, width_log2, height_log2;
+ init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
+ bsize);
+
+ const int num_blks = 16 / width;
+ for (bi = 0; bi < cdef_count; bi += inc) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ uint16_t *src_tmp = &src[bi << (height_log2 + width_log2)];
+ uint8_t *dst_tmp =
+ &dst_buff[(by << height_log2) * dstride + (bx << width_log2)];
+
+ if (is_dual_or_quad_applicable(dlist, width, cdef_count, bi, iter)) {
+ sum += aom_mse_16xh_16bit(dst_tmp, dstride, src_tmp, width, height);
+ iter += num_blks;
+ inc = num_blks;
+ } else {
+ sum += aom_mse_wxh_16bit(dst_tmp, dstride, src_tmp, src_stride, width,
+ height);
+ iter += 1;
+ inc = 1;
+ }
+ }
+
+ return sum >> 2 * coeff_shift;
+}
+
+// Fill the boundary regions of the block with CDEF_VERY_LARGE, only if the
+// region is outside frame boundary
+static INLINE void fill_borders_for_fbs_on_frame_boundary(
+ uint16_t *inbuf, int hfilt_size, int vfilt_size,
+ bool is_fb_on_frm_left_boundary, bool is_fb_on_frm_right_boundary,
+ bool is_fb_on_frm_top_boundary, bool is_fb_on_frm_bottom_boundary) {
+ if (!is_fb_on_frm_left_boundary && !is_fb_on_frm_right_boundary &&
+ !is_fb_on_frm_top_boundary && !is_fb_on_frm_bottom_boundary)
+ return;
+ if (is_fb_on_frm_bottom_boundary) {
+ // Fill bottom region of the block
+ const int buf_offset =
+ (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + CDEF_HBORDER;
+ fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_left_boundary) {
+ const int buf_offset = (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE;
+ // Fill bottom-left region of the block
+ fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_bottom_boundary || is_fb_on_frm_right_boundary) {
+ const int buf_offset =
+ (vfilt_size + CDEF_VBORDER) * CDEF_BSTRIDE + hfilt_size + CDEF_HBORDER;
+ // Fill bottom-right region of the block
+ fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_top_boundary) {
+ // Fill top region of the block
+ fill_rect(&inbuf[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hfilt_size,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_top_boundary || is_fb_on_frm_left_boundary) {
+ // Fill top-left region of the block
+ fill_rect(inbuf, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_top_boundary || is_fb_on_frm_right_boundary) {
+ const int buf_offset = hfilt_size + CDEF_HBORDER;
+ // Fill top-right region of the block
+ fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_left_boundary) {
+ const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE;
+ // Fill left region of the block
+ fill_rect(&inbuf[buf_offset], CDEF_BSTRIDE, vfilt_size, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (is_fb_on_frm_right_boundary) {
+ const int buf_offset = CDEF_VBORDER * CDEF_BSTRIDE;
+ // Fill right region of the block
+ fill_rect(&inbuf[buf_offset + hfilt_size + CDEF_HBORDER], CDEF_BSTRIDE,
+ vfilt_size, CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+}
+
+// Calculate the number of 8x8/4x4 filter units for which SSE can be calculated
+// after CDEF filtering in single function call
+static AOM_FORCE_INLINE int get_error_calc_width_in_filt_units(
+ cdef_list *dlist, int cdef_count, int bi, int subsampling_x,
+ int subsampling_y) {
+ // TODO(Ranjit): Extend the optimization for 422
+ if (subsampling_x != subsampling_y) return 1;
+
+ // Combining more blocks seems to increase encode time due to increase in
+ // control code
+ if (bi + 3 < cdef_count && dlist[bi].by == dlist[bi + 3].by &&
+ dlist[bi].bx + 3 == dlist[bi + 3].bx) {
+ /* Calculate error for four 8x8/4x4 blocks using 32x8/16x4 block specific
+ * logic if y co-ordinates match and x co-ordinates are
+ * separated by 3 for first and fourth 8x8/4x4 blocks in dlist[]. */
+ return 4;
+ }
+ if (bi + 1 < cdef_count && dlist[bi].by == dlist[bi + 1].by &&
+ dlist[bi].bx + 1 == dlist[bi + 1].bx) {
+ /* Calculate error for two 8x8/4x4 blocks using 16x8/8x4 block specific
+ * logic if their y co-ordinates match and x co-ordinates are
+ * separated by 1 for first and second 8x8/4x4 blocks in dlist[]. */
+ return 2;
+ }
+ return 1;
+}
+
+// Returns the block error after CDEF filtering for a given strength
+static INLINE uint64_t get_filt_error(
+ const CdefSearchCtx *cdef_search_ctx, const struct macroblockd_plane *pd,
+ cdef_list *dlist, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS], uint16_t *in, uint8_t *ref_buffer,
+ int ref_stride, int row, int col, int pri_strength, int sec_strength,
+ int cdef_count, int pli, int coeff_shift, BLOCK_SIZE bs) {
+ uint64_t curr_sse = 0;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bs, pd->subsampling_x, pd->subsampling_y);
+ const int bw_log2 = 3 - pd->subsampling_x;
+ const int bh_log2 = 3 - pd->subsampling_y;
+
+ // TODO(Ranjit): Extend this optimization for HBD
+ if (!cdef_search_ctx->use_highbitdepth) {
+ // If all 8x8/4x4 blocks in CDEF block need to be filtered, calculate the
+ // error at CDEF block level
+ const int tot_blk_count =
+ (block_size_wide[plane_bsize] * block_size_high[plane_bsize]) >>
+ (bw_log2 + bh_log2);
+ if (cdef_count == tot_blk_count) {
+ // Calculate the offset in the buffer based on block position
+ const FULLPEL_MV this_mv = { row, col };
+ const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+ if (pri_strength == 0 && sec_strength == 0) {
+ // When CDEF strength is zero, filtering is not applied. Hence
+ // error is calculated between source and unfiltered pixels
+ curr_sse =
+ aom_sse(&ref_buffer[buf_offset], ref_stride,
+ get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride,
+ block_size_wide[plane_bsize], block_size_high[plane_bsize]);
+ } else {
+ DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+
+ av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in,
+ cdef_search_ctx->xdec[pli],
+ cdef_search_ctx->ydec[pli], dir, dirinit, var, pli,
+ dlist, cdef_count, pri_strength,
+ sec_strength + (sec_strength == 3),
+ cdef_search_ctx->damping, coeff_shift);
+ curr_sse =
+ aom_sse(&ref_buffer[buf_offset], ref_stride, tmp_dst8,
+ (1 << MAX_SB_SIZE_LOG2), block_size_wide[plane_bsize],
+ block_size_high[plane_bsize]);
+ }
+ } else {
+ // If few 8x8/4x4 blocks in CDEF block need to be filtered, filtering
+ // functions produce 8-bit output and the error is calculated in 8-bit
+ // domain
+ if (pri_strength == 0 && sec_strength == 0) {
+ int num_error_calc_filt_units = 1;
+ for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) {
+ const uint8_t by = dlist[bi].by;
+ const uint8_t bx = dlist[bi].bx;
+ const int16_t by_pos = (by << bh_log2);
+ const int16_t bx_pos = (bx << bw_log2);
+ // Calculate the offset in the buffer based on block position
+ const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos };
+ const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+ num_error_calc_filt_units = get_error_calc_width_in_filt_units(
+ dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y);
+ curr_sse += aom_sse(
+ &ref_buffer[buf_offset], ref_stride,
+ get_buf_from_fullmv(&pd->dst, &this_mv), pd->dst.stride,
+ num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2));
+ }
+ } else {
+ DECLARE_ALIGNED(32, uint8_t, tmp_dst8[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+ av1_cdef_filter_fb(tmp_dst8, NULL, (1 << MAX_SB_SIZE_LOG2), in,
+ cdef_search_ctx->xdec[pli],
+ cdef_search_ctx->ydec[pli], dir, dirinit, var, pli,
+ dlist, cdef_count, pri_strength,
+ sec_strength + (sec_strength == 3),
+ cdef_search_ctx->damping, coeff_shift);
+ int num_error_calc_filt_units = 1;
+ for (int bi = 0; bi < cdef_count; bi = bi + num_error_calc_filt_units) {
+ const uint8_t by = dlist[bi].by;
+ const uint8_t bx = dlist[bi].bx;
+ const int16_t by_pos = (by << bh_log2);
+ const int16_t bx_pos = (bx << bw_log2);
+ // Calculate the offset in the buffer based on block position
+ const FULLPEL_MV this_mv = { row + by_pos, col + bx_pos };
+ const FULLPEL_MV tmp_buf_pos = { by_pos, bx_pos };
+ const int buf_offset = get_offset_from_fullmv(&this_mv, ref_stride);
+ const int tmp_buf_offset =
+ get_offset_from_fullmv(&tmp_buf_pos, (1 << MAX_SB_SIZE_LOG2));
+ num_error_calc_filt_units = get_error_calc_width_in_filt_units(
+ dlist, cdef_count, bi, pd->subsampling_x, pd->subsampling_y);
+ curr_sse += aom_sse(
+ &ref_buffer[buf_offset], ref_stride, &tmp_dst8[tmp_buf_offset],
+ (1 << MAX_SB_SIZE_LOG2),
+ num_error_calc_filt_units * (1 << bw_log2), (1 << bh_log2));
+ }
+ }
+ }
+ } else {
+ DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+
+ av1_cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in,
+ cdef_search_ctx->xdec[pli], cdef_search_ctx->ydec[pli],
+ dir, dirinit, var, pli, dlist, cdef_count, pri_strength,
+ sec_strength + (sec_strength == 3),
+ cdef_search_ctx->damping, coeff_shift);
+ curr_sse = cdef_search_ctx->compute_cdef_dist_fn(
+ ref_buffer, ref_stride, tmp_dst, dlist, cdef_count,
+ cdef_search_ctx->bsize[pli], coeff_shift, row, col);
+ }
+ return curr_sse;
+}
+
+// Calculates MSE at block level.
+// Inputs:
+// cdef_search_ctx: Pointer to the structure containing parameters related to
+// CDEF search context.
+// fbr: Row index in units of 64x64 block
+// fbc: Column index in units of 64x64 block
+// Returns:
+// Nothing will be returned. Contents of cdef_search_ctx will be modified.
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx,
+ struct aom_internal_error_info *error_info,
+ int fbr, int fbc, int sb_count) {
+ // TODO(aomedia:3276): Pass error_info to the low-level functions as required
+ // in future to handle error propagation.
+ (void)error_info;
+ const CommonModeInfoParams *const mi_params = cdef_search_ctx->mi_params;
+ const YV12_BUFFER_CONFIG *ref = cdef_search_ctx->ref;
+ const int coeff_shift = cdef_search_ctx->coeff_shift;
+ const int *mi_wide_l2 = cdef_search_ctx->mi_wide_l2;
+ const int *mi_high_l2 = cdef_search_ctx->mi_high_l2;
+
+ // Declare and initialize the temporary buffers.
+ DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
+ cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
+ int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+ uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
+ int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+ int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+ int hb_step = 1, vb_step = 1;
+ BLOCK_SIZE bs;
+
+ const MB_MODE_INFO *const mbmi =
+ mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+ MI_SIZE_64X64 * fbc];
+
+ uint8_t *ref_buffer[MAX_MB_PLANE] = { ref->y_buffer, ref->u_buffer,
+ ref->v_buffer };
+ int ref_stride[MAX_MB_PLANE] = { ref->y_stride, ref->uv_stride,
+ ref->uv_stride };
+
+ if (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64 ||
+ mbmi->bsize == BLOCK_64X128) {
+ bs = mbmi->bsize;
+ if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+ nhb = AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+ hb_step = 2;
+ }
+ if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+ nvb = AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+ vb_step = 2;
+ }
+ } else {
+ bs = BLOCK_64X64;
+ }
+ // Get number of 8x8 blocks which are not skip. Cdef processing happens for
+ // 8x8 blocks which are not skip.
+ const int cdef_count = av1_cdef_compute_sb_list(
+ mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
+ const bool is_fb_on_frm_left_boundary = (fbc == 0);
+ const bool is_fb_on_frm_right_boundary =
+ (fbc + hb_step == cdef_search_ctx->nhfb);
+ const bool is_fb_on_frm_top_boundary = (fbr == 0);
+ const bool is_fb_on_frm_bottom_boundary =
+ (fbr + vb_step == cdef_search_ctx->nvfb);
+ const int yoff = CDEF_VBORDER * (!is_fb_on_frm_top_boundary);
+ const int xoff = CDEF_HBORDER * (!is_fb_on_frm_left_boundary);
+ int dirinit = 0;
+ for (int pli = 0; pli < cdef_search_ctx->num_planes; pli++) {
+ /* We avoid filtering the pixels for which some of the pixels to
+ average are outside the frame. We could change the filter instead,
+ but it would add special cases for any future vectorization. */
+ const int hfilt_size = (nhb << mi_wide_l2[pli]);
+ const int vfilt_size = (nvb << mi_high_l2[pli]);
+ const int ysize =
+ vfilt_size + CDEF_VBORDER * (!is_fb_on_frm_bottom_boundary) + yoff;
+ const int xsize =
+ hfilt_size + CDEF_HBORDER * (!is_fb_on_frm_right_boundary) + xoff;
+ const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli];
+ const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
+ struct macroblockd_plane pd = cdef_search_ctx->plane[pli];
+ cdef_search_ctx->copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+ pd.dst.buf, row - yoff, col - xoff, pd.dst.stride,
+ ysize, xsize);
+ fill_borders_for_fbs_on_frame_boundary(
+ inbuf, hfilt_size, vfilt_size, is_fb_on_frm_left_boundary,
+ is_fb_on_frm_right_boundary, is_fb_on_frm_top_boundary,
+ is_fb_on_frm_bottom_boundary);
+ for (int gi = 0; gi < cdef_search_ctx->total_strengths; gi++) {
+ int pri_strength, sec_strength;
+ get_cdef_filter_strengths(cdef_search_ctx->pick_method, &pri_strength,
+ &sec_strength, gi);
+ const uint64_t curr_mse = get_filt_error(
+ cdef_search_ctx, &pd, dlist, dir, &dirinit, var, in, ref_buffer[pli],
+ ref_stride[pli], row, col, pri_strength, sec_strength, cdef_count,
+ pli, coeff_shift, bs);
+ if (pli < 2)
+ cdef_search_ctx->mse[pli][sb_count][gi] = curr_mse;
+ else
+ cdef_search_ctx->mse[1][sb_count][gi] += curr_mse;
+ }
+ }
+ cdef_search_ctx->sb_index[sb_count] =
+ MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc;
+}
+
+// MSE calculation at frame level.
+// Inputs:
+// cdef_search_ctx: Pointer to the structure containing parameters related to
+// CDEF search context.
+// Returns:
+// Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static void cdef_mse_calc_frame(CdefSearchCtx *cdef_search_ctx,
+ struct aom_internal_error_info *error_info) {
+ // Loop over each sb.
+ for (int fbr = 0; fbr < cdef_search_ctx->nvfb; ++fbr) {
+ for (int fbc = 0; fbc < cdef_search_ctx->nhfb; ++fbc) {
+ // Checks if cdef processing can be skipped for particular sb.
+ if (cdef_sb_skip(cdef_search_ctx->mi_params, fbr, fbc)) continue;
+ // Calculate mse for each sb and store the relevant sb index.
+ av1_cdef_mse_calc_block(cdef_search_ctx, error_info, fbr, fbc,
+ cdef_search_ctx->sb_count);
+ cdef_search_ctx->sb_count++;
+ }
+ }
+}
+
+// Allocates memory for members of CdefSearchCtx.
+// Inputs:
+// cdef_search_ctx: Pointer to the structure containing parameters
+// related to CDEF search context.
+// Returns:
+// Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static void cdef_alloc_data(AV1_COMMON *cm, CdefSearchCtx *cdef_search_ctx) {
+ const int nvfb = cdef_search_ctx->nvfb;
+ const int nhfb = cdef_search_ctx->nhfb;
+ CHECK_MEM_ERROR(
+ cm, cdef_search_ctx->sb_index,
+ aom_malloc(nvfb * nhfb * sizeof(cdef_search_ctx->sb_index[0])));
+ cdef_search_ctx->sb_count = 0;
+ CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[0],
+ aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb));
+ CHECK_MEM_ERROR(cm, cdef_search_ctx->mse[1],
+ aom_malloc(sizeof(**cdef_search_ctx->mse) * nvfb * nhfb));
+}
+
+// Deallocates the memory allocated for members of CdefSearchCtx.
+// Inputs:
+// cdef_search_ctx: Pointer to the structure containing parameters
+// related to CDEF search context.
+// Returns:
+// Nothing will be returned.
+void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx) {
+ if (cdef_search_ctx) {
+ aom_free(cdef_search_ctx->mse[0]);
+ cdef_search_ctx->mse[0] = NULL;
+ aom_free(cdef_search_ctx->mse[1]);
+ cdef_search_ctx->mse[1] = NULL;
+ aom_free(cdef_search_ctx->sb_index);
+ cdef_search_ctx->sb_index = NULL;
+ }
+}
+
+// Initialize the parameters related to CDEF search context.
+// Inputs:
+// frame: Pointer to compressed frame buffer
+// ref: Pointer to the frame buffer holding the source frame
+// cm: Pointer to top level common structure
+// xd: Pointer to common current coding block structure
+// cdef_search_ctx: Pointer to the structure containing parameters related to
+// CDEF search context.
+// pick_method: Search method used to select CDEF parameters
+// Returns:
+// Nothing will be returned. Contents of cdef_search_ctx will be modified.
+static AOM_INLINE void cdef_params_init(const YV12_BUFFER_CONFIG *frame,
+ const YV12_BUFFER_CONFIG *ref,
+ AV1_COMMON *cm, MACROBLOCKD *xd,
+ CdefSearchCtx *cdef_search_ctx,
+ CDEF_PICK_METHOD pick_method) {
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int num_planes = av1_num_planes(cm);
+ cdef_search_ctx->mi_params = &cm->mi_params;
+ cdef_search_ctx->ref = ref;
+ cdef_search_ctx->nvfb =
+ (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ cdef_search_ctx->nhfb =
+ (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+ cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6);
+ cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method];
+ cdef_search_ctx->num_planes = num_planes;
+ cdef_search_ctx->pick_method = pick_method;
+ cdef_search_ctx->sb_count = 0;
+ cdef_search_ctx->use_highbitdepth = cm->seq_params->use_highbitdepth;
+ av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+ num_planes);
+ // Initialize plane wise information.
+ for (int pli = 0; pli < num_planes; pli++) {
+ cdef_search_ctx->xdec[pli] = xd->plane[pli].subsampling_x;
+ cdef_search_ctx->ydec[pli] = xd->plane[pli].subsampling_y;
+ cdef_search_ctx->bsize[pli] =
+ cdef_search_ctx->ydec[pli]
+ ? (cdef_search_ctx->xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
+ : (cdef_search_ctx->xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
+ cdef_search_ctx->mi_wide_l2[pli] =
+ MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+ cdef_search_ctx->mi_high_l2[pli] =
+ MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+ cdef_search_ctx->plane[pli] = xd->plane[pli];
+ }
+ // Function pointer initialization.
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cm->seq_params->use_highbitdepth) {
+ cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_highbd;
+ cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd;
+ } else {
+ cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd;
+ cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
+ }
+#else
+ cdef_search_ctx->copy_fn = av1_cdef_copy_sb8_16_lowbd;
+ cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist;
+#endif
+}
+
+void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
+ int is_screen_content) {
+ const int bd = cm->seq_params->bit_depth;
+ const int q =
+ av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8);
+ CdefInfo *const cdef_info = &cm->cdef_info;
+ // Check the speed feature to avoid extra signaling.
+ if (skip_cdef) {
+ cdef_info->cdef_bits = 1;
+ cdef_info->nb_cdef_strengths = 2;
+ } else {
+ cdef_info->cdef_bits = 0;
+ cdef_info->nb_cdef_strengths = 1;
+ }
+ cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6);
+
+ int predicted_y_f1 = 0;
+ int predicted_y_f2 = 0;
+ int predicted_uv_f1 = 0;
+ int predicted_uv_f2 = 0;
+ if (is_screen_content) {
+ predicted_y_f1 =
+ (int)(5.88217781e-06 * q * q + 6.10391455e-03 * q + 9.95043102e-02);
+ predicted_y_f2 =
+ (int)(-7.79934857e-06 * q * q + 6.58957830e-03 * q + 8.81045025e-01);
+ predicted_uv_f1 =
+ (int)(-6.79500136e-06 * q * q + 1.02695586e-02 * q + 1.36126802e-01);
+ predicted_uv_f2 =
+ (int)(-9.99613695e-08 * q * q - 1.79361339e-05 * q + 1.17022324e+0);
+ predicted_y_f1 = clamp(predicted_y_f1, 0, 15);
+ predicted_y_f2 = clamp(predicted_y_f2, 0, 3);
+ predicted_uv_f1 = clamp(predicted_uv_f1, 0, 15);
+ predicted_uv_f2 = clamp(predicted_uv_f2, 0, 3);
+ } else {
+ if (!frame_is_intra_only(cm)) {
+ predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f +
+ q * 0.0068615186f + 0.02709886f),
+ 0, 15);
+ predicted_y_f2 = clamp((int)roundf(q * q * -0.00000057629734f +
+ q * 0.0013993345f + 0.03831067f),
+ 0, 3);
+ predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000007095069f +
+ q * 0.0034628846f + 0.00887099f),
+ 0, 15);
+ predicted_uv_f2 = clamp((int)roundf(q * q * 0.00000023874085f +
+ q * 0.00028223585f + 0.05576307f),
+ 0, 3);
+ } else {
+ predicted_y_f1 = clamp(
+ (int)roundf(q * q * 0.0000033731974f + q * 0.008070594f + 0.0187634f),
+ 0, 15);
+ predicted_y_f2 = clamp((int)roundf(q * q * 0.0000029167343f +
+ q * 0.0027798624f + 0.0079405f),
+ 0, 3);
+ predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000130790995f +
+ q * 0.012892405f - 0.00748388f),
+ 0, 15);
+ predicted_uv_f2 = clamp((int)roundf(q * q * 0.0000032651783f +
+ q * 0.00035520183f + 0.00228092f),
+ 0, 3);
+ }
+ }
+ cdef_info->cdef_strengths[0] =
+ predicted_y_f1 * CDEF_SEC_STRENGTHS + predicted_y_f2;
+ cdef_info->cdef_uv_strengths[0] =
+ predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2;
+
+ // mbmi->cdef_strength is already set in the encoding stage. We don't need to
+ // set it again here.
+ if (skip_cdef) {
+ cdef_info->cdef_strengths[1] = 0;
+ cdef_info->cdef_uv_strengths[1] = 0;
+ return;
+ }
+
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ MB_MODE_INFO **mbmi = mi_params->mi_grid_base;
+ // mbmi is NULL when real-time rate control library is used.
+ if (!mbmi) return;
+ for (int r = 0; r < nvfb; ++r) {
+ for (int c = 0; c < nhfb; ++c) {
+ MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c];
+ current_mbmi->cdef_strength = 0;
+ }
+ mbmi += MI_SIZE_64X64 * mi_params->mi_stride;
+ }
+}
+
+void av1_cdef_search(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ CDEF_CONTROL cdef_control = cpi->oxcf.tool_cfg.cdef_control;
+
+ assert(cdef_control != CDEF_NONE);
+ if (cdef_control == CDEF_REFERENCE && cpi->ppi->rtc_ref.non_reference_frame) {
+ CdefInfo *const cdef_info = &cm->cdef_info;
+ cdef_info->nb_cdef_strengths = 1;
+ cdef_info->cdef_bits = 0;
+ cdef_info->cdef_strengths[0] = 0;
+ cdef_info->cdef_uv_strengths[0] = 0;
+ return;
+ }
+
+ // Indicate if external RC is used for testing
+ const int rtc_ext_rc = cpi->rc.rtc_external_ratectrl;
+ if (rtc_ext_rc) {
+ av1_pick_cdef_from_qp(cm, 0, 0);
+ return;
+ }
+ CDEF_PICK_METHOD pick_method = cpi->sf.lpf_sf.cdef_pick_method;
+ if (pick_method == CDEF_PICK_FROM_Q) {
+ const int use_screen_content_model =
+ cm->quant_params.base_qindex >
+ AOMMAX(cpi->sf.rt_sf.screen_content_cdef_filter_qindex_thresh,
+ cpi->rc.best_quality + 5) &&
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ av1_pick_cdef_from_qp(cm, cpi->sf.rt_sf.skip_cdef_sb,
+ use_screen_content_model);
+ return;
+ }
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int damping = 3 + (cm->quant_params.base_qindex >> 6);
+ const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
+ pick_method <= CDEF_FAST_SEARCH_LVL5);
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+ if (!cpi->cdef_search_ctx)
+ CHECK_MEM_ERROR(cm, cpi->cdef_search_ctx,
+ aom_malloc(sizeof(*cpi->cdef_search_ctx)));
+ CdefSearchCtx *cdef_search_ctx = cpi->cdef_search_ctx;
+
+ // Initialize parameters related to CDEF search context.
+ cdef_params_init(&cm->cur_frame->buf, cpi->source, cm, xd, cdef_search_ctx,
+ pick_method);
+ // Allocate CDEF search context buffers.
+ cdef_alloc_data(cm, cdef_search_ctx);
+ // Frame level mse calculation.
+ if (cpi->mt_info.num_workers > 1) {
+ av1_cdef_mse_calc_frame_mt(cpi);
+ } else {
+ cdef_mse_calc_frame(cdef_search_ctx, cm->error);
+ }
+
+ /* Search for different number of signaling bits. */
+ int nb_strength_bits = 0;
+ uint64_t best_rd = UINT64_MAX;
+ CdefInfo *const cdef_info = &cm->cdef_info;
+ int sb_count = cdef_search_ctx->sb_count;
+ uint64_t(*mse[2])[TOTAL_STRENGTHS];
+ mse[0] = cdef_search_ctx->mse[0];
+ mse[1] = cdef_search_ctx->mse[1];
+ /* Calculate the maximum number of bits required to signal CDEF strengths at
+ * block level */
+ const int total_strengths = nb_cdef_strengths[pick_method];
+ const int joint_strengths =
+ num_planes > 1 ? total_strengths * total_strengths : total_strengths;
+ const int max_signaling_bits =
+ joint_strengths == 1 ? 0 : get_msb(joint_strengths - 1) + 1;
+ int rdmult = cpi->td.mb.rdmult;
+ for (int i = 0; i <= 3; i++) {
+ if (i > max_signaling_bits) break;
+ int best_lev0[CDEF_MAX_STRENGTHS];
+ int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
+ const int nb_strengths = 1 << i;
+ uint64_t tot_mse;
+ if (num_planes > 1) {
+ tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
+ mse, sb_count, pick_method);
+ } else {
+ tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count,
+ pick_method);
+ }
+
+ const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS *
+ (num_planes > 1 ? 2 : 1);
+ const int rate_cost = av1_cost_literal(total_bits);
+ const uint64_t dist = tot_mse * 16;
+ const uint64_t rd = RDCOST(rdmult, rate_cost, dist);
+ if (rd < best_rd) {
+ best_rd = rd;
+ nb_strength_bits = i;
+ memcpy(cdef_info->cdef_strengths, best_lev0,
+ nb_strengths * sizeof(best_lev0[0]));
+ if (num_planes > 1) {
+ memcpy(cdef_info->cdef_uv_strengths, best_lev1,
+ nb_strengths * sizeof(best_lev1[0]));
+ }
+ }
+ }
+
+ cdef_info->cdef_bits = nb_strength_bits;
+ cdef_info->nb_cdef_strengths = 1 << nb_strength_bits;
+ for (int i = 0; i < sb_count; i++) {
+ uint64_t best_mse = UINT64_MAX;
+ int best_gi = 0;
+ for (int gi = 0; gi < cdef_info->nb_cdef_strengths; gi++) {
+ uint64_t curr = mse[0][i][cdef_info->cdef_strengths[gi]];
+ if (num_planes > 1) curr += mse[1][i][cdef_info->cdef_uv_strengths[gi]];
+ if (curr < best_mse) {
+ best_gi = gi;
+ best_mse = curr;
+ }
+ }
+ mi_params->mi_grid_base[cdef_search_ctx->sb_index[i]]->cdef_strength =
+ best_gi;
+ }
+ if (fast) {
+ for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) {
+ const int luma_strength = cdef_info->cdef_strengths[j];
+ const int chroma_strength = cdef_info->cdef_uv_strengths[j];
+ int pri_strength, sec_strength;
+
+ STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_strengths[j], pick_method,
+ luma_strength);
+ STORE_CDEF_FILTER_STRENGTH(cdef_info->cdef_uv_strengths[j], pick_method,
+ chroma_strength);
+ }
+ }
+
+ cdef_info->cdef_damping = damping;
+ // Deallocate CDEF search context buffers.
+ av1_cdef_dealloc_data(cdef_search_ctx);
+}
diff --git a/third_party/aom/av1/encoder/pickcdef.h b/third_party/aom/av1/encoder/pickcdef.h
new file mode 100644
index 0000000000..192e734fb0
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickcdef.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKCDEF_H_
+#define AOM_AV1_ENCODER_PICKCDEF_H_
+
+#include "av1/common/cdef.h"
+#include "av1/encoder/speed_features.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\enum CDEF_CONTROL
+ * \brief This enum controls to which frames CDEF is applied.
+ */
+typedef enum {
+ CDEF_NONE = 0, /*!< Disable CDEF on all frames. */
+ CDEF_ALL = 1, /*!< Enable CDEF for all frames. */
+ CDEF_REFERENCE = 2, /*!< Disable CDEF on non reference frames. */
+} CDEF_CONTROL;
+
+/*!\cond */
+struct MultiThreadInfo;
+
+#define REDUCED_PRI_STRENGTHS_LVL1 8
+#define REDUCED_PRI_STRENGTHS_LVL2 5
+#define REDUCED_SEC_STRENGTHS_LVL3 2
+#define REDUCED_SEC_STRENGTHS_LVL5 1
+#define REDUCED_PRI_STRENGTHS_LVL4 2
+
+#define REDUCED_TOTAL_STRENGTHS_LVL1 \
+ (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL2 \
+ (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL3 \
+ (REDUCED_PRI_STRENGTHS_LVL2 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL4 \
+ (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL5 \
+ (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL5)
+#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+
+static const int priconv_lvl1[REDUCED_PRI_STRENGTHS_LVL1] = { 0, 1, 2, 3,
+ 5, 7, 10, 13 };
+static const int priconv_lvl2[REDUCED_PRI_STRENGTHS_LVL2] = { 0, 2, 4, 8, 14 };
+static const int priconv_lvl4[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 11 };
+static const int priconv_lvl5[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 5 };
+static const int secconv_lvl3[REDUCED_SEC_STRENGTHS_LVL3] = { 0, 2 };
+static const int secconv_lvl5[REDUCED_SEC_STRENGTHS_LVL5] = { 0 };
+static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
+ TOTAL_STRENGTHS,
+ REDUCED_TOTAL_STRENGTHS_LVL1,
+ REDUCED_TOTAL_STRENGTHS_LVL2,
+ REDUCED_TOTAL_STRENGTHS_LVL3,
+ REDUCED_TOTAL_STRENGTHS_LVL4,
+ REDUCED_TOTAL_STRENGTHS_LVL5,
+ TOTAL_STRENGTHS
+};
+
+typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const uint8_t *src,
+ int src_voffset, int src_hoffset, int sstride,
+ int vsize, int hsize);
+typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
+ cdef_list *dlist, int cdef_count,
+ BLOCK_SIZE bsize, int coeff_shift,
+ int row, int col);
+
+/*! \brief CDEF search context.
+ */
+typedef struct {
+ /*!
+ * Pointer to the frame buffer holding the source frame
+ */
+ const YV12_BUFFER_CONFIG *ref;
+ /*!
+ * Pointer to params related to MB_MODE_INFO arrays and related info
+ */
+ CommonModeInfoParams *mi_params;
+ /*!
+ * Info specific to each plane
+ */
+ struct macroblockd_plane plane[MAX_MB_PLANE];
+ /*!
+ * Function pointer of copy_fn
+ */
+ copy_fn_t copy_fn;
+ /*!
+ * Function pointer of compute_cdef_dist_fn
+ */
+ compute_cdef_dist_t compute_cdef_dist_fn;
+ /*!
+ * Number of strenghts evaluated in CDEF filter search
+ */
+ int total_strengths;
+ /*!
+ * Bit-depth dependent shift
+ */
+ int coeff_shift;
+ /*!
+ * CDEF damping factor
+ */
+ int damping;
+ /*!
+ * Search method used to select CDEF parameters
+ */
+ int pick_method;
+ /*!
+ * Number of planes
+ */
+ int num_planes;
+ /*!
+ * Log2 of width of the MI unit in pixels. mi_wide_l2[i]
+ * indicates the width of the MI unit in pixels for the ith plane
+ */
+ int mi_wide_l2[MAX_MB_PLANE];
+ /*!
+ * Log2 of height of the MI unit in pixels. mi_high_l2[i]
+ * indicates the height of the MI unit in pixels for the ith plane
+ */
+ int mi_high_l2[MAX_MB_PLANE];
+ /*!
+ * Subsampling in x direction. xdec[i] indicates the subsampling
+ * for the ith plane
+ */
+ int xdec[MAX_MB_PLANE];
+ /*!
+ * Subsampling in y direction. ydec[i] indicates the subsampling
+ * for the ith plane
+ */
+ int ydec[MAX_MB_PLANE];
+ /*!
+ * bsize[i] indicates the block size of ith plane
+ */
+ int bsize[MAX_MB_PLANE];
+ /*!
+ * Number of 64x64 blocks in vertical direction of a frame
+ */
+ int nvfb;
+ /*!
+ * Number of 64x64 blocks in horizontal direction of a frame
+ */
+ int nhfb;
+ /*!
+ * Pointer to the mean squared error between the CDEF filtered block and the
+ * source block. mse[i][j][k] stores the MSE of the ith plane (i=0 corresponds
+ * to Y-plane, i=1 corresponds to U and V planes), jth block and kth strength
+ * index
+ */
+ uint64_t (*mse[2])[TOTAL_STRENGTHS];
+ /*!
+ * Holds the position (in units of mi's) of the cdef filtered
+ * block in raster scan order
+ */
+ int *sb_index;
+ /*!
+ * Holds the count of cdef filtered blocks
+ */
+ int sb_count;
+ /*!
+ * Indicates if 16bit frame buffers are to be used i.e., the content bit-depth
+ * is > 8-bit
+ */
+ bool use_highbitdepth;
+} CdefSearchCtx;
+
+static INLINE int sb_all_skip(const CommonModeInfoParams *const mi_params,
+ int mi_row, int mi_col) {
+ const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
+ const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64);
+ const int stride = mi_params->mi_stride;
+ MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col;
+ for (int r = 0; r < maxr; ++r, mbmi += stride) {
+ for (int c = 0; c < maxc; ++c) {
+ if (!mbmi[c]->skip_txfm) return 0;
+ }
+ }
+ return 1;
+}
+
+// Checks if cdef processing can be skipped for particular sb.
+// Inputs:
+// cdef_search_ctx: Pointer to the structure containing parameters related to
+// CDEF search context.
+// fbr: Row index in units of 64x64 block
+// fbc: Column index in units of 64x64 block
+// Returns:
+// 1/0 will be returned to indicate skip/don't skip cdef processing of sb
+// respectively.
+static INLINE int cdef_sb_skip(const CommonModeInfoParams *const mi_params,
+ int fbr, int fbc) {
+ const MB_MODE_INFO *const mbmi =
+ mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+ MI_SIZE_64X64 * fbc];
+ // No filtering if the entire filter block is skipped.
+ if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
+ return 1;
+ // Skip odd numbered 64x64 block rows(cols) when bsize is BLOCK_128X128,
+ // BLOCK_64X128(BLOCK_128X128, BLOCK_128X64) as for such blocks CDEF filtering
+ // is done at the corresponding block sizes.
+ if (((fbc & 1) &&
+ (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_128X64)) ||
+ ((fbr & 1) &&
+ (mbmi->bsize == BLOCK_128X128 || mbmi->bsize == BLOCK_64X128)))
+ return 1;
+ return 0;
+}
+
+void av1_cdef_dealloc_data(CdefSearchCtx *cdef_search_ctx);
+
+void av1_cdef_mse_calc_block(CdefSearchCtx *cdef_search_ctx,
+ struct aom_internal_error_info *error_info,
+ int fbr, int fbc, int sb_count);
+/*!\endcond */
+
+/*!\brief AV1 CDEF parameter search
+ *
+ * \ingroup in_loop_cdef
+ *
+ * Searches for optimal CDEF parameters for frame
+ *
+ * \param[in,out] cpi Top level encoder structure
+ *
+ * \remark Nothing is returned. Instead, optimal CDEF parameters are stored
+ * in the \c cdef_info structure of type \ref CdefInfo inside \c cm:
+ * \arg \c cdef_bits: Bits of strength parameters
+ * \arg \c nb_cdef_strengths: Number of strength parameters
+ * \arg \c cdef_strengths: list of \c nb_cdef_strengths strength parameters
+ * for the luma plane.
+ * \arg \c uv_cdef_strengths: list of \c nb_cdef_strengths strength parameters
+ * for the chroma planes.
+ * \arg \c damping_factor: CDEF damping factor.
+ *
+ */
+void av1_cdef_search(struct AV1_COMP *cpi);
+
+/*!\brief AV1 CDEF level from QP
+ *
+ * \ingroup in_loop_cdef
+ *
+ * Calculates CDEF levels from frame QP. Only used for speed 7+ with RT mode.
+ *
+ * \param[in,out] cm Pointer to top level common structure
+ * \param[in] skip_cdef Flag to skip CDEF filtering
+ * \param[in] is_screen_content Flag indicating screen content
+ *
+ */
+void av1_pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
+ int is_screen_content);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_ENCODER_PICKCDEF_H_
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
new file mode 100644
index 0000000000..9084d3f13a
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/quant_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+
+static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
+ YV12_BUFFER_CONFIG *dst_bc, int plane) {
+ switch (plane) {
+ case 0: aom_yv12_copy_y(src_bc, dst_bc); break;
+ case 1: aom_yv12_copy_u(src_bc, dst_bc); break;
+ case 2: aom_yv12_copy_v(src_bc, dst_bc); break;
+ default: assert(plane >= 0 && plane <= 2); break;
+ }
+}
+
+int av1_get_max_filter_level(const AV1_COMP *cpi) {
+ if (is_stat_consumption_stage_twopass(cpi)) {
+ return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+ : MAX_LOOP_FILTER;
+ } else {
+ return MAX_LOOP_FILTER;
+ }
+}
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+ AV1_COMP *const cpi, int filt_level,
+ int partial_frame, int plane, int dir) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ int num_workers = mt_info->num_mod_workers[MOD_LPF];
+ AV1_COMMON *const cm = &cpi->common;
+ int64_t filt_err;
+
+ assert(plane >= 0 && plane <= 2);
+ int filter_level[2] = { filt_level, filt_level };
+ if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
+ if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
+
+ // set base filters for use of av1_get_filter_level when in DELTA_LF mode
+ switch (plane) {
+ case 0:
+ cm->lf.filter_level[0] = filter_level[0];
+ cm->lf.filter_level[1] = filter_level[1];
+ break;
+ case 1: cm->lf.filter_level_u = filter_level[0]; break;
+ case 2: cm->lf.filter_level_v = filter_level[0]; break;
+ }
+
+ // lpf_opt_level = 1 : Enables dual/quad loop-filtering.
+ int lpf_opt_level = is_inter_tx_size_search_level_one(&cpi->sf.tx_sf);
+
+ av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
+ plane + 1, partial_frame, mt_info->workers,
+ num_workers, &mt_info->lf_row_sync, lpf_opt_level);
+
+ filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
+ cm->seq_params->use_highbitdepth);
+
+ // Re-instate the unfiltered frame
+ yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane);
+
+ return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+ int partial_frame,
+ const int *last_frame_filter_level, int plane,
+ int dir) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int min_filter_level = 0;
+ const int max_filter_level = av1_get_max_filter_level(cpi);
+ int filt_direction = 0;
+ int64_t best_err;
+ int filt_best;
+
+ // Start the search at the previous frame filter level unless it is now out of
+ // range.
+ int lvl;
+ switch (plane) {
+ case 0:
+ switch (dir) {
+ case 2:
+ lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >>
+ 1;
+ break;
+ case 0:
+ case 1: lvl = last_frame_filter_level[dir]; break;
+ default: assert(dir >= 0 && dir <= 2); return 0;
+ }
+ break;
+ case 1: lvl = last_frame_filter_level[2]; break;
+ case 2: lvl = last_frame_filter_level[3]; break;
+ default: assert(plane >= 0 && plane <= 2); return 0;
+ }
+ int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
+ int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+ // Sum squared error at each filter level
+ int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+ const int use_coarse_search = cpi->sf.lpf_sf.use_coarse_filter_level_search;
+ assert(use_coarse_search <= 1);
+ static const int min_filter_step_lookup[2] = { 0, 2 };
+ // min_filter_step_thesh determines the stopping criteria for the search.
+ // The search is terminated when filter_step equals min_filter_step_thesh.
+ const int min_filter_step_thesh = min_filter_step_lookup[use_coarse_search];
+
+ // Set each entry to -1
+ memset(ss_err, 0xFF, sizeof(ss_err));
+ yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane);
+ best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
+ filt_best = filt_mid;
+ ss_err[filt_mid] = best_err;
+
+ while (filter_step > min_filter_step_thesh) {
+ const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
+ const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
+
+ // Bias against raising loop filter in favor of lowering it.
+ int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+ if ((is_stat_consumption_stage_twopass(cpi)) &&
+ (cpi->ppi->twopass.section_intra_rating < 20))
+ bias = (bias * cpi->ppi->twopass.section_intra_rating) / 20;
+
+ // yx, bias less for large block size
+ if (cm->features.tx_mode != ONLY_4X4) bias >>= 1;
+
+ if (filt_direction <= 0 && filt_low != filt_mid) {
+ // Get Low filter error score
+ if (ss_err[filt_low] < 0) {
+ ss_err[filt_low] =
+ try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
+ }
+ // If value is close to the best so far then bias towards a lower loop
+ // filter value.
+ if (ss_err[filt_low] < (best_err + bias)) {
+ // Was it actually better than the previous best?
+ if (ss_err[filt_low] < best_err) {
+ best_err = ss_err[filt_low];
+ }
+ filt_best = filt_low;
+ }
+ }
+
+ // Now look at filt_high
+ if (filt_direction >= 0 && filt_high != filt_mid) {
+ if (ss_err[filt_high] < 0) {
+ ss_err[filt_high] =
+ try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
+ }
+ // If value is significantly better than previous best, bias added against
+ // raising filter value
+ if (ss_err[filt_high] < (best_err - bias)) {
+ best_err = ss_err[filt_high];
+ filt_best = filt_high;
+ }
+ }
+
+ // Half the step distance if the best filter value was the same as last time
+ if (filt_best == filt_mid) {
+ filter_step /= 2;
+ filt_direction = 0;
+ } else {
+ filt_direction = (filt_best < filt_mid) ? -1 : 1;
+ filt_mid = filt_best;
+ }
+ }
+
+ return filt_best;
+}
+
+void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+ LPF_PICK_METHOD method) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ struct loopfilter *const lf = &cm->lf;
+ int disable_filter_rt_screen = 0;
+ (void)sd;
+
+ lf->sharpness_level = 0;
+
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->sf.rt_sf.skip_lf_screen)
+ disable_filter_rt_screen = av1_cyclic_refresh_disable_lf_cdef(cpi);
+
+ if (disable_filter_rt_screen ||
+ cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_NONE ||
+ (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_REFERENCE &&
+ cpi->ppi->rtc_ref.non_reference_frame)) {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ return;
+ }
+
+ if (method == LPF_PICK_MINIMAL_LPF) {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ } else if (method >= LPF_PICK_FROM_Q) {
+ const int min_filter_level = 0;
+ const int max_filter_level = av1_get_max_filter_level(cpi);
+ const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
+ seq_params->bit_depth);
+ // based on tests result for rtc test set
+ // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
+ const int strength_boost_q_treshold = 0;
+ int inter_frame_multiplier =
+ (q > strength_boost_q_treshold ||
+ (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ cpi->common.width * cpi->common.height > 352 * 288))
+ ? 12034
+ : 6017;
+ // Increase strength on base TL0 for temporal layers, for low-resoln,
+ // based on frame source_sad.
+ if (cpi->svc.number_temporal_layers > 1 &&
+ cpi->svc.temporal_layer_id == 0 &&
+ cpi->common.width * cpi->common.height <= 352 * 288 &&
+ cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ if (cpi->rc.frame_source_sad > 100000)
+ inter_frame_multiplier = inter_frame_multiplier << 1;
+ else if (cpi->rc.frame_source_sad > 50000)
+ inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1);
+ }
+ // These values were determined by linear fitting the result of the
+ // searched level for 8 bit depth:
+ // Keyframes: filt_guess = q * 0.06699 - 1.60817
+ // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225
+ //
+ // And high bit depth separately:
+ // filt_guess = q * 0.316206 + 3.87252
+ int filt_guess;
+ switch (seq_params->bit_depth) {
+ case AOM_BITS_8:
+ filt_guess =
+ (cm->current_frame.frame_type == KEY_FRAME)
+ ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
+ : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18);
+ break;
+ case AOM_BITS_10:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+ break;
+ case AOM_BITS_12:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+ break;
+ default:
+ assert(0 &&
+ "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
+ "or AOM_BITS_12");
+ return;
+ }
+ if (seq_params->bit_depth != AOM_BITS_8 &&
+ cm->current_frame.frame_type == KEY_FRAME)
+ filt_guess -= 4;
+ // TODO(chengchen): retrain the model for Y, U, V filter levels
+ lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
+ lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
+ lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
+ lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
+ if (cpi->oxcf.algo_cfg.loopfilter_control == LOOPFILTER_SELECTIVELY &&
+ !frame_is_intra_only(cm) && !cpi->rc.high_source_sad) {
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ } else {
+ const int num4x4 = (cm->width >> 2) * (cm->height >> 2);
+ const int newmv_thresh = 7;
+ const int distance_since_key_thresh = 5;
+ if ((cpi->td.rd_counts.newmv_or_intra_blocks * 100 / num4x4) <
+ newmv_thresh &&
+ cpi->rc.frames_since_key > distance_since_key_thresh) {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ }
+ }
+ }
+ } else {
+ int last_frame_filter_level[4] = { 0 };
+ if (!frame_is_intra_only(cm)) {
+ last_frame_filter_level[0] = cpi->ppi->filter_level[0];
+ last_frame_filter_level[1] = cpi->ppi->filter_level[1];
+ last_frame_filter_level[2] = cpi->ppi->filter_level_u;
+ last_frame_filter_level[3] = cpi->ppi->filter_level_v;
+ }
+ // The frame buffer last_frame_uf is used to store the non-loop filtered
+ // reconstructed frame in search_filter_level().
+ if (aom_realloc_frame_buffer(
+ &cpi->last_frame_uf, cm->width, cm->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, NULL, NULL, NULL, 0, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate last frame buffer");
+
+ lf->filter_level[0] = lf->filter_level[1] =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, 0, 2);
+ if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) {
+ lf->filter_level[0] =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, 0, 0);
+ lf->filter_level[1] =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, 0, 1);
+ }
+
+ if (num_planes > 1) {
+ lf->filter_level_u =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, 1, 0);
+ lf->filter_level_v =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, 2, 0);
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
new file mode 100644
index 0000000000..f567937c32
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PICKLPF_H_
+#define AOM_AV1_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+int av1_get_max_filter_level(const AV1_COMP *cpi);
+
+/*!\brief Algorithm for AV1 loop filter level selection.
+ *
+ * \ingroup in_loop_filter
+ * This function determines proper filter levels used for in-loop filter
+ * (deblock filter).
+ *
+ * \param[in] sd The pointer of frame buffer
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] method The method used to select filter levels
+ *
+ * \par
+ * method includes:
+ * \arg \c LPF_PICK_FROM_FULL_IMAGE: Try the full image with different values.
+ * \arg \c LPF_PICK_FROM_FULL_IMAGE_NON_DUAL: Try the full image filter search
+ * with non-dual filter only.
+ * \arg \c LPF_PICK_FROM_SUBIMAGE: Try a small portion of the image with
+ * different values.
+ * \arg \c LPF_PICK_FROM_Q: Estimate the level based on quantizer and frame type
+ * \arg \c LPF_PICK_MINIMAL_LPF: Pick 0 to disable LPF if LPF was enabled last
+ * frame
+ *
+ * \remark Nothing is returned. Instead, filter levels below are stored in the
+ * "loopfilter" structure inside "cpi":
+ * \arg \c filter_level[0]: the vertical filter level for Y plane
+ * \arg \c filter_level[1]: the horizontal filter level for Y plane
+ * \arg \c filter_level_u: the filter level for U plane
+ * \arg \c filter_level_v: the filter level for V plane
+ *
+ * \n
+ * \b Overview
+ * \par
+ * The workflow of deblock filter is shown in Fig.1. \n
+ * Boundary pixels pass through a non-flatness check, followed by a step that
+ * determines smoothness and selects proper types of filters
+ * (4-, 6-, 8-, 14-tap filter). \n
+ * If non-flatness criteria is not satisfied, the encoder will not apply
+ * deblock filtering on these boundary pixels.
+ * \image html filter_flow.png "Fig.1. The workflow of deblock filter" width=70%
+ *
+ * \par
+ * The non-flatness is determined by the boundary pixels and thresholds as shown
+ * in Fig.2. \n
+ * Filtering is applied when \n
+ * \f$|p_0-p_1|<thr_1\f$ and \f$|q_0-q_1|<thr_1\f$ and
+ * \f$2*|p_0-q_0|+|p_1-q_1|/2<thr_2\f$ \n
+ * \image html filter_thr.png "Fig.2. Non-flatness of pixel boundary" height=40%
+ *
+ * \par
+ * Thresholds ("thr_1" and "thr_2") are determined by the filter level. \n
+ * In AV1, for each frame, we employ the four filter levels, based on these
+ * observations: \n
+ * Luma and chroma planes have different characteristics, including subsampling
+ * (different plane size), coding quality (chroma planes are better coded). \n
+ * Therefore chroma planes need less deblocking filtering than luma plane. \n
+ * In addition, content texture has different spatial characteristics: vertical
+ * and horizontal direction may need different level of filtering. \n
+ * The selection of these filter levels is described in the following section.
+ *
+ * \par
+ * \b Algorithm
+ * \par
+ * The encoder selects filter levels given the current frame buffer, and the
+ * method. \n
+ * By default, "LPF_PICK_FROM_FULL_IMAGE" is used, which should provide
+ * the most appropriate filter levels. \n
+ * For video on demand (VOD) mode, if speed setting is larger than 5,
+ * "LPF_PICK_FROM_FULL_IMAGE_NON_DUAL" is used. \n
+ * For real-time mode, if speed setting is larger than 5, "LPF_PICK_FROM_Q" is
+ * used.
+ *
+ * \par
+ * "LPF_PICK_FROM_FULL_IMAGE" method: determine filter levels sequentially
+ * by a filter level search procedure (function "search_filter_level"). \n
+ * The order is: \n
+ * First search and determine the filter level for Y plane.
+ * Let vertical filter level (filter_level[0]) and the horizontal filter level
+ * (filter_level[1]) be equal to it. \n
+ * Keep the horizontal filter level the same and search and determine the
+ * vertical filter level. \n
+ * Search and determine the horizontal filter level. \n
+ * Search and determine filter level for U plane. \n
+ * Search and determine filter level for V plane.
+ *
+ * \par
+ * Search and determine filter level is fulfilled by function
+ * "search_filter_level". \n
+ * It starts with a base filter level ("filt_mid") initialized by the
+ * corresponding last frame's filter level. \n
+ * A filter step ("filter_step") is determined as:
+ * filter_step = filt_mid < 16 ? 4 : filt_mid / 4. \n
+ * Then a modified binary search strategy is employed to find a proper
+ * filter level. \n
+ * In each iteration, set filt_low = filt_mid - filter_step,
+ * filt_high = filt_mid + filter_step. \n
+ * We now have three candidate levels, "filt_mid", "filt_low" and "filt_high".
+ * \n
+ * Deblock filtering is applied on the current frame with candidate filter
+ * levels and the sum of squared error (SSE) between source and filtered frame
+ * is computed. \n
+ * Set "filt_best" to the filter level of the smallest SSE. If "filter_best"
+ * equals to "filt_mid", halve the filter_step. Otherwise, set filt_mid =
+ * filt_best. \n
+ * Go to the next iteration until "filter_step" is 0. \n
+ * Note that in the comparison of SSEs between SSE[filt_low] and SSE[filt_mid],
+ * a "bias" is introduced to slightly raise the filter level. \n
+ * It is based on the observation that low filter levels tend to yield a smaller
+ * SSE and produce a higher PSNR for the current frame, \n
+ * while oversmoothing it and degradating the quality for prediction for future
+ * frames and leanding to a suboptimal performance overall. \n
+ * Function "try_filter_frame" is the referrence for applying deblock filtering
+ * with a given filter level and computatition of SSE.
+ *
+ * \par
+ * "LPF_PICK_FROM_FULL_IMAGE_NON_DUAL" method: almost the same as
+ * "LPF_PICK_FROM_FULL_IMAGE", \n
+ * just without separately searching for appropriate filter levels for vertical
+ * and horizontal filters.
+ *
+ * \par
+ * "LPF_PICK_FROM_Q" method: filter levels are determined by the
+ * quantization factor (q). \n
+ * For 8 bit: \n
+ * Keyframes: filt_guess = q * 0.06699 - 1.60817 \n
+ * Other frames: filt_guess = q * inter_frame_multiplier + 2.48225 \n
+ * inter_frame_multiplier = q > 700 ? 0.04590 : 0.02295 \n
+ * For 10 bit and 12 bit: \n
+ * filt_guess = q * 0.316206 + 3.87252 \n
+ * Then filter_level[0] = filter_level[1] = filter_level_u = filter_level_v =
+ * clamp(filt_guess, min_filter_level, max_filter_level) \n
+ * Where min_filter_level = 0, max_filter_level = 64 \n
+ * The equations were determined by linear fitting using filter levels
+ * generated by "LPF_PICK_FROM_FULL_IMAGE" method.
+ *
+ */
+void av1_pick_filter_level(const struct yv12_buffer_config *sd,
+ struct AV1_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PICKLPF_H_
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
new file mode 100644
index 0000000000..6429064175
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -0,0 +1,2217 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+
+// Number of Wiener iterations
+#define NUM_WIENER_ITERS 5
+
+// Penalty factor for use of dual sgr
+#define DUAL_SGR_PENALTY_MULT 0.01
+
+// Working precision for Wiener filter coefficients
+#define WIENER_TAP_SCALE_FACTOR ((int64_t)1 << 16)
+
+#define SGRPROJ_EP_GRP1_START_IDX 0
+#define SGRPROJ_EP_GRP1_END_IDX 9
+#define SGRPROJ_EP_GRP1_SEARCH_COUNT 4
+#define SGRPROJ_EP_GRP2_3_SEARCH_COUNT 2
+static const int sgproj_ep_grp1_seed[SGRPROJ_EP_GRP1_SEARCH_COUNT] = { 0, 3, 6,
+ 9 };
+static const int sgproj_ep_grp2_3[SGRPROJ_EP_GRP2_3_SEARCH_COUNT][14] = {
+ { 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, -1, -1, -1, -1 },
+ { 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 }
+};
+
+#if DEBUG_LR_COSTING
+RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE]
+ [MAX_LR_UNITS_W * MAX_LR_UNITS_H];
+#endif // DEBUG_LR_COSTING
+
+typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b,
+ int hstart, int width, int vstart,
+ int height);
+typedef uint64_t (*var_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+ int hstart, int width, int vstart,
+ int height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define NUM_EXTRACTORS (3 * (1 + 1))
+#else
+#define NUM_EXTRACTORS 3
+#endif
+static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
+ aom_get_y_sse_part, aom_get_u_sse_part,
+ aom_get_v_sse_part,
+#if CONFIG_AV1_HIGHBITDEPTH
+ aom_highbd_get_y_sse_part, aom_highbd_get_u_sse_part,
+ aom_highbd_get_v_sse_part,
+#endif
+};
+static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = {
+ aom_get_y_var, aom_get_u_var, aom_get_v_var,
+#if CONFIG_AV1_HIGHBITDEPTH
+ aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var,
+#endif
+};
+
+static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
+ const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *dst, int plane,
+ int highbd) {
+ return sse_part_extractors[3 * highbd + plane](
+ src, dst, limits->h_start, limits->h_end - limits->h_start,
+ limits->v_start, limits->v_end - limits->v_start);
+}
+
+static uint64_t var_restoration_unit(const RestorationTileLimits *limits,
+ const YV12_BUFFER_CONFIG *src, int plane,
+ int highbd) {
+ return var_part_extractors[3 * highbd + plane](
+ src, limits->h_start, limits->h_end - limits->h_start, limits->v_start,
+ limits->v_end - limits->v_start);
+}
+
+typedef struct {
+ const YV12_BUFFER_CONFIG *src;
+ YV12_BUFFER_CONFIG *dst;
+
+ const AV1_COMMON *cm;
+ const MACROBLOCK *x;
+ int plane;
+ int plane_w;
+ int plane_h;
+ RestUnitSearchInfo *rusi;
+
+ // Speed features
+ const LOOP_FILTER_SPEED_FEATURES *lpf_sf;
+
+ uint8_t *dgd_buffer;
+ int dgd_stride;
+ const uint8_t *src_buffer;
+ int src_stride;
+
+ // SSE values for each restoration mode for the current RU
+ // These are saved by each search function for use in search_switchable()
+ int64_t sse[RESTORE_SWITCHABLE_TYPES];
+
+ // This flag will be set based on the speed feature
+ // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning.
+ uint8_t skip_sgr_eval;
+
+ // Total rate and distortion so far for each restoration type
+ // These are initialised by reset_rsc in search_rest_type
+ int64_t total_sse[RESTORE_TYPES];
+ int64_t total_bits[RESTORE_TYPES];
+
+ // Reference parameters for delta-coding
+ //
+ // For each restoration type, we need to store the latest parameter set which
+ // has been used, so that we can properly cost up the next parameter set.
+ // Note that we have two sets of these - one for the single-restoration-mode
+ // search (ie, frame_restoration_type = RESTORE_WIENER or RESTORE_SGRPROJ)
+ // and one for the switchable mode. This is because these two cases can lead
+ // to different sets of parameters being signaled, but we don't know which
+ // we will pick for sure until the end of the search process.
+ WienerInfo ref_wiener;
+ SgrprojInfo ref_sgrproj;
+ WienerInfo switchable_ref_wiener;
+ SgrprojInfo switchable_ref_sgrproj;
+
+ // Buffers used to hold dgd-avg and src-avg data respectively during SIMD
+ // call of Wiener filter.
+ int16_t *dgd_avg;
+ int16_t *src_avg;
+} RestSearchCtxt;
+
+static AOM_INLINE void rsc_on_tile(void *priv) {
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ set_default_wiener(&rsc->ref_wiener);
+ set_default_sgrproj(&rsc->ref_sgrproj);
+ set_default_wiener(&rsc->switchable_ref_wiener);
+ set_default_sgrproj(&rsc->switchable_ref_sgrproj);
+}
+
+static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) {
+ memset(rsc->total_sse, 0, sizeof(rsc->total_sse));
+ memset(rsc->total_bits, 0, sizeof(rsc->total_bits));
+}
+
+static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
+ const AV1_COMMON *cm, const MACROBLOCK *x,
+ const LOOP_FILTER_SPEED_FEATURES *lpf_sf,
+ int plane, RestUnitSearchInfo *rusi,
+ YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) {
+ rsc->src = src;
+ rsc->dst = dst;
+ rsc->cm = cm;
+ rsc->x = x;
+ rsc->plane = plane;
+ rsc->rusi = rusi;
+ rsc->lpf_sf = lpf_sf;
+
+ const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
+ const int is_uv = plane != AOM_PLANE_Y;
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+ assert(plane_w == src->crop_widths[is_uv]);
+ assert(plane_h == src->crop_heights[is_uv]);
+ assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
+ assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
+
+ rsc->plane_w = plane_w;
+ rsc->plane_h = plane_h;
+ rsc->src_buffer = src->buffers[plane];
+ rsc->src_stride = src->strides[is_uv];
+ rsc->dgd_buffer = dgd->buffers[plane];
+ rsc->dgd_stride = dgd->strides[is_uv];
+}
+
+static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
+ const RestorationTileLimits *limits,
+ const RestorationUnitInfo *rui) {
+ const AV1_COMMON *const cm = rsc->cm;
+ const int plane = rsc->plane;
+ const int is_uv = plane > 0;
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+ RestorationLineBuffers rlbs;
+ const int bit_depth = cm->seq_params->bit_depth;
+ const int highbd = cm->seq_params->use_highbitdepth;
+
+ const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf;
+ // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
+ // also used in encoder.
+ const int optimized_lr = 0;
+
+ av1_loop_restoration_filter_unit(
+ limits, rui, &rsi->boundaries, &rlbs, rsc->plane_w, rsc->plane_h,
+ is_uv && cm->seq_params->subsampling_x,
+ is_uv && cm->seq_params->subsampling_y, highbd, bit_depth,
+ fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
+ rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr, cm->error);
+
+ return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
+}
+
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride, int32_t *flt1,
+ int flt1_stride, int xq[2],
+ const sgr_params_type *params) {
+ int i, j;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+ assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ }
+ } else if (params->r[0] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[0] * (flt0[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ }
+ } else if (params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[1] * (flt1[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ }
+ } else {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t e = (int32_t)(dat[j]) - src[j];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+
+ return err;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2],
+ const sgr_params_type *params) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ int i, j;
+ int64_t err = 0;
+ const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ int xq0 = xq[0];
+ int xq1 = xq[1];
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t d = dat[j];
+ const int32_t s = src[j];
+ const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+ int32_t v0 = flt0[j] - u;
+ int32_t v1 = flt1[j] - u;
+ int32_t v = half;
+ v += xq0 * v0;
+ v += xq1 * v1;
+ const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ src += src_stride;
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ int exq;
+ int32_t *flt;
+ int flt_stride;
+ if (params->r[0] > 0) {
+ exq = xq[0];
+ flt = flt0;
+ flt_stride = flt0_stride;
+ } else {
+ exq = xq[1];
+ flt = flt1;
+ flt_stride = flt1_stride;
+ }
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t d = dat[j];
+ const int32_t s = src[j];
+ const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+ int32_t v = half;
+ v += exq * (flt[j] - u);
+ const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ flt += flt_stride;
+ src += src_stride;
+ }
+ } else {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t d = dat[j];
+ const int32_t s = src[j];
+ const int32_t e = d - s;
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+ return err;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int use_highbitdepth,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int *xqd,
+ const sgr_params_type *params) {
+ int xq[2];
+ av1_decode_xq(xqd, xq, params);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (use_highbitdepth) {
+ return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, xq, params);
+
+ } else {
+ return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, xq, params);
+ }
+#else
+ (void)use_highbitdepth;
+ return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, xq, params);
+#endif
+}
+
+#define USE_SGRPROJ_REFINEMENT_SEARCH 1
+static int64_t finer_search_pixel_proj_error(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0,
+ int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd,
+ const sgr_params_type *params) {
+ int64_t err = get_pixel_proj_error(
+ src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+ flt0_stride, flt1, flt1_stride, xqd, params);
+ (void)start_step;
+#if USE_SGRPROJ_REFINEMENT_SEARCH
+ int64_t err2;
+ int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 };
+ int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 };
+ for (int s = start_step; s >= 1; s >>= 1) {
+ for (int p = 0; p < 2; ++p) {
+ if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) {
+ continue;
+ }
+ int skip = 0;
+ do {
+ if (xqd[p] - s >= tap_min[p]) {
+ xqd[p] -= s;
+ err2 =
+ get_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, use_highbitdepth, flt0,
+ flt0_stride, flt1, flt1_stride, xqd, params);
+ if (err2 > err) {
+ xqd[p] += s;
+ } else {
+ err = err2;
+ skip = 1;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ if (skip) break;
+ do {
+ if (xqd[p] + s <= tap_max[p]) {
+ xqd[p] += s;
+ err2 =
+ get_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, use_highbitdepth, flt0,
+ flt0_stride, flt1, flt1_stride, xqd, params);
+ if (err2 > err) {
+ xqd[p] -= s;
+ } else {
+ err = err2;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ }
+ }
+#endif // USE_SGRPROJ_REFINEMENT_SEARCH
+ return err;
+}
+
+static int64_t signed_rounded_divide(int64_t dividend, int64_t divisor) {
+ if (dividend < 0)
+ return (dividend - divisor / 2) / divisor;
+ else
+ return (dividend + divisor / 2) / divisor;
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_c(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+ const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+ H[0][0] += (int64_t)f1 * f1;
+ H[1][1] += (int64_t)f2 * f2;
+ H[0][1] += (int64_t)f1 * f2;
+ C[0] += (int64_t)f1 * s;
+ C[1] += (int64_t)f2 * s;
+ }
+ }
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+ const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+ H[0][0] += (int64_t)f1 * f1;
+ H[1][1] += (int64_t)f2 * f2;
+ H[0][1] += (int64_t)f1 * f2;
+ C[0] += (int64_t)f1 * s;
+ C[1] += (int64_t)f2 * s;
+ }
+ }
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride, int64_t H[2][2],
+ int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+ H[0][0] += (int64_t)f1 * f1;
+ C[0] += (int64_t)f1 * s;
+ }
+ }
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_c(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+ H[0][0] += (int64_t)f1 * f1;
+ C[0] += (int64_t)f1 * s;
+ }
+ }
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8,
+ int dat_stride, int32_t *flt1,
+ int flt1_stride, int64_t H[2][2],
+ int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+ H[1][1] += (int64_t)f2 * f2;
+ C[1] += (int64_t)f2 * s;
+ }
+ }
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_c(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const int32_t s =
+ (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+ H[1][1] += (int64_t)f2 * f2;
+ C[1] += (int64_t)f2 * s;
+ }
+ }
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_c(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride, int32_t *flt1,
+ int flt1_stride, int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_c(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, flt1, flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_c(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_c(src8, width, height, src_stride, dat8, dat_stride,
+ flt1, flt1_stride, H, C);
+ }
+}
+
+void av1_calc_proj_params_high_bd_c(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride, int32_t *flt1,
+ int flt1_stride, int64_t H[2][2],
+ int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_high_bd_c(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_high_bd_c(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int use_highbitdepth, int32_t *flt0,
+ int flt0_stride, int32_t *flt1,
+ int flt1_stride, int *xq,
+ const sgr_params_type *params) {
+ int64_t H[2][2] = { { 0, 0 }, { 0, 0 } };
+ int64_t C[2] = { 0, 0 };
+
+ // Default values to be returned if the problem becomes ill-posed
+ xq[0] = 0;
+ xq[1] = 0;
+
+ if (!use_highbitdepth) {
+ if ((width & 0x7) == 0) {
+ av1_calc_proj_params(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, flt1, flt1_stride, H, C, params);
+ } else {
+ av1_calc_proj_params_c(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, flt1, flt1_stride, H, C,
+ params);
+ }
+ }
+#if CONFIG_AV1_HIGHBITDEPTH
+ else { // NOLINT
+ if ((width & 0x7) == 0) {
+ av1_calc_proj_params_high_bd(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C, params);
+ } else {
+ av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C, params);
+ }
+ }
+#endif
+
+ if (params->r[0] == 0) {
+ // H matrix is now only the scalar H[1][1]
+ // C vector is now only the scalar C[1]
+ const int64_t Det = H[1][1];
+ if (Det == 0) return; // ill-posed, return default values
+ xq[0] = 0;
+ xq[1] = (int)signed_rounded_divide(C[1] * (1 << SGRPROJ_PRJ_BITS), Det);
+ } else if (params->r[1] == 0) {
+ // H matrix is now only the scalar H[0][0]
+ // C vector is now only the scalar C[0]
+ const int64_t Det = H[0][0];
+ if (Det == 0) return; // ill-posed, return default values
+ xq[0] = (int)signed_rounded_divide(C[0] * (1 << SGRPROJ_PRJ_BITS), Det);
+ xq[1] = 0;
+ } else {
+ const int64_t Det = H[0][0] * H[1][1] - H[0][1] * H[1][0];
+ if (Det == 0) return; // ill-posed, return default values
+
+ // If scaling up dividend would overflow, instead scale down the divisor
+ const int64_t div1 = H[1][1] * C[0] - H[0][1] * C[1];
+ if ((div1 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div1) ||
+ (div1 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div1))
+ xq[0] = (int)signed_rounded_divide(div1, Det / (1 << SGRPROJ_PRJ_BITS));
+ else
+ xq[0] = (int)signed_rounded_divide(div1 * (1 << SGRPROJ_PRJ_BITS), Det);
+
+ const int64_t div2 = H[0][0] * C[1] - H[1][0] * C[0];
+ if ((div2 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div2) ||
+ (div2 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div2))
+ xq[1] = (int)signed_rounded_divide(div2, Det / (1 << SGRPROJ_PRJ_BITS));
+ else
+ xq[1] = (int)signed_rounded_divide(div2 * (1 << SGRPROJ_PRJ_BITS), Det);
+ }
+}
+
+static AOM_INLINE void encode_xq(int *xq, int *xqd,
+ const sgr_params_type *params) {
+ if (params->r[0] == 0) {
+ xqd[0] = 0;
+ xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1,
+ SGRPROJ_PRJ_MAX1);
+ } else if (params->r[1] == 0) {
+ xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+ xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1,
+ SGRPROJ_PRJ_MAX1);
+ } else {
+ xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+ xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1,
+ SGRPROJ_PRJ_MAX1);
+ }
+}
+
+// Apply the self-guided filter across an entire restoration unit.
+static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8,
+ int width, int height, int dat_stride,
+ int use_highbd, int bit_depth, int pu_width,
+ int pu_height, int32_t *flt0, int32_t *flt1,
+ int flt_stride,
+ struct aom_internal_error_info *error_info) {
+ for (int i = 0; i < height; i += pu_height) {
+ const int h = AOMMIN(pu_height, height - i);
+ int32_t *flt0_row = flt0 + i * flt_stride;
+ int32_t *flt1_row = flt1 + i * flt_stride;
+ const uint8_t *dat8_row = dat8 + i * dat_stride;
+
+ // Iterate over the stripe in blocks of width pu_width
+ for (int j = 0; j < width; j += pu_width) {
+ const int w = AOMMIN(pu_width, width - j);
+ if (av1_selfguided_restoration(
+ dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+ flt_stride, sgr_params_idx, bit_depth, use_highbd) != 0) {
+ aom_internal_error(
+ error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffer in av1_selfguided_restoration");
+ }
+ }
+ }
+}
+
+static AOM_INLINE void compute_sgrproj_err(
+ const uint8_t *dat8, const int width, const int height,
+ const int dat_stride, const uint8_t *src8, const int src_stride,
+ const int use_highbitdepth, const int bit_depth, const int pu_width,
+ const int pu_height, const int ep, int32_t *flt0, int32_t *flt1,
+ const int flt_stride, int *exqd, int64_t *err,
+ struct aom_internal_error_info *error_info) {
+ int exq[2];
+ apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
+ pu_width, pu_height, flt0, flt1, flt_stride, error_info);
+ const sgr_params_type *const params = &av1_sgr_params[ep];
+ get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
+ use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
+ params);
+ encode_xq(exq, exqd, params);
+ *err = finer_search_pixel_proj_error(
+ src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+ flt_stride, flt1, flt_stride, 2, exqd, params);
+}
+
+static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err,
+ const int *exqd, int *bestxqd,
+ int *bestep, const int ep) {
+ if (*besterr == -1 || err < *besterr) {
+ *bestep = ep;
+ *besterr = err;
+ bestxqd[0] = exqd[0];
+ bestxqd[1] = exqd[1];
+ }
+}
+
+static SgrprojInfo search_selfguided_restoration(
+ const uint8_t *dat8, int width, int height, int dat_stride,
+ const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
+ int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning,
+ struct aom_internal_error_info *error_info) {
+ int32_t *flt0 = rstbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ int ep, idx, bestep = 0;
+ int64_t besterr = -1;
+ int exqd[2], bestxqd[2] = { 0, 0 };
+ int flt_stride = ((width + 7) & ~7) + 8;
+ assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+ pu_width == RESTORATION_PROC_UNIT_SIZE);
+ assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+ pu_height == RESTORATION_PROC_UNIT_SIZE);
+ if (!enable_sgr_ep_pruning) {
+ for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
+ int64_t err;
+ compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+ use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+ flt0, flt1, flt_stride, exqd, &err, error_info);
+ get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+ }
+ } else {
+ // evaluate first four seed ep in first group
+ for (idx = 0; idx < SGRPROJ_EP_GRP1_SEARCH_COUNT; idx++) {
+ ep = sgproj_ep_grp1_seed[idx];
+ int64_t err;
+ compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+ use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+ flt0, flt1, flt_stride, exqd, &err, error_info);
+ get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+ }
+ // evaluate left and right ep of winner in seed ep
+ int bestep_ref = bestep;
+ for (ep = bestep_ref - 1; ep < bestep_ref + 2; ep += 2) {
+ if (ep < SGRPROJ_EP_GRP1_START_IDX || ep > SGRPROJ_EP_GRP1_END_IDX)
+ continue;
+ int64_t err;
+ compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+ use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+ flt0, flt1, flt_stride, exqd, &err, error_info);
+ get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+ }
+ // evaluate last two group
+ for (idx = 0; idx < SGRPROJ_EP_GRP2_3_SEARCH_COUNT; idx++) {
+ ep = sgproj_ep_grp2_3[idx][bestep];
+ int64_t err;
+ compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+ use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+ flt0, flt1, flt_stride, exqd, &err, error_info);
+ get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+ }
+ }
+
+ SgrprojInfo ret;
+ ret.ep = bestep;
+ ret.xqd[0] = bestxqd[0];
+ ret.xqd[1] = bestxqd[1];
+ return ret;
+}
+
+static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
+ SgrprojInfo *ref_sgrproj_info) {
+ int bits = SGRPROJ_PARAMS_BITS;
+ const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
+ if (params->r[0] > 0)
+ bits += aom_count_primitive_refsubexpfin(
+ SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ if (params->r[1] > 0)
+ bits += aom_count_primitive_refsubexpfin(
+ SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ return bits;
+}
+
+static AOM_INLINE void search_sgrproj(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
+ (void)rlbs;
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+ const MACROBLOCK *const x = rsc->x;
+ const AV1_COMMON *const cm = rsc->cm;
+ const int highbd = cm->seq_params->use_highbitdepth;
+ const int bit_depth = cm->seq_params->bit_depth;
+
+ const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0];
+ // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
+ if (rsc->skip_sgr_eval) {
+ rsc->total_bits[RESTORE_SGRPROJ] += bits_none;
+ rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[RESTORE_NONE];
+ rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE;
+ rsc->sse[RESTORE_SGRPROJ] = INT64_MAX;
+ return;
+ }
+
+ uint8_t *dgd_start =
+ rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
+ const uint8_t *src_start =
+ rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
+
+ const int is_uv = rsc->plane > 0;
+ const int ss_x = is_uv && cm->seq_params->subsampling_x;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+ const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+ const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+
+ rusi->sgrproj = search_selfguided_restoration(
+ dgd_start, limits->h_end - limits->h_start,
+ limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
+ rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
+ tmpbuf, rsc->lpf_sf->enable_sgr_ep_pruning, error_info);
+
+ RestorationUnitInfo rui;
+ rui.restoration_type = RESTORE_SGRPROJ;
+ rui.sgrproj_info = rusi->sgrproj;
+
+ rsc->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, &rui);
+
+ const int64_t bits_sgr =
+ x->mode_costs.sgrproj_restore_cost[1] +
+ (count_sgrproj_bits(&rusi->sgrproj, &rsc->ref_sgrproj)
+ << AV1_PROB_COST_SHIFT);
+ double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE], bit_depth);
+ double cost_sgr = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_sgr >> 4, rsc->sse[RESTORE_SGRPROJ], bit_depth);
+ if (rusi->sgrproj.ep < 10)
+ cost_sgr *=
+ (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
+
+ RestorationType rtype =
+ (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
+ rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype;
+
+#if DEBUG_LR_COSTING
+ // Store ref params for later checking
+ lr_ref_params[RESTORE_SGRPROJ][rsc->plane][rest_unit_idx].sgrproj_info =
+ rsc->ref_sgrproj;
+#endif // DEBUG_LR_COSTING
+
+ rsc->total_sse[RESTORE_SGRPROJ] += rsc->sse[rtype];
+ rsc->total_bits[RESTORE_SGRPROJ] +=
+ (cost_sgr < cost_none) ? bits_sgr : bits_none;
+ if (cost_sgr < cost_none) rsc->ref_sgrproj = rusi->sgrproj;
+}
+
+static void acc_stat_one_line(const uint8_t *dgd, const uint8_t *src,
+ int dgd_stride, int h_start, int h_end,
+ uint8_t avg, const int wiener_halfwin,
+ const int wiener_win2, int32_t *M_int32,
+ int32_t *H_int32, int count) {
+ int j, k, l;
+ int16_t Y[WIENER_WIN2];
+
+ for (j = h_start; j < h_end; j++) {
+ const int16_t X = (int16_t)src[j] - (int16_t)avg;
+ int idx = 0;
+ for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+ for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+ Y[idx] =
+ (int16_t)dgd[(count + l) * dgd_stride + (j + k)] - (int16_t)avg;
+ idx++;
+ }
+ }
+ assert(idx == wiener_win2);
+ for (k = 0; k < wiener_win2; ++k) {
+ M_int32[k] += (int32_t)Y[k] * X;
+ for (l = k; l < wiener_win2; ++l) {
+ // H is a symmetric matrix, so we only need to fill out the upper
+ // triangle here. We can copy it down to the lower triangle outside
+ // the (i, j) loops.
+ H_int32[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l];
+ }
+ }
+ }
+}
+
+void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int16_t *dgd_avg, int16_t *src_avg, int h_start,
+ int h_end, int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ (void)dgd_avg;
+ (void)src_avg;
+ int i, k, l;
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+ int32_t M_row[WIENER_WIN2] = { 0 };
+ int32_t H_row[WIENER_WIN2 * WIENER_WIN2] = { 0 };
+ int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+
+ memset(M, 0, sizeof(*M) * wiener_win2);
+ memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+
+ for (i = v_start; i < v_end; i = i + downsample_factor) {
+ if (use_downsampled_wiener_stats &&
+ (v_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+ downsample_factor = v_end - i;
+ }
+
+ memset(M_row, 0, sizeof(int32_t) * WIENER_WIN2);
+ memset(H_row, 0, sizeof(int32_t) * WIENER_WIN2 * WIENER_WIN2);
+ acc_stat_one_line(dgd, src + i * src_stride, dgd_stride, h_start, h_end,
+ avg, wiener_halfwin, wiener_win2, M_row, H_row, i);
+
+ for (k = 0; k < wiener_win2; ++k) {
+ // Scale M matrix based on the downsampling factor
+ M[k] += ((int64_t)M_row[k] * downsample_factor);
+ for (l = k; l < wiener_win2; ++l) {
+ // H is a symmetric matrix, so we only need to fill out the upper
+ // triangle here. We can copy it down to the lower triangle outside
+ // the (i, j) loops.
+ // Scale H Matrix based on the downsampling factor
+ H[k * wiener_win2 + l] +=
+ ((int64_t)H_row[k * wiener_win2 + l] * downsample_factor);
+ }
+ }
+ }
+
+ for (k = 0; k < wiener_win2; ++k) {
+ for (l = k + 1; l < wiener_win2; ++l) {
+ H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8,
+ const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ int i, j, k, l;
+ int32_t Y[WIENER_WIN2];
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ memset(M, 0, sizeof(*M) * wiener_win2);
+ memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+ for (i = v_start; i < v_end; i++) {
+ for (j = h_start; j < h_end; j++) {
+ const int32_t X = (int32_t)src[i * src_stride + j] - (int32_t)avg;
+ int idx = 0;
+ for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+ for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+ Y[idx] = (int32_t)dgd[(i + l) * dgd_stride + (j + k)] - (int32_t)avg;
+ idx++;
+ }
+ }
+ assert(idx == wiener_win2);
+ for (k = 0; k < wiener_win2; ++k) {
+ M[k] += (int64_t)Y[k] * X;
+ for (l = k; l < wiener_win2; ++l) {
+ // H is a symmetric matrix, so we only need to fill out the upper
+ // triangle here. We can copy it down to the lower triangle outside
+ // the (i, j) loops.
+ H[k * wiener_win2 + l] += (int64_t)Y[k] * Y[l];
+ }
+ }
+ }
+ }
+ for (k = 0; k < wiener_win2; ++k) {
+ M[k] /= bit_depth_divider;
+ H[k * wiener_win2 + k] /= bit_depth_divider;
+ for (l = k + 1; l < wiener_win2; ++l) {
+ H[k * wiener_win2 + l] /= bit_depth_divider;
+ H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+ }
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE int wrap_index(int i, int wiener_win) {
+ const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+ return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i);
+}
+
+// Solve linear equations to find Wiener filter tap values
+// Taps are output scaled by WIENER_FILT_STEP
+static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
+ int64_t *x) {
+ for (int k = 0; k < n - 1; k++) {
+ // Partial pivoting: bring the row with the largest pivot to the top
+ for (int i = n - 1; i > k; i--) {
+ // If row i has a better (bigger) pivot than row (i-1), swap them
+ if (llabs(A[(i - 1) * stride + k]) < llabs(A[i * stride + k])) {
+ for (int j = 0; j < n; j++) {
+ const int64_t c = A[i * stride + j];
+ A[i * stride + j] = A[(i - 1) * stride + j];
+ A[(i - 1) * stride + j] = c;
+ }
+ const int64_t c = b[i];
+ b[i] = b[i - 1];
+ b[i - 1] = c;
+ }
+ }
+
+ // b/278065963: The multiplies
+ // c / 256 * A[k * stride + j] / cd * 256
+ // and
+ // c / 256 * b[k] / cd * 256
+ // within Gaussian elimination can cause a signed integer overflow. Rework
+ // the multiplies so that larger scaling is used without significantly
+ // impacting the overall precision.
+ //
+ // Precision guidance:
+ // scale_threshold: Pick as high as possible.
+ // For max_abs_akj >= scale_threshold scenario:
+ // scaler_A: Pick as low as possible. Needed for A[(i + 1) * stride + j].
+ // scaler_c: Pick as low as possible while maintaining scaler_c >=
+ // (1 << 7). Needed for A[(i + 1) * stride + j] and b[i + 1].
+ int64_t max_abs_akj = 0;
+ for (int j = 0; j < n; j++) {
+ const int64_t abs_akj = llabs(A[k * stride + j]);
+ if (abs_akj > max_abs_akj) max_abs_akj = abs_akj;
+ }
+ const int scale_threshold = 1 << 22;
+ const int scaler_A = max_abs_akj < scale_threshold ? 1 : (1 << 5);
+ const int scaler_c = max_abs_akj < scale_threshold ? 1 : (1 << 7);
+ const int scaler = scaler_c * scaler_A;
+
+ // Forward elimination (convert A to row-echelon form)
+ for (int i = k; i < n - 1; i++) {
+ if (A[k * stride + k] == 0) return 0;
+ const int64_t c = A[(i + 1) * stride + k] / scaler_c;
+ const int64_t cd = A[k * stride + k];
+ for (int j = 0; j < n; j++) {
+ A[(i + 1) * stride + j] -=
+ A[k * stride + j] / scaler_A * c / cd * scaler;
+ }
+ b[i + 1] -= c * b[k] / cd * scaler_c;
+ }
+ }
+ // Back-substitution
+ for (int i = n - 1; i >= 0; i--) {
+ if (A[i * stride + i] == 0) return 0;
+ int64_t c = 0;
+ for (int j = i + 1; j <= n - 1; j++) {
+ c += A[i * stride + j] * x[j] / WIENER_TAP_SCALE_FACTOR;
+ }
+ // Store filter taps x in scaled form.
+ x[i] = WIENER_TAP_SCALE_FACTOR * (b[i] - c) / A[i * stride + i];
+ }
+
+ return 1;
+}
+
+// Fix vector b, update vector a
+static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
+ int64_t **Hc, int32_t *a, int32_t *b) {
+ int i, j;
+ int64_t S[WIENER_WIN];
+ int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+ memset(A, 0, sizeof(A));
+ memset(B, 0, sizeof(B));
+ for (i = 0; i < wiener_win; i++) {
+ for (j = 0; j < wiener_win; ++j) {
+ const int jj = wrap_index(j, wiener_win);
+ A[jj] += Mc[i][j] * b[i] / WIENER_TAP_SCALE_FACTOR;
+ }
+ }
+
+ // b/274668506: This is the dual branch for the issue in b/272139363. The fix
+ // is similar. See comments in update_b_sep_sym() below.
+ int32_t max_b_l = 0;
+ for (int l = 0; l < wiener_win; ++l) {
+ const int32_t abs_b_l = abs(b[l]);
+ if (abs_b_l > max_b_l) max_b_l = abs_b_l;
+ }
+ const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR;
+ const int scaler = max_b_l < scale_threshold ? 1 : 4;
+
+ for (i = 0; i < wiener_win; i++) {
+ for (j = 0; j < wiener_win; j++) {
+ int k, l;
+ for (k = 0; k < wiener_win; ++k) {
+ const int kk = wrap_index(k, wiener_win);
+ for (l = 0; l < wiener_win; ++l) {
+ const int ll = wrap_index(l, wiener_win);
+ B[ll * wiener_halfwin1 + kk] +=
+ Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] /
+ (scaler * WIENER_TAP_SCALE_FACTOR) * b[j] /
+ (WIENER_TAP_SCALE_FACTOR / scaler);
+ }
+ }
+ }
+ }
+ // Normalization enforcement in the system of equations itself
+ for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+ A[i] -=
+ A[wiener_halfwin1 - 1] * 2 +
+ B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+ }
+ for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+ for (j = 0; j < wiener_halfwin1 - 1; ++j) {
+ B[i * wiener_halfwin1 + j] -=
+ 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+ B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+ (wiener_halfwin1 - 1)]);
+ }
+ }
+ if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+ S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR;
+ for (i = wiener_halfwin1; i < wiener_win; ++i) {
+ S[i] = S[wiener_win - 1 - i];
+ S[wiener_halfwin1 - 1] -= 2 * S[i];
+ }
+ for (i = 0; i < wiener_win; ++i) {
+ a[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)),
+ (1 << (WIENER_FILT_BITS - 1)) - 1);
+ }
+ }
+}
+
+// Fix vector a, update vector b
+static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
+ int64_t **Hc, int32_t *a, int32_t *b) {
+ int i, j;
+ int64_t S[WIENER_WIN];
+ int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+ memset(A, 0, sizeof(A));
+ memset(B, 0, sizeof(B));
+ for (i = 0; i < wiener_win; i++) {
+ const int ii = wrap_index(i, wiener_win);
+ for (j = 0; j < wiener_win; j++) {
+ A[ii] += Mc[i][j] * a[j] / WIENER_TAP_SCALE_FACTOR;
+ }
+ }
+
+ // b/272139363: The computation,
+ // Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
+ // WIENER_TAP_SCALE_FACTOR * a[l] / WIENER_TAP_SCALE_FACTOR;
+ // may generate a signed-integer-overflow. Conditionally scale the terms to
+ // avoid a potential overflow.
+ //
+ // Hc contains accumulated correlation statistics and it is desired to leave
+ // as much room as possible for Hc. It was experimentally observed that the
+ // primary issue manifests itself with the second, a[l], multiply. For
+ // max_a_l < WIENER_TAP_SCALE_FACTOR the first multiply with a[k] should not
+ // increase dynamic range and the second multiply should hence be safe.
+ // Thereafter a safe scale_threshold depends on the actual operational range
+ // of Hc. The largest scale_threshold is expected to depend on bit-depth
+ // (av1_compute_stats_highbd_c() scales highbd to 8-bit) and maximum
+ // restoration-unit size (256), leading up to 32-bit positive numbers in Hc.
+ // Noting that the caller, wiener_decompose_sep_sym(), initializes a[...]
+ // to a range smaller than 16 bits, the scale_threshold is set as below for
+ // convenience.
+ int32_t max_a_l = 0;
+ for (int l = 0; l < wiener_win; ++l) {
+ const int32_t abs_a_l = abs(a[l]);
+ if (abs_a_l > max_a_l) max_a_l = abs_a_l;
+ }
+ const int scale_threshold = 128 * WIENER_TAP_SCALE_FACTOR;
+ const int scaler = max_a_l < scale_threshold ? 1 : 4;
+
+ for (i = 0; i < wiener_win; i++) {
+ const int ii = wrap_index(i, wiener_win);
+ for (j = 0; j < wiener_win; j++) {
+ const int jj = wrap_index(j, wiener_win);
+ int k, l;
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ B[jj * wiener_halfwin1 + ii] +=
+ Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
+ (scaler * WIENER_TAP_SCALE_FACTOR) * a[l] /
+ (WIENER_TAP_SCALE_FACTOR / scaler);
+ }
+ }
+ }
+ }
+ // Normalization enforcement in the system of equations itself
+ for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+ A[i] -=
+ A[wiener_halfwin1 - 1] * 2 +
+ B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+ }
+ for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+ for (j = 0; j < wiener_halfwin1 - 1; ++j) {
+ B[i * wiener_halfwin1 + j] -=
+ 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+ B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+ (wiener_halfwin1 - 1)]);
+ }
+ }
+ if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+ S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR;
+ for (i = wiener_halfwin1; i < wiener_win; ++i) {
+ S[i] = S[wiener_win - 1 - i];
+ S[wiener_halfwin1 - 1] -= 2 * S[i];
+ }
+ for (i = 0; i < wiener_win; ++i) {
+ b[i] = (int32_t)CLIP(S[i], -(1 << (WIENER_FILT_BITS - 1)),
+ (1 << (WIENER_FILT_BITS - 1)) - 1);
+ }
+ }
+}
+
+static void wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H,
+ int32_t *a, int32_t *b) {
+ static const int32_t init_filt[WIENER_WIN] = {
+ WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV,
+ WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV,
+ WIENER_FILT_TAP0_MIDV,
+ };
+ int64_t *Hc[WIENER_WIN2];
+ int64_t *Mc[WIENER_WIN];
+ int i, j, iter;
+ const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+ const int wiener_win2 = wiener_win * wiener_win;
+ for (i = 0; i < wiener_win; i++) {
+ a[i] = b[i] =
+ WIENER_TAP_SCALE_FACTOR / WIENER_FILT_STEP * init_filt[i + plane_off];
+ }
+ for (i = 0; i < wiener_win; i++) {
+ Mc[i] = M + i * wiener_win;
+ for (j = 0; j < wiener_win; j++) {
+ Hc[i * wiener_win + j] =
+ H + i * wiener_win * wiener_win2 + j * wiener_win;
+ }
+ }
+
+ iter = 1;
+ while (iter < NUM_WIENER_ITERS) {
+ update_a_sep_sym(wiener_win, Mc, Hc, a, b);
+ update_b_sep_sym(wiener_win, Mc, Hc, a, b);
+ iter++;
+ }
+}
+
+// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
+// against identity filters; Final score is defined as the difference between
+// the function values
+static int64_t compute_score(int wiener_win, int64_t *M, int64_t *H,
+ InterpKernel vfilt, InterpKernel hfilt) {
+ int32_t ab[WIENER_WIN * WIENER_WIN];
+ int16_t a[WIENER_WIN], b[WIENER_WIN];
+ int64_t P = 0, Q = 0;
+ int64_t iP = 0, iQ = 0;
+ int64_t Score, iScore;
+ int i, k, l;
+ const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+ const int wiener_win2 = wiener_win * wiener_win;
+
+ a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = WIENER_FILT_STEP;
+ for (i = 0; i < WIENER_HALFWIN; ++i) {
+ a[i] = a[WIENER_WIN - i - 1] = vfilt[i];
+ b[i] = b[WIENER_WIN - i - 1] = hfilt[i];
+ a[WIENER_HALFWIN] -= 2 * a[i];
+ b[WIENER_HALFWIN] -= 2 * b[i];
+ }
+ memset(ab, 0, sizeof(ab));
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l)
+ ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off];
+ }
+ for (k = 0; k < wiener_win2; ++k) {
+ P += ab[k] * M[k] / WIENER_FILT_STEP / WIENER_FILT_STEP;
+ for (l = 0; l < wiener_win2; ++l) {
+ Q += ab[k] * H[k * wiener_win2 + l] * ab[l] / WIENER_FILT_STEP /
+ WIENER_FILT_STEP / WIENER_FILT_STEP / WIENER_FILT_STEP;
+ }
+ }
+ Score = Q - 2 * P;
+
+ iP = M[wiener_win2 >> 1];
+ iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)];
+ iScore = iQ - 2 * iP;
+
+ return Score - iScore;
+}
+
+static AOM_INLINE void finalize_sym_filter(int wiener_win, int32_t *f,
+ InterpKernel fi) {
+ int i;
+ const int wiener_halfwin = (wiener_win >> 1);
+
+ for (i = 0; i < wiener_halfwin; ++i) {
+ const int64_t dividend = (int64_t)f[i] * WIENER_FILT_STEP;
+ const int64_t divisor = WIENER_TAP_SCALE_FACTOR;
+ // Perform this division with proper rounding rather than truncation
+ if (dividend < 0) {
+ fi[i] = (int16_t)((dividend - (divisor / 2)) / divisor);
+ } else {
+ fi[i] = (int16_t)((dividend + (divisor / 2)) / divisor);
+ }
+ }
+ // Specialize for 7-tap filter
+ if (wiener_win == WIENER_WIN) {
+ fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+ fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+ fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+ } else {
+ fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+ fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+ fi[0] = 0;
+ }
+ // Satisfy filter constraints
+ fi[WIENER_WIN - 1] = fi[0];
+ fi[WIENER_WIN - 2] = fi[1];
+ fi[WIENER_WIN - 3] = fi[2];
+ // The central element has an implicit +WIENER_FILT_STEP
+ fi[3] = -2 * (fi[0] + fi[1] + fi[2]);
+}
+
+static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
+ WienerInfo *ref_wiener_info) {
+ int bits = 0;
+ if (wiener_win == WIENER_WIN)
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+ if (wiener_win == WIENER_WIN)
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+ return bits;
+}
+
+static int64_t finer_search_wiener(const RestSearchCtxt *rsc,
+ const RestorationTileLimits *limits,
+ RestorationUnitInfo *rui, int wiener_win) {
+ const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+ int64_t err = try_restoration_unit(rsc, limits, rui);
+
+ if (rsc->lpf_sf->disable_wiener_coeff_refine_search) return err;
+
+ // Refinement search around the wiener filter coefficients.
+ int64_t err2;
+ int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV,
+ WIENER_FILT_TAP2_MINV };
+ int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV,
+ WIENER_FILT_TAP2_MAXV };
+
+ WienerInfo *plane_wiener = &rui->wiener_info;
+
+ // printf("err pre = %"PRId64"\n", err);
+ const int start_step = 4;
+ for (int s = start_step; s >= 1; s >>= 1) {
+ for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+ int skip = 0;
+ do {
+ if (plane_wiener->hfilter[p] - s >= tap_min[p]) {
+ plane_wiener->hfilter[p] -= s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+ err2 = try_restoration_unit(rsc, limits, rui);
+ if (err2 > err) {
+ plane_wiener->hfilter[p] += s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+ } else {
+ err = err2;
+ skip = 1;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ if (skip) break;
+ do {
+ if (plane_wiener->hfilter[p] + s <= tap_max[p]) {
+ plane_wiener->hfilter[p] += s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+ err2 = try_restoration_unit(rsc, limits, rui);
+ if (err2 > err) {
+ plane_wiener->hfilter[p] -= s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+ } else {
+ err = err2;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ }
+ for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+ int skip = 0;
+ do {
+ if (plane_wiener->vfilter[p] - s >= tap_min[p]) {
+ plane_wiener->vfilter[p] -= s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+ err2 = try_restoration_unit(rsc, limits, rui);
+ if (err2 > err) {
+ plane_wiener->vfilter[p] += s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+ } else {
+ err = err2;
+ skip = 1;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ if (skip) break;
+ do {
+ if (plane_wiener->vfilter[p] + s <= tap_max[p]) {
+ plane_wiener->vfilter[p] += s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+ err2 = try_restoration_unit(rsc, limits, rui);
+ if (err2 > err) {
+ plane_wiener->vfilter[p] -= s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+ } else {
+ err = err2;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ }
+ }
+ // printf("err post = %"PRId64"\n", err);
+ return err;
+}
+
+static AOM_INLINE void search_wiener(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
+ (void)tmpbuf;
+ (void)rlbs;
+ (void)error_info;
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+ const MACROBLOCK *const x = rsc->x;
+ const int64_t bits_none = x->mode_costs.wiener_restore_cost[0];
+
+ // Skip Wiener search for low variance contents
+ if (rsc->lpf_sf->prune_wiener_based_on_src_var) {
+ const int scale[3] = { 0, 1, 2 };
+ // Obtain the normalized Qscale
+ const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0,
+ rsc->cm->seq_params->bit_depth) >>
+ 3;
+ // Derive threshold as sqr(normalized Qscale) * scale / 16,
+ const uint64_t thresh =
+ (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4;
+ const int highbd = rsc->cm->seq_params->use_highbitdepth;
+ const uint64_t src_var =
+ var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
+ // Do not perform Wiener search if source variance is lower than threshold
+ // or if the reconstruction error is zero
+ int prune_wiener = (src_var < thresh) || (rsc->sse[RESTORE_NONE] == 0);
+ if (prune_wiener) {
+ rsc->total_bits[RESTORE_WIENER] += bits_none;
+ rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE];
+ rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+ rsc->sse[RESTORE_WIENER] = INT64_MAX;
+ if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1;
+ return;
+ }
+ }
+
+ const int wiener_win =
+ (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+ int reduced_wiener_win = wiener_win;
+ if (rsc->lpf_sf->reduce_wiener_window_size) {
+ reduced_wiener_win =
+ (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA;
+ }
+
+ int64_t M[WIENER_WIN2];
+ int64_t H[WIENER_WIN2 * WIENER_WIN2];
+ int32_t vfilter[WIENER_WIN], hfilter[WIENER_WIN];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const AV1_COMMON *const cm = rsc->cm;
+ if (cm->seq_params->use_highbitdepth) {
+ // TODO(any) : Add support for use_downsampled_wiener_stats SF in HBD
+ // functions. Optimize intrinsics of HBD design similar to LBD (i.e.,
+ // pre-calculate d and s buffers and avoid most of the C operations).
+ av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
+ rsc->src_buffer, limits->h_start, limits->h_end,
+ limits->v_start, limits->v_end, rsc->dgd_stride,
+ rsc->src_stride, M, H, cm->seq_params->bit_depth);
+ } else {
+ av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+ rsc->dgd_avg, rsc->src_avg, limits->h_start,
+ limits->h_end, limits->v_start, limits->v_end,
+ rsc->dgd_stride, rsc->src_stride, M, H,
+ rsc->lpf_sf->use_downsampled_wiener_stats);
+ }
+#else
+ av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+ rsc->dgd_avg, rsc->src_avg, limits->h_start, limits->h_end,
+ limits->v_start, limits->v_end, rsc->dgd_stride,
+ rsc->src_stride, M, H,
+ rsc->lpf_sf->use_downsampled_wiener_stats);
+#endif
+
+ wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter);
+
+ RestorationUnitInfo rui;
+ memset(&rui, 0, sizeof(rui));
+ rui.restoration_type = RESTORE_WIENER;
+ finalize_sym_filter(reduced_wiener_win, vfilter, rui.wiener_info.vfilter);
+ finalize_sym_filter(reduced_wiener_win, hfilter, rui.wiener_info.hfilter);
+
+ // Filter score computes the value of the function x'*A*x - x'*b for the
+ // learned filter and compares it against identity filer. If there is no
+ // reduction in the function, the filter is reverted back to identity
+ if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter,
+ rui.wiener_info.hfilter) > 0) {
+ rsc->total_bits[RESTORE_WIENER] += bits_none;
+ rsc->total_sse[RESTORE_WIENER] += rsc->sse[RESTORE_NONE];
+ rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+ rsc->sse[RESTORE_WIENER] = INT64_MAX;
+ if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) rsc->skip_sgr_eval = 1;
+ return;
+ }
+
+ rsc->sse[RESTORE_WIENER] =
+ finer_search_wiener(rsc, limits, &rui, reduced_wiener_win);
+ rusi->wiener = rui.wiener_info;
+
+ if (reduced_wiener_win != WIENER_WIN) {
+ assert(rui.wiener_info.vfilter[0] == 0 &&
+ rui.wiener_info.vfilter[WIENER_WIN - 1] == 0);
+ assert(rui.wiener_info.hfilter[0] == 0 &&
+ rui.wiener_info.hfilter[WIENER_WIN - 1] == 0);
+ }
+
+ const int64_t bits_wiener =
+ x->mode_costs.wiener_restore_cost[1] +
+ (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->ref_wiener)
+ << AV1_PROB_COST_SHIFT);
+
+ double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_none >> 4, rsc->sse[RESTORE_NONE],
+ rsc->cm->seq_params->bit_depth);
+ double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_wiener >> 4, rsc->sse[RESTORE_WIENER],
+ rsc->cm->seq_params->bit_depth);
+
+ RestorationType rtype =
+ (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
+ rusi->best_rtype[RESTORE_WIENER - 1] = rtype;
+
+ // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and
+ // RESTORE_NONE or based on best_rtype
+ if (rsc->lpf_sf->prune_sgr_based_on_wiener == 1) {
+ rsc->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
+ } else if (rsc->lpf_sf->prune_sgr_based_on_wiener == 2) {
+ rsc->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
+ }
+
+#if DEBUG_LR_COSTING
+ // Store ref params for later checking
+ lr_ref_params[RESTORE_WIENER][rsc->plane][rest_unit_idx].wiener_info =
+ rsc->ref_wiener;
+#endif // DEBUG_LR_COSTING
+
+ rsc->total_sse[RESTORE_WIENER] += rsc->sse[rtype];
+ rsc->total_bits[RESTORE_WIENER] +=
+ (cost_wiener < cost_none) ? bits_wiener : bits_none;
+ if (cost_wiener < cost_none) rsc->ref_wiener = rusi->wiener;
+}
+
+static AOM_INLINE void search_norestore(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
+ (void)rest_unit_idx;
+ (void)tmpbuf;
+ (void)rlbs;
+ (void)error_info;
+
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+
+ const int highbd = rsc->cm->seq_params->use_highbitdepth;
+ rsc->sse[RESTORE_NONE] = sse_restoration_unit(
+ limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
+
+ rsc->total_sse[RESTORE_NONE] += rsc->sse[RESTORE_NONE];
+}
+
+static AOM_INLINE void search_switchable(
+ const RestorationTileLimits *limits, int rest_unit_idx, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ struct aom_internal_error_info *error_info) {
+ (void)limits;
+ (void)tmpbuf;
+ (void)rlbs;
+ (void)error_info;
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+ const MACROBLOCK *const x = rsc->x;
+
+ const int wiener_win =
+ (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+ double best_cost = 0;
+ int64_t best_bits = 0;
+ RestorationType best_rtype = RESTORE_NONE;
+
+ for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+ // If this restoration mode was skipped, or could not find a solution
+ // that was better than RESTORE_NONE, then we can't select it here either.
+ //
+ // Note: It is possible for the restoration search functions to find a
+ // filter which is better than RESTORE_NONE when looking purely at SSE, but
+ // for it to be rejected overall due to its rate cost. In this case, there
+ // is a chance that it may be have a lower rate cost when looking at
+ // RESTORE_SWITCHABLE, and so it might be acceptable here.
+ //
+ // Therefore we prune based on SSE, rather than on whether or not the
+ // previous search function selected this mode.
+ if (r > RESTORE_NONE) {
+ if (rsc->sse[r] > rsc->sse[RESTORE_NONE]) continue;
+ }
+
+ const int64_t sse = rsc->sse[r];
+ int64_t coeff_pcost = 0;
+ switch (r) {
+ case RESTORE_NONE: coeff_pcost = 0; break;
+ case RESTORE_WIENER:
+ coeff_pcost = count_wiener_bits(wiener_win, &rusi->wiener,
+ &rsc->switchable_ref_wiener);
+ break;
+ case RESTORE_SGRPROJ:
+ coeff_pcost =
+ count_sgrproj_bits(&rusi->sgrproj, &rsc->switchable_ref_sgrproj);
+ break;
+ default: assert(0); break;
+ }
+ const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
+ const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits;
+ double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits >> 4, sse, rsc->cm->seq_params->bit_depth);
+ if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
+ cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
+ if (r == 0 || cost < best_cost) {
+ best_cost = cost;
+ best_bits = bits;
+ best_rtype = r;
+ }
+ }
+
+ rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype;
+
+#if DEBUG_LR_COSTING
+ // Store ref params for later checking
+ lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].wiener_info =
+ rsc->switchable_ref_wiener;
+ lr_ref_params[RESTORE_SWITCHABLE][rsc->plane][rest_unit_idx].sgrproj_info =
+ rsc->switchable_ref_sgrproj;
+#endif // DEBUG_LR_COSTING
+
+ rsc->total_sse[RESTORE_SWITCHABLE] += rsc->sse[best_rtype];
+ rsc->total_bits[RESTORE_SWITCHABLE] += best_bits;
+ if (best_rtype == RESTORE_WIENER) rsc->switchable_ref_wiener = rusi->wiener;
+ if (best_rtype == RESTORE_SGRPROJ)
+ rsc->switchable_ref_sgrproj = rusi->sgrproj;
+}
+
+static AOM_INLINE void copy_unit_info(RestorationType frame_rtype,
+ const RestUnitSearchInfo *rusi,
+ RestorationUnitInfo *rui) {
+ assert(frame_rtype > 0);
+ rui->restoration_type = rusi->best_rtype[frame_rtype - 1];
+ if (rui->restoration_type == RESTORE_WIENER)
+ rui->wiener_info = rusi->wiener;
+ else
+ rui->sgrproj_info = rusi->sgrproj;
+}
+
+static void restoration_search(AV1_COMMON *cm, int plane, RestSearchCtxt *rsc,
+ bool *disable_lr_filter) {
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int mib_size_log2 = cm->seq_params->mib_size_log2;
+ const CommonTileParams *tiles = &cm->tiles;
+ const int is_uv = plane > 0;
+ const int ss_y = is_uv && cm->seq_params->subsampling_y;
+ RestorationInfo *rsi = &cm->rst_info[plane];
+ const int ru_size = rsi->restoration_unit_size;
+ const int ext_size = ru_size * 3 / 2;
+
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+ static const rest_unit_visitor_t funs[RESTORE_TYPES] = {
+ search_norestore, search_wiener, search_sgrproj, search_switchable
+ };
+
+ const int plane_num_units = rsi->num_rest_units;
+ const RestorationType num_rtypes =
+ (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+
+ reset_rsc(rsc);
+
+ // Iterate over restoration units in encoding order, so that each RU gets
+ // the correct reference parameters when we cost it up. This is effectively
+ // a nested iteration over:
+ // * Each tile, order does not matter
+ // * Each superblock within that tile, in raster order
+ // * Each LR unit which is coded within that superblock, in raster order
+ for (int tile_row = 0; tile_row < tiles->rows; tile_row++) {
+ int sb_row_start = tiles->row_start_sb[tile_row];
+ int sb_row_end = tiles->row_start_sb[tile_row + 1];
+ for (int tile_col = 0; tile_col < tiles->cols; tile_col++) {
+ int sb_col_start = tiles->col_start_sb[tile_col];
+ int sb_col_end = tiles->col_start_sb[tile_col + 1];
+
+ // Reset reference parameters for delta-coding at the start of each tile
+ rsc_on_tile(rsc);
+
+ for (int sb_row = sb_row_start; sb_row < sb_row_end; sb_row++) {
+ int mi_row = sb_row << mib_size_log2;
+ for (int sb_col = sb_col_start; sb_col < sb_col_end; sb_col++) {
+ int mi_col = sb_col << mib_size_log2;
+
+ int rcol0, rcol1, rrow0, rrow1;
+ int has_lr_info = av1_loop_restoration_corners_in_sb(
+ cm, plane, mi_row, mi_col, sb_size, &rcol0, &rcol1, &rrow0,
+ &rrow1);
+
+ if (!has_lr_info) continue;
+
+ RestorationTileLimits limits;
+ for (int rrow = rrow0; rrow < rrow1; rrow++) {
+ int y0 = rrow * ru_size;
+ int remaining_h = plane_h - y0;
+ int h = (remaining_h < ext_size) ? remaining_h : ru_size;
+
+ limits.v_start = y0;
+ limits.v_end = y0 + h;
+ assert(limits.v_end <= plane_h);
+ // Offset upwards to align with the restoration processing stripe
+ const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+ limits.v_start = AOMMAX(0, limits.v_start - voffset);
+ if (limits.v_end < plane_h) limits.v_end -= voffset;
+
+ for (int rcol = rcol0; rcol < rcol1; rcol++) {
+ int x0 = rcol * ru_size;
+ int remaining_w = plane_w - x0;
+ int w = (remaining_w < ext_size) ? remaining_w : ru_size;
+
+ limits.h_start = x0;
+ limits.h_end = x0 + w;
+ assert(limits.h_end <= plane_w);
+
+ const int unit_idx = rrow * rsi->horz_units + rcol;
+
+ rsc->skip_sgr_eval = 0;
+ for (RestorationType r = RESTORE_NONE; r < num_rtypes; r++) {
+ if (disable_lr_filter[r]) continue;
+
+ funs[r](&limits, unit_idx, rsc, rsc->cm->rst_tmpbuf, NULL,
+ cm->error);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static INLINE void av1_derive_flags_for_lr_processing(
+ const LOOP_FILTER_SPEED_FEATURES *lpf_sf, bool *disable_lr_filter) {
+ const bool is_wiener_disabled = lpf_sf->disable_wiener_filter;
+ const bool is_sgr_disabled = lpf_sf->disable_sgr_filter;
+
+ // Enable None Loop restoration filter if either of Wiener or Self-guided is
+ // enabled.
+ disable_lr_filter[RESTORE_NONE] = (is_wiener_disabled && is_sgr_disabled);
+
+ disable_lr_filter[RESTORE_WIENER] = is_wiener_disabled;
+ disable_lr_filter[RESTORE_SGRPROJ] = is_sgr_disabled;
+
+ // Enable Swicthable Loop restoration filter if both of the Wiener and
+ // Self-guided are enabled.
+ disable_lr_filter[RESTORE_SWITCHABLE] =
+ (is_wiener_disabled || is_sgr_disabled);
+}
+
+#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
+// Allocate both decoder-side and encoder-side info structs for a single plane.
+// The unit size passed in should be the minimum size which we are going to
+// search; before each search, set_restoration_unit_size() must be called to
+// configure the actual size.
+static RestUnitSearchInfo *allocate_search_structs(AV1_COMMON *cm,
+ RestorationInfo *rsi,
+ int is_uv,
+ int min_luma_unit_size) {
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int sx = cm->seq_params.subsampling_x;
+ int sy = cm->seq_params.subsampling_y;
+ int s = (p > 0) ? AOMMIN(sx, sy) : 0;
+#else
+ int s = 0;
+#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int min_unit_size = min_luma_unit_size >> s;
+
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+ const int max_horz_units = av1_lr_count_units(min_unit_size, plane_w);
+ const int max_vert_units = av1_lr_count_units(min_unit_size, plane_h);
+ const int max_num_units = max_horz_units * max_vert_units;
+
+ aom_free(rsi->unit_info);
+ CHECK_MEM_ERROR(cm, rsi->unit_info,
+ (RestorationUnitInfo *)aom_memalign(
+ 16, sizeof(*rsi->unit_info) * max_num_units));
+
+ RestUnitSearchInfo *rusi;
+ CHECK_MEM_ERROR(
+ cm, rusi,
+ (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * max_num_units));
+
+ // If the restoration unit dimensions are not multiples of
+ // rsi->restoration_unit_size then some elements of the rusi array may be
+ // left uninitialised when we reach copy_unit_info(...). This is not a
+ // problem, as these elements are ignored later, but in order to quiet
+ // Valgrind's warnings we initialise the array below.
+ memset(rusi, 0, sizeof(*rusi) * max_num_units);
+
+ return rusi;
+}
+
+static void set_restoration_unit_size(AV1_COMMON *cm, RestorationInfo *rsi,
+ int is_uv, int luma_unit_size) {
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int sx = cm->seq_params.subsampling_x;
+ int sy = cm->seq_params.subsampling_y;
+ int s = (p > 0) ? AOMMIN(sx, sy) : 0;
+#else
+ int s = 0;
+#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int unit_size = luma_unit_size >> s;
+
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+
+ const int horz_units = av1_lr_count_units(unit_size, plane_w);
+ const int vert_units = av1_lr_count_units(unit_size, plane_h);
+
+ rsi->restoration_unit_size = unit_size;
+ rsi->num_rest_units = horz_units * vert_units;
+ rsi->horz_units = horz_units;
+ rsi->vert_units = vert_units;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ const LOOP_FILTER_SPEED_FEATURES *lpf_sf = &cpi->sf.lpf_sf;
+ const int num_planes = av1_num_planes(cm);
+ const int highbd = cm->seq_params->use_highbitdepth;
+ assert(!cm->features.all_lossless);
+
+ av1_fill_lr_rates(&x->mode_costs, x->e_mbd.tile_ctx);
+
+ // Select unit size based on speed feature settings, and allocate
+ // rui structs based on this size
+ int min_lr_unit_size = cpi->sf.lpf_sf.min_lr_unit_size;
+ int max_lr_unit_size = cpi->sf.lpf_sf.max_lr_unit_size;
+
+ // The minimum allowed unit size at a syntax level is 1 superblock.
+ // Apply this constraint here so that the speed features code which sets
+ // cpi->sf.lpf_sf.min_lr_unit_size does not need to know the superblock size
+ min_lr_unit_size =
+ AOMMAX(min_lr_unit_size, block_size_wide[cm->seq_params->sb_size]);
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ cpi->pick_lr_ctxt.rusi[plane] = allocate_search_structs(
+ cm, &cm->rst_info[plane], plane > 0, min_lr_unit_size);
+ }
+
+ x->rdmult = cpi->rd.RDMULT;
+
+ // Allocate the frame buffer trial_frame_rst, which is used to temporarily
+ // store the loop restored frame.
+ if (aom_realloc_frame_buffer(
+ &cpi->trial_frame_rst, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
+ cm->features.byte_alignment, NULL, NULL, NULL, 0, 0))
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate trial restored frame buffer");
+
+ RestSearchCtxt rsc;
+
+ // The buffers 'src_avg' and 'dgd_avg' are used to compute H and M buffers.
+ // These buffers are only required for the AVX2 and NEON implementations of
+ // av1_compute_stats. The buffer size required is calculated based on maximum
+ // width and height of the LRU (i.e., from foreach_rest_unit_in_plane() 1.5
+ // times the RESTORATION_UNITSIZE_MAX) allowed for Wiener filtering. The width
+ // and height aligned to multiple of 16 is considered for intrinsic purpose.
+ rsc.dgd_avg = NULL;
+ rsc.src_avg = NULL;
+#if HAVE_AVX2 || HAVE_NEON
+ // The buffers allocated below are used during Wiener filter processing of low
+ // bitdepth path. Hence, allocate the same when Wiener filter is enabled in
+ // low bitdepth path.
+ if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+ const int buf_size = sizeof(*cpi->pick_lr_ctxt.dgd_avg) * 6 *
+ RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+ CHECK_MEM_ERROR(cm, cpi->pick_lr_ctxt.dgd_avg,
+ (int16_t *)aom_memalign(32, buf_size));
+
+ rsc.dgd_avg = cpi->pick_lr_ctxt.dgd_avg;
+ // When LRU width isn't multiple of 16, the 256 bits load instruction used
+ // in AVX2 intrinsic can read data beyond valid LRU. Hence, in order to
+ // silence Valgrind warning this buffer is initialized with zero. Overhead
+ // due to this initialization is negligible since it is done at frame level.
+ memset(rsc.dgd_avg, 0, buf_size);
+ rsc.src_avg =
+ rsc.dgd_avg + 3 * RESTORATION_UNITSIZE_MAX * RESTORATION_UNITSIZE_MAX;
+ // Asserts the starting address of src_avg is always 32-bytes aligned.
+ assert(!((intptr_t)rsc.src_avg % 32));
+ }
+#endif
+
+ // Initialize all planes, so that any planes we skip searching will still have
+ // valid data
+ for (int plane = 0; plane < num_planes; plane++) {
+ cm->rst_info[plane].frame_restoration_type = RESTORE_NONE;
+ }
+
+ // Decide which planes to search
+ int plane_start, plane_end;
+
+ if (lpf_sf->disable_loop_restoration_luma) {
+ plane_start = AOM_PLANE_U;
+ } else {
+ plane_start = AOM_PLANE_Y;
+ }
+
+ if (num_planes == 1 || lpf_sf->disable_loop_restoration_chroma) {
+ plane_end = AOM_PLANE_Y;
+ } else {
+ plane_end = AOM_PLANE_V;
+ }
+
+ // Derive the flags to enable/disable Loop restoration filters based on the
+ // speed features 'disable_wiener_filter' and 'disable_sgr_filter'.
+ bool disable_lr_filter[RESTORE_TYPES] = { false };
+ av1_derive_flags_for_lr_processing(lpf_sf, disable_lr_filter);
+
+ for (int plane = plane_start; plane <= plane_end; plane++) {
+ const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
+ const int is_uv = plane != AOM_PLANE_Y;
+ int plane_w, plane_h;
+ av1_get_upsampled_plane_size(cm, is_uv, &plane_w, &plane_h);
+ av1_extend_frame(dgd->buffers[plane], plane_w, plane_h, dgd->strides[is_uv],
+ RESTORATION_BORDER, RESTORATION_BORDER, highbd);
+ }
+
+ double best_cost = DBL_MAX;
+ int best_luma_unit_size = max_lr_unit_size;
+ for (int luma_unit_size = max_lr_unit_size;
+ luma_unit_size >= min_lr_unit_size; luma_unit_size >>= 1) {
+ int64_t bits_this_size = 0;
+ int64_t sse_this_size = 0;
+ RestorationType best_rtype[MAX_MB_PLANE] = { RESTORE_NONE, RESTORE_NONE,
+ RESTORE_NONE };
+ for (int plane = plane_start; plane <= plane_end; ++plane) {
+ set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0,
+ luma_unit_size);
+ init_rsc(src, &cpi->common, x, lpf_sf, plane,
+ cpi->pick_lr_ctxt.rusi[plane], &cpi->trial_frame_rst, &rsc);
+
+ restoration_search(cm, plane, &rsc, disable_lr_filter);
+
+ const int plane_num_units = cm->rst_info[plane].num_rest_units;
+ const RestorationType num_rtypes =
+ (plane_num_units > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+ double best_cost_this_plane = DBL_MAX;
+ for (RestorationType r = 0; r < num_rtypes; ++r) {
+ // Disable Loop restoration filter based on the flags set using speed
+ // feature 'disable_wiener_filter' and 'disable_sgr_filter'.
+ if (disable_lr_filter[r]) continue;
+
+ double cost_this_plane = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, rsc.total_bits[r] >> 4, rsc.total_sse[r],
+ cm->seq_params->bit_depth);
+
+ if (cost_this_plane < best_cost_this_plane) {
+ best_cost_this_plane = cost_this_plane;
+ best_rtype[plane] = r;
+ }
+ }
+
+ bits_this_size += rsc.total_bits[best_rtype[plane]];
+ sse_this_size += rsc.total_sse[best_rtype[plane]];
+ }
+
+ double cost_this_size = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+ x->rdmult, bits_this_size >> 4, sse_this_size,
+ cm->seq_params->bit_depth);
+
+ if (cost_this_size < best_cost) {
+ best_cost = cost_this_size;
+ best_luma_unit_size = luma_unit_size;
+ // Copy parameters out of rusi struct, before we overwrite it at
+ // the start of the next iteration
+ bool all_none = true;
+ for (int plane = plane_start; plane <= plane_end; ++plane) {
+ cm->rst_info[plane].frame_restoration_type = best_rtype[plane];
+ if (best_rtype[plane] != RESTORE_NONE) {
+ all_none = false;
+ const int plane_num_units = cm->rst_info[plane].num_rest_units;
+ for (int u = 0; u < plane_num_units; ++u) {
+ copy_unit_info(best_rtype[plane], &cpi->pick_lr_ctxt.rusi[plane][u],
+ &cm->rst_info[plane].unit_info[u]);
+ }
+ }
+ }
+ // Heuristic: If all best_rtype entries are RESTORE_NONE, this means we
+ // couldn't find any good filters at this size. So we likely won't find
+ // any good filters at a smaller size either, so skip
+ if (all_none) {
+ break;
+ }
+ } else {
+ // Heuristic: If this size is worse than the previous (larger) size, then
+ // the next size down will likely be even worse, so skip
+ break;
+ }
+ }
+
+ // Final fixup to set the correct unit size
+ // We set this for all planes, even ones we have skipped searching,
+ // so that other code does not need to care which planes were and weren't
+ // searched
+ for (int plane = 0; plane < num_planes; ++plane) {
+ set_restoration_unit_size(cm, &cm->rst_info[plane], plane > 0,
+ best_luma_unit_size);
+ }
+
+#if HAVE_AVX || HAVE_NEON
+ if (!cpi->sf.lpf_sf.disable_wiener_filter && !highbd) {
+ aom_free(cpi->pick_lr_ctxt.dgd_avg);
+ cpi->pick_lr_ctxt.dgd_avg = NULL;
+ }
+#endif
+ for (int plane = 0; plane < num_planes; plane++) {
+ aom_free(cpi->pick_lr_ctxt.rusi[plane]);
+ cpi->pick_lr_ctxt.rusi[plane] = NULL;
+ }
+}
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
new file mode 100644
index 0000000000..d1d0b0cec6
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKRST_H_
+#define AOM_AV1_ENCODER_PICKRST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+// Enable extra debugging for loop restoration costing?
+//
+// If this is set to 1, then we record not just the selected LR parameters, but
+// also the values which the search process thinks they should be delta-coded
+// against. Then, when writing out the bitstream, we verify this information,
+// to help ensure that the search code is costing things properly
+#define DEBUG_LR_COSTING 0
+
+#if DEBUG_LR_COSTING
+#define MAX_LR_UNITS_W 64
+#define MAX_LR_UNITS_H 64
+
+// Storage for reference parameters.
+//
+// The storage size is determined by:
+// * This is always written and then checked within the same frame encode pass,
+// so we do not need to buffer multiple frames of data
+// * The parameters can be different per plane within one frame
+// * The relevant set of ref parameters can differ between the search where
+// we set the frame restoration mode to RESTORE_WIENER, and the search where
+// we set it to RESTORE_SWITCHABLE.
+// So we need to store at least two sets of Wiener params and two sets of
+// SGR params, and the easiest way to do this is to index by
+// frame_restoration_type
+extern RestorationUnitInfo lr_ref_params[RESTORE_TYPES][MAX_MB_PLANE]
+ [MAX_LR_UNITS_W * MAX_LR_UNITS_H];
+#endif // DEBUG_LR_COSTING
+
+static const uint8_t g_shuffle_stats_data[16] = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+};
+
+static const uint8_t g_shuffle_stats_highbd_data[32] = {
+ 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
+ 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
+};
+
+static INLINE uint8_t find_average(const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int stride) {
+ uint64_t sum = 0;
+ for (int i = v_start; i < v_end; i++) {
+ for (int j = h_start; j < h_end; j++) {
+ sum += src[i * stride + j];
+ }
+ }
+ uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start));
+ return (uint8_t)avg;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start,
+ int h_end, int v_start, int v_end,
+ int stride) {
+ uint64_t sum = 0;
+ for (int i = v_start; i < v_end; i++) {
+ for (int j = h_start; j < h_end; j++) {
+ sum += src[i * stride + j];
+ }
+ }
+ uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start));
+ return (uint16_t)avg;
+}
+#endif
+
+/*!\brief Algorithm for AV1 loop restoration search and estimation.
+ *
+ * \ingroup in_loop_restoration
+ * This function determines proper restoration filter types and
+ * associated parameters for each restoration unit in a frame.
+ *
+ * \param[in] sd Source frame buffer
+ * \param[in,out] cpi Top-level encoder structure
+ *
+ * \remark Nothing is returned. Instead, chosen restoration filter
+ * types and parameters are stored per plane in the \c rst_info structure
+ * of type \ref RestorationInfo inside \c cpi->common:
+ * \arg \c rst_info[ \c 0 ]: Chosen parameters for Y plane
+ * \arg \c rst_info[ \c 1 ]: Chosen parameters for U plane if it exists
+ * \arg \c rst_info[ \c 2 ]: Chosen parameters for V plane if it exists
+ * \par
+ * The following fields in each \c rst_info[ \c p], \c p = 0, 1, 2
+ * are populated:
+ * \arg \c rst_info[ \c p ].\c frame_restoration_type
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ],
+ * for each \c u in 0, 1, ..., \c n( \c p ) - 1,
+ * where \c n( \c p ) is the number of restoration units in plane \c p.
+ * \par
+ * The following fields in each \c rst_info[ \c p ].\c unit_info[ \c u ],
+ * \c p = 0, 1, 2 and \c u = 0, 1, ..., \c n( \c p ) - 1, of type
+ * \ref RestorationUnitInfo are populated:
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type
+ * \arg \c rst_info[ \c p ].\c unit_info[ \c u ].\c wiener_info OR
+ * \c rst_info[ \c p ].\c unit_info[ \c u ].\c sgrproj_info OR
+ * neither, depending on
+ * \c rst_info[ \c p ].\c unit_info[ \c u ].\c restoration_type
+ *
+ */
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PICKRST_H_
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
new file mode 100644
index 0000000000..2e8710108b
--- /dev/null
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PUSTATS_H_
+#define AOM_AV1_ENCODER_PUSTATS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define NUM_FEATURES_PUSTATS 8
+#define NUM_HIDDEN_LAYERS 2
+#define HIDDEN_LAYERS_0_NODES 12
+#define HIDDEN_LAYERS_1_NODES 10
+#define LOGITS_NODES 1
+
+static const float
+ av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+ HIDDEN_LAYERS_0_NODES] = {
+ -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f,
+ -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f,
+ 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f,
+ 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f,
+ -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f,
+ -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f,
+ -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f,
+ -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f,
+ 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f,
+ -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f,
+ -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f,
+ -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f,
+ 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f,
+ -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f,
+ };
+
+static const float
+ av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = {
+ 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f,
+ 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f,
+ };
+
+static const float
+ av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+ HIDDEN_LAYERS_1_NODES] = {
+ 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f,
+ 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f,
+ -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f,
+ 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f,
+ 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f,
+ -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f,
+ -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f,
+ -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f,
+ 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f,
+ 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f,
+ -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f,
+ -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f,
+ -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f,
+ 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f,
+ -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f,
+ -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f,
+ 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f,
+ -2.7566f,
+ };
+
+static const float
+ av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = {
+ 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f,
+ 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f,
+ };
+
+static const float
+ av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+ 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f,
+ 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f,
+ };
+
+static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
+ 4.5103f,
+};
+
+static const NN_CONFIG av1_pustats_rate_nnconfig = {
+ NUM_FEATURES_PUSTATS, // num_inputs
+ LOGITS_NODES, // num_outputs
+ NUM_HIDDEN_LAYERS, // num_hidden_layers
+ { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes
+ {
+ av1_pustats_rate_hiddenlayer_0_kernel,
+ av1_pustats_rate_hiddenlayer_1_kernel,
+ av1_pustats_rate_logits_kernel,
+ },
+ {
+ av1_pustats_rate_hiddenlayer_0_bias,
+ av1_pustats_rate_hiddenlayer_1_bias,
+ av1_pustats_rate_logits_bias,
+ },
+};
+
+static const float
+ av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+ HIDDEN_LAYERS_0_NODES] = {
+ -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f,
+ 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f,
+ 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f,
+ 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f,
+ 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f,
+ -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f,
+ -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f,
+ -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f,
+ 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f,
+ 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f,
+ -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f,
+ -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f,
+ 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f,
+ -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f,
+ };
+
+static const float
+ av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = {
+ 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f,
+ 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f,
+ };
+
+static const float
+ av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+ HIDDEN_LAYERS_1_NODES] = {
+ -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f,
+ -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f,
+ 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f,
+ 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f,
+ -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f,
+ -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f,
+ -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f,
+ 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f,
+ -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f,
+ 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f,
+ 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f,
+ -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f,
+ 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f,
+ 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f,
+ 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f,
+ -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f,
+ -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f,
+ -0.4164f,
+ };
+
+static const float
+ av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = {
+ -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f,
+ 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f,
+ };
+
+static const float
+ av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+ -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f,
+ 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f,
+ };
+
+static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
+ 2.3371f,
+};
+
+static const NN_CONFIG av1_pustats_dist_nnconfig = {
+ NUM_FEATURES_PUSTATS, // num_inputs
+ LOGITS_NODES, // num_outputs
+ NUM_HIDDEN_LAYERS, // num_hidden_layers
+ { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes
+ {
+ av1_pustats_dist_hiddenlayer_0_kernel,
+ av1_pustats_dist_hiddenlayer_1_kernel,
+ av1_pustats_dist_logits_kernel,
+ },
+ {
+ av1_pustats_dist_hiddenlayer_0_bias,
+ av1_pustats_dist_hiddenlayer_1_bias,
+ av1_pustats_dist_logits_bias,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef HIDDEN_LAYERS_0_NODES
+#undef HIDDEN_LAYERS_1_NODES
+#undef LOGITS_NODES
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PUSTATS_H_
diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h
new file mode 100644
index 0000000000..efe909b6db
--- /dev/null
+++ b/third_party/aom/av1/encoder/random.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RANDOM_H_
+#define AOM_AV1_ENCODER_RANDOM_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Advance the generator to its next state, and generate the next 32-bit output.
+// Note that the low bits of this output are comparatively low-quality, so users
+// of this function should ensure that the high bits factor through to their
+// outputs.
+static INLINE uint32_t lcg_next(uint32_t *state) {
+ *state = (uint32_t)(*state * 1103515245ULL + 12345);
+ return *state;
+}
+
+// Generate a random number in the range [0, 32768).
+static INLINE uint32_t lcg_rand16(uint32_t *state) {
+ return (lcg_next(state) / 65536) % 32768;
+}
+
+// Generate a random number in the range [0, n)
+// This is implemented as (rand() * n) / <range of RNG> rather than
+// rand() % n, for a few reasons: This implementation is faster and less biased,
+// and if is a power of 2, this uses the higher-quality top bits from the RNG
+// output rather than the lower-quality bottom bits.
+static INLINE uint32_t lcg_randint(uint32_t *state, uint32_t n) {
+ uint64_t v = ((uint64_t)lcg_next(state) * n) >> 32;
+ return (uint32_t)v;
+}
+
+// Generate a random number in the range [lo, hi)
+static INLINE uint32_t lcg_randrange(uint32_t *state, uint32_t lo,
+ uint32_t hi) {
+ assert(lo < hi);
+ return lo + lcg_randint(state, hi - lo);
+}
+
+// Pick k distinct numbers from the set {0, ..., n-1}
+// All possible sets of k numbers, and all possible orderings of those numbers,
+// are equally likely.
+//
+// Note: The algorithm used here uses resampling to avoid choosing repeated
+// values. This works well as long as n >> k, but can potentially lead to many
+// resampling attempts if n is equal to or only slightly larger than k.
+static INLINE void lcg_pick(int n, int k, int *out, unsigned int *seed) {
+ assert(0 <= k && k <= n);
+ for (int i = 0; i < k; i++) {
+ int v;
+
+ // Inner resampling loop
+ // We have to use a goto here because C does not have a multi-level continue
+ // statement
+ resample:
+ v = (int)lcg_randint(seed, n);
+ for (int j = 0; j < i; j++) {
+ if (v == out[j]) {
+ // Repeated v, resample
+ goto resample;
+ }
+ }
+
+ // New v, accept
+ out[i] = v;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RANDOM_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
new file mode 100644
index 0000000000..df86380272
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -0,0 +1,3587 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_once.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#define USE_UNRESTRICTED_Q_IN_CQ_MODE 0
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#define SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO 0
+#define SUPERRES_QADJ_PER_DENOM_KEYFRAME 2
+#define SUPERRES_QADJ_PER_DENOM_ARFFRAME 0
+
+#define FRAME_OVERHEAD_BITS 200
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+ do { \
+ switch (bit_depth) { \
+ case AOM_BITS_8: name = name##_8; break; \
+ case AOM_BITS_10: name = name##_10; break; \
+ case AOM_BITS_12: name = name##_12; break; \
+ default: \
+ assert(0 && \
+ "bit_depth should be AOM_BITS_8, AOM_BITS_10" \
+ " or AOM_BITS_12"); \
+ name = NULL; \
+ } \
+ } while (0)
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+
+static int gf_high = 2400;
+static int gf_low = 300;
+#ifdef STRICT_RC
+static int kf_high = 3200;
+#else
+static int kf_high = 5000;
+#endif
+static int kf_low = 400;
+
+// How many times less pixels there are to encode given the current scaling.
+// Temporary replacement for rcf_mult and rate_thresh_mult.
+static double resize_rate_factor(const FrameDimensionCfg *const frm_dim_cfg,
+ int width, int height) {
+ return (double)(frm_dim_cfg->width * frm_dim_cfg->height) / (width * height);
+}
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+ aom_bit_depth_t bit_depth) {
+ const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
+
+ // Special case handling to deal with the step from q2.0
+ // down to lossless mode represented by q 1.0.
+ if (minqtarget <= 2.0) return 0;
+
+ return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1);
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
+ int *arfgf_high, int *inter, int *rtc,
+ aom_bit_depth_t bit_depth) {
+ int i;
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ const double maxq = av1_convert_qindex_to_q(i, bit_depth);
+ kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+ kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
+ arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+ arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+ inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+ rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+ }
+}
+
+static void rc_init_minq_luts(void) {
+ init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+ arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+ inter_minq_8, rtc_minq_8, AOM_BITS_8);
+ init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+ arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+ inter_minq_10, rtc_minq_10, AOM_BITS_10);
+ init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+ arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+ inter_minq_12, rtc_minq_12, AOM_BITS_12);
+}
+
+void av1_rc_init_minq_luts(void) { aom_once(rc_init_minq_luts); }
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
+ // Convert the index to a real Q value (scaled down to match old Q values)
+ switch (bit_depth) {
+ case AOM_BITS_8: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 4.0;
+ case AOM_BITS_10: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 16.0;
+ case AOM_BITS_12: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 64.0;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1.0;
+ }
+}
+
+int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
+ const int is_screen_content_type) {
+ int enumerator;
+
+ if (is_screen_content_type) {
+ enumerator = (frame_type == KEY_FRAME) ? 1000000 : 750000;
+ } else {
+ enumerator = (frame_type == KEY_FRAME) ? 2000000 : 1500000;
+ }
+
+ return enumerator;
+}
+
+static int get_init_ratio(double sse) { return (int)(300000 / sse); }
+
+int av1_rc_bits_per_mb(const AV1_COMP *cpi, FRAME_TYPE frame_type, int qindex,
+ double correction_factor, int accurate_estimate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int is_screen_content_type = cpi->is_screen_content_type;
+ const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
+ const double q = av1_convert_qindex_to_q(qindex, bit_depth);
+ int enumerator = av1_get_bpmb_enumerator(frame_type, is_screen_content_type);
+
+ assert(correction_factor <= MAX_BPB_FACTOR &&
+ correction_factor >= MIN_BPB_FACTOR);
+
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR && frame_type != KEY_FRAME &&
+ accurate_estimate && cpi->rec_sse != UINT64_MAX) {
+ const int mbs = cm->mi_params.MBs;
+ const double sse_sqrt =
+ (double)((int)sqrt((double)(cpi->rec_sse)) << BPER_MB_NORMBITS) /
+ (double)mbs;
+ const int ratio = (cpi->rc.bit_est_ratio == 0) ? get_init_ratio(sse_sqrt)
+ : cpi->rc.bit_est_ratio;
+ // Clamp the enumerator to lower the q fluctuations.
+ enumerator = AOMMIN(AOMMAX((int)(ratio * sse_sqrt), 20000), 170000);
+ }
+
+ // q based adjustment to baseline enumerator
+ return (int)(enumerator * correction_factor / q);
+}
+
+int av1_estimate_bits_at_q(const AV1_COMP *cpi, int q,
+ double correction_factor) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+ const int mbs = cm->mi_params.MBs;
+ const int bpm =
+ (int)(av1_rc_bits_per_mb(cpi, frame_type, q, correction_factor,
+ cpi->sf.hl_sf.accurate_bit_estimate));
+ return AOMMAX(FRAME_OVERHEAD_BITS,
+ (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+}
+
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target,
+ FRAME_UPDATE_TYPE frame_update_type) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const int min_frame_target =
+ AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+ // Clip the frame target to the minimum setup value.
+ if (frame_update_type == OVERLAY_UPDATE ||
+ frame_update_type == INTNL_OVERLAY_UPDATE) {
+ // If there is an active ARF at this location use the minimum
+ // bits on this frame even if it is a constructed arf.
+ // The active maximum quantizer insures that an appropriate
+ // number of bits will be spent if needed for constructed ARFs.
+ target = min_frame_target;
+ } else if (target < min_frame_target) {
+ target = min_frame_target;
+ }
+
+ // Clip the frame target to the maximum allowed value.
+ if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+ if (oxcf->rc_cfg.max_inter_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * oxcf->rc_cfg.max_inter_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+
+ return target;
+}
+
+int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int64_t target) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+ if (rc_cfg->max_intra_bitrate_pct) {
+ const int64_t max_rate =
+ (int64_t)rc->avg_frame_bandwidth * rc_cfg->max_intra_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+ if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+ return (int)target;
+}
+
+// Update the buffer level for higher temporal layers, given the encoded current
+// temporal layer.
+static void update_layer_buffer_level(SVC *svc, int encoded_frame_size,
+ bool is_screen) {
+ const int current_temporal_layer = svc->temporal_layer_id;
+ for (int i = current_temporal_layer + 1; i < svc->number_temporal_layers;
+ ++i) {
+ const int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+ lp_rc->bits_off_target +=
+ (int)round(lc->target_bandwidth / lc->framerate) - encoded_frame_size;
+ // Clip buffer level to maximum buffer size for the layer.
+ lp_rc->bits_off_target =
+ AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size);
+ lp_rc->buffer_level = lp_rc->bits_off_target;
+
+ // For screen-content mode: don't let buffer level go below threshold,
+ // given here as -rc->maximum_ buffer_size, to allow buffer to come back
+ // up sooner after slide change with big oveshoot.
+ if (is_screen) {
+ lp_rc->bits_off_target =
+ AOMMAX(lp_rc->bits_off_target, -lp_rc->maximum_buffer_size);
+ lp_rc->buffer_level = lp_rc->bits_off_target;
+ }
+ }
+}
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
+ // Non-viewable frames are a special case and are treated as pure overhead.
+ if (!cm->show_frame)
+ p_rc->bits_off_target -= encoded_frame_size;
+ else
+ p_rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+
+ // Clip the buffer level to the maximum specified buffer size.
+ p_rc->bits_off_target =
+ AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+ // For screen-content mode: don't let buffel level go below threshold,
+ // given here as -rc->maximum_ buffer_size, to allow buffer to come back
+ // up sooner after slide change with big oveshoot.
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)
+ p_rc->bits_off_target =
+ AOMMAX(p_rc->bits_off_target, -p_rc->maximum_buffer_size);
+ p_rc->buffer_level = p_rc->bits_off_target;
+
+ if (cpi->ppi->use_svc)
+ update_layer_buffer_level(&cpi->svc, encoded_frame_size,
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN);
+
+#if CONFIG_FPMT_TEST
+ /* The variable temp_buffer_level is introduced for quality
+ * simulation purpose, it retains the value previous to the parallel
+ * encode frames. The variable is updated based on the update flag.
+ *
+ * If there exist show_existing_frames between parallel frames, then to
+ * retain the temp state do not update it. */
+ int show_existing_between_parallel_frames =
+ (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ p_rc->temp_buffer_level = p_rc->buffer_level;
+ }
+#endif
+}
+
+int av1_rc_get_default_min_gf_interval(int width, int height,
+ double framerate) {
+ // Assume we do not need any constraint lower than 4K 20 fps
+ static const double factor_safe = 3840 * 2160 * 20.0;
+ const double factor = (double)width * height * framerate;
+ const int default_interval =
+ clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+ if (factor <= factor_safe)
+ return default_interval;
+ else
+ return AOMMAX(default_interval,
+ (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+ // Note this logic makes:
+ // 4K24: 5
+ // 4K30: 6
+ // 4K60: 12
+}
+
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+ int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+ interval += (interval & 0x01); // Round to even value
+ interval = AOMMAX(MAX_GF_INTERVAL, interval);
+ return AOMMAX(interval, min_gf_interval);
+}
+
+void av1_primary_rc_init(const AV1EncoderConfig *oxcf,
+ PRIMARY_RATE_CONTROL *p_rc) {
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ int worst_allowed_q = rc_cfg->worst_allowed_q;
+
+ int min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+ int max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+ if (min_gf_interval == 0)
+ min_gf_interval = av1_rc_get_default_min_gf_interval(
+ oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+ oxcf->input_cfg.init_framerate);
+ if (max_gf_interval == 0)
+ max_gf_interval = av1_rc_get_default_max_gf_interval(
+ oxcf->input_cfg.init_framerate, min_gf_interval);
+ p_rc->baseline_gf_interval = (min_gf_interval + max_gf_interval) / 2;
+ p_rc->this_key_frame_forced = 0;
+ p_rc->next_key_frame_forced = 0;
+ p_rc->ni_frames = 0;
+
+ p_rc->tot_q = 0.0;
+ p_rc->total_actual_bits = 0;
+ p_rc->total_target_bits = 0;
+ p_rc->buffer_level = p_rc->starting_buffer_level;
+
+ if (oxcf->target_seq_level_idx[0] < SEQ_LEVELS) {
+ worst_allowed_q = 255;
+ }
+ if (oxcf->pass == AOM_RC_ONE_PASS && rc_cfg->mode == AOM_CBR) {
+ p_rc->avg_frame_qindex[KEY_FRAME] = worst_allowed_q;
+ p_rc->avg_frame_qindex[INTER_FRAME] = worst_allowed_q;
+ } else {
+ p_rc->avg_frame_qindex[KEY_FRAME] =
+ (worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+ p_rc->avg_frame_qindex[INTER_FRAME] =
+ (worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+ }
+ p_rc->avg_q = av1_convert_qindex_to_q(rc_cfg->worst_allowed_q,
+ oxcf->tool_cfg.bit_depth);
+ p_rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q;
+ p_rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q;
+
+ for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+ p_rc->rate_correction_factors[i] = 0.7;
+ }
+ p_rc->rate_correction_factors[KF_STD] = 1.0;
+ p_rc->bits_off_target = p_rc->starting_buffer_level;
+
+ p_rc->rolling_target_bits =
+ (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate);
+ p_rc->rolling_actual_bits =
+ (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate);
+}
+
+void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) {
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ rc->frames_since_key = 8; // Sensible default for first frame.
+ rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist;
+
+ rc->frames_till_gf_update_due = 0;
+ rc->ni_av_qi = rc_cfg->worst_allowed_q;
+ rc->ni_tot_qi = 0;
+
+ rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+ rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+ if (rc->min_gf_interval == 0)
+ rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+ oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+ oxcf->input_cfg.init_framerate);
+ if (rc->max_gf_interval == 0)
+ rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+ oxcf->input_cfg.init_framerate, rc->min_gf_interval);
+ rc->avg_frame_low_motion = 0;
+
+ rc->resize_state = ORIG;
+ rc->resize_avg_qp = 0;
+ rc->resize_buffer_underflow = 0;
+ rc->resize_count = 0;
+ rc->rtc_external_ratectrl = 0;
+ rc->frame_level_fast_extra_bits = 0;
+ rc->use_external_qp_one_pass = 0;
+}
+
+static bool check_buffer_below_thresh(AV1_COMP *cpi, int64_t buffer_level,
+ int drop_mark) {
+ SVC *svc = &cpi->svc;
+ if (!cpi->ppi->use_svc || cpi->svc.number_spatial_layers == 1 ||
+ cpi->svc.framedrop_mode == AOM_LAYER_DROP) {
+ return (buffer_level <= drop_mark);
+ } else {
+ // For SVC in the AOM_FULL_SUPERFRAME_DROP): the condition on
+ // buffer is checked on current and upper spatial layers.
+ for (int i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
+ const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ PRIMARY_RATE_CONTROL *lrc = &lc->p_rc;
+ // Exclude check for layer whose bitrate is 0.
+ if (lc->target_bandwidth > 0) {
+ const int drop_thresh = cpi->oxcf.rc_cfg.drop_frames_water_mark;
+ const int drop_mark_layer =
+ (int)(drop_thresh * lrc->optimal_buffer_level / 100);
+ if (lrc->buffer_level <= drop_mark_layer) return true;
+ }
+ }
+ return false;
+ }
+}
+
+int av1_rc_drop_frame(AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int64_t buffer_level =
+ simulate_parallel_frame ? p_rc->temp_buffer_level : p_rc->buffer_level;
+#else
+ int64_t buffer_level = p_rc->buffer_level;
+#endif
+ // Never drop on key frame, or for frame whose base layer is key.
+ // If drop_count_consec hits or exceeds max_consec_drop then don't drop.
+ if (cpi->common.current_frame.frame_type == KEY_FRAME ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
+ !oxcf->rc_cfg.drop_frames_water_mark ||
+ (rc->max_consec_drop > 0 &&
+ rc->drop_count_consec >= rc->max_consec_drop)) {
+ return 0;
+ } else {
+ SVC *svc = &cpi->svc;
+ // In the full_superframe framedrop mode for svc, if the previous spatial
+ // layer was dropped, drop the current spatial layer.
+ if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
+ svc->drop_spatial_layer[svc->spatial_layer_id - 1] &&
+ svc->framedrop_mode == AOM_FULL_SUPERFRAME_DROP)
+ return 1;
+ // -1 is passed here for drop_mark since we are checking if
+ // buffer goes below 0 (<= -1).
+ if (check_buffer_below_thresh(cpi, buffer_level, -1)) {
+ // Always drop if buffer is below 0.
+ rc->drop_count_consec++;
+ return 1;
+ } else {
+ // If buffer is below drop_mark, for now just drop every other frame
+ // (starting with the next frame) until it increases back over drop_mark.
+ const int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark *
+ p_rc->optimal_buffer_level / 100);
+ const bool buffer_below_thresh =
+ check_buffer_below_thresh(cpi, buffer_level, drop_mark);
+ if (!buffer_below_thresh && rc->decimation_factor > 0) {
+ --rc->decimation_factor;
+ } else if (buffer_below_thresh && rc->decimation_factor == 0) {
+ rc->decimation_factor = 1;
+ }
+ if (rc->decimation_factor > 0) {
+ if (rc->decimation_count > 0) {
+ --rc->decimation_count;
+ rc->drop_count_consec++;
+ return 1;
+ } else {
+ rc->decimation_count = rc->decimation_factor;
+ return 0;
+ }
+ } else {
+ rc->decimation_count = 0;
+ return 0;
+ }
+ }
+ }
+}
+
+static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality,
+ int width, int height) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1_COMMON *const cm = &cpi->common;
+ const SVC *const svc = &cpi->svc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ // Flag to indicate previous frame has overshoot, and buffer level
+ // for current frame is low (less than ~half of optimal). For such
+ // (inter) frames, if the source_sad is non-zero, relax the max_delta_up
+ // and clamp applied below.
+ const bool overshoot_buffer_low =
+ cpi->rc.rc_1_frame == -1 && rc->frame_source_sad > 1000 &&
+ p_rc->buffer_level < (p_rc->optimal_buffer_level >> 1) &&
+ rc->frames_since_key > 4;
+ int max_delta_down;
+ int max_delta_up = overshoot_buffer_low ? 60 : 20;
+ const int change_avg_frame_bandwidth =
+ abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
+ 0.1 * (rc->avg_frame_bandwidth);
+
+ // Set the maximum adjustment down for Q for this frame.
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->cyclic_refresh->apply_cyclic_refresh) {
+ // For static screen type content limit the Q drop till the start of the
+ // next refresh cycle.
+ if (cpi->is_screen_content_type &&
+ (cpi->cyclic_refresh->sb_index > cpi->cyclic_refresh->last_sb_index)) {
+ max_delta_down = AOMMIN(8, AOMMAX(1, rc->q_1_frame / 32));
+ } else {
+ max_delta_down = AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8));
+ }
+ if (!cpi->ppi->use_svc && cpi->is_screen_content_type) {
+ // Link max_delta_up to max_delta_down and buffer status.
+ if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
+ max_delta_up = AOMMAX(4, max_delta_down);
+ } else {
+ max_delta_up = AOMMAX(8, max_delta_down);
+ }
+ }
+ } else {
+ max_delta_down = (cpi->is_screen_content_type)
+ ? AOMMIN(8, AOMMAX(1, rc->q_1_frame / 16))
+ : AOMMIN(16, AOMMAX(1, rc->q_1_frame / 8));
+ }
+ // If resolution changes or avg_frame_bandwidth significantly changed,
+ // then set this flag to indicate change in target bits per macroblock.
+ const int change_target_bits_mb =
+ cm->prev_frame &&
+ (width != cm->prev_frame->width || height != cm->prev_frame->height ||
+ change_avg_frame_bandwidth);
+ // Apply some control/clamp to QP under certain conditions.
+ // Delay the use of the clamping for svc until after num_temporal_layers,
+ // to make they have been set for each temporal layer.
+ if (!frame_is_intra_only(cm) && rc->frames_since_key > 1 &&
+ (!cpi->ppi->use_svc ||
+ svc->current_superframe > (unsigned int)svc->number_temporal_layers) &&
+ !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl &&
+ (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct ||
+ !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) {
+ // If in the previous two frames we have seen both overshoot and undershoot
+ // clamp Q between the two. Check for rc->q_1/2_frame > 0 in case they have
+ // not been set due to dropped frames.
+ if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
+ rc->q_1_frame != rc->q_2_frame && rc->q_1_frame > 0 &&
+ rc->q_2_frame > 0 && !overshoot_buffer_low) {
+ int qclamp = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
+ AOMMAX(rc->q_1_frame, rc->q_2_frame));
+ // If the previous frame had overshoot and the current q needs to
+ // increase above the clamped value, reduce the clamp for faster reaction
+ // to overshoot.
+ if (cpi->rc.rc_1_frame == -1 && q > qclamp && rc->frames_since_key > 10)
+ q = (q + qclamp) >> 1;
+ else
+ q = qclamp;
+ }
+ // Adjust Q base on source content change from scene detection.
+ if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 &&
+ rc->frames_since_key > 10 && rc->frame_source_sad > 0 &&
+ !cpi->rc.rtc_external_ratectrl) {
+ const int bit_depth = cm->seq_params->bit_depth;
+ double delta =
+ (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0;
+ // Push Q downwards if content change is decreasing and buffer level
+ // is stable (at least 1/4-optimal level), so not overshooting. Do so
+ // only for high Q to avoid excess overshoot.
+ // Else reduce decrease in Q from previous frame if content change is
+ // increasing and buffer is below max (so not undershooting).
+ if (delta < 0.0 &&
+ p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
+ q > (rc->worst_quality >> 1)) {
+ double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta);
+ double q_val = av1_convert_qindex_to_q(q, bit_depth);
+ q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+ } else if (rc->q_1_frame - q > 0 && delta > 0.1 &&
+ p_rc->buffer_level < AOMMIN(p_rc->maximum_buffer_size,
+ p_rc->optimal_buffer_level << 1)) {
+ q = (3 * q + rc->q_1_frame) >> 2;
+ }
+ }
+ // Limit the decrease in Q from previous frame.
+ if (rc->q_1_frame - q > max_delta_down) q = rc->q_1_frame - max_delta_down;
+ // Limit the increase in Q from previous frame.
+ else if (q - rc->q_1_frame > max_delta_up)
+ q = rc->q_1_frame + max_delta_up;
+ }
+ // Adjustment for temporal layers.
+ if (svc->number_temporal_layers > 1 && svc->spatial_layer_id == 0 &&
+ !change_target_bits_mb && !cpi->rc.rtc_external_ratectrl &&
+ cpi->oxcf.resize_cfg.resize_mode != RESIZE_DYNAMIC) {
+ if (svc->temporal_layer_id > 0) {
+ // Constrain enhancement relative to the previous base TL0.
+ // Get base temporal layer TL0.
+ const int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ // lc->rc.avg_frame_bandwidth and lc->p_rc.last_q correspond to the
+ // last TL0 frame.
+ if (rc->avg_frame_bandwidth < lc->rc.avg_frame_bandwidth &&
+ q < lc->p_rc.last_q[INTER_FRAME] - 4)
+ q = lc->p_rc.last_q[INTER_FRAME] - 4;
+ } else if (cpi->svc.temporal_layer_id == 0 &&
+ p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
+ rc->frame_source_sad < 100000) {
+ // Push base TL0 Q down if buffer is stable and frame_source_sad
+ // is below threshold.
+ int delta = (svc->number_temporal_layers == 2) ? 4 : 10;
+ q = q - delta;
+ }
+ }
+ // For non-svc (single layer): if resolution has increased push q closer
+ // to the active_worst to avoid excess overshoot.
+ if (!cpi->ppi->use_svc && cm->prev_frame &&
+ (width * height > 1.5 * cm->prev_frame->width * cm->prev_frame->height))
+ q = (q + active_worst_quality) >> 1;
+ // For single layer RPS: Bias Q based on distance of closest reference.
+ if (cpi->ppi->rtc_ref.bias_recovery_frame) {
+ const int min_dist = av1_svc_get_min_ref_dist(cpi);
+ q = q - AOMMIN(min_dist, 20);
+ }
+ return AOMMAX(AOMMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
+}
+
+static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = {
+ KF_STD, // KF_UPDATE
+ INTER_NORMAL, // LF_UPDATE
+ GF_ARF_STD, // GF_UPDATE
+ GF_ARF_STD, // ARF_UPDATE
+ INTER_NORMAL, // OVERLAY_UPDATE
+ INTER_NORMAL, // INTNL_OVERLAY_UPDATE
+ GF_ARF_LOW, // INTNL_ARF_UPDATE
+};
+
+static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group,
+ int gf_frame_index) {
+ const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index];
+ assert(update_type < FRAME_UPDATE_TYPES);
+ return rate_factor_levels[update_type];
+}
+
+/*!\brief Gets a rate vs Q correction factor
+ *
+ * This function returns the current value of a correction factor used to
+ * dynamilcally adjust the relationship between Q and the expected number
+ * of bits for the frame.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] width Frame width
+ * \param[in] height Frame height
+ *
+ * \return Returns a correction factor for the current frame
+ */
+static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
+ int height) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ double rcf;
+ double rate_correction_factors_kfstd;
+ double rate_correction_factors_gfarfstd;
+ double rate_correction_factors_internormal;
+
+ rate_correction_factors_kfstd =
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ ? rc->frame_level_rate_correction_factors[KF_STD]
+ : p_rc->rate_correction_factors[KF_STD];
+ rate_correction_factors_gfarfstd =
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ ? rc->frame_level_rate_correction_factors[GF_ARF_STD]
+ : p_rc->rate_correction_factors[GF_ARF_STD];
+ rate_correction_factors_internormal =
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ ? rc->frame_level_rate_correction_factors[INTER_NORMAL]
+ : p_rc->rate_correction_factors[INTER_NORMAL];
+
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+ rcf = rate_correction_factors_kfstd;
+ } else if (is_stat_consumption_stage(cpi)) {
+ const RATE_FACTOR_LEVEL rf_lvl =
+ get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ double rate_correction_factors_rflvl =
+ (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+ ? rc->frame_level_rate_correction_factors[rf_lvl]
+ : p_rc->rate_correction_factors[rf_lvl];
+ rcf = rate_correction_factors_rflvl;
+ } else {
+ if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
+ !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+ (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+ cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
+ rcf = rate_correction_factors_gfarfstd;
+ else
+ rcf = rate_correction_factors_internormal;
+ }
+ rcf *= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
+ return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+/*!\brief Sets a rate vs Q correction factor
+ *
+ * This function updates the current value of a correction factor used to
+ * dynamilcally adjust the relationship between Q and the expected number
+ * of bits for the frame.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] is_encode_stage Indicates if recode loop or post-encode
+ * \param[in] factor New correction factor
+ * \param[in] width Frame width
+ * \param[in] height Frame height
+ *
+ * \remark Updates the rate correction factor for the
+ * current frame type in cpi->rc.
+ */
+static void set_rate_correction_factor(AV1_COMP *cpi, int is_encode_stage,
+ double factor, int width, int height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ int update_default_rcf = 1;
+ // Normalize RCF to account for the size-dependent scaling factor.
+ factor /= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
+
+ factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+ p_rc->rate_correction_factors[KF_STD] = factor;
+ } else if (is_stat_consumption_stage(cpi)) {
+ const RATE_FACTOR_LEVEL rf_lvl =
+ get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ if (is_encode_stage &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ rc->frame_level_rate_correction_factors[rf_lvl] = factor;
+ update_default_rcf = 0;
+ }
+ if (update_default_rcf) p_rc->rate_correction_factors[rf_lvl] = factor;
+ } else {
+ if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
+ !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+ (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
+ cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) {
+ p_rc->rate_correction_factors[GF_ARF_STD] = factor;
+ } else {
+ if (is_encode_stage &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+ rc->frame_level_rate_correction_factors[INTER_NORMAL] = factor;
+ update_default_rcf = 0;
+ }
+ if (update_default_rcf)
+ p_rc->rate_correction_factors[INTER_NORMAL] = factor;
+ }
+ }
+}
+
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int is_encode_stage,
+ int width, int height) {
+ const AV1_COMMON *const cm = &cpi->common;
+ double correction_factor = 1.0;
+ double rate_correction_factor =
+ get_rate_correction_factor(cpi, width, height);
+ double adjustment_limit;
+ int projected_size_based_on_q = 0;
+ int cyclic_refresh_active =
+ cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled;
+
+ // Do not update the rate factors for arf overlay frames.
+ if (cpi->rc.is_src_frame_alt_ref) return;
+
+ // Don't update rate correction factors here on scene changes as
+ // it is already reset in av1_encodedframe_overshoot_cbr(),
+ // but reset variables related to previous frame q and size.
+ // Note that the counter of frames since the last scene change
+ // is only valid when cyclic refresh mode is enabled and that
+ // this break out only applies to scene changes that are not
+ // recorded as INTRA only key frames.
+ if ((cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ) &&
+ (cpi->cyclic_refresh->counter_encode_maxq_scene_change == 0) &&
+ !frame_is_intra_only(cm) && !cpi->ppi->use_svc) {
+ cpi->rc.q_2_frame = cm->quant_params.base_qindex;
+ cpi->rc.q_1_frame = cm->quant_params.base_qindex;
+ cpi->rc.rc_2_frame = 0;
+ cpi->rc.rc_1_frame = 0;
+ return;
+ }
+
+ // Clear down mmx registers to allow floating point in what follows
+
+ // Work out how big we would have expected the frame to be at this Q given
+ // the current correction factor.
+ // Stay in double to avoid int overflow when values are large
+ if (cyclic_refresh_active) {
+ projected_size_based_on_q =
+ av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+ } else {
+ projected_size_based_on_q = av1_estimate_bits_at_q(
+ cpi, cm->quant_params.base_qindex, rate_correction_factor);
+ }
+ // Work out a size correction factor.
+ if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+ correction_factor = (double)cpi->rc.projected_frame_size /
+ (double)projected_size_based_on_q;
+
+ // Clamp correction factor to prevent anything too extreme
+ correction_factor = AOMMAX(correction_factor, 0.25);
+
+ cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+ cpi->rc.q_1_frame = cm->quant_params.base_qindex;
+ cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+ if (correction_factor > 1.1)
+ cpi->rc.rc_1_frame = -1;
+ else if (correction_factor < 0.9)
+ cpi->rc.rc_1_frame = 1;
+ else
+ cpi->rc.rc_1_frame = 0;
+
+ // Decide how heavily to dampen the adjustment
+ if (correction_factor > 0.0) {
+ if (cpi->is_screen_content_type) {
+ adjustment_limit =
+ 0.25 + 0.5 * AOMMIN(0.5, fabs(log10(correction_factor)));
+ } else {
+ adjustment_limit =
+ 0.25 + 0.75 * AOMMIN(0.5, fabs(log10(correction_factor)));
+ }
+ } else {
+ adjustment_limit = 0.75;
+ }
+
+ // Adjustment to delta Q and number of blocks updated in cyclic refressh
+ // based on over or under shoot of target in current frame.
+ if (cyclic_refresh_active && cpi->rc.this_frame_target > 0) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ if (correction_factor > 1.25) {
+ cr->percent_refresh_adjustment =
+ AOMMAX(cr->percent_refresh_adjustment - 1, -5);
+ cr->rate_ratio_qdelta_adjustment =
+ AOMMAX(cr->rate_ratio_qdelta_adjustment - 0.05, -0.0);
+ } else if (correction_factor < 0.5) {
+ cr->percent_refresh_adjustment =
+ AOMMIN(cr->percent_refresh_adjustment + 1, 5);
+ cr->rate_ratio_qdelta_adjustment =
+ AOMMIN(cr->rate_ratio_qdelta_adjustment + 0.05, 0.25);
+ }
+ }
+
+ if (correction_factor > 1.01) {
+ // We are not already at the worst allowable quality
+ correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit));
+ rate_correction_factor = rate_correction_factor * correction_factor;
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ } else if (correction_factor < 0.99) {
+ // We are not already at the best allowable quality
+ correction_factor = 1.0 / correction_factor;
+ correction_factor = (1.0 + ((correction_factor - 1.0) * adjustment_limit));
+ correction_factor = 1.0 / correction_factor;
+
+ rate_correction_factor = rate_correction_factor * correction_factor;
+
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor < MIN_BPB_FACTOR)
+ rate_correction_factor = MIN_BPB_FACTOR;
+ }
+
+ set_rate_correction_factor(cpi, is_encode_stage, rate_correction_factor,
+ width, height);
+}
+
+// Calculate rate for the given 'q'.
+static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh,
+ double correction_factor, int q) {
+ const AV1_COMMON *const cm = &cpi->common;
+ return use_cyclic_refresh
+ ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
+ : av1_rc_bits_per_mb(cpi, cm->current_frame.frame_type, q,
+ correction_factor,
+ cpi->sf.hl_sf.accurate_bit_estimate);
+}
+
+/*!\brief Searches for a Q index value predicted to give an average macro
+ * block rate closest to the target value.
+ *
+ * Similar to find_qindex_by_rate() function, but returns a q index with a
+ * rate just above or below the desired rate, depending on which of the two
+ * rates is closer to the desired rate.
+ * Also, respects the selected aq_mode when computing the rate.
+ *
+ * \ingroup rate_control
+ * \param[in] desired_bits_per_mb Target bits per mb
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] correction_factor Current Q to rate correction factor
+ * \param[in] best_qindex Min allowed Q value.
+ * \param[in] worst_qindex Max allowed Q value.
+ *
+ * \return Returns a correction factor for the current frame
+ */
+static int find_closest_qindex_by_rate(int desired_bits_per_mb,
+ const AV1_COMP *cpi,
+ double correction_factor,
+ int best_qindex, int worst_qindex) {
+ const int use_cyclic_refresh = cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->cyclic_refresh->apply_cyclic_refresh;
+
+ // Find 'qindex' based on 'desired_bits_per_mb'.
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const int mid_bits_per_mb =
+ get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid);
+ if (mid_bits_per_mb > desired_bits_per_mb) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ assert(low == high);
+
+ // Calculate rate difference of this q index from the desired rate.
+ const int curr_q = low;
+ const int curr_bits_per_mb =
+ get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q);
+ const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb)
+ ? desired_bits_per_mb - curr_bits_per_mb
+ : INT_MAX;
+ assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) ||
+ curr_q == worst_qindex);
+
+ // Calculate rate difference for previous q index too.
+ const int prev_q = curr_q - 1;
+ int prev_bit_diff;
+ if (curr_bit_diff == INT_MAX || curr_q == best_qindex) {
+ prev_bit_diff = INT_MAX;
+ } else {
+ const int prev_bits_per_mb =
+ get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q);
+ assert(prev_bits_per_mb > desired_bits_per_mb);
+ prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb;
+ }
+
+ // Pick one of the two q indices, depending on which one has rate closer to
+ // the desired rate.
+ return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q;
+}
+
+int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
+ int active_best_quality, int active_worst_quality,
+ int width, int height) {
+ const int MBs = av1_get_MBs(width, height);
+ const double correction_factor =
+ get_rate_correction_factor(cpi, width, height);
+ const int target_bits_per_mb =
+ (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / MBs);
+
+ int q =
+ find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor,
+ active_best_quality, active_worst_quality);
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR && has_no_stats_stage(cpi))
+ return adjust_q_cbr(cpi, q, active_worst_quality, width, height);
+
+ return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+ int *low_motion_minq, int *high_motion_minq) {
+ if (gfu_boost > high) {
+ return low_motion_minq[q];
+ } else if (gfu_boost < low) {
+ return high_motion_minq[q];
+ } else {
+ const int gap = high - low;
+ const int offset = high - gfu_boost;
+ const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+ const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+ return low_motion_minq[q] + adjustment;
+ }
+}
+
+static int get_kf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
+ aom_bit_depth_t bit_depth) {
+ int *kf_low_motion_minq;
+ int *kf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+ ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+ return get_active_quality(q, p_rc->kf_boost, kf_low, kf_high,
+ kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality_no_rc(int gfu_boost, int q,
+ aom_bit_depth_t bit_depth) {
+ int *arfgf_low_motion_minq;
+ int *arfgf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+ return get_active_quality(q, gfu_boost, gf_low, gf_high,
+ arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
+ aom_bit_depth_t bit_depth) {
+ return get_gf_active_quality_no_rc(p_rc->gfu_boost, q, bit_depth);
+}
+
+static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
+ int *arfgf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+ return arfgf_high_motion_minq[q];
+}
+
+static int calc_active_worst_quality_no_stats_vbr(const AV1_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const unsigned int curr_frame = cpi->common.current_frame.frame_number;
+ int active_worst_quality;
+ int last_q_key_frame;
+ int last_q_inter_frame;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ last_q_key_frame = simulate_parallel_frame ? p_rc->temp_last_q[KEY_FRAME]
+ : p_rc->last_q[KEY_FRAME];
+ last_q_inter_frame = simulate_parallel_frame ? p_rc->temp_last_q[INTER_FRAME]
+ : p_rc->last_q[INTER_FRAME];
+#else
+ last_q_key_frame = p_rc->last_q[KEY_FRAME];
+ last_q_inter_frame = p_rc->last_q[INTER_FRAME];
+#endif
+
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+ active_worst_quality =
+ curr_frame == 0 ? rc->worst_quality : last_q_key_frame * 2;
+ } else {
+ if (!rc->is_src_frame_alt_ref &&
+ (refresh_frame->golden_frame || refresh_frame->bwd_ref_frame ||
+ refresh_frame->alt_ref_frame)) {
+ active_worst_quality =
+ curr_frame == 1 ? last_q_key_frame * 5 / 4 : last_q_inter_frame;
+ } else {
+ active_worst_quality =
+ curr_frame == 1 ? last_q_key_frame * 2 : last_q_inter_frame * 2;
+ }
+ }
+ return AOMMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_no_stats_cbr(const AV1_COMP *cpi) {
+ // Adjust active_worst_quality: If buffer is above the optimal/target level,
+ // bring active_worst_quality down depending on fullness of buffer.
+ // If buffer is below the optimal level, let the active_worst_quality go from
+ // ambient Q (at buffer = optimal level) to worst_quality level
+ // (at buffer = critical level).
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+ const SVC *const svc = &cpi->svc;
+ unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
+ // Buffer level below which we push active_worst to worst_quality.
+ int64_t critical_level = p_rc->optimal_buffer_level >> 3;
+ int64_t buff_lvl_step = 0;
+ int adjustment = 0;
+ int active_worst_quality;
+ int ambient_qp;
+ if (cm->current_frame.frame_type == KEY_FRAME) return rc->worst_quality;
+ // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+ // for the first few frames following key frame. These are both initialized
+ // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+ // So for first few frames following key, the qp of that key frame is weighted
+ // into the active_worst_quality setting. For SVC the key frame should
+ // correspond to layer (0, 0), so use that for layer context.
+ int avg_qindex_key = p_rc->avg_frame_qindex[KEY_FRAME];
+ if (svc->number_temporal_layers > 1) {
+ int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers);
+ const LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ const PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+ avg_qindex_key =
+ AOMMIN(lp_rc->avg_frame_qindex[KEY_FRAME], lp_rc->last_q[KEY_FRAME]);
+ }
+ ambient_qp = (cm->current_frame.frame_number < num_frames_weight_key)
+ ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key)
+ : p_rc->avg_frame_qindex[INTER_FRAME];
+ ambient_qp = AOMMIN(rc->worst_quality, ambient_qp);
+
+ if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
+ // Adjust down.
+ int max_adjustment_down; // Maximum adjustment down for Q
+
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && !cpi->ppi->use_svc &&
+ (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN)) {
+ active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp);
+ max_adjustment_down = AOMMIN(4, active_worst_quality / 16);
+ } else {
+ active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
+ max_adjustment_down = active_worst_quality / 3;
+ }
+
+ if (max_adjustment_down) {
+ buff_lvl_step =
+ ((p_rc->maximum_buffer_size - p_rc->optimal_buffer_level) /
+ max_adjustment_down);
+ if (buff_lvl_step)
+ adjustment = (int)((p_rc->buffer_level - p_rc->optimal_buffer_level) /
+ buff_lvl_step);
+ active_worst_quality -= adjustment;
+ }
+ } else if (p_rc->buffer_level > critical_level) {
+ // Adjust up from ambient Q.
+ active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp);
+ if (critical_level) {
+ buff_lvl_step = (p_rc->optimal_buffer_level - critical_level);
+ if (buff_lvl_step) {
+ adjustment = (int)((rc->worst_quality - ambient_qp) *
+ (p_rc->optimal_buffer_level - p_rc->buffer_level) /
+ buff_lvl_step);
+ }
+ active_worst_quality += adjustment;
+ }
+ } else {
+ // Set to worst_quality if buffer is below critical level.
+ active_worst_quality = rc->worst_quality;
+ }
+ return active_worst_quality;
+}
+
+// Calculate the active_best_quality level.
+static int calc_active_best_quality_no_stats_cbr(const AV1_COMP *cpi,
+ int active_worst_quality,
+ int width, int height) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ int *rtc_minq;
+ const int bit_depth = cm->seq_params->bit_depth;
+ int active_best_quality = rc->best_quality;
+ ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
+
+ if (frame_is_intra_only(cm)) {
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ if (p_rc->this_key_frame_forced) {
+ int qindex = p_rc->last_boosted_qindex;
+ double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ (last_boosted_q * 0.75), bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else if (current_frame->frame_number > 0) {
+ // not first frame of one pass and kf_boost is set
+ double q_adj_factor = 1.0;
+ double q_val;
+ active_best_quality = get_kf_active_quality(
+ p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+ // Allow somewhat lower kf minq with small image formats.
+ if ((width * height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+ }
+ } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
+ cpi->oxcf.rc_cfg.gf_cbr_boost_pct &&
+ (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ int q = active_worst_quality;
+ if (rc->frames_since_key > 1 &&
+ p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = p_rc->avg_frame_qindex[INTER_FRAME];
+ }
+ active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+ } else {
+ // Use the lower of active_worst_quality and recent/average Q.
+ FRAME_TYPE frame_type =
+ (current_frame->frame_number > 1) ? INTER_FRAME : KEY_FRAME;
+ if (p_rc->avg_frame_qindex[frame_type] < active_worst_quality)
+ active_best_quality = rtc_minq[p_rc->avg_frame_qindex[frame_type]];
+ else
+ active_best_quality = rtc_minq[active_worst_quality];
+ }
+ return active_best_quality;
+}
+
+#if RT_PASSIVE_STRATEGY
+static int get_q_passive_strategy(const AV1_COMP *const cpi,
+ const int q_candidate, const int threshold) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ int sum = 0;
+ int count = 0;
+ int i = 1;
+ while (i < MAX_Q_HISTORY) {
+ int frame_id = current_frame->frame_number - i;
+ if (frame_id <= 0) break;
+ sum += p_rc->q_history[frame_id % MAX_Q_HISTORY];
+ ++count;
+ ++i;
+ }
+ if (count > 0) {
+ const int avg_q = sum / count;
+ if (abs(avg_q - q_candidate) <= threshold) return avg_q;
+ }
+ return q_candidate;
+}
+#endif // RT_PASSIVE_STRATEGY
+
+/*!\brief Picks q and q bounds given CBR rate control parameters in \c cpi->rc.
+ *
+ * Handles the special case when using:
+ * - Constant bit-rate mode: \c cpi->oxcf.rc_cfg.mode == \ref AOM_CBR, and
+ * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are
+ * NOT available.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] width Coded frame width
+ * \param[in] height Coded frame height
+ * \param[out] bottom_index Bottom bound for q index (best quality)
+ * \param[out] top_index Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds_no_stats_cbr(const AV1_COMP *cpi, int width,
+ int height, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ int q;
+ int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
+ int active_best_quality = calc_active_best_quality_no_stats_cbr(
+ cpi, active_worst_quality, width, height);
+ assert(has_no_stats_stage(cpi));
+ assert(cpi->oxcf.rc_cfg.mode == AOM_CBR);
+
+ // Clip the active best and worst quality values to limits
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ // Limit Q range for the adaptive loop.
+ if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced &&
+ current_frame->frame_number != 0) {
+ int qdelta = 0;
+ qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+ active_worst_quality, 2.0);
+ *top_index = active_worst_quality + qdelta;
+ *top_index = AOMMAX(*top_index, *bottom_index);
+ }
+
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality, width, height);
+#if RT_PASSIVE_STRATEGY
+ if (current_frame->frame_type != KEY_FRAME &&
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ q = get_q_passive_strategy(cpi, q, 50);
+ }
+#endif // RT_PASSIVE_STRATEGY
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+static int gf_group_pyramid_level(const GF_GROUP *gf_group, int gf_index) {
+ return gf_group->layer_depth[gf_index];
+}
+
+static int get_active_cq_level(const RATE_CONTROL *rc,
+ const PRIMARY_RATE_CONTROL *p_rc,
+ const AV1EncoderConfig *const oxcf,
+ int intra_only, aom_superres_mode superres_mode,
+ int superres_denom) {
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+ static const double cq_adjust_threshold = 0.1;
+ int active_cq_level = rc_cfg->cq_level;
+ if (rc_cfg->mode == AOM_CQ || rc_cfg->mode == AOM_Q) {
+ // printf("Superres %d %d %d = %d\n", superres_denom, intra_only,
+ // rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1));
+ if ((superres_mode == AOM_SUPERRES_QTHRESH ||
+ superres_mode == AOM_SUPERRES_AUTO) &&
+ superres_denom != SCALE_NUMERATOR) {
+ int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO;
+ if (intra_only && rc->frames_to_key <= 1) {
+ mult = 0;
+ } else if (intra_only) {
+ mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME;
+ } else {
+ mult = SUPERRES_QADJ_PER_DENOM_ARFFRAME;
+ }
+ active_cq_level = AOMMAX(
+ active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0);
+ }
+ }
+ if (rc_cfg->mode == AOM_CQ && p_rc->total_target_bits > 0) {
+ const double x = (double)p_rc->total_actual_bits / p_rc->total_target_bits;
+ if (x < cq_adjust_threshold) {
+ active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+ }
+ }
+ return active_cq_level;
+}
+
+/*!\brief Picks q and q bounds given non-CBR rate control params in \c cpi->rc.
+ *
+ * Handles the special case when using:
+ * - Any rate control other than constant bit-rate mode:
+ * \c cpi->oxcf.rc_cfg.mode != \ref AOM_CBR, and
+ * - 1-pass encoding without LAP (look-ahead processing), so 1st pass stats are
+ * NOT available.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] width Coded frame width
+ * \param[in] height Coded frame height
+ * \param[out] bottom_index Bottom bound for q index (best quality)
+ * \param[out] top_index Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
+ int height, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
+
+ assert(has_no_stats_stage(cpi));
+ assert(rc_mode == AOM_VBR ||
+ (!USE_UNRESTRICTED_Q_IN_CQ_MODE && rc_mode == AOM_CQ) ||
+ rc_mode == AOM_Q);
+
+ const int cq_level =
+ get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+ cpi->superres_mode, cm->superres_scale_denominator);
+ const int bit_depth = cm->seq_params->bit_depth;
+
+ int active_best_quality;
+ int active_worst_quality = calc_active_worst_quality_no_stats_vbr(cpi);
+ int q;
+ int *inter_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+
+ if (frame_is_intra_only(cm)) {
+ if (rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+ const int delta_qindex =
+ av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else if (p_rc->this_key_frame_forced) {
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int qindex = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex
+ : p_rc->last_boosted_qindex;
+#else
+ int qindex = p_rc->last_boosted_qindex;
+#endif
+ const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ const int delta_qindex = av1_compute_qdelta(
+ rc, last_boosted_q, last_boosted_q * 0.75, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else { // not first frame of one pass and kf_boost is set
+ double q_adj_factor = 1.0;
+
+ active_best_quality = get_kf_active_quality(
+ p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((width * height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Convert the adjustment factor to a qindex delta on active_best_quality.
+ {
+ const double q_val =
+ av1_convert_qindex_to_q(active_best_quality, bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+ }
+ }
+ } else if (!rc->is_src_frame_alt_ref &&
+ (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ q = (rc->frames_since_key > 1 &&
+ p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+ ? p_rc->avg_frame_qindex[INTER_FRAME]
+ : p_rc->avg_frame_qindex[KEY_FRAME];
+ // For constrained quality dont allow Q less than the cq level
+ if (rc_mode == AOM_CQ) {
+ if (q < cq_level) q = cq_level;
+ active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+ // Constrained quality use slightly lower active best.
+ active_best_quality = active_best_quality * 15 / 16;
+ } else if (rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+ const int delta_qindex =
+ (refresh_frame->alt_ref_frame)
+ ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
+ : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+ }
+ } else {
+ if (rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+ const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
+ 0.70, 1.0, 0.85, 1.0 };
+ const int delta_qindex = av1_compute_qdelta(
+ rc, q_val,
+ q_val * delta_rate[current_frame->frame_number % FIXED_GF_INTERVAL],
+ bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ // Use the lower of active_worst_quality and recent/average Q.
+ active_best_quality =
+ (current_frame->frame_number > 1)
+ ? inter_minq[p_rc->avg_frame_qindex[INTER_FRAME]]
+ : inter_minq[p_rc->avg_frame_qindex[KEY_FRAME]];
+ // For the constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+ active_best_quality = cq_level;
+ }
+ }
+ }
+
+ // Clip the active best and worst quality values to limits
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ // Limit Q range for the adaptive loop.
+ {
+ int qdelta = 0;
+ if (current_frame->frame_type == KEY_FRAME &&
+ !p_rc->this_key_frame_forced && current_frame->frame_number != 0) {
+ qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+ active_worst_quality, 2.0);
+ } else if (!rc->is_src_frame_alt_ref &&
+ (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
+ qdelta = av1_compute_qdelta_by_rate(cpi, current_frame->frame_type,
+ active_worst_quality, 1.75);
+ }
+ *top_index = active_worst_quality + qdelta;
+ *top_index = AOMMAX(*top_index, *bottom_index);
+ }
+
+ if (rc_mode == AOM_Q) {
+ q = active_best_quality;
+ // Special case code to try and match quality with forced key frames
+ } else if ((current_frame->frame_type == KEY_FRAME) &&
+ p_rc->this_key_frame_forced) {
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ q = simulate_parallel_frame ? p_rc->temp_last_boosted_qindex
+ : p_rc->last_boosted_qindex;
+#else
+ q = p_rc->last_boosted_qindex;
+#endif
+ } else {
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality, width, height);
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
+ }
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+static const double arf_layer_deltas[MAX_ARF_LAYERS + 1] = { 2.50, 2.00, 1.75,
+ 1.50, 1.25, 1.15,
+ 1.0 };
+int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const RATE_FACTOR_LEVEL rf_lvl =
+ get_rate_factor_level(gf_group, cpi->gf_frame_index);
+ const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+ const int arf_layer = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const double rate_factor =
+ (rf_lvl == INTER_NORMAL) ? 1.0 : arf_layer_deltas[arf_layer];
+
+ return av1_compute_qdelta_by_rate(cpi, frame_type, q, rate_factor);
+}
+
+// This unrestricted Q selection on CQ mode is useful when testing new features,
+// but may lead to Q being out of range on current RC restrictions
+#if USE_UNRESTRICTED_Q_IN_CQ_MODE
+static int rc_pick_q_and_bounds_no_stats_cq(const AV1_COMP *cpi, int width,
+ int height, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int cq_level =
+ get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
+ cm->superres_scale_denominator);
+ const int bit_depth = cm->seq_params->bit_depth;
+ const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth);
+ (void)width;
+ (void)height;
+ assert(has_no_stats_stage(cpi));
+ assert(cpi->oxcf.rc_cfg.mode == AOM_CQ);
+
+ *top_index = q;
+ *bottom_index = q;
+
+ return q;
+}
+#endif // USE_UNRESTRICTED_Q_IN_CQ_MODE
+
+#define STATIC_MOTION_THRESH 95
+static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+ int *active_best, int *active_worst,
+ int cq_level) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ int active_best_quality;
+ int active_worst_quality = *active_worst;
+ const int bit_depth = cm->seq_params->bit_depth;
+
+ if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) {
+ // If the next frame is also a key frame or the current frame is the
+ // only frame in the sequence in AOM_Q mode, just use the cq_level
+ // as q.
+ active_best_quality = cq_level;
+ active_worst_quality = cq_level;
+ } else if (p_rc->this_key_frame_forced) {
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ double last_boosted_q;
+ int delta_qindex;
+ int qindex;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int last_boosted_qindex = simulate_parallel_frame
+ ? p_rc->temp_last_boosted_qindex
+ : p_rc->last_boosted_qindex;
+#else
+ int last_boosted_qindex = p_rc->last_boosted_qindex;
+#endif
+ if (is_stat_consumption_stage_twopass(cpi) &&
+ cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+ qindex = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex);
+ active_best_quality = qindex;
+ last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 1.25, bit_depth);
+ active_worst_quality =
+ AOMMIN(qindex + delta_qindex, active_worst_quality);
+ } else {
+ qindex = last_boosted_qindex;
+ last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 0.50, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ }
+ } else {
+ // Not forced keyframe.
+ double q_adj_factor = 1.0;
+ double q_val;
+
+ // Baseline value derived from active_worst_quality and kf boost.
+ active_best_quality =
+ get_kf_active_quality(p_rc, active_worst_quality, bit_depth);
+ if (cpi->is_screen_content_type) {
+ active_best_quality /= 2;
+ }
+
+ if (is_stat_consumption_stage_twopass(cpi) &&
+ cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+ active_best_quality /= 3;
+ }
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((width * height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Make a further adjustment based on the kf zero motion measure.
+ if (is_stat_consumption_stage_twopass(cpi))
+ q_adj_factor +=
+ 0.05 - (0.001 * (double)cpi->ppi->twopass.kf_zeromotion_pct);
+
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+
+ // Tweak active_best_quality for AOM_Q mode when superres is on, as this
+ // will be used directly as 'q' later.
+ if (oxcf->rc_cfg.mode == AOM_Q &&
+ (cpi->superres_mode == AOM_SUPERRES_QTHRESH ||
+ cpi->superres_mode == AOM_SUPERRES_AUTO) &&
+ cm->superres_scale_denominator != SCALE_NUMERATOR) {
+ active_best_quality =
+ AOMMAX(active_best_quality -
+ ((cm->superres_scale_denominator - SCALE_NUMERATOR) *
+ SUPERRES_QADJ_PER_DENOM_KEYFRAME),
+ 0);
+ }
+ }
+ *active_best = active_best_quality;
+ *active_worst = active_worst_quality;
+}
+
+static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
+ const int is_intrl_arf_boost,
+ int *active_worst,
+ int *active_best) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ int active_best_quality = *active_best;
+ int active_worst_quality = *active_worst;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int extend_minq = simulate_parallel_frame ? p_rc->temp_extend_minq
+ : cpi->ppi->twopass.extend_minq;
+ int extend_maxq = simulate_parallel_frame ? p_rc->temp_extend_maxq
+ : cpi->ppi->twopass.extend_maxq;
+#endif
+ // Extension to max or min Q if undershoot or overshoot is outside
+ // the permitted range.
+ if (cpi->oxcf.rc_cfg.mode != AOM_Q) {
+ if (frame_is_intra_only(cm) ||
+ (!rc->is_src_frame_alt_ref &&
+ (refresh_frame->golden_frame || is_intrl_arf_boost ||
+ refresh_frame->alt_ref_frame))) {
+#if CONFIG_FPMT_TEST
+ active_best_quality -= extend_minq;
+ active_worst_quality += (extend_maxq / 2);
+#else
+ active_best_quality -= cpi->ppi->twopass.extend_minq / 4;
+ active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2);
+#endif
+ } else {
+#if CONFIG_FPMT_TEST
+ active_best_quality -= extend_minq / 2;
+ active_worst_quality += extend_maxq;
+#else
+ active_best_quality -= cpi->ppi->twopass.extend_minq / 4;
+ active_worst_quality += cpi->ppi->twopass.extend_maxq;
+#endif
+ }
+ }
+
+#ifndef STRICT_RC
+ // Static forced key frames Q restrictions dealt with elsewhere.
+ if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced ||
+ (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+ const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality);
+ active_worst_quality =
+ AOMMAX(active_worst_quality + qdelta, active_best_quality);
+ }
+#endif
+
+ // Modify active_best_quality for downscaled normal frames.
+ if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
+ int qdelta = av1_compute_qdelta_by_rate(cpi, cm->current_frame.frame_type,
+ active_best_quality, 2.0);
+ active_best_quality =
+ AOMMAX(active_best_quality + qdelta, rc->best_quality);
+ }
+
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *active_best = active_best_quality;
+ *active_worst = active_worst_quality;
+}
+
+/*!\brief Gets a Q value to use for the current frame
+ *
+ *
+ * Selects a Q value from a permitted range that we estimate
+ * will result in approximately the target number of bits.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] width Width of frame
+ * \param[in] height Height of frame
+ * \param[in] active_worst_quality Max Q allowed
+ * \param[in] active_best_quality Min Q allowed
+ *
+ * \return The suggested Q for this frame.
+ */
+static int get_q(const AV1_COMP *cpi, const int width, const int height,
+ const int active_worst_quality,
+ const int active_best_quality) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ int q;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg;
+ int last_boosted_qindex = simulate_parallel_frame
+ ? p_rc->temp_last_boosted_qindex
+ : p_rc->last_boosted_qindex;
+#else
+ int last_boosted_qindex = p_rc->last_boosted_qindex;
+#endif
+
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q ||
+ (frame_is_intra_only(cm) && !p_rc->this_key_frame_forced &&
+ cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
+ rc->frames_to_key > 1)) {
+ q = active_best_quality;
+ // Special case code to try and match quality with forced key frames.
+ } else if (frame_is_intra_only(cm) && p_rc->this_key_frame_forced) {
+ // If static since last kf use better of last boosted and last kf q.
+ if (cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+ q = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex);
+ } else {
+ q = AOMMIN(last_boosted_qindex,
+ (active_best_quality + active_worst_quality) / 2);
+ }
+ q = clamp(q, active_best_quality, active_worst_quality);
+ } else {
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality, width, height);
+ if (q > active_worst_quality) {
+ // Special case when we are targeting the max allowed rate.
+ if (rc->this_frame_target < rc->max_frame_bandwidth) {
+ q = active_worst_quality;
+ }
+ }
+ q = AOMMAX(q, active_best_quality);
+ }
+ return q;
+}
+
+// Returns |active_best_quality| for an inter frame.
+// The |active_best_quality| depends on different rate control modes:
+// VBR, Q, CQ, CBR.
+// The returning active_best_quality could further be adjusted in
+// adjust_active_best_and_worst_quality().
+static int get_active_best_quality(const AV1_COMP *const cpi,
+ const int active_worst_quality,
+ const int cq_level, const int gf_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bit_depth = cm->seq_params->bit_depth;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
+ int *inter_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+ int active_best_quality = 0;
+ const int is_intrl_arf_boost =
+ gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+ int is_leaf_frame =
+ !(gf_group->update_type[gf_index] == ARF_UPDATE ||
+ gf_group->update_type[gf_index] == GF_UPDATE || is_intrl_arf_boost);
+
+ // TODO(jingning): Consider to rework this hack that covers issues incurred
+ // in lightfield setting.
+ if (cm->tiles.large_scale) {
+ is_leaf_frame = !(refresh_frame->golden_frame ||
+ refresh_frame->alt_ref_frame || is_intrl_arf_boost);
+ }
+ const int is_overlay_frame = rc->is_src_frame_alt_ref;
+
+ if (is_leaf_frame || is_overlay_frame) {
+ if (rc_mode == AOM_Q) return cq_level;
+
+ active_best_quality = inter_minq[active_worst_quality];
+ // For the constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+ active_best_quality = cq_level;
+ }
+ return active_best_quality;
+ }
+
+ // Determine active_best_quality for frames that are not leaf or overlay.
+ int q = active_worst_quality;
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ if (rc->frames_since_key > 1 &&
+ p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = p_rc->avg_frame_qindex[INTER_FRAME];
+ }
+ if (rc_mode == AOM_CQ && q < cq_level) q = cq_level;
+ active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
+ // Constrained quality use slightly lower active best.
+ if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16;
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+ active_best_quality = min_boost - (int)(boost * p_rc->arf_boost_factor);
+ if (!is_intrl_arf_boost) return active_best_quality;
+
+ if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = p_rc->arf_q;
+ int this_height = gf_group_pyramid_level(gf_group, gf_index);
+ while (this_height > 1) {
+ active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+ --this_height;
+ }
+ return active_best_quality;
+}
+
+// Returns the q_index for a single frame in the GOP.
+// This function assumes that rc_mode == AOM_Q mode.
+int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
+ int gf_pyramid_level, int arf_q) {
+ const int is_intrl_arf_boost = gf_update_type == INTNL_ARF_UPDATE;
+ int is_leaf_or_overlay_frame = gf_update_type == LF_UPDATE ||
+ gf_update_type == OVERLAY_UPDATE ||
+ gf_update_type == INTNL_OVERLAY_UPDATE;
+
+ if (is_leaf_or_overlay_frame) return base_q_index;
+
+ if (!is_intrl_arf_boost) return arf_q;
+
+ int active_best_quality = arf_q;
+ int active_worst_quality = base_q_index;
+
+ while (gf_pyramid_level > 1) {
+ active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+ --gf_pyramid_level;
+ }
+ return active_best_quality;
+}
+
+// Returns the q_index for the ARF in the GOP.
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+ double arf_boost_factor) {
+ int active_best_quality =
+ get_gf_active_quality_no_rc(gfu_boost, base_q_index, bit_depth);
+ const int min_boost = get_gf_high_motion_quality(base_q_index, bit_depth);
+ const int boost = min_boost - active_best_quality;
+ return min_boost - (int)(boost * arf_boost_factor);
+}
+
+static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width,
+ int height, int gf_index,
+ int *bottom_index, int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int cq_level =
+ get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+ cpi->superres_mode, cm->superres_scale_denominator);
+ int active_best_quality = 0;
+ int active_worst_quality = rc->active_worst_quality;
+ int q;
+
+ if (frame_is_intra_only(cm)) {
+ get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+ &active_worst_quality, cq_level);
+ } else {
+ // Active best quality limited by previous layer.
+ active_best_quality =
+ get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index);
+ }
+
+ if (cq_level > 0) active_best_quality = AOMMAX(1, active_best_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ *top_index = AOMMAX(*top_index, rc->best_quality);
+ *top_index = AOMMIN(*top_index, rc->worst_quality);
+
+ *bottom_index = AOMMAX(*bottom_index, rc->best_quality);
+ *bottom_index = AOMMIN(*bottom_index, rc->worst_quality);
+
+ q = active_best_quality;
+
+ q = AOMMAX(q, rc->best_quality);
+ q = AOMMIN(q, rc->worst_quality);
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+
+ return q;
+}
+
+/*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc.
+ *
+ * Handles the the general cases not covered by
+ * \ref rc_pick_q_and_bounds_no_stats_cbr() and
+ * \ref rc_pick_q_and_bounds_no_stats()
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] width Coded frame width
+ * \param[in] height Coded frame height
+ * \param[in] gf_index Index of this frame in the golden frame group
+ * \param[out] bottom_index Bottom bound for q index (best quality)
+ * \param[out] top_index Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ */
+static int rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+ int gf_index, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ assert(IMPLIES(has_no_stats_stage(cpi),
+ cpi->oxcf.rc_cfg.mode == AOM_Q &&
+ gf_group->update_type[gf_index] != ARF_UPDATE));
+ const int cq_level =
+ get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+ cpi->superres_mode, cm->superres_scale_denominator);
+
+ if (oxcf->rc_cfg.mode == AOM_Q) {
+ return rc_pick_q_and_bounds_q_mode(cpi, width, height, gf_index,
+ bottom_index, top_index);
+ }
+
+ int active_best_quality = 0;
+ int active_worst_quality = rc->active_worst_quality;
+ int q;
+
+ const int is_intrl_arf_boost =
+ gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+
+ if (frame_is_intra_only(cm)) {
+ get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+ &active_worst_quality, cq_level);
+#ifdef STRICT_RC
+ active_best_quality = 0;
+#endif
+ } else {
+ // Active best quality limited by previous layer.
+ const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index);
+
+ if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS)) {
+ active_best_quality = get_active_best_quality(cpi, active_worst_quality,
+ cq_level, gf_index);
+ } else {
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int local_active_best_quality =
+ simulate_parallel_frame
+ ? p_rc->temp_active_best_quality[pyramid_level - 1]
+ : p_rc->active_best_quality[pyramid_level - 1];
+ active_best_quality = local_active_best_quality + 1;
+#else
+ active_best_quality = p_rc->active_best_quality[pyramid_level - 1] + 1;
+#endif
+
+ active_best_quality = AOMMIN(active_best_quality, active_worst_quality);
+#ifdef STRICT_RC
+ active_best_quality += (active_worst_quality - active_best_quality) / 16;
+#else
+ active_best_quality += (active_worst_quality - active_best_quality) / 2;
+#endif
+ }
+
+ // For alt_ref and GF frames (including internal arf frames) adjust the
+ // worst allowed quality as well. This insures that even on hard
+ // sections we dont clamp the Q at the same value for arf frames and
+ // leaf (non arf) frames. This is important to the TPL model which assumes
+ // Q drops with each arf level.
+ if (!(rc->is_src_frame_alt_ref) &&
+ (refresh_frame->golden_frame || refresh_frame->alt_ref_frame ||
+ is_intrl_arf_boost)) {
+ active_worst_quality =
+ (active_best_quality + (3 * active_worst_quality) + 2) / 4;
+ }
+ }
+
+ adjust_active_best_and_worst_quality(
+ cpi, is_intrl_arf_boost, &active_worst_quality, &active_best_quality);
+ q = get_q(cpi, width, height, active_worst_quality, active_best_quality);
+
+ // Special case when we are targeting the max allowed rate.
+ if (rc->this_frame_target >= rc->max_frame_bandwidth &&
+ q > active_worst_quality) {
+ active_worst_quality = q;
+ }
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+
+ return q;
+}
+
+static void rc_compute_variance_onepass_rt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ YV12_BUFFER_CONFIG const *const unscaled_src = cpi->unscaled_source;
+ if (unscaled_src == NULL) return;
+
+ const uint8_t *src_y = unscaled_src->y_buffer;
+ const int src_ystride = unscaled_src->y_stride;
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ const uint8_t *pre_y = yv12->buffers[0];
+ const int pre_ystride = yv12->strides[0];
+
+ // TODO(yunqing): support scaled reference frames.
+ if (cpi->scaled_ref_buf[LAST_FRAME - 1]) return;
+
+ for (int i = 0; i < 2; ++i) {
+ if (unscaled_src->widths[i] != yv12->widths[i] ||
+ unscaled_src->heights[i] != yv12->heights[i]) {
+ return;
+ }
+ }
+
+ const int num_mi_cols = cm->mi_params.mi_cols;
+ const int num_mi_rows = cm->mi_params.mi_rows;
+ const BLOCK_SIZE bsize = BLOCK_64X64;
+ int num_samples = 0;
+ // sse is computed on 64x64 blocks
+ const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+ const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+
+ uint64_t fsse = 0;
+ cpi->rec_sse = 0;
+
+ for (int sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
+ for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+ unsigned int sse;
+ uint8_t src[64 * 64] = { 0 };
+ // Apply 4x4 block averaging/denoising on source frame.
+ for (int i = 0; i < 64; i += 4) {
+ for (int j = 0; j < 64; j += 4) {
+ const unsigned int avg =
+ aom_avg_4x4(src_y + i * src_ystride + j, src_ystride);
+
+ for (int m = 0; m < 4; ++m) {
+ for (int n = 0; n < 4; ++n) src[i * 64 + j + m * 64 + n] = avg;
+ }
+ }
+ }
+
+ cpi->ppi->fn_ptr[bsize].vf(src, 64, pre_y, pre_ystride, &sse);
+ fsse += sse;
+ num_samples++;
+ src_y += 64;
+ pre_y += 64;
+ }
+ src_y += (src_ystride << 6) - (sb_cols << 6);
+ pre_y += (pre_ystride << 6) - (sb_cols << 6);
+ }
+ assert(num_samples > 0);
+ // Ensure rec_sse > 0
+ if (num_samples > 0) cpi->rec_sse = fsse > 0 ? fsse : 1;
+}
+
+int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height, int gf_index,
+ int *bottom_index, int *top_index) {
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ int q;
+ // TODO(sarahparker) merge no-stats vbr and altref q computation
+ // with rc_pick_q_and_bounds().
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if ((cpi->oxcf.rc_cfg.mode != AOM_Q ||
+ gf_group->update_type[gf_index] == ARF_UPDATE) &&
+ has_no_stats_stage(cpi)) {
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+ // TODO(yunqing): the results could be used for encoder optimization.
+ cpi->rec_sse = UINT64_MAX;
+ if (cpi->sf.hl_sf.accurate_bit_estimate &&
+ cpi->common.current_frame.frame_type != KEY_FRAME)
+ rc_compute_variance_onepass_rt(cpi);
+
+ q = rc_pick_q_and_bounds_no_stats_cbr(cpi, width, height, bottom_index,
+ top_index);
+ // preserve copy of active worst quality selected.
+ cpi->rc.active_worst_quality = *top_index;
+
+#if USE_UNRESTRICTED_Q_IN_CQ_MODE
+ } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
+ q = rc_pick_q_and_bounds_no_stats_cq(cpi, width, height, bottom_index,
+ top_index);
+#endif // USE_UNRESTRICTED_Q_IN_CQ_MODE
+ } else {
+ q = rc_pick_q_and_bounds_no_stats(cpi, width, height, bottom_index,
+ top_index);
+ }
+ } else {
+ q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index,
+ top_index);
+ }
+ if (gf_group->update_type[gf_index] == ARF_UPDATE) p_rc->arf_q = q;
+
+ return q;
+}
+
+void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit) {
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+ *frame_under_shoot_limit = 0;
+ *frame_over_shoot_limit = INT_MAX;
+ } else {
+ // For very small rate targets where the fractional adjustment
+ // may be tiny make sure there is at least a minimum range.
+ assert(cpi->sf.hl_sf.recode_tolerance <= 100);
+ const int tolerance = (int)AOMMAX(
+ 100, ((int64_t)cpi->sf.hl_sf.recode_tolerance * frame_target) / 100);
+ *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0);
+ *frame_over_shoot_limit =
+ AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth);
+ }
+}
+
+void av1_rc_set_frame_target(AV1_COMP *cpi, int target, int width, int height) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ rc->this_frame_target = target;
+
+ // Modify frame size target when down-scaled.
+ if (av1_frame_scaled(cm) && cpi->oxcf.rc_cfg.mode != AOM_CBR) {
+ rc->this_frame_target =
+ (int)(rc->this_frame_target *
+ resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height));
+ }
+
+ // Target rate per SB64 (including partial SB64s.
+ rc->sb64_target_rate =
+ (int)(((int64_t)rc->this_frame_target << 12) / (width * height));
+}
+
+static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
+ // this frame refreshes means next frames don't unless specified by user
+ RATE_CONTROL *const rc = &cpi->rc;
+ rc->frames_since_golden = 0;
+}
+
+static void update_golden_frame_stats(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // Update the Golden frame usage counts.
+ if (cpi->refresh_frame.golden_frame || rc->is_src_frame_alt_ref) {
+ rc->frames_since_golden = 0;
+ } else if (cpi->common.show_frame) {
+ rc->frames_since_golden++;
+ }
+}
+
+void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+
+ const int is_intrnl_arf =
+ gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+
+ const int qindex = cm->quant_params.base_qindex;
+
+#if RT_PASSIVE_STRATEGY
+ const int frame_number = current_frame->frame_number % MAX_Q_HISTORY;
+ p_rc->q_history[frame_number] = qindex;
+#endif // RT_PASSIVE_STRATEGY
+
+ // Update rate control heuristics
+ rc->projected_frame_size = (int)(bytes_used << 3);
+
+ // Post encode loop adjustment of Q prediction.
+ av1_rc_update_rate_correction_factors(cpi, 0, cm->width, cm->height);
+
+ // Update bit estimation ratio.
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+ cm->current_frame.frame_type != KEY_FRAME &&
+ cpi->sf.hl_sf.accurate_bit_estimate) {
+ const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
+ cm->seq_params->bit_depth);
+ const int this_bit_est_ratio =
+ (int)(rc->projected_frame_size * q / sqrt((double)cpi->rec_sse));
+ cpi->rc.bit_est_ratio =
+ cpi->rc.bit_est_ratio == 0
+ ? this_bit_est_ratio
+ : (7 * cpi->rc.bit_est_ratio + this_bit_est_ratio) / 8;
+ }
+
+ // Keep a record of last Q and ambient average Q.
+ if (current_frame->frame_type == KEY_FRAME) {
+ p_rc->last_q[KEY_FRAME] = qindex;
+ p_rc->avg_frame_qindex[KEY_FRAME] =
+ ROUND_POWER_OF_TWO(3 * p_rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+ } else {
+ if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
+ cpi->rc.rtc_external_ratectrl ||
+ (!rc->is_src_frame_alt_ref &&
+ !(refresh_frame->golden_frame || is_intrnl_arf ||
+ refresh_frame->alt_ref_frame))) {
+ p_rc->last_q[INTER_FRAME] = qindex;
+ p_rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(
+ 3 * p_rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+ p_rc->ni_frames++;
+ p_rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params->bit_depth);
+ p_rc->avg_q = p_rc->tot_q / p_rc->ni_frames;
+ // Calculate the average Q for normal inter frames (not key or GFU
+ // frames).
+ rc->ni_tot_qi += qindex;
+ rc->ni_av_qi = rc->ni_tot_qi / p_rc->ni_frames;
+ }
+ }
+ // Keep record of last boosted (KF/GF/ARF) Q value.
+ // If the current frame is coded at a lower Q then we also update it.
+ // If all mbs in this group are skipped only update if the Q value is
+ // better than that already stored.
+ // This is used to help set quality in forced key frames to reduce popping
+ if ((qindex < p_rc->last_boosted_qindex) ||
+ (current_frame->frame_type == KEY_FRAME) ||
+ (!p_rc->constrained_gf_group &&
+ (refresh_frame->alt_ref_frame || is_intrnl_arf ||
+ (refresh_frame->golden_frame && !rc->is_src_frame_alt_ref)))) {
+ p_rc->last_boosted_qindex = qindex;
+ }
+ if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex;
+
+ update_buffer_level(cpi, rc->projected_frame_size);
+ rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth;
+
+ // Rolling monitors of whether we are over or underspending used to help
+ // regulate min and Max Q in two pass.
+ if (av1_frame_scaled(cm))
+ rc->this_frame_target = (int)(rc->this_frame_target /
+ resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
+ cm->width, cm->height));
+ if (current_frame->frame_type != KEY_FRAME) {
+ p_rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
+ p_rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+ p_rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
+ p_rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+ }
+
+ // Actual bits spent
+ p_rc->total_actual_bits += rc->projected_frame_size;
+ p_rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+
+ if (is_altref_enabled(cpi->oxcf.gf_cfg.lag_in_frames,
+ cpi->oxcf.gf_cfg.enable_auto_arf) &&
+ refresh_frame->alt_ref_frame &&
+ (current_frame->frame_type != KEY_FRAME && !frame_is_sframe(cm)))
+ // Update the alternate reference frame stats as appropriate.
+ update_alt_ref_frame_stats(cpi);
+ else
+ // Update the Golden frame stats as appropriate.
+ update_golden_frame_stats(cpi);
+
+#if CONFIG_FPMT_TEST
+ /*The variables temp_avg_frame_qindex, temp_last_q, temp_avg_q,
+ * temp_last_boosted_qindex are introduced only for quality simulation
+ * purpose, it retains the value previous to the parallel encode frames. The
+ * variables are updated based on the update flag.
+ *
+ * If there exist show_existing_frames between parallel frames, then to
+ * retain the temp state do not update it. */
+ int show_existing_between_parallel_frames =
+ (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] ==
+ INTNL_OVERLAY_UPDATE &&
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index + 1] == 2);
+
+ if (cpi->do_frame_data_update && !show_existing_between_parallel_frames &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) {
+ for (int i = 0; i < FRAME_TYPES; i++) {
+ p_rc->temp_last_q[i] = p_rc->last_q[i];
+ }
+ p_rc->temp_avg_q = p_rc->avg_q;
+ p_rc->temp_last_boosted_qindex = p_rc->last_boosted_qindex;
+ p_rc->temp_total_actual_bits = p_rc->total_actual_bits;
+ p_rc->temp_projected_frame_size = rc->projected_frame_size;
+ for (int i = 0; i < RATE_FACTOR_LEVELS; i++)
+ p_rc->temp_rate_correction_factors[i] = p_rc->rate_correction_factors[i];
+ }
+#endif
+ if (current_frame->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+ if (cpi->refresh_frame.golden_frame)
+ rc->frame_num_last_gf_refresh = current_frame->frame_number;
+ rc->prev_coded_width = cm->width;
+ rc->prev_coded_height = cm->height;
+ rc->frame_number_encoded++;
+ rc->prev_frame_is_dropped = 0;
+ rc->drop_count_consec = 0;
+ // if (current_frame->frame_number == 1 && cm->show_frame)
+ /*
+ rc->this_frame_target =
+ (int)(rc->this_frame_target / resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
+ cm->width, cm->height));
+ */
+}
+
+void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
+ // Update buffer level with zero size, update frame counters, and return.
+ update_buffer_level(cpi, 0);
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ }
+ cpi->rc.rc_2_frame = 0;
+ cpi->rc.rc_1_frame = 0;
+ cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
+ cpi->rc.prev_coded_width = cpi->common.width;
+ cpi->rc.prev_coded_height = cpi->common.height;
+ cpi->rc.prev_frame_is_dropped = 1;
+ // On a scene/slide change for dropped frame: reset the avg_source_sad to 0,
+ // otherwise the avg_source_sad can get too large and subsequent frames
+ // may miss the scene/slide detection.
+ if (cpi->rc.high_source_sad) cpi->rc.avg_source_sad = 0;
+ if (cpi->ppi->use_svc && cpi->svc.number_spatial_layers > 1) {
+ cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = true;
+ cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = true;
+ }
+}
+
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+ int best_qindex, int worst_qindex) {
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const double mid_q = av1_convert_qindex_to_q(mid, bit_depth);
+ if (mid_q < desired_q) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ assert(low == high);
+ assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q ||
+ low == worst_qindex);
+ return low;
+}
+
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+ aom_bit_depth_t bit_depth) {
+ const int start_index =
+ av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality);
+ const int target_index =
+ av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality);
+ return target_index - start_index;
+}
+
+// Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex],
+// assuming 'correction_factor' is 1.0.
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// bits per mb <= desired_bits_per_mb.
+// If no such q index is found, returns 'worst_qindex'.
+static int find_qindex_by_rate(const AV1_COMP *const cpi,
+ int desired_bits_per_mb, FRAME_TYPE frame_type,
+ int best_qindex, int worst_qindex) {
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const int mid_bits_per_mb =
+ av1_rc_bits_per_mb(cpi, frame_type, mid, 1.0, 0);
+ if (mid_bits_per_mb > desired_bits_per_mb) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ assert(low == high);
+ assert(av1_rc_bits_per_mb(cpi, frame_type, low, 1.0, 0) <=
+ desired_bits_per_mb ||
+ low == worst_qindex);
+ return low;
+}
+
+int av1_compute_qdelta_by_rate(const AV1_COMP *cpi, FRAME_TYPE frame_type,
+ int qindex, double rate_target_ratio) {
+ const RATE_CONTROL *rc = &cpi->rc;
+
+ // Look up the current projected bits per block for the base index
+ const int base_bits_per_mb =
+ av1_rc_bits_per_mb(cpi, frame_type, qindex, 1.0, 0);
+
+ // Find the target bits per mb based on the base value and given ratio.
+ const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+ const int target_index = find_qindex_by_rate(
+ cpi, target_bits_per_mb, frame_type, rc->best_quality, rc->worst_quality);
+ return target_index - qindex;
+}
+
+void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
+ RATE_CONTROL *const rc) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ // Special case code for 1 pass fixed Q mode tests
+ if ((has_no_stats_stage(cpi)) && (oxcf->rc_cfg.mode == AOM_Q)) {
+ rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+ rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+ rc->static_scene_max_gf_interval = rc->min_gf_interval + 1;
+ } else {
+ // Set Maximum gf/arf interval
+ rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+ rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+ if (rc->min_gf_interval == 0)
+ rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+ oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, cpi->framerate);
+ if (rc->max_gf_interval == 0)
+ rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+ cpi->framerate, rc->min_gf_interval);
+ /*
+ * Extended max interval for genuinely static scenes like slide shows.
+ * The no.of.stats available in the case of LAP is limited,
+ * hence setting to max_gf_interval.
+ */
+ if (cpi->ppi->lap_enabled)
+ rc->static_scene_max_gf_interval = rc->max_gf_interval + 1;
+ else
+ rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+
+ if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+ rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+ // Clamp min to max
+ rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
+ }
+}
+
+void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int vbr_max_bits;
+ const int MBs = av1_get_MBs(width, height);
+
+ rc->avg_frame_bandwidth =
+ (int)round(oxcf->rc_cfg.target_bandwidth / cpi->framerate);
+ rc->min_frame_bandwidth =
+ (int)(rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmin_section / 100);
+
+ rc->min_frame_bandwidth =
+ AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+ // A maximum bitrate for a frame is defined.
+ // The baseline for this aligns with HW implementations that
+ // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+ // per 16x16 MB (averaged over a frame). However this limit is extended if
+ // a very high rate is given on the command line or the the rate cannnot
+ // be acheived because of a user specificed max q (e.g. when the user
+ // specifies lossless encode.
+ vbr_max_bits =
+ (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->rc_cfg.vbrmax_section) /
+ 100);
+ rc->max_frame_bandwidth =
+ AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+ av1_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+#if CONFIG_FPMT_TEST
+ const int simulate_parallel_frame =
+ cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 &&
+ cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE;
+ int64_t vbr_bits_off_target = simulate_parallel_frame
+ ? cpi->ppi->p_rc.temp_vbr_bits_off_target
+ : p_rc->vbr_bits_off_target;
+#else
+ int64_t vbr_bits_off_target = p_rc->vbr_bits_off_target;
+#endif
+ const int stats_count =
+ cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL
+ ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count
+ : 0;
+ const int frame_window = AOMMIN(
+ 16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
+ assert(VBR_PCT_ADJUSTMENT_LIMIT <= 100);
+ if (frame_window > 0) {
+ const int max_delta = (int)AOMMIN(
+ abs((int)(vbr_bits_off_target / frame_window)),
+ ((int64_t)(*this_frame_target) * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
+
+ // vbr_bits_off_target > 0 means we have extra bits to spend
+ // vbr_bits_off_target < 0 we are currently overshooting
+ *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta;
+ }
+
+#if CONFIG_FPMT_TEST
+ int64_t vbr_bits_off_target_fast =
+ simulate_parallel_frame ? cpi->ppi->p_rc.temp_vbr_bits_off_target_fast
+ : p_rc->vbr_bits_off_target_fast;
+#endif
+ // Fast redistribution of bits arising from massive local undershoot.
+ // Dont do it for kf,arf,gf or overlay frames.
+ if (!frame_is_kf_gf_arf(cpi) &&
+#if CONFIG_FPMT_TEST
+ vbr_bits_off_target_fast &&
+#else
+ p_rc->vbr_bits_off_target_fast &&
+#endif
+ !rc->is_src_frame_alt_ref) {
+ int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
+ int fast_extra_bits;
+#if CONFIG_FPMT_TEST
+ fast_extra_bits = (int)AOMMIN(vbr_bits_off_target_fast, one_frame_bits);
+ fast_extra_bits =
+ (int)AOMMIN(fast_extra_bits,
+ AOMMAX(one_frame_bits / 8, vbr_bits_off_target_fast / 8));
+#else
+ fast_extra_bits =
+ (int)AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits);
+ fast_extra_bits = (int)AOMMIN(
+ fast_extra_bits,
+ AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8));
+#endif
+ if (fast_extra_bits > 0) {
+ // Update this_frame_target only if additional bits are available from
+ // local undershoot.
+ *this_frame_target += (int)fast_extra_bits;
+ }
+ // Store the fast_extra_bits of the frame and reduce it from
+ // vbr_bits_off_target_fast during postencode stage.
+ rc->frame_level_fast_extra_bits = fast_extra_bits;
+ // Retaining the condition to udpate during postencode stage since
+ // fast_extra_bits are calculated based on vbr_bits_off_target_fast.
+ cpi->do_update_vbr_bits_off_target_fast = 1;
+ }
+}
+
+void av1_set_target_rate(AV1_COMP *cpi, int width, int height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target_rate = rc->base_frame_target;
+
+ // Correction to rate target based on prior over or under shoot.
+ if (cpi->oxcf.rc_cfg.mode == AOM_VBR || cpi->oxcf.rc_cfg.mode == AOM_CQ)
+ vbr_rate_correction(cpi, &target_rate);
+ av1_rc_set_frame_target(cpi, target_rate, width, height);
+}
+
+int av1_calc_pframe_target_size_one_pass_vbr(
+ const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
+ static const int af_ratio = 10;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ int64_t target;
+#if USE_ALTREF_FOR_ONE_PASS
+ if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
+ frame_update_type == ARF_UPDATE) {
+ target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+ af_ratio) /
+ (p_rc->baseline_gf_interval + af_ratio - 1);
+ } else {
+ target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) /
+ (p_rc->baseline_gf_interval + af_ratio - 1);
+ }
+ if (target > INT_MAX) target = INT_MAX;
+#else
+ target = rc->avg_frame_bandwidth;
+#endif
+ return av1_rc_clamp_pframe_target_size(cpi, (int)target, frame_update_type);
+}
+
+int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+ static const int kf_ratio = 25;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const int64_t target = (int64_t)rc->avg_frame_bandwidth * kf_ratio;
+ return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+int av1_calc_pframe_target_size_one_pass_cbr(
+ const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+ const RateControlCfg *rc_cfg = &oxcf->rc_cfg;
+ const int64_t diff = p_rc->optimal_buffer_level - p_rc->buffer_level;
+ const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100;
+ int min_frame_target =
+ AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+ int target;
+
+ if (rc_cfg->gf_cbr_boost_pct) {
+ const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100;
+ if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
+ target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+ af_ratio_pct) /
+ (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ } else {
+ target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) /
+ (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ }
+ } else {
+ target = rc->avg_frame_bandwidth;
+ }
+ if (cpi->ppi->use_svc) {
+ // Note that for layers, avg_frame_bandwidth is the cumulative
+ // per-frame-bandwidth. For the target size of this frame, use the
+ // layer average frame size (i.e., non-cumulative per-frame-bw).
+ int layer =
+ LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id,
+ cpi->svc.number_temporal_layers);
+ const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ target = lc->avg_frame_size;
+ min_frame_target = AOMMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
+ }
+ if (diff > 0) {
+ // Lower the target bandwidth for this frame.
+ const int pct_low =
+ (int)AOMMIN(diff / one_pct_bits, rc_cfg->under_shoot_pct);
+ target -= (target * pct_low) / 200;
+ } else if (diff < 0) {
+ // Increase the target bandwidth for this frame.
+ const int pct_high =
+ (int)AOMMIN(-diff / one_pct_bits, rc_cfg->over_shoot_pct);
+ target += (target * pct_high) / 200;
+ }
+ if (rc_cfg->max_inter_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * rc_cfg->max_inter_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+ return AOMMAX(min_frame_target, target);
+}
+
+int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+ int64_t target;
+ if (cpi->common.current_frame.frame_number == 0) {
+ target = ((p_rc->starting_buffer_level / 2) > INT_MAX)
+ ? INT_MAX
+ : (int)(p_rc->starting_buffer_level / 2);
+ if (cpi->svc.number_temporal_layers > 1 && target < (INT_MAX >> 2)) {
+ target = target << AOMMIN(2, (cpi->svc.number_temporal_layers - 1));
+ }
+ } else {
+ int kf_boost = 32;
+ int framerate = (int)round(cpi->framerate);
+
+ kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
+ if (rc->frames_since_key < framerate / 2) {
+ kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+ }
+ target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+ }
+ return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+static void set_golden_update(AV1_COMP *const cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ int divisor = 10;
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+ divisor = cpi->cyclic_refresh->percent_refresh;
+
+ // Set minimum gf_interval for GF update to a multiple of the refresh period,
+ // with some max limit. Depending on past encoding stats, GF flag may be
+ // reset and update may not occur until next baseline_gf_interval.
+ const int gf_length_mult[2] = { 8, 4 };
+ if (divisor > 0)
+ p_rc->baseline_gf_interval =
+ AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] * (100 / divisor),
+ MAX_GF_INTERVAL_RT);
+ else
+ p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT;
+ if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40)
+ p_rc->baseline_gf_interval = 16;
+}
+
+static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+ set_golden_update(cpi);
+
+ if (p_rc->baseline_gf_interval > rc->frames_to_key &&
+ cpi->oxcf.kf_cfg.auto_key)
+ p_rc->baseline_gf_interval = rc->frames_to_key;
+ p_rc->gfu_boost = DEFAULT_GF_BOOST_RT;
+ p_rc->constrained_gf_group =
+ (p_rc->baseline_gf_interval >= rc->frames_to_key &&
+ cpi->oxcf.kf_cfg.auto_key)
+ ? 1
+ : 0;
+ rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+ cpi->gf_frame_index = 0;
+ // SVC does not use GF as periodic boost.
+ // TODO(marpan): Find better way to disable this for SVC.
+ if (cpi->ppi->use_svc) {
+ SVC *const svc = &cpi->svc;
+ p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
+ p_rc->gfu_boost = 1;
+ p_rc->constrained_gf_group = 0;
+ rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+ for (int layer = 0;
+ layer < svc->number_spatial_layers * svc->number_temporal_layers;
+ ++layer) {
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval;
+ lc->p_rc.gfu_boost = p_rc->gfu_boost;
+ lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group;
+ lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
+ lc->group_index = 0;
+ }
+ }
+ gf_group->size = p_rc->baseline_gf_interval;
+ gf_group->update_type[0] = (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
+ gf_group->refbuf_state[cpi->gf_frame_index] =
+ (frame_type == KEY_FRAME) ? REFBUF_RESET : REFBUF_UPDATE;
+}
+
+void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ const int resize_pending = is_frame_resize_pending(cpi);
+ if (!resize_pending && !rc->high_source_sad) {
+ // Check if we should disable GF refresh (if period is up),
+ // or force a GF refresh update (if we are at least halfway through
+ // period) based on QP. Look into add info on segment deltaq.
+ PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+ const int avg_qp = p_rc->avg_frame_qindex[INTER_FRAME];
+ const int allow_gf_update =
+ rc->frames_till_gf_update_due <= (p_rc->baseline_gf_interval - 10);
+ int gf_update_changed = 0;
+ int thresh = 87;
+ if ((cm->current_frame.frame_number - cpi->rc.frame_num_last_gf_refresh) <
+ FIXED_GF_INTERVAL_RT &&
+ rc->frames_till_gf_update_due == 1 &&
+ cm->quant_params.base_qindex > avg_qp) {
+ // Disable GF refresh since QP is above the running average QP.
+ rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 0;
+ gf_update_changed = 1;
+ cpi->refresh_frame.golden_frame = 0;
+ } else if (allow_gf_update &&
+ ((cm->quant_params.base_qindex < thresh * avg_qp / 100) ||
+ (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 20))) {
+ // Force refresh since QP is well below average QP or this is a high
+ // motion frame.
+ rtc_ref->refresh[rtc_ref->gld_idx_1layer] = 1;
+ gf_update_changed = 1;
+ cpi->refresh_frame.golden_frame = 1;
+ }
+ if (gf_update_changed) {
+ set_baseline_gf_interval(cpi, INTER_FRAME);
+ int refresh_mask = 0;
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ int ref_frame_map_idx = rtc_ref->ref_idx[i];
+ refresh_mask |= rtc_ref->refresh[ref_frame_map_idx]
+ << ref_frame_map_idx;
+ }
+ cm->current_frame.refresh_frame_flags = refresh_mask;
+ }
+ }
+}
+
+/*!\brief Setup the reference prediction structure for 1 pass real-time
+ *
+ * Set the reference prediction structure for 1 layer.
+ * Current structue is to use 3 references (LAST, GOLDEN, ALTREF),
+ * where ALT_REF always behind current by lag_alt frames, and GOLDEN is
+ * either updated on LAST with period baseline_gf_interval (fixed slot)
+ * or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7).
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] gf_update Flag to indicate if GF is updated
+ *
+ * \remark Nothing is returned. Instead the settings for the prediction
+ * structure are set in \c cpi-ext_flags; and the buffer slot index
+ * (for each of 7 references) and refresh flags (for each of the 8 slots)
+ * are set in \c cpi->svc.ref_idx[] and \c cpi->svc.refresh[].
+ */
+void av1_set_rtc_reference_structure_one_layer(AV1_COMP *cpi, int gf_update) {
+ AV1_COMMON *const cm = &cpi->common;
+ ExternalFlags *const ext_flags = &cpi->ext_flags;
+ RATE_CONTROL *const rc = &cpi->rc;
+ ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
+ &ext_flags->refresh_frame;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ unsigned int frame_number = (cpi->oxcf.rc_cfg.drop_frames_water_mark)
+ ? rc->frame_number_encoded
+ : cm->current_frame.frame_number;
+ unsigned int lag_alt = 4;
+ int last_idx = 0;
+ int last_idx_refresh = 0;
+ int gld_idx = 0;
+ int alt_ref_idx = 0;
+ int last2_idx = 0;
+ ext_refresh_frame_flags->update_pending = 1;
+ ext_flags->ref_frame_flags = 0;
+ ext_refresh_frame_flags->last_frame = 1;
+ ext_refresh_frame_flags->golden_frame = 0;
+ ext_refresh_frame_flags->alt_ref_frame = 0;
+ // Decide altref lag adaptively for rt
+ if (cpi->sf.rt_sf.sad_based_adp_altref_lag) {
+ lag_alt = 6;
+ const uint64_t th_frame_sad[4][3] = {
+ { 18000, 18000, 18000 }, // HDRES CPU 9
+ { 25000, 25000, 25000 }, // MIDRES CPU 9
+ { 40000, 30000, 20000 }, // HDRES CPU10
+ { 30000, 25000, 20000 } // MIDRES CPU 10
+ };
+ int th_idx = cpi->sf.rt_sf.sad_based_adp_altref_lag - 1;
+ assert(th_idx < 4);
+ if (rc->avg_source_sad > th_frame_sad[th_idx][0])
+ lag_alt = 3;
+ else if (rc->avg_source_sad > th_frame_sad[th_idx][1])
+ lag_alt = 4;
+ else if (rc->avg_source_sad > th_frame_sad[th_idx][2])
+ lag_alt = 5;
+ }
+ // This defines the reference structure for 1 layer (non-svc) RTC encoding.
+ // To avoid the internal/default reference structure for non-realtime
+ // overwriting this behavior, we use the "svc" ref parameters from the
+ // external control SET_SVC_REF_FRAME_CONFIG.
+ // TODO(marpan): rename that control and the related internal parameters
+ // to rtc_ref.
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) rtc_ref->ref_idx[i] = 7;
+ for (int i = 0; i < REF_FRAMES; ++i) rtc_ref->refresh[i] = 0;
+ // Set the reference frame flags.
+ ext_flags->ref_frame_flags ^= AOM_LAST_FLAG;
+ if (!cpi->sf.rt_sf.force_only_last_ref) {
+ ext_flags->ref_frame_flags ^= AOM_ALT_FLAG;
+ ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+ if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
+ ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG;
+ }
+ const int sh = 6;
+ // Moving index slot for last: 0 - (sh - 1).
+ if (frame_number > 1) last_idx = ((frame_number - 1) % sh);
+ // Moving index for refresh of last: one ahead for next frame.
+ last_idx_refresh = (frame_number % sh);
+ gld_idx = 6;
+
+ // Moving index for alt_ref, lag behind LAST by lag_alt frames.
+ if (frame_number > lag_alt) alt_ref_idx = ((frame_number - lag_alt) % sh);
+ if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
+ // Moving index for LAST2, lag behind LAST by 2 frames.
+ if (frame_number > 2) last2_idx = ((frame_number - 2) % sh);
+ }
+ rtc_ref->ref_idx[0] = last_idx; // LAST
+ rtc_ref->ref_idx[1] = last_idx_refresh; // LAST2 (for refresh of last).
+ if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
+ rtc_ref->ref_idx[1] = last2_idx; // LAST2
+ rtc_ref->ref_idx[2] = last_idx_refresh; // LAST3 (for refresh of last).
+ }
+ rtc_ref->ref_idx[3] = gld_idx; // GOLDEN
+ rtc_ref->ref_idx[6] = alt_ref_idx; // ALT_REF
+ // Refresh this slot, which will become LAST on next frame.
+ rtc_ref->refresh[last_idx_refresh] = 1;
+ // Update GOLDEN on period for fixed slot case.
+ if (gf_update && cm->current_frame.frame_type != KEY_FRAME) {
+ ext_refresh_frame_flags->golden_frame = 1;
+ rtc_ref->refresh[gld_idx] = 1;
+ }
+ rtc_ref->gld_idx_1layer = gld_idx;
+ // Set the flag to reduce the number of reference frame buffers used.
+ // This assumes that slot 7 is never used.
+ cpi->rt_reduce_num_ref_buffers = 1;
+ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[0] < 7);
+ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[1] < 7);
+ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[3] < 7);
+ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[6] < 7);
+ if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
+ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[2] < 7);
+}
+
+/*!\brief Check for scene detection, for 1 pass real-time mode.
+ *
+ * Compute average source sad (temporal sad: between current source and
+ * previous source) over a subset of superblocks. Use this is detect big changes
+ * in content and set the \c cpi->rc.high_source_sad flag.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] frame_input Current and last input source frames
+ *
+ * \remark Nothing is returned. Instead the flag \c cpi->rc.high_source_sad
+ * is set if scene change is detected, and \c cpi->rc.avg_source_sad is updated.
+ */
+static void rc_scene_detection_onepass_rt(AV1_COMP *cpi,
+ const EncodeFrameInput *frame_input) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ YV12_BUFFER_CONFIG const *const unscaled_src = frame_input->source;
+ YV12_BUFFER_CONFIG const *const unscaled_last_src = frame_input->last_source;
+ uint8_t *src_y;
+ int src_ystride;
+ int src_width;
+ int src_height;
+ uint8_t *last_src_y;
+ int last_src_ystride;
+ int last_src_width;
+ int last_src_height;
+ int width = cm->width;
+ int height = cm->height;
+ if (cpi->svc.number_spatial_layers > 1) {
+ width = cpi->oxcf.frm_dim_cfg.width;
+ height = cpi->oxcf.frm_dim_cfg.height;
+ }
+ if (width != cm->render_width || height != cm->render_height ||
+ unscaled_src == NULL || unscaled_last_src == NULL) {
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+ }
+ if (unscaled_src == NULL || unscaled_last_src == NULL) return;
+ src_y = unscaled_src->y_buffer;
+ src_ystride = unscaled_src->y_stride;
+ src_width = unscaled_src->y_width;
+ src_height = unscaled_src->y_height;
+ last_src_y = unscaled_last_src->y_buffer;
+ last_src_ystride = unscaled_last_src->y_stride;
+ last_src_width = unscaled_last_src->y_width;
+ last_src_height = unscaled_last_src->y_height;
+ if (src_width != last_src_width || src_height != last_src_height) {
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+ return;
+ }
+ rc->high_source_sad = 0;
+ rc->percent_blocks_with_motion = 0;
+ rc->max_block_source_sad = 0;
+ rc->prev_avg_source_sad = rc->avg_source_sad;
+ int num_mi_cols = cm->mi_params.mi_cols;
+ int num_mi_rows = cm->mi_params.mi_rows;
+ if (cpi->svc.number_spatial_layers > 1) {
+ num_mi_cols = cpi->svc.mi_cols_full_resoln;
+ num_mi_rows = cpi->svc.mi_rows_full_resoln;
+ }
+ int num_zero_temp_sad = 0;
+ uint32_t min_thresh = 10000;
+ if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+ min_thresh = cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0
+ ? 50000
+ : 100000;
+ }
+ const BLOCK_SIZE bsize = BLOCK_64X64;
+ // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
+ uint64_t avg_sad = 0;
+ uint64_t tmp_sad = 0;
+ int num_samples = 0;
+ const int thresh =
+ cm->width * cm->height <= 320 * 240 && cpi->framerate < 10.0 ? 5 : 6;
+ // SAD is computed on 64x64 blocks
+ const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+ const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
+ uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5
+ int num_low_var_high_sumdiff = 0;
+ int light_change = 0;
+ // Flag to check light change or not.
+ const int check_light_change = 0;
+ // TODO(marpan): There seems some difference along the bottom border when
+ // using the source_last_tl0 for last_source (used for temporal layers or
+ // when previous frame is dropped).
+ // Remove this bord parameter when issue is resolved: difference is that
+ // non-zero sad exists along bottom border even though source is static.
+ const int border =
+ rc->prev_frame_is_dropped || cpi->svc.number_temporal_layers > 1;
+ // Store blkwise SAD for later use
+ if (width == cm->render_width && height == cm->render_height) {
+ if (cpi->src_sad_blk_64x64 == NULL) {
+ CHECK_MEM_ERROR(cm, cpi->src_sad_blk_64x64,
+ (uint64_t *)aom_calloc(sb_cols * sb_rows,
+ sizeof(*cpi->src_sad_blk_64x64)));
+ }
+ }
+ // Avoid bottom and right border.
+ for (int sbi_row = 0; sbi_row < sb_rows - border; ++sbi_row) {
+ for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+ tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+ last_src_ystride);
+ if (cpi->src_sad_blk_64x64 != NULL)
+ cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad;
+ if (check_light_change) {
+ unsigned int sse, variance;
+ variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+ last_src_ystride, &sse);
+ // Note: sse - variance = ((sum * sum) >> 12)
+ // Detect large lighting change.
+ if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) {
+ num_low_var_high_sumdiff++;
+ }
+ }
+ avg_sad += tmp_sad;
+ num_samples++;
+ if (tmp_sad == 0) num_zero_temp_sad++;
+ if (tmp_sad > rc->max_block_source_sad)
+ rc->max_block_source_sad = tmp_sad;
+
+ src_y += 64;
+ last_src_y += 64;
+ }
+ src_y += (src_ystride << 6) - (sb_cols << 6);
+ last_src_y += (last_src_ystride << 6) - (sb_cols << 6);
+ }
+ if (check_light_change && num_samples > 0 &&
+ num_low_var_high_sumdiff > (num_samples >> 1))
+ light_change = 1;
+ if (num_samples > 0) avg_sad = avg_sad / num_samples;
+ // Set high_source_sad flag if we detect very high increase in avg_sad
+ // between current and previous frame value(s). Use minimum threshold
+ // for cases where there is small change from content that is completely
+ // static.
+ if (!light_change &&
+ avg_sad >
+ AOMMAX(min_thresh, (unsigned int)(rc->avg_source_sad * thresh)) &&
+ rc->frames_since_key > 1 + cpi->svc.number_spatial_layers &&
+ num_zero_temp_sad < 3 * (num_samples >> 2))
+ rc->high_source_sad = 1;
+ else
+ rc->high_source_sad = 0;
+ rc->avg_source_sad = (3 * rc->avg_source_sad + avg_sad) >> 2;
+ rc->frame_source_sad = avg_sad;
+ if (num_samples > 0)
+ rc->percent_blocks_with_motion =
+ ((num_samples - num_zero_temp_sad) * 100) / num_samples;
+ // Scene detection is only on base SLO, and using full/orignal resolution.
+ // Pass the state to the upper spatial layers.
+ if (cpi->svc.number_spatial_layers > 1) {
+ SVC *svc = &cpi->svc;
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ int tl = svc->temporal_layer_id;
+ const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ lrc->high_source_sad = rc->high_source_sad;
+ lrc->frame_source_sad = rc->frame_source_sad;
+ lrc->avg_source_sad = rc->avg_source_sad;
+ lrc->percent_blocks_with_motion = rc->percent_blocks_with_motion;
+ lrc->max_block_source_sad = rc->max_block_source_sad;
+ }
+ }
+}
+
+/*!\brief Set the GF baseline interval for 1 pass real-time mode.
+ *
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] frame_type frame type
+ *
+ * \return Return GF update flag, and update the \c cpi->rc with
+ * the next GF interval settings.
+ */
+static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi,
+ FRAME_TYPE frame_type) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int gf_update = 0;
+ const int resize_pending = is_frame_resize_pending(cpi);
+ // GF update based on frames_till_gf_update_due, also
+ // force upddate on resize pending frame or for scene change.
+ if ((resize_pending || rc->high_source_sad ||
+ rc->frames_till_gf_update_due == 0) &&
+ cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) {
+ set_baseline_gf_interval(cpi, frame_type);
+ gf_update = 1;
+ }
+ return gf_update;
+}
+
+static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
+ int prev_width, int prev_height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ SVC *const svc = &cpi->svc;
+ int target_bits_per_frame;
+ int active_worst_quality;
+ int qindex;
+ double tot_scale_change = (double)(resize_width * resize_height) /
+ (double)(prev_width * prev_height);
+ // Disable the skip mv search for svc on resize frame.
+ svc->skip_mvsearch_last = 0;
+ svc->skip_mvsearch_gf = 0;
+ svc->skip_mvsearch_altref = 0;
+ // Reset buffer level to optimal, update target size.
+ p_rc->buffer_level = p_rc->optimal_buffer_level;
+ p_rc->bits_off_target = p_rc->optimal_buffer_level;
+ rc->this_frame_target =
+ av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME);
+ target_bits_per_frame = rc->this_frame_target;
+ if (tot_scale_change > 4.0)
+ p_rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+ else if (tot_scale_change > 1.0)
+ p_rc->avg_frame_qindex[INTER_FRAME] =
+ (p_rc->avg_frame_qindex[INTER_FRAME] + rc->worst_quality) >> 1;
+ active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
+ qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality,
+ active_worst_quality, resize_width, resize_height);
+ // If resize is down, check if projected q index is close to worst_quality,
+ // and if so, reduce the rate correction factor (since likely can afford
+ // lower q for resized frame).
+ if (tot_scale_change < 1.0 && qindex > 90 * rc->worst_quality / 100)
+ p_rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+ // If resize is back up: check if projected q index is too much above the
+ // previous index, and if so, reduce the rate correction factor
+ // (since prefer to keep q for resized frame at least closet to previous q).
+ // Also check if projected qindex is close to previous qindex, if so
+ // increase correction factor (to push qindex higher and avoid overshoot).
+ if (tot_scale_change >= 1.0) {
+ if (tot_scale_change < 4.0 &&
+ qindex > 130 * p_rc->last_q[INTER_FRAME] / 100)
+ p_rc->rate_correction_factors[INTER_NORMAL] *= 0.8;
+ if (qindex <= 120 * p_rc->last_q[INTER_FRAME] / 100)
+ p_rc->rate_correction_factors[INTER_NORMAL] *= 1.5;
+ }
+ if (svc->number_temporal_layers > 1) {
+ // Apply the same rate control reset to all temporal layers.
+ for (int tl = 0; tl < svc->number_temporal_layers; tl++) {
+ LAYER_CONTEXT *lc = NULL;
+ lc = &svc->layer_context[svc->spatial_layer_id *
+ svc->number_temporal_layers +
+ tl];
+ lc->rc.resize_state = rc->resize_state;
+ lc->p_rc.buffer_level = lc->p_rc.optimal_buffer_level;
+ lc->p_rc.bits_off_target = lc->p_rc.optimal_buffer_level;
+ lc->p_rc.rate_correction_factors[INTER_NORMAL] =
+ p_rc->rate_correction_factors[INTER_NORMAL];
+ lc->p_rc.avg_frame_qindex[INTER_FRAME] =
+ p_rc->avg_frame_qindex[INTER_FRAME];
+ }
+ }
+}
+
+/*!\brief ChecK for resize based on Q, for 1 pass real-time mode.
+ *
+ * Check if we should resize, based on average QP from past x frames.
+ * Only allow for resize at most 1/2 scale down for now, Scaling factor
+ * for each step may be 3/4 or 1/2.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Return resized width/height in \c cpi->resize_pending_params,
+ * and update some resize counters in \c rc.
+ */
+static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ RESIZE_ACTION resize_action = NO_RESIZE;
+ const int avg_qp_thr1 = 70;
+ const int avg_qp_thr2 = 50;
+ // Don't allow for resized frame to go below 160x90, resize in steps of 3/4.
+ const int min_width = (160 * 4) / 3;
+ const int min_height = (90 * 4) / 3;
+ int down_size_on = 1;
+ // Don't resize on key frame; reset the counters on key frame.
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ rc->resize_avg_qp = 0;
+ rc->resize_count = 0;
+ rc->resize_buffer_underflow = 0;
+ return;
+ }
+ // No resizing down if frame size is below some limit.
+ if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0;
+
+ // Resize based on average buffer underflow and QP over some window.
+ // Ignore samples close to key frame, since QP is usually high after key.
+ if (cpi->rc.frames_since_key > cpi->framerate) {
+ const int window = AOMMIN(30, (int)(2 * cpi->framerate));
+ rc->resize_avg_qp += p_rc->last_q[INTER_FRAME];
+ if (cpi->ppi->p_rc.buffer_level <
+ (int)(30 * p_rc->optimal_buffer_level / 100))
+ ++rc->resize_buffer_underflow;
+ ++rc->resize_count;
+ // Check for resize action every "window" frames.
+ if (rc->resize_count >= window) {
+ int avg_qp = rc->resize_avg_qp / rc->resize_count;
+ // Resize down if buffer level has underflowed sufficient amount in past
+ // window, and we are at original or 3/4 of original resolution.
+ // Resize back up if average QP is low, and we are currently in a resized
+ // down state, i.e. 1/2 or 3/4 of original resolution.
+ // Currently, use a flag to turn 3/4 resizing feature on/off.
+ if (rc->resize_buffer_underflow > (rc->resize_count >> 2) &&
+ down_size_on) {
+ if (rc->resize_state == THREE_QUARTER) {
+ resize_action = DOWN_ONEHALF;
+ rc->resize_state = ONE_HALF;
+ } else if (rc->resize_state == ORIG) {
+ resize_action = DOWN_THREEFOUR;
+ rc->resize_state = THREE_QUARTER;
+ }
+ } else if (rc->resize_state != ORIG &&
+ avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) {
+ if (rc->resize_state == THREE_QUARTER ||
+ avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100) {
+ resize_action = UP_ORIG;
+ rc->resize_state = ORIG;
+ } else if (rc->resize_state == ONE_HALF) {
+ resize_action = UP_THREEFOUR;
+ rc->resize_state = THREE_QUARTER;
+ }
+ }
+ // Reset for next window measurement.
+ rc->resize_avg_qp = 0;
+ rc->resize_count = 0;
+ rc->resize_buffer_underflow = 0;
+ }
+ }
+ // If decision is to resize, reset some quantities, and check is we should
+ // reduce rate correction factor,
+ if (resize_action != NO_RESIZE) {
+ int resize_width = cpi->oxcf.frm_dim_cfg.width;
+ int resize_height = cpi->oxcf.frm_dim_cfg.height;
+ int resize_scale_num = 1;
+ int resize_scale_den = 1;
+ if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) {
+ resize_scale_num = 3;
+ resize_scale_den = 4;
+ } else if (resize_action == DOWN_ONEHALF) {
+ resize_scale_num = 1;
+ resize_scale_den = 2;
+ }
+ resize_width = resize_width * resize_scale_num / resize_scale_den;
+ resize_height = resize_height * resize_scale_num / resize_scale_den;
+ resize_reset_rc(cpi, resize_width, resize_height, cm->width, cm->height);
+ }
+ return;
+}
+
+static INLINE int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ AV1_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+
+ // Very first frame has to be key frame.
+ if (cm->current_frame.frame_number == 0) return 1;
+ // Set key frame if forced by frame flags.
+ if (frame_flags & FRAMEFLAGS_KEY) return 1;
+ if (!cpi->ppi->use_svc) {
+ // Non-SVC
+ if (cpi->oxcf.kf_cfg.auto_key && rc->frames_to_key == 0) return 1;
+ } else {
+ // SVC
+ if (svc->spatial_layer_id == 0 &&
+ (cpi->oxcf.kf_cfg.auto_key &&
+ (cpi->oxcf.kf_cfg.key_freq_max == 0 ||
+ svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0)))
+ return 1;
+ }
+
+ return 0;
+}
+
+// Set to true if this frame is a recovery frame, for 1 layer RPS,
+// and whether we should apply some boost (QP, adjust speed features, etc).
+// Recovery frame here means frame whose closest reference suddenly
+// switched from previous frame to one much further away.
+// TODO(marpan): Consider adding on/off flag to SVC_REF_FRAME_CONFIG to
+// allow more control for applications.
+static bool set_flag_rps_bias_recovery_frame(const AV1_COMP *const cpi) {
+ if (cpi->ppi->rtc_ref.set_ref_frame_config &&
+ cpi->svc.number_temporal_layers == 1 &&
+ cpi->svc.number_spatial_layers == 1 &&
+ cpi->ppi->rtc_ref.reference_was_previous_frame) {
+ int min_dist = av1_svc_get_min_ref_dist(cpi);
+ // Only consider boost for this frame if its closest reference is further
+ // than x frames away, using x = 4 for now.
+ if (min_dist != INT_MAX && min_dist > 4) return true;
+ }
+ return false;
+}
+
+void av1_get_one_pass_rt_params(AV1_COMP *cpi, FRAME_TYPE *const frame_type,
+ const EncodeFrameInput *frame_input,
+ unsigned int frame_flags) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ AV1_COMMON *const cm = &cpi->common;
+ GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ SVC *const svc = &cpi->svc;
+ ResizePendingParams *const resize_pending_params =
+ &cpi->resize_pending_params;
+ int target;
+ const int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ if (cpi->ppi->use_svc) {
+ av1_update_temporal_layer_framerate(cpi);
+ av1_restore_layer_context(cpi);
+ }
+ cpi->ppi->rtc_ref.bias_recovery_frame = set_flag_rps_bias_recovery_frame(cpi);
+ // Set frame type.
+ if (set_key_frame(cpi, frame_flags)) {
+ *frame_type = KEY_FRAME;
+ p_rc->this_key_frame_forced =
+ cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
+ rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max;
+ p_rc->kf_boost = DEFAULT_KF_BOOST_RT;
+ gf_group->update_type[cpi->gf_frame_index] = KF_UPDATE;
+ gf_group->frame_type[cpi->gf_frame_index] = KEY_FRAME;
+ gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_RESET;
+ if (cpi->ppi->use_svc) {
+ if (cm->current_frame.frame_number > 0)
+ av1_svc_reset_temporal_layers(cpi, 1);
+ svc->layer_context[layer].is_key_frame = 1;
+ }
+ rc->frame_number_encoded = 0;
+ cpi->ppi->rtc_ref.non_reference_frame = 0;
+ } else {
+ *frame_type = INTER_FRAME;
+ gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE;
+ gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME;
+ gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE;
+ if (cpi->ppi->use_svc) {
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ lc->is_key_frame =
+ svc->spatial_layer_id == 0
+ ? 0
+ : svc->layer_context[svc->temporal_layer_id].is_key_frame;
+ // If the user is setting the reference structure with
+ // set_ref_frame_config and did not set any references, set the
+ // frame type to Intra-only.
+ if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+ int no_references_set = 1;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ if (cpi->ppi->rtc_ref.reference[i]) {
+ no_references_set = 0;
+ break;
+ }
+ }
+ // Set to intra_only_frame if no references are set.
+ // The stream can start decoding on INTRA_ONLY_FRAME so long as the
+ // layer with the intra_only_frame doesn't signal a reference to a slot
+ // that hasn't been set yet.
+ if (no_references_set) *frame_type = INTRA_ONLY_FRAME;
+ }
+ }
+ }
+ // Check for scene change: for SVC check on base spatial layer only.
+ if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0) {
+ if (rc->prev_coded_width == cm->width &&
+ rc->prev_coded_height == cm->height) {
+ rc_scene_detection_onepass_rt(cpi, frame_input);
+ } else {
+ aom_free(cpi->src_sad_blk_64x64);
+ cpi->src_sad_blk_64x64 = NULL;
+ }
+ }
+ // Check for dynamic resize, for single spatial layer for now.
+ // For temporal layers only check on base temporal layer.
+ if (cpi->oxcf.resize_cfg.resize_mode == RESIZE_DYNAMIC) {
+ if (svc->number_spatial_layers == 1 && svc->temporal_layer_id == 0)
+ dynamic_resize_one_pass_cbr(cpi);
+ if (rc->resize_state == THREE_QUARTER) {
+ resize_pending_params->width = (3 + cpi->oxcf.frm_dim_cfg.width * 3) >> 2;
+ resize_pending_params->height =
+ (3 + cpi->oxcf.frm_dim_cfg.height * 3) >> 2;
+ } else if (rc->resize_state == ONE_HALF) {
+ resize_pending_params->width = (1 + cpi->oxcf.frm_dim_cfg.width) >> 1;
+ resize_pending_params->height = (1 + cpi->oxcf.frm_dim_cfg.height) >> 1;
+ } else {
+ resize_pending_params->width = cpi->oxcf.frm_dim_cfg.width;
+ resize_pending_params->height = cpi->oxcf.frm_dim_cfg.height;
+ }
+ } else if (is_frame_resize_pending(cpi)) {
+ resize_reset_rc(cpi, resize_pending_params->width,
+ resize_pending_params->height, cm->width, cm->height);
+ }
+ // Set the GF interval and update flag.
+ if (!rc->rtc_external_ratectrl)
+ set_gf_interval_update_onepass_rt(cpi, *frame_type);
+ // Set target size.
+ if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+ if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) {
+ target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
+ } else {
+ target = av1_calc_pframe_target_size_one_pass_cbr(
+ cpi, gf_group->update_type[cpi->gf_frame_index]);
+ }
+ } else {
+ if (*frame_type == KEY_FRAME || *frame_type == INTRA_ONLY_FRAME) {
+ target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
+ } else {
+ target = av1_calc_pframe_target_size_one_pass_vbr(
+ cpi, gf_group->update_type[cpi->gf_frame_index]);
+ }
+ }
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q)
+ rc->active_worst_quality = cpi->oxcf.rc_cfg.cq_level;
+
+ av1_rc_set_frame_target(cpi, target, cm->width, cm->height);
+ rc->base_frame_target = target;
+ cm->current_frame.frame_type = *frame_type;
+ // For fixed mode SVC: if KSVC is enabled remove inter layer
+ // prediction on spatial enhancement layer frames for frames
+ // whose base is not KEY frame.
+ if (cpi->ppi->use_svc && !svc->use_flexible_mode && svc->ksvc_fixed_mode &&
+ svc->number_spatial_layers > 1 &&
+ !svc->layer_context[layer].is_key_frame) {
+ ExternalFlags *const ext_flags = &cpi->ext_flags;
+ ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+ }
+}
+
+int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
+ AV1_COMMON *const cm = &cpi->common;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ double rate_correction_factor =
+ cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL];
+ const int target_size = cpi->rc.avg_frame_bandwidth;
+ double new_correction_factor;
+ int target_bits_per_mb;
+ double q2;
+ int enumerator;
+ int is_screen_content = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN);
+ *q = (3 * cpi->rc.worst_quality + *q) >> 2;
+ // For screen content use the max-q set by the user to allow for less
+ // overshoot on slide changes.
+ if (is_screen_content) *q = cpi->rc.worst_quality;
+ cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0;
+ // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
+ // these parameters will affect QP selection for subsequent frames. If they
+ // have settled down to a very different (low QP) state, then not adjusting
+ // them may cause next frame to select low QP and overshoot again.
+ p_rc->avg_frame_qindex[INTER_FRAME] = *q;
+ p_rc->buffer_level = p_rc->optimal_buffer_level;
+ p_rc->bits_off_target = p_rc->optimal_buffer_level;
+ // Reset rate under/over-shoot flags.
+ cpi->rc.rc_1_frame = 0;
+ cpi->rc.rc_2_frame = 0;
+ // Adjust rate correction factor.
+ target_bits_per_mb =
+ (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs);
+ // Reset rate correction factor: for now base it on target_bits_per_mb
+ // and qp (==max_QP). This comes from the inverse computation of
+ // av1_rc_bits_per_mb().
+ q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth);
+ enumerator = av1_get_bpmb_enumerator(INTER_NORMAL, is_screen_content);
+ new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
+ if (new_correction_factor > rate_correction_factor) {
+ rate_correction_factor =
+ (new_correction_factor + rate_correction_factor) / 2.0;
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL] =
+ rate_correction_factor;
+ }
+ // For temporal layers: reset the rate control parameters across all
+ // temporal layers.
+ if (cpi->svc.number_temporal_layers > 1) {
+ SVC *svc = &cpi->svc;
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int sl = svc->spatial_layer_id;
+ const int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+ lp_rc->avg_frame_qindex[INTER_FRAME] = *q;
+ lp_rc->buffer_level = lp_rc->optimal_buffer_level;
+ lp_rc->bits_off_target = lp_rc->optimal_buffer_level;
+ lrc->rc_1_frame = 0;
+ lrc->rc_2_frame = 0;
+ lp_rc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+ }
+ }
+ return 1;
+}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
new file mode 100644
index 0000000000..6802ad42d0
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -0,0 +1,864 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RATECTRL_H_
+#define AOM_AV1_ENCODER_RATECTRL_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS 1
+
+// Threshold used to define if a KF group is static (e.g. a slide show).
+// Essentially, this means that no frame in the group has more than 1% of MBs
+// that are not marked as coded with 0,0 motion in the first pass.
+#define STATIC_KF_GROUP_THRESH 99
+#define STATIC_KF_GROUP_FLOAT_THRESH 0.99
+
+// The maximum duration of a GF group that is static (e.g. a slide show).
+#define MAX_STATIC_GF_GROUP_LENGTH 250
+
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 32
+#define FIXED_GF_INTERVAL 16
+#define MAX_GF_LENGTH_LAP 16
+
+#define FIXED_GF_INTERVAL_RT 80
+#define MAX_GF_INTERVAL_RT 160
+
+#define MAX_NUM_GF_INTERVALS 15
+
+#define MAX_ARF_LAYERS 6
+// #define STRICT_RC
+
+#define DEFAULT_KF_BOOST_RT 2300
+#define DEFAULT_GF_BOOST_RT 2000
+
+// A passive rate control strategy for screen content type in real-time mode.
+// When it is turned on, the compression performance is improved by
+// 7.8% (overall_psnr), 5.0% (VMAF) on average. Some clips see gains
+// over 20% on metric.
+// The downside is that it does not guarantee frame size.
+// Since RT mode has a tight restriction on buffer overflow control, we
+// turn it off by default.
+#define RT_PASSIVE_STRATEGY 0
+#define MAX_Q_HISTORY 1000
+
+typedef struct {
+ int resize_width;
+ int resize_height;
+ uint8_t superres_denom;
+} size_params_type;
+
+enum {
+ INTER_NORMAL,
+ GF_ARF_LOW,
+ GF_ARF_STD,
+ KF_STD,
+ RATE_FACTOR_LEVELS
+} UENUM1BYTE(RATE_FACTOR_LEVEL);
+
+enum {
+ KF_UPDATE,
+ LF_UPDATE,
+ GF_UPDATE,
+ ARF_UPDATE,
+ OVERLAY_UPDATE,
+ INTNL_OVERLAY_UPDATE, // Internal Overlay Frame
+ INTNL_ARF_UPDATE, // Internal Altref Frame
+ FRAME_UPDATE_TYPES
+} UENUM1BYTE(FRAME_UPDATE_TYPE);
+
+enum {
+ REFBUF_RESET, // Clear reference frame buffer
+ REFBUF_UPDATE, // Refresh reference frame buffer
+ REFBUF_STATES
+} UENUM1BYTE(REFBUF_STATE);
+
+typedef enum {
+ NO_RESIZE = 0,
+ DOWN_THREEFOUR = 1, // From orig to 3/4.
+ DOWN_ONEHALF = 2, // From orig or 3/4 to 1/2.
+ UP_THREEFOUR = -1, // From 1/2 to 3/4.
+ UP_ORIG = -2, // From 1/2 or 3/4 to orig.
+} RESIZE_ACTION;
+
+typedef enum { ORIG = 0, THREE_QUARTER = 1, ONE_HALF = 2 } RESIZE_STATE;
+
+#define MAX_FIRSTPASS_ANALYSIS_FRAMES 150
+typedef enum region_types {
+ STABLE_REGION = 0,
+ HIGH_VAR_REGION = 1,
+ SCENECUT_REGION = 2,
+ BLENDING_REGION = 3,
+} REGION_TYPES;
+
+typedef struct regions {
+ int start;
+ int last;
+ double avg_noise_var;
+ double avg_cor_coeff;
+ double avg_sr_fr_ratio;
+ double avg_intra_err;
+ double avg_coded_err;
+ REGION_TYPES type;
+} REGIONS;
+
+/*!\endcond */
+/*!
+ * \brief Rate Control parameters and status
+ */
+typedef struct {
+ // Rate targetting variables
+
+ /*!
+ * Baseline target rate for frame before adjustment for previous under or
+ * over shoot.
+ */
+ int base_frame_target;
+ /*!
+ * Target rate for frame after adjustment for previous under or over shoot.
+ */
+ int this_frame_target; // Actual frame target after rc adjustment.
+
+ /*!
+ * Projected size for current frame
+ */
+ int projected_frame_size;
+
+ /*!
+ * Bit size of transform coefficient for current frame.
+ */
+ int coefficient_size;
+
+ /*!
+ * Super block rate target used with some adaptive quantization strategies.
+ */
+ int sb64_target_rate;
+
+ /*!
+ * Number of frames since the last ARF / GF.
+ */
+ int frames_since_golden;
+
+ /*!
+ * Number of frames till the next ARF / GF is due.
+ */
+ int frames_till_gf_update_due;
+
+ /*!
+ * Number of determined gf groups left
+ */
+ int intervals_till_gf_calculate_due;
+
+ /*!\cond */
+ int min_gf_interval;
+ int max_gf_interval;
+ int static_scene_max_gf_interval;
+ /*!\endcond */
+ /*!
+ * Frames before the next key frame
+ */
+ int frames_to_key;
+ /*!\cond */
+ int frames_since_key;
+ int frames_to_fwd_kf;
+ int is_src_frame_alt_ref;
+ int sframe_due;
+
+ int high_source_sad;
+ uint64_t avg_source_sad;
+ uint64_t prev_avg_source_sad;
+ uint64_t frame_source_sad;
+
+ int avg_frame_bandwidth; // Average frame size target for clip
+ int min_frame_bandwidth; // Minimum allocation used for any frame
+ int max_frame_bandwidth; // Maximum burst rate allowed for a frame.
+ int prev_avg_frame_bandwidth;
+
+ int ni_av_qi;
+ int ni_tot_qi;
+
+ int decimation_factor;
+ int decimation_count;
+ int prev_frame_is_dropped;
+ int drop_count_consec;
+ int max_consec_drop;
+
+ /*!
+ * Frame number for encoded frames (non-dropped).
+ * Use for setting the rtc reference structure.
+ */
+ unsigned int frame_number_encoded;
+
+ /*!\endcond */
+ /*!
+ * User specified maximum Q allowed for current frame
+ */
+ int worst_quality;
+ /*!
+ * User specified minimum Q allowed for current frame
+ */
+ int best_quality;
+
+ /*!\cond */
+
+ // rate control history for last frame(1) and the frame before(2).
+ // -1: overshoot
+ // 1: undershoot
+ // 0: not initialized.
+ int rc_1_frame;
+ int rc_2_frame;
+ int q_1_frame;
+ int q_2_frame;
+
+ /*!\endcond */
+ /*!
+ * Proposed maximum allowed Q for current frame
+ */
+ int active_worst_quality;
+
+ /*!\cond */
+ // Track amount of low motion in scene
+ int avg_frame_low_motion;
+ int cnt_zeromv;
+
+ // signals if number of blocks with motion is high
+ int percent_blocks_with_motion;
+
+ // Maximum value of source sad across all blocks of frame.
+ uint64_t max_block_source_sad;
+
+ // For dynamic resize, 1 pass cbr.
+ RESIZE_STATE resize_state;
+ int resize_avg_qp;
+ int resize_buffer_underflow;
+ int resize_count;
+
+ // Flag to disable content related qp adjustment.
+ int rtc_external_ratectrl;
+
+ // Stores fast_extra_bits of the current frame.
+ int frame_level_fast_extra_bits;
+
+ double frame_level_rate_correction_factors[RATE_FACTOR_LEVELS];
+
+ int frame_num_last_gf_refresh;
+
+ int prev_coded_width;
+ int prev_coded_height;
+
+ // The ratio used for inter frames in bit estimation.
+ // TODO(yunqing): if golden frame is treated differently (e.g. gf_cbr_boost_
+ // pct > THR), consider to add bit_est_ratio_g for golden frames.
+ int bit_est_ratio;
+
+ // Whether to use a fixed qp for the frame, bypassing internal rate control.
+ // This flag will reset to 0 after every frame.
+ int use_external_qp_one_pass;
+ /*!\endcond */
+} RATE_CONTROL;
+
+/*!
+ * \brief Primary Rate Control parameters and status
+ */
+typedef struct {
+ // Sub-gop level Rate targetting variables
+
+ /*!
+ * Target bit budget for the current GF / ARF group of frame.
+ */
+ int64_t gf_group_bits;
+
+ /*!
+ * Boost factor used to calculate the extra bits allocated to the key frame
+ */
+ int kf_boost;
+
+ /*!
+ * Boost factor used to calculate the extra bits allocated to ARFs and GFs
+ */
+ int gfu_boost;
+
+ /*!
+ * Stores the determined gf group lengths for a set of gf groups
+ */
+ int gf_intervals[MAX_NUM_GF_INTERVALS];
+
+ /*!
+ * The current group's index into gf_intervals[]
+ */
+ int cur_gf_index;
+
+ /*!\cond */
+ int num_regions;
+
+ REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
+ int regions_offset; // offset of regions from the last keyframe
+ int frames_till_regions_update;
+
+ int baseline_gf_interval;
+
+ int constrained_gf_group;
+
+ int this_key_frame_forced;
+
+ int next_key_frame_forced;
+ /*!\endcond */
+
+ /*!
+ * Initial buffuer level in ms for CBR / low delay encoding
+ */
+ int64_t starting_buffer_level;
+
+ /*!
+ * Optimum / target buffuer level in ms for CBR / low delay encoding
+ */
+ int64_t optimal_buffer_level;
+
+ /*!
+ * Maximum target buffuer level in ms for CBR / low delay encoding
+ */
+ int64_t maximum_buffer_size;
+
+ /*!
+ * Q index used for ALT frame
+ */
+ int arf_q;
+
+ /*!\cond */
+ float_t arf_boost_factor;
+
+ int base_layer_qp;
+
+ // Total number of stats used only for kf_boost calculation.
+ int num_stats_used_for_kf_boost;
+
+ // Total number of stats used only for gfu_boost calculation.
+ int num_stats_used_for_gfu_boost;
+
+ // Total number of stats required by gfu_boost calculation.
+ int num_stats_required_for_gfu_boost;
+
+ int enable_scenecut_detection;
+
+ int use_arf_in_this_kf_group;
+
+ int ni_frames;
+
+ double tot_q;
+ /*!\endcond */
+
+ /*!
+ * Q used for last boosted (non leaf) frame
+ */
+ int last_kf_qindex;
+
+ /*!
+ * Average of q index of previous encoded frames in a sequence.
+ */
+ int avg_frame_qindex[FRAME_TYPES];
+
+#if CONFIG_FPMT_TEST
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * active_best_quality.
+ */
+ int temp_active_best_quality[MAX_ARF_LAYERS + 1];
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * last_boosted_qindex.
+ */
+ int temp_last_boosted_qindex;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * avg_q.
+ */
+ double temp_avg_q;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * last_q.
+ */
+ int temp_last_q[FRAME_TYPES];
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * projected_frame_size.
+ */
+ int temp_projected_frame_size;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * total_actual_bits.
+ */
+ int64_t temp_total_actual_bits;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * buffer_level.
+ */
+ int64_t temp_buffer_level;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * vbr_bits_off_target.
+ */
+ int64_t temp_vbr_bits_off_target;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * vbr_bits_off_target_fast.
+ */
+ int64_t temp_vbr_bits_off_target_fast;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * rate_correction_factors.
+ */
+ double temp_rate_correction_factors[RATE_FACTOR_LEVELS];
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * rate_error_estimate.
+ */
+ int temp_rate_error_estimate;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * rolling_arf_group_target_bits.
+ */
+ int temp_rolling_arf_group_target_bits;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * rolling_arf_group_actual_bits;.
+ */
+ int temp_rolling_arf_group_actual_bits;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * bits_left;.
+ */
+ int64_t temp_bits_left;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * extend_minq.
+ */
+ int temp_extend_minq;
+
+ /*!
+ * Temporary variable used in simulating the delayed update of
+ * extend_maxq.
+ */
+ int temp_extend_maxq;
+
+#endif
+ /*!
+ * Proposed minimum allowed Q different layers in a coding pyramid
+ */
+ int active_best_quality[MAX_ARF_LAYERS + 1];
+
+ /*!
+ * Q used for last boosted (non leaf) frame (GF/KF/ARF)
+ */
+ int last_boosted_qindex;
+
+ /*!
+ * Average Q value of previous inter frames
+ */
+ double avg_q;
+
+ /*!
+ * Q used on last encoded frame of the given type.
+ */
+ int last_q[FRAME_TYPES];
+
+ /*!
+ * Correction factors used to adjust the q estimate for a given target rate
+ * in the encode loop.
+ */
+ double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+ /*!
+ * Current total consumed bits.
+ */
+ int64_t total_actual_bits;
+
+ /*!
+ * Current total target bits.
+ */
+ int64_t total_target_bits;
+
+ /*!
+ * Current buffer level.
+ */
+ int64_t buffer_level;
+
+ /*!
+ * PCT rc error.
+ */
+ int rate_error_estimate;
+
+ /*!
+ * Error bits available from previously encoded frames.
+ */
+ int64_t vbr_bits_off_target;
+
+ /*!
+ * Error bits available from previously encoded frames undershoot.
+ */
+ int64_t vbr_bits_off_target_fast;
+
+ /*!
+ * Total bits deviated from the average frame target, from previously
+ * encoded frames.
+ */
+ int64_t bits_off_target;
+
+ /*!
+ * Rolling monitor target bits updated based on current frame target size.
+ */
+ int rolling_target_bits;
+
+ /*!
+ * Rolling monitor actual bits updated based on current frame final projected
+ * size.
+ */
+ int rolling_actual_bits;
+
+ /*!
+ * The history of qindex for each frame.
+ * Only used when RT_PASSIVE_STRATEGY = 1.
+ */
+ int q_history[MAX_Q_HISTORY];
+} PRIMARY_RATE_CONTROL;
+
+/*!\cond */
+
+struct AV1_COMP;
+struct AV1EncoderConfig;
+struct GF_GROUP;
+
+void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf,
+ PRIMARY_RATE_CONTROL *p_rc);
+
+void av1_rc_init(const struct AV1EncoderConfig *oxcf, RATE_CONTROL *rc);
+
+int av1_estimate_bits_at_q(const struct AV1_COMP *cpi, int q,
+ double correction_factor);
+
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
+
+void av1_rc_init_minq_luts(void);
+
+int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+// av1_get_one_pass_rt_params()
+// av1_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by:
+// av1_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the av1_get_..._params() functions and
+// updated during the av1_rc_postencode_update...() functions.
+// The only exceptions are av1_rc_drop_frame() and
+// av1_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+struct EncodeFrameInput;
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
+
+/*!\endcond */
+/*!\brief Updates the rate correction factor linking Q to output bits
+ *
+ * This function updates the Q rate correction factor after an encode
+ * cycle depending on whether we overshot or undershot the target rate.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] is_encode_stage Indicates if recode loop or post-encode
+ * \param[in] width Frame width
+ * \param[in] height Frame height
+ *
+ * \remark Updates the relevant rate correction factor in cpi->rc
+ */
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi,
+ int is_encode_stage, int width,
+ int height);
+/*!\cond */
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int av1_rc_drop_frame(struct AV1_COMP *cpi);
+
+// Computes frame size bounds.
+void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
+ int this_frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit);
+
+/*!\endcond */
+
+/*!\brief Picks q and q bounds given the rate control parameters in \c cpi->rc.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] width Coded frame width
+ * \param[in] height Coded frame height
+ * \param[in] gf_index Index of this frame in the golden frame group
+ * \param[out] bottom_index Bottom bound for q index (best quality)
+ * \param[out] top_index Top bound for q index (worst quality)
+ * \return Returns selected q index to be used for encoding this frame.
+ * Also, updates \c rc->arf_q.
+ */
+int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height,
+ int gf_index, int *bottom_index, int *top_index);
+
+/*!\brief Estimates q to achieve a target bits per frame
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] target_bits_per_frame Frame rate target
+ * \param[in] active_worst_quality Max Q allowed
+ * \param[in] active_best_quality Min Q allowed
+ * \param[in] width Frame width
+ * \param[in] height Frame height
+ *
+ * \return Returns a q index value
+ */
+int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
+ int active_best_quality, int active_worst_quality,
+ int width, int height);
+
+/*!\cond */
+// Gets the appropriate bpmb ennumerator based on the frame and content type
+int av1_get_bpmb_enumerator(FRAME_TYPE frame_type,
+ const int is_screen_content_type);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int av1_rc_bits_per_mb(const struct AV1_COMP *cpi, FRAME_TYPE frame_type,
+ int qindex, double correction_factor,
+ int accurate_estimate);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
+ int64_t target);
+int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
+ int target, uint8_t frame_update_type);
+
+// Find q_index corresponding to desired_q, within [best_qindex, worst_qindex].
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// q >= desired_q.
+// If no such q index is found, returns 'worst_qindex'.
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+ int best_qindex, int worst_qindex);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+ aom_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int av1_compute_qdelta_by_rate(const struct AV1_COMP *cpi,
+ FRAME_TYPE frame_type, int qindex,
+ double rate_target_ratio);
+
+int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q);
+
+void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height);
+
+void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi,
+ RATE_CONTROL *const rc);
+
+void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height);
+
+int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
+
+void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width,
+ int height);
+
+void av1_adjust_gf_refresh_qp_one_pass_rt(struct AV1_COMP *cpi);
+
+void av1_set_rtc_reference_structure_one_layer(struct AV1_COMP *cpi,
+ int gf_update);
+
+/*!\endcond */
+/*!\brief Calculates how many bits to use for a P frame in one pass vbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] frame_update_type Type of frame
+ *
+ * \return Returns the target number of bits for this frame.
+ */
+int av1_calc_pframe_target_size_one_pass_vbr(
+ const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type);
+
+/*!\brief Calculates how many bits to use for an i frame in one pass vbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \return Returns the target number of bits for this frame.
+ */
+int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi);
+
+/*!\brief Calculates how many bits to use for a P frame in one pass cbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] frame_update_type Type of frame
+ *
+ * \return Returns the target number of bits for this frame.
+ */
+int av1_calc_pframe_target_size_one_pass_cbr(
+ const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type);
+
+/*!\brief Calculates how many bits to use for an i frame in one pass cbr
+ *
+ * \ingroup rate_control
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \return Returns the target number of bits for this frame.
+ */
+int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi);
+
+/*!\brief Setup the rate control parameters for 1 pass real-time mode.
+ *
+ * - Sets the frame type and target frame size.
+ * - Sets the GF update.
+ * - Checks for scene change.
+ * - Sets the reference prediction structure for 1 layers (non-SVC).
+ * - Resets and updates are done for SVC.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] frame_type Encoder frame type
+ * \param[in] frame_input Current and last input source frames
+ * \param[in] frame_flags Encoder frame flags
+ *
+ * \remark Nothing is returned. Instead the settings computed in this
+ * function are set in: \c frame_params, \c cpi->common, \c cpi->rc,
+ * \c cpi->svc.
+ */
+void av1_get_one_pass_rt_params(struct AV1_COMP *cpi,
+ FRAME_TYPE *const frame_type,
+ const struct EncodeFrameInput *frame_input,
+ unsigned int frame_flags);
+
+/*!\brief Increase q on expected encoder overshoot, for CBR mode.
+ *
+ * Handles the case when encoder is expected to create a large frame:
+ * - q is increased to value closer to \c cpi->rc.worst_quality
+ * - avg_frame_qindex is reset
+ * - buffer levels are reset
+ * - rate correction factor is adjusted
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ * \param[in] q Current q index
+ *
+ * \return q is returned, and updates are done to \c cpi->rc.
+ */
+int av1_encodedframe_overshoot_cbr(struct AV1_COMP *cpi, int *q);
+
+/*!\brief Compute the q_indices for a single frame.
+ *
+ * Intended to be used with AOM_Q mode.
+ *
+ * \param[in] base_q_index Base q index
+ * \param[in] gf_update_type GOP update type
+ * \param[in] gf_pyramid_level GOP level of the current frame
+ * \param[in] arf_q ARF q_index
+ *
+ * \return Returns the q_index for the current frame.
+ */
+int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
+ int gf_pyramid_level, int arf_q);
+
+/*!\brief Compute the q_indices for the ARF of a GOP.
+ *
+ * \param[in] base_q_index Base q index
+ * \param[in] gfu_boost GFU boost
+ * \param[in] bit_depth Bit depth
+ * \param[in] arf_boost_factor ARF boost factor
+ *
+ * \return Returns the q_index for the ARF frame.
+ */
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+ double arf_boost_factor);
+
+#if !CONFIG_REALTIME_ONLY
+struct TplDepFrame;
+/*!\brief Compute the q_indices for the ARF of a GOP in Q mode.
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] tpl_frame Tpl Frame stats
+ *
+ * \return Returns the q_index for the ARF frame.
+ */
+int av1_get_arf_q_index_q_mode(struct AV1_COMP *cpi,
+ struct TplDepFrame *tpl_frame);
+#endif
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RATECTRL_H_
diff --git a/third_party/aom/av1/encoder/rc_utils.h b/third_party/aom/av1/encoder/rc_utils.h
new file mode 100644
index 0000000000..fe22ee5afb
--- /dev/null
+++ b/third_party/aom/av1/encoder/rc_utils.h
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RC_UTILS_H_
+#define AOM_AV1_ENCODER_RC_UTILS_H_
+
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/psnr.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) {
+ RATE_CONTROL *rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ if (cpi->common.current_frame.frame_number >
+ (unsigned int)cpi->svc.number_spatial_layers) {
+ if (cpi->ppi->use_svc) {
+ av1_svc_check_reset_layer_rc_flag(cpi);
+ } else {
+ if (rc->avg_frame_bandwidth > (3 * rc->prev_avg_frame_bandwidth >> 1) ||
+ rc->avg_frame_bandwidth < (rc->prev_avg_frame_bandwidth >> 1)) {
+ rc->rc_1_frame = 0;
+ rc->rc_2_frame = 0;
+ p_rc->bits_off_target = p_rc->optimal_buffer_level;
+ p_rc->buffer_level = p_rc->optimal_buffer_level;
+ }
+ }
+ }
+}
+
+static AOM_INLINE void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf,
+ AV1_PRIMARY *ppi) {
+ PRIMARY_RATE_CONTROL *p_rc = &ppi->p_rc;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ const int64_t bandwidth = rc_cfg->target_bandwidth;
+ const int64_t starting = rc_cfg->starting_buffer_level_ms;
+ const int64_t optimal = rc_cfg->optimal_buffer_level_ms;
+ const int64_t maximum = rc_cfg->maximum_buffer_size_ms;
+
+ p_rc->starting_buffer_level = starting * bandwidth / 1000;
+ p_rc->optimal_buffer_level =
+ (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+ p_rc->maximum_buffer_size =
+ (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+
+ // Under a configuration change, where maximum_buffer_size may change,
+ // keep buffer level clipped to the maximum allowed buffer size.
+ p_rc->bits_off_target =
+ AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+ p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
+}
+
+static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
+ AV1_LEVEL target_level, int tier) {
+ AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ SequenceHeader *const seq_params = cpi->common.seq_params;
+ TileConfig *const tile_cfg = &oxcf->tile_cfg;
+ RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ // Adjust target bitrate to be no larger than 70% of level limit.
+ const BITSTREAM_PROFILE profile = seq_params->profile;
+ const double level_bitrate_limit =
+ av1_get_max_bitrate_for_level(target_level, tier, profile);
+ const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
+ rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate);
+ // Also need to update cpi->ppi->twopass.bits_left.
+ TWO_PASS *const twopass = &cpi->ppi->twopass;
+ FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
+ if (stats != NULL)
+ cpi->ppi->twopass.bits_left =
+ (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0);
+
+ // Adjust max over-shoot percentage.
+ rc_cfg->over_shoot_pct = 0;
+
+ // Adjust max quantizer.
+ rc_cfg->worst_allowed_q = 255;
+
+ // Adjust number of tiles and tile columns to be under level limit.
+ int max_tiles, max_tile_cols;
+ av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols);
+ while (tile_cfg->tile_columns > 0 &&
+ (1 << tile_cfg->tile_columns) > max_tile_cols) {
+ --tile_cfg->tile_columns;
+ }
+ const int tile_cols = (1 << tile_cfg->tile_columns);
+ while (tile_cfg->tile_rows > 0 &&
+ tile_cols * (1 << tile_cfg->tile_rows) > max_tiles) {
+ --tile_cfg->tile_rows;
+ }
+
+ // Adjust min compression ratio.
+ const int still_picture = seq_params->still_picture;
+ const double min_cr =
+ av1_get_min_cr_for_level(target_level, tier, still_picture);
+ rc_cfg->min_cr = AOMMAX(rc_cfg->min_cr, (unsigned int)(min_cr * 100));
+}
+
+#if !CONFIG_REALTIME_ONLY
+
+/*!\brief Function to test for conditions that indicate we should loop
+ * back and recode a frame.
+ *
+ * \ingroup rate_control
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] high_limit Upper rate threshold
+ * \param[in] low_limit Lower rate threshold
+ * \param[in] q Current q index
+ * \param[in] maxq Maximum allowed q index
+ * \param[in] minq Minimum allowed q index
+ *
+ * \return Indicates if a recode is required.
+ * \retval 1 Recode Required
+ * \retval 0 No Recode required
+ */
+static AOM_INLINE int recode_loop_test(AV1_COMP *cpi, int high_limit,
+ int low_limit, int q, int maxq,
+ int minq) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+ int force_recode = 0;
+
+ if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+ (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) ||
+ (frame_is_kfgfarf &&
+ (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+ // TODO(agrange) high_limit could be greater than the scale-down threshold.
+ if ((rc->projected_frame_size > high_limit && q < maxq) ||
+ (rc->projected_frame_size < low_limit && q > minq)) {
+ force_recode = 1;
+ } else if (cpi->oxcf.rc_cfg.mode == AOM_CQ) {
+ // Deal with frame undershoot and whether or not we are
+ // below the automatically set cq level.
+ if (q > oxcf->rc_cfg.cq_level &&
+ rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+ force_recode = 1;
+ }
+ }
+ }
+ return force_recode;
+}
+
+static AOM_INLINE double av1_get_gfu_boost_projection_factor(double min_factor,
+ double max_factor,
+ int frame_count) {
+ double factor = sqrt((double)frame_count);
+ factor = AOMMIN(factor, max_factor);
+ factor = AOMMAX(factor, min_factor);
+ factor = (200.0 + 10.0 * factor);
+ return factor;
+}
+
+static AOM_INLINE int get_gfu_boost_from_r0_lap(double min_factor,
+ double max_factor, double r0,
+ int frames_to_key) {
+ double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor,
+ frames_to_key);
+ const int boost = (int)rint(factor / r0);
+ return boost;
+}
+
+static AOM_INLINE double av1_get_kf_boost_projection_factor(int frame_count) {
+ double factor = sqrt((double)frame_count);
+ factor = AOMMIN(factor, 10.0);
+ factor = AOMMAX(factor, 4.0);
+ factor = (75.0 + 14.0 * factor);
+ return factor;
+}
+
+static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi,
+ int is_encode_stage, int q_low,
+ int q_high, int top_index,
+ int bottom_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+
+ av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+ cm->height);
+
+ int q_regulated =
+ av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ AOMMAX(q_high, top_index), cm->width, cm->height);
+
+ int retries = 0;
+ while (q_regulated < q_low && retries < 10) {
+ av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+ cm->height);
+ q_regulated =
+ av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ AOMMAX(q_high, top_index), cm->width, cm->height);
+ retries++;
+ }
+ return q_regulated;
+}
+
+static AOM_INLINE int get_regulated_q_undershoot(AV1_COMP *const cpi,
+ int is_encode_stage,
+ int q_high, int top_index,
+ int bottom_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+
+ av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+ cm->height);
+ int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ top_index, cm->width, cm->height);
+
+ int retries = 0;
+ while (q_regulated > q_high && retries < 10) {
+ av1_rc_update_rate_correction_factors(cpi, is_encode_stage, cm->width,
+ cm->height);
+ q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ top_index, cm->width, cm->height);
+ retries++;
+ }
+ return q_regulated;
+}
+
+/*!\brief Called after encode_with_recode_loop() has just encoded a frame.
+ * This function works out whether we undershot or overshot our bitrate
+ * target and adjusts q as appropriate. It also decides whether or not
+ * we need to recode the frame to get closer to the target rate.
+ *
+ * \ingroup rate_control
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[out] loop Should we go around the recode loop again
+ * \param[in,out] q New q index value
+ * \param[in,out] q_low Low q index limit for this loop itteration
+ * \param[in,out] q_high High q index limit for this loop itteration
+ * \param[in] top_index Max permited new value for q index
+ * \param[in] bottom_index Min permited new value for q index
+ * \param[in,out] undershoot_seen Have we seen undershoot on this frame
+ * \param[in,out] overshoot_seen Have we seen overshoot on this frame
+ * \param[in,out] low_cr_seen Have we previously trriggered recode
+ * because the compression ration was less
+ * than a given minimum threshold.
+ * \param[in] loop_count Loop itterations so far.
+ *
+ */
+static AOM_INLINE void recode_loop_update_q(
+ AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low,
+ int *const q_high, const int top_index, const int bottom_index,
+ int *const undershoot_seen, int *const overshoot_seen,
+ int *const low_cr_seen, const int loop_count) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+ *loop = 0;
+
+ // Special case for overlay frame.
+ if (rc->is_src_frame_alt_ref &&
+ rc->projected_frame_size < rc->max_frame_bandwidth)
+ return;
+
+ const int min_cr = rc_cfg->min_cr;
+ if (min_cr > 0) {
+ const double compression_ratio =
+ av1_get_compression_ratio(cm, rc->projected_frame_size >> 3);
+ const double target_cr = min_cr / 100.0;
+ if (compression_ratio < target_cr) {
+ *low_cr_seen = 1;
+ if (*q < rc->worst_quality) {
+ const double cr_ratio = target_cr / compression_ratio;
+ const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio));
+ *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality);
+ *q_low = AOMMAX(*q, *q_low);
+ *q_high = AOMMAX(*q, *q_high);
+ *loop = 1;
+ }
+ }
+ if (*low_cr_seen) return;
+ }
+
+ if (cpi->ppi->level_params.keep_level_stats &&
+ !is_stat_generation_stage(cpi)) {
+ // Initialize level info. at the beginning of each sequence.
+ if (cm->current_frame.frame_type == KEY_FRAME &&
+ cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+ av1_init_level_info(cpi);
+ }
+ const AV1LevelParams *const level_params = &cpi->ppi->level_params;
+ // TODO(any): currently only checking operating point 0
+ const AV1LevelInfo *const level_info = level_params->level_info[0];
+ const DECODER_MODEL *const decoder_models = level_info->decoder_models;
+ const AV1_LEVEL target_level = level_params->target_seq_level_idx[0];
+
+ if (target_level < SEQ_LEVELS &&
+ decoder_models[target_level].status == DECODER_MODEL_OK) {
+ DECODER_MODEL_STATUS status = av1_decoder_model_try_smooth_buf(
+ cpi, rc->projected_frame_size, &decoder_models[target_level]);
+
+ if ((status == SMOOTHING_BUFFER_UNDERFLOW ||
+ status == SMOOTHING_BUFFER_OVERFLOW) &&
+ *q < rc->worst_quality) {
+ *q = AOMMIN(*q + 10, rc->worst_quality);
+ *q_low = AOMMAX(*q, *q_low);
+ *q_high = AOMMAX(*q, *q_high);
+ *loop = 1;
+ return;
+ }
+ }
+ }
+
+ if (rc_cfg->mode == AOM_Q) return;
+
+ const int last_q = *q;
+ int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
+ av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+ &frame_under_shoot_limit,
+ &frame_over_shoot_limit);
+ if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+
+ if (cm->current_frame.frame_type == KEY_FRAME &&
+ p_rc->this_key_frame_forced &&
+ rc->projected_frame_size < rc->max_frame_bandwidth) {
+ int64_t kf_err;
+ const int64_t high_err_target = cpi->ambient_err;
+ const int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (cm->seq_params->use_highbitdepth) {
+ kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+ } else {
+ kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+ }
+#else
+ kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+ // Prevent possible divide by zero error below for perfect KF
+ kf_err += !kf_err;
+
+ // The key frame is not good enough or we can afford
+ // to make it better without undue risk of popping.
+ if ((kf_err > high_err_target &&
+ rc->projected_frame_size <= frame_over_shoot_limit) ||
+ (kf_err > low_err_target &&
+ rc->projected_frame_size <= frame_under_shoot_limit)) {
+ // Lower q_high
+ *q_high = AOMMAX(*q - 1, *q_low);
+
+ // Adjust Q
+ *q = (int)((*q * high_err_target) / kf_err);
+ *q = AOMMIN(*q, (*q_high + *q_low) >> 1);
+ } else if (kf_err < low_err_target &&
+ rc->projected_frame_size >= frame_under_shoot_limit) {
+ // The key frame is much better than the previous frame
+ // Raise q_low
+ *q_low = AOMMIN(*q + 1, *q_high);
+
+ // Adjust Q
+ *q = (int)((*q * low_err_target) / kf_err);
+ *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1);
+ }
+
+ // Clamp Q to upper and lower limits:
+ *q = clamp(*q, *q_low, *q_high);
+ *loop = (*q != last_q);
+ return;
+ }
+
+ if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q,
+ AOMMAX(*q_high, top_index), bottom_index)) {
+ // Is the projected frame size out of range and are we allowed
+ // to attempt to recode.
+
+ // Frame size out of permitted range:
+ // Update correction factor & compute new Q to try...
+ // Frame is too large
+ if (rc->projected_frame_size > rc->this_frame_target) {
+ // Special case if the projected size is > the max allowed.
+ if (*q == *q_high &&
+ rc->projected_frame_size >= rc->max_frame_bandwidth) {
+ const double q_val_high_current =
+ av1_convert_qindex_to_q(*q_high, cm->seq_params->bit_depth);
+ const double q_val_high_new =
+ q_val_high_current *
+ ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
+ *q_high = av1_find_qindex(q_val_high_new, cm->seq_params->bit_depth,
+ rc->best_quality, rc->worst_quality);
+ }
+
+ // Raise Qlow as to at least the current value
+ *q_low = AOMMIN(*q + 1, *q_high);
+
+ if (*undershoot_seen || loop_count > 2 ||
+ (loop_count == 2 && !frame_is_intra_only(cm))) {
+ av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height);
+
+ *q = (*q_high + *q_low + 1) / 2;
+ } else if (loop_count == 2 && frame_is_intra_only(cm)) {
+ const int q_mid = (*q_high + *q_low + 1) / 2;
+ const int q_regulated = get_regulated_q_overshoot(
+ cpi, 1, *q_low, *q_high, top_index, bottom_index);
+ // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+ // transition between loop_count < 2 and loop_count > 2.
+ *q = (q_mid + q_regulated + 1) / 2;
+ } else {
+ *q = get_regulated_q_overshoot(cpi, 1, *q_low, *q_high, top_index,
+ bottom_index);
+ }
+
+ *overshoot_seen = 1;
+ } else {
+ // Frame is too small
+ *q_high = AOMMAX(*q - 1, *q_low);
+
+ if (*overshoot_seen || loop_count > 2 ||
+ (loop_count == 2 && !frame_is_intra_only(cm))) {
+ av1_rc_update_rate_correction_factors(cpi, 1, cm->width, cm->height);
+ *q = (*q_high + *q_low) / 2;
+ } else if (loop_count == 2 && frame_is_intra_only(cm)) {
+ const int q_mid = (*q_high + *q_low) / 2;
+ const int q_regulated = get_regulated_q_undershoot(
+ cpi, 1, *q_high, top_index, bottom_index);
+ // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+ // transition between loop_count < 2 and loop_count > 2.
+ *q = (q_mid + q_regulated) / 2;
+
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passsed in value.
+ if (rc_cfg->mode == AOM_CQ && q_regulated < *q_low) {
+ *q_low = *q;
+ }
+ } else {
+ *q = get_regulated_q_undershoot(cpi, 1, *q_high, top_index,
+ bottom_index);
+
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passsed in value.
+ if (rc_cfg->mode == AOM_CQ && *q < *q_low) {
+ *q_low = *q;
+ }
+ }
+
+ *undershoot_seen = 1;
+ }
+
+ // Clamp Q to upper and lower limits:
+ *q = clamp(*q, *q_low, *q_high);
+ }
+
+ *loop = (*q != last_q);
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RC_UTILS_H_
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
new file mode 100644
index 0000000000..c2d76e7a9a
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.c
@@ -0,0 +1,1580 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_once.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/nonrd_opt.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+
+#define RD_THRESH_POW 1.25
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = {
+ 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16
+};
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA]
+ [EXT_TX_SIZES] = {
+ { 1, 1, 1, 1 }, // unused
+ { 1, 1, 0, 0 },
+ { 0, 0, 1, 0 },
+ };
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER]
+ [EXT_TX_SIZES] = {
+ { 1, 1, 1, 1 }, // unused
+ { 1, 1, 0, 0 },
+ { 0, 0, 1, 0 },
+ { 0, 1, 1, 1 },
+ };
+
+static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
+ EXT_TX_SETS_INTER)] = {
+ {
+ // Intra
+ EXT_TX_SET_DCTONLY,
+ EXT_TX_SET_DTT4_IDTX_1DDCT,
+ EXT_TX_SET_DTT4_IDTX,
+ },
+ {
+ // Inter
+ EXT_TX_SET_DCTONLY,
+ EXT_TX_SET_ALL16,
+ EXT_TX_SET_DTT9_IDTX_1DDCT,
+ EXT_TX_SET_DCT_IDTX,
+ },
+};
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs,
+ FRAME_CONTEXT *fc) {
+ int i, j;
+
+ for (i = 0; i < PARTITION_CONTEXTS; ++i)
+ av1_cost_tokens_from_cdf(mode_costs->partition_cost[i],
+ fc->partition_cdf[i], NULL);
+
+ if (cm->current_frame.skip_mode_info.skip_mode_flag) {
+ for (i = 0; i < SKIP_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->skip_mode_cost[i],
+ fc->skip_mode_cdfs[i], NULL);
+ }
+ }
+
+ for (i = 0; i < SKIP_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->skip_txfm_cost[i],
+ fc->skip_txfm_cdfs[i], NULL);
+ }
+
+ for (i = 0; i < KF_MODE_CONTEXTS; ++i)
+ for (j = 0; j < KF_MODE_CONTEXTS; ++j)
+ av1_cost_tokens_from_cdf(mode_costs->y_mode_costs[i][j],
+ fc->kf_y_cdf[i][j], NULL);
+
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+ av1_cost_tokens_from_cdf(mode_costs->mbmode_cost[i], fc->y_mode_cdf[i],
+ NULL);
+ for (i = 0; i < CFL_ALLOWED_TYPES; ++i)
+ for (j = 0; j < INTRA_MODES; ++j)
+ av1_cost_tokens_from_cdf(mode_costs->intra_uv_mode_cost[i][j],
+ fc->uv_mode_cdf[i][j], NULL);
+
+ av1_cost_tokens_from_cdf(mode_costs->filter_intra_mode_cost,
+ fc->filter_intra_mode_cdf, NULL);
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ if (av1_filter_intra_allowed_bsize(cm, i))
+ av1_cost_tokens_from_cdf(mode_costs->filter_intra_cost[i],
+ fc->filter_intra_cdfs[i], NULL);
+ }
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ av1_cost_tokens_from_cdf(mode_costs->switchable_interp_costs[i],
+ fc->switchable_interp_cdf[i], NULL);
+
+ for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->palette_y_size_cost[i],
+ fc->palette_y_size_cdf[i], NULL);
+ av1_cost_tokens_from_cdf(mode_costs->palette_uv_size_cost[i],
+ fc->palette_uv_size_cdf[i], NULL);
+ for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->palette_y_mode_cost[i][j],
+ fc->palette_y_mode_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->palette_uv_mode_cost[i],
+ fc->palette_uv_mode_cdf[i], NULL);
+ }
+
+ for (i = 0; i < PALETTE_SIZES; ++i) {
+ for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->palette_y_color_cost[i][j],
+ fc->palette_y_color_index_cdf[i][j], NULL);
+ av1_cost_tokens_from_cdf(mode_costs->palette_uv_color_cost[i][j],
+ fc->palette_uv_color_index_cdf[i][j], NULL);
+ }
+ }
+
+ int sign_cost[CFL_JOINT_SIGNS];
+ av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
+ for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+ int *cost_u = mode_costs->cfl_cost[joint_sign][CFL_PRED_U];
+ int *cost_v = mode_costs->cfl_cost[joint_sign][CFL_PRED_V];
+ if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) {
+ memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
+ } else {
+ const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+ av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL);
+ }
+ if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) {
+ memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v));
+ } else {
+ const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+ av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL);
+ }
+ for (int u = 0; u < CFL_ALPHABET_SIZE; u++)
+ cost_u[u] += sign_cost[joint_sign];
+ }
+
+ for (i = 0; i < MAX_TX_CATS; ++i)
+ for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+ av1_cost_tokens_from_cdf(mode_costs->tx_size_cost[i][j],
+ fc->tx_size_cdf[i][j], NULL);
+
+ for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->txfm_partition_cost[i],
+ fc->txfm_partition_cdf[i], NULL);
+ }
+
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ int s;
+ for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+ if (use_inter_ext_tx_for_txsize[s][i]) {
+ av1_cost_tokens_from_cdf(
+ mode_costs->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
+ av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]);
+ }
+ }
+ for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+ if (use_intra_ext_tx_for_txsize[s][i]) {
+ for (j = 0; j < INTRA_MODES; ++j) {
+ av1_cost_tokens_from_cdf(
+ mode_costs->intra_tx_type_costs[s][i][j],
+ fc->intra_ext_tx_cdf[s][i][j],
+ av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]);
+ }
+ }
+ }
+ }
+ for (i = 0; i < DIRECTIONAL_MODES; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->angle_delta_cost[i],
+ fc->angle_delta_cdf[i], NULL);
+ }
+ av1_cost_tokens_from_cdf(mode_costs->intrabc_cost, fc->intrabc_cdf, NULL);
+
+ for (i = 0; i < SPATIAL_PREDICTION_PROBS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->spatial_pred_cost[i],
+ fc->seg.spatial_pred_seg_cdf[i], NULL);
+ }
+
+ for (i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->tmp_pred_cost[i], fc->seg.pred_cdf[i],
+ NULL);
+ }
+
+ if (!frame_is_intra_only(cm)) {
+ for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_inter_cost[i],
+ fc->comp_inter_cdf[i], NULL);
+ }
+
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < SINGLE_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->single_ref_cost[i][j],
+ fc->single_ref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_ref_type_cost[i],
+ fc->comp_ref_type_cdf[i], NULL);
+ }
+
+ for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
+ for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->uni_comp_ref_cost[i][j],
+ fc->uni_comp_ref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < FWD_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_ref_cost[i][j],
+ fc->comp_ref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < BWD_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_bwdref_cost[i][j],
+ fc->comp_bwdref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->intra_inter_cost[i],
+ fc->intra_inter_cdf[i], NULL);
+ }
+
+ for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->newmv_mode_cost[i], fc->newmv_cdf[i],
+ NULL);
+ }
+
+ for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->zeromv_mode_cost[i],
+ fc->zeromv_cdf[i], NULL);
+ }
+
+ for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->refmv_mode_cost[i], fc->refmv_cdf[i],
+ NULL);
+ }
+
+ for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->drl_mode_cost0[i], fc->drl_cdf[i],
+ NULL);
+ }
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+ av1_cost_tokens_from_cdf(mode_costs->inter_compound_mode_cost[i],
+ fc->inter_compound_mode_cdf[i], NULL);
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i)
+ av1_cost_tokens_from_cdf(mode_costs->compound_type_cost[i],
+ fc->compound_type_cdf[i], NULL);
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ if (av1_is_wedge_used(i)) {
+ av1_cost_tokens_from_cdf(mode_costs->wedge_idx_cost[i],
+ fc->wedge_idx_cdf[i], NULL);
+ }
+ }
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->interintra_cost[i],
+ fc->interintra_cdf[i], NULL);
+ av1_cost_tokens_from_cdf(mode_costs->interintra_mode_cost[i],
+ fc->interintra_mode_cdf[i], NULL);
+ }
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->wedge_interintra_cost[i],
+ fc->wedge_interintra_cdf[i], NULL);
+ }
+ for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+ av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost[i],
+ fc->motion_mode_cdf[i], NULL);
+ }
+ for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+ av1_cost_tokens_from_cdf(mode_costs->motion_mode_cost1[i],
+ fc->obmc_cdf[i], NULL);
+ }
+ for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_idx_cost[i],
+ fc->compound_index_cdf[i], NULL);
+ }
+ for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(mode_costs->comp_group_idx_cost[i],
+ fc->comp_group_idx_cdf[i], NULL);
+ }
+ }
+}
+
+void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc) {
+ av1_cost_tokens_from_cdf(mode_costs->switchable_restore_cost,
+ fc->switchable_restore_cdf, NULL);
+ av1_cost_tokens_from_cdf(mode_costs->wiener_restore_cost,
+ fc->wiener_restore_cdf, NULL);
+ av1_cost_tokens_from_cdf(mode_costs->sgrproj_restore_cost,
+ fc->sgrproj_restore_cdf, NULL);
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit_lut_8[QINDEX_RANGE];
+static int sad_per_bit_lut_10[QINDEX_RANGE];
+static int sad_per_bit_lut_12[QINDEX_RANGE];
+
+static void init_me_luts_bd(int *bit16lut, int range,
+ aom_bit_depth_t bit_depth) {
+ int i;
+ // Initialize the sad lut tables using a formulaic calculation for now.
+ // This is to make it easier to resolve the impact of experimental changes
+ // to the quantizer tables.
+ for (i = 0; i < range; i++) {
+ const double q = av1_convert_qindex_to_q(i, bit_depth);
+ bit16lut[i] = (int)(0.0418 * q + 2.4107);
+ }
+}
+
+static void init_me_luts(void) {
+ init_me_luts_bd(sad_per_bit_lut_8, QINDEX_RANGE, AOM_BITS_8);
+ init_me_luts_bd(sad_per_bit_lut_10, QINDEX_RANGE, AOM_BITS_10);
+ init_me_luts_bd(sad_per_bit_lut_12, QINDEX_RANGE, AOM_BITS_12);
+}
+
+void av1_init_me_luts(void) { aom_once(init_me_luts); }
+
+static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
+ 8, 8, 4, 4, 2, 2, 1, 0 };
+
+static const int rd_layer_depth_factor[7] = {
+ 160, 160, 160, 160, 192, 208, 224
+};
+
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+ return 3.2 + (0.0015 * (double)qindex);
+}
+
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+ return 3.25 + (0.0015 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+ return 3.3 + (0.0015 * (double)qindex);
+}
+
+int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
+ FRAME_UPDATE_TYPE update_type,
+ int qindex) {
+ const int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ int64_t rdmult = q * q;
+ if (update_type == KF_UPDATE) {
+ double def_rd_q_mult = def_kf_rd_multiplier(q);
+ rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+ } else if ((update_type == GF_UPDATE) || (update_type == ARF_UPDATE)) {
+ double def_rd_q_mult = def_arf_rd_multiplier(q);
+ rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+ } else {
+ double def_rd_q_mult = def_inter_rd_multiplier(q);
+ rdmult = (int64_t)((double)rdmult * def_rd_q_mult);
+ }
+
+ switch (bit_depth) {
+ case AOM_BITS_8: break;
+ case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
+ case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+ return rdmult > 0 ? (int)AOMMIN(rdmult, INT_MAX) : 1;
+}
+
+int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth,
+ const FRAME_UPDATE_TYPE update_type,
+ const int layer_depth, const int boost_index,
+ const FRAME_TYPE frame_type,
+ const int use_fixed_qp_offsets,
+ const int is_stat_consumption_stage) {
+ int64_t rdmult =
+ av1_compute_rd_mult_based_on_qindex(bit_depth, update_type, qindex);
+ if (is_stat_consumption_stage && !use_fixed_qp_offsets &&
+ (frame_type != KEY_FRAME)) {
+ // Layer depth adjustment
+ rdmult = (rdmult * rd_layer_depth_factor[layer_depth]) >> 7;
+ // ARF boost adjustment
+ rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+ }
+ return (int)rdmult;
+}
+
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) {
+ assert(beta > 0.0);
+ int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ int newq = (int)rint(q / sqrt(beta));
+ int orig_qindex = qindex;
+ if (newq == q) {
+ return 0;
+ }
+ if (newq < q) {
+ while (qindex > 0) {
+ qindex--;
+ q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ if (newq >= q) {
+ break;
+ }
+ }
+ } else {
+ while (qindex < MAXQ) {
+ qindex++;
+ q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ if (newq <= q) {
+ break;
+ }
+ }
+ }
+ return qindex - orig_qindex;
+}
+
+int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
+ int curr_qindex) {
+ curr_qindex = clamp(curr_qindex, delta_q_res, 256 - delta_q_res);
+ const int sign_deltaq_index = curr_qindex - prev_qindex >= 0 ? 1 : -1;
+ const int deltaq_deadzone = delta_q_res / 4;
+ const int qmask = ~(delta_q_res - 1);
+ int abs_deltaq_index = abs(curr_qindex - prev_qindex);
+ abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask;
+ int adjust_qindex = prev_qindex + sign_deltaq_index * abs_deltaq_index;
+ adjust_qindex = AOMMAX(adjust_qindex, MINQ + 1);
+ return adjust_qindex;
+}
+
+int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) {
+ assert(beta > 0.0);
+ const AV1_COMMON *cm = &cpi->common;
+
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+ const int qindex_rdmult = cm->quant_params.base_qindex;
+ return (int)(av1_compute_rd_mult(
+ qindex_rdmult, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+ layer_depth, boost_index, frame_type,
+ cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi)) /
+ beta);
+}
+
+static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
+ double q;
+ switch (bit_depth) {
+ case AOM_BITS_8: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_8) / 4.0; break;
+ case AOM_BITS_10:
+ q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_10) / 16.0;
+ break;
+ case AOM_BITS_12:
+ q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_12) / 64.0;
+ break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+ // TODO(debargha): Adjust the function below.
+ return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) {
+ switch (cpi->common.seq_params->bit_depth) {
+ case AOM_BITS_8: *sadperbit = sad_per_bit_lut_8[qindex]; break;
+ case AOM_BITS_10: *sadperbit = sad_per_bit_lut_10[qindex]; break;
+ case AOM_BITS_12: *sadperbit = sad_per_bit_lut_12[qindex]; break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ }
+}
+
+static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd,
+ int use_nonrd_pick_mode) {
+ int i, bsize, segment_id;
+ THR_MODES mode_indices[RTC_REFS * RTC_MODES] = { 0 };
+ int num_modes_count = use_nonrd_pick_mode ? 0 : MAX_MODES;
+
+ if (use_nonrd_pick_mode) {
+ for (int r_idx = 0; r_idx < RTC_REFS; r_idx++) {
+ const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0];
+ if (ref != INTRA_FRAME) {
+ for (i = 0; i < RTC_INTER_MODES; i++)
+ mode_indices[num_modes_count++] =
+ mode_idx[ref][mode_offset(inter_mode_list[i])];
+ } else {
+ for (i = 0; i < RTC_INTRA_MODES; i++)
+ mode_indices[num_modes_count++] =
+ mode_idx[ref][mode_offset(intra_mode_list[i])];
+ }
+ }
+ }
+
+ for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+ const int qindex = clamp(
+ av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) +
+ cm->quant_params.y_dc_delta_q,
+ 0, MAXQ);
+ const int q = compute_rd_thresh_factor(qindex, cm->seq_params->bit_depth);
+
+ for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ // Threshold here seems unnecessarily harsh but fine given actual
+ // range of values used for cpi->sf.thresh_mult[].
+ const int t = q * rd_thresh_block_size_factor[bsize];
+ const int thresh_max = INT_MAX / t;
+
+ for (i = 0; i < num_modes_count; ++i) {
+ const int mode_index = use_nonrd_pick_mode ? mode_indices[i] : i;
+ rd->threshes[segment_id][bsize][mode_index] =
+ rd->thresh_mult[mode_index] < thresh_max
+ ? rd->thresh_mult[mode_index] * t / 4
+ : INT_MAX;
+ }
+ }
+ }
+}
+
+void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
+ const int num_planes) {
+ const int nplanes = AOMMIN(num_planes, PLANE_TYPES);
+ for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) {
+ for (int plane = 0; plane < nplanes; ++plane) {
+ LV_MAP_EOB_COST *pcost = &coeff_costs->eob_costs[eob_multi_size][plane];
+
+ for (int ctx = 0; ctx < 2; ++ctx) {
+ aom_cdf_prob *pcdf;
+ switch (eob_multi_size) {
+ case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break;
+ case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break;
+ case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break;
+ case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break;
+ case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break;
+ case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break;
+ case 6:
+ default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break;
+ }
+ av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL);
+ }
+ }
+ }
+ for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+ for (int plane = 0; plane < nplanes; ++plane) {
+ LV_MAP_COEFF_COST *pcost = &coeff_costs->coeff_costs[tx_size][plane];
+
+ for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
+ fc->txb_skip_cdf[tx_size][ctx], NULL);
+
+ for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx],
+ fc->coeff_base_eob_cdf[tx_size][plane][ctx],
+ NULL);
+ for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
+ fc->coeff_base_cdf[tx_size][plane][ctx], NULL);
+
+ for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+ pcost->base_cost[ctx][4] = 0;
+ pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] +
+ av1_cost_literal(1) -
+ pcost->base_cost[ctx][0];
+ pcost->base_cost[ctx][6] =
+ pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1];
+ pcost->base_cost[ctx][7] =
+ pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2];
+ }
+
+ for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
+ fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
+
+ for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx],
+ fc->dc_sign_cdf[plane][ctx], NULL);
+
+ for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+ int br_rate[BR_CDF_SIZE];
+ int prev_cost = 0;
+ int i, j;
+ av1_cost_tokens_from_cdf(
+ br_rate, fc->coeff_br_cdf[AOMMIN(tx_size, TX_32X32)][plane][ctx],
+ NULL);
+ // printf("br_rate: ");
+ // for(j = 0; j < BR_CDF_SIZE; j++)
+ // printf("%4d ", br_rate[j]);
+ // printf("\n");
+ for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) {
+ for (j = 0; j < BR_CDF_SIZE - 1; j++) {
+ pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j];
+ }
+ prev_cost += br_rate[j];
+ }
+ pcost->lps_cost[ctx][i] = prev_cost;
+ // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx);
+ // for (i = 0; i <= COEFF_BASE_RANGE; i++)
+ // printf("%5d ", pcost->lps_cost[ctx][i]);
+ // printf("\n");
+ }
+ for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+ pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] =
+ pcost->lps_cost[ctx][0];
+ for (int i = 1; i <= COEFF_BASE_RANGE; ++i) {
+ pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] =
+ pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1];
+ }
+ }
+ }
+ }
+}
+
+void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
+ MvCosts *mv_costs) {
+ // Avoid accessing 'mv_costs' when it is not allocated.
+ if (mv_costs == NULL) return;
+
+ mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
+ mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
+ mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];
+ mv_costs->nmv_cost_hp[1] = &mv_costs->nmv_cost_hp_alloc[1][MV_MAX];
+ if (integer_mv) {
+ mv_costs->mv_cost_stack = (int **)&mv_costs->nmv_cost;
+ av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack,
+ nmvc, MV_SUBPEL_NONE);
+ } else {
+ mv_costs->mv_cost_stack =
+ usehp ? mv_costs->nmv_cost_hp : mv_costs->nmv_cost;
+ av1_build_nmv_cost_table(mv_costs->nmv_joint_cost, mv_costs->mv_cost_stack,
+ nmvc, usehp);
+ }
+}
+
+void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs) {
+ dv_costs->dv_costs[0] = &dv_costs->dv_costs_alloc[0][MV_MAX];
+ dv_costs->dv_costs[1] = &dv_costs->dv_costs_alloc[1][MV_MAX];
+ av1_build_nmv_cost_table(dv_costs->joint_mv, dv_costs->dv_costs, ndvc,
+ MV_SUBPEL_NONE);
+}
+
+// Populates speed features based on codec control settings (of type
+// COST_UPDATE_TYPE) and expected speed feature settings (of type
+// INTERNAL_COST_UPDATE_TYPE) by considering the least frequent cost update.
+// The populated/updated speed features are used for cost updates in the
+// encoder.
+// WARNING: Population of unified cost update frequency needs to be taken care
+// accordingly, in case of any modifications/additions to the enum
+// COST_UPDATE_TYPE/INTERNAL_COST_UPDATE_TYPE.
+static INLINE void populate_unified_cost_update_freq(
+ const CostUpdateFreq cost_upd_freq, SPEED_FEATURES *const sf) {
+ INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf;
+ // Mapping of entropy cost update frequency from the encoder's codec control
+ // settings of type COST_UPDATE_TYPE to speed features of type
+ // INTERNAL_COST_UPDATE_TYPE.
+ static const INTERNAL_COST_UPDATE_TYPE
+ map_cost_upd_to_internal_cost_upd[NUM_COST_UPDATE_TYPES] = {
+ INTERNAL_COST_UPD_SB, INTERNAL_COST_UPD_SBROW, INTERNAL_COST_UPD_TILE,
+ INTERNAL_COST_UPD_OFF
+ };
+
+ inter_sf->mv_cost_upd_level =
+ AOMMIN(inter_sf->mv_cost_upd_level,
+ map_cost_upd_to_internal_cost_upd[cost_upd_freq.mv]);
+ inter_sf->coeff_cost_upd_level =
+ AOMMIN(inter_sf->coeff_cost_upd_level,
+ map_cost_upd_to_internal_cost_upd[cost_upd_freq.coeff]);
+ inter_sf->mode_cost_upd_level =
+ AOMMIN(inter_sf->mode_cost_upd_level,
+ map_cost_upd_to_internal_cost_upd[cost_upd_freq.mode]);
+ sf->intra_sf.dv_cost_upd_level =
+ AOMMIN(sf->intra_sf.dv_cost_upd_level,
+ map_cost_upd_to_internal_cost_upd[cost_upd_freq.dv]);
+}
+
+// Checks if entropy costs should be initialized/updated at frame level or not.
+static INLINE int is_frame_level_cost_upd_freq_set(
+ const AV1_COMMON *const cm, const INTERNAL_COST_UPDATE_TYPE cost_upd_level,
+ const int use_nonrd_pick_mode, const int frames_since_key) {
+ const int fill_costs =
+ frame_is_intra_only(cm) ||
+ (use_nonrd_pick_mode ? frames_since_key < 2
+ : (cm->current_frame.frame_number & 0x07) == 1);
+ return ((!use_nonrd_pick_mode && cost_upd_level != INTERNAL_COST_UPD_OFF) ||
+ cost_upd_level == INTERNAL_COST_UPD_TILE || fill_costs);
+}
+
+// Decide whether we want to update the mode entropy cost for the current frame.
+// The logit is currently inherited from selective_disable_cdf_rtc.
+static AOM_INLINE int should_force_mode_cost_update(const AV1_COMP *cpi) {
+ const REAL_TIME_SPEED_FEATURES *const rt_sf = &cpi->sf.rt_sf;
+ if (!rt_sf->frame_level_mode_cost_update) {
+ return false;
+ }
+
+ if (cpi->oxcf.algo_cfg.cdf_update_mode == 2) {
+ return cpi->frames_since_last_update == 1;
+ } else if (cpi->oxcf.algo_cfg.cdf_update_mode == 1) {
+ if (cpi->svc.number_spatial_layers == 1 &&
+ cpi->svc.number_temporal_layers == 1) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+
+ return frame_is_intra_only(cm) || is_frame_resize_pending(cpi) ||
+ rc->high_source_sad || rc->frames_since_key < 10 ||
+ cpi->cyclic_refresh->counter_encode_maxq_scene_change < 10 ||
+ cm->current_frame.frame_number % 8 == 0;
+ } else if (cpi->svc.number_temporal_layers > 1) {
+ return cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1;
+ }
+ }
+
+ return false;
+}
+
+void av1_initialize_rd_consts(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ RD_OPT *const rd = &cpi->rd;
+ int use_nonrd_pick_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+ int frames_since_key = cpi->rc.frames_since_key;
+
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+ const int qindex_rdmult =
+ cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q;
+ rd->RDMULT = av1_compute_rd_mult(
+ qindex_rdmult, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+#if CONFIG_RD_COMMAND
+ if (cpi->oxcf.pass == 2) {
+ const RD_COMMAND *rd_command = &cpi->rd_command;
+ if (rd_command->option_ls[rd_command->frame_index] ==
+ RD_OPTION_SET_Q_RDMULT) {
+ rd->RDMULT = rd_command->rdmult_ls[rd_command->frame_index];
+ }
+ }
+#endif // CONFIG_RD_COMMAND
+
+ av1_set_error_per_bit(&x->errorperbit, rd->RDMULT);
+
+ set_block_thresholds(cm, rd, cpi->sf.rt_sf.use_nonrd_pick_mode);
+
+ populate_unified_cost_update_freq(cpi->oxcf.cost_upd_freq, sf);
+ const INTER_MODE_SPEED_FEATURES *const inter_sf = &cpi->sf.inter_sf;
+ // Frame level mv cost update
+ if (is_frame_level_cost_upd_freq_set(cm, inter_sf->mv_cost_upd_level,
+ use_nonrd_pick_mode, frames_since_key))
+ av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
+ cm->features.allow_high_precision_mv, x->mv_costs);
+
+ // Frame level coefficient cost update
+ if (is_frame_level_cost_upd_freq_set(cm, inter_sf->coeff_cost_upd_level,
+ use_nonrd_pick_mode, frames_since_key))
+ av1_fill_coeff_costs(&x->coeff_costs, cm->fc, av1_num_planes(cm));
+
+ // Frame level mode cost update
+ if (should_force_mode_cost_update(cpi) ||
+ is_frame_level_cost_upd_freq_set(cm, inter_sf->mode_cost_upd_level,
+ use_nonrd_pick_mode, frames_since_key))
+ av1_fill_mode_rates(cm, &x->mode_costs, cm->fc);
+
+ // Frame level dv cost update
+ if (av1_need_dv_costs(cpi)) {
+ if (cpi->td.dv_costs_alloc == NULL) {
+ CHECK_MEM_ERROR(
+ cm, cpi->td.dv_costs_alloc,
+ (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.dv_costs_alloc)));
+ cpi->td.mb.dv_costs = cpi->td.dv_costs_alloc;
+ }
+ av1_fill_dv_costs(&cm->fc->ndvc, x->dv_costs);
+ }
+}
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+ // NOTE: The tables below must be of the same size.
+
+ // The functions described below are sampled at the four most significant
+ // bits of x^2 + 8 / 256.
+
+ // Normalized rate:
+ // This table models the rate for a Laplacian source with given variance
+ // when quantized with a uniform quantizer with given stepsize. The
+ // closed form expression is:
+ // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+ // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+ // and H(x) is the binary entropy function.
+ static const int rate_tab_q10[] = {
+ 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
+ 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
+ 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
+ 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
+ 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
+ 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424,
+ 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87,
+ 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6,
+ 5, 3, 2, 1, 1, 1, 0, 0,
+ };
+ // Normalized distortion:
+ // This table models the normalized distortion for a Laplacian source
+ // with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expression is:
+ // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+ // where x = qpstep / sqrt(variance).
+ // Note the actual distortion is Dn * variance.
+ static const int dist_tab_q10[] = {
+ 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5,
+ 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17,
+ 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54,
+ 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142,
+ 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351,
+ 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659,
+ 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936,
+ 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017,
+ 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+ };
+ static const int xsq_iq_q10[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32,
+ 40, 48, 56, 64, 72, 80, 88, 96, 112,
+ 128, 144, 160, 176, 192, 208, 224, 256, 288,
+ 320, 352, 384, 416, 448, 480, 544, 608, 672,
+ 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504,
+ 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296,
+ 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136,
+ 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328,
+ 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736,
+ 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696,
+ 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808,
+ 180192, 196576, 212960, 229344, 245728,
+ };
+ const int tmp = (xsq_q10 >> 2) + 8;
+ const int k = get_msb(tmp) - 3;
+ const int xq = (k << 3) + ((tmp >> k) & 0x7);
+ const int one_q10 = 1 << 10;
+ const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+ const int b_q10 = one_q10 - a_q10;
+ *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+ *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
+ unsigned int qstep, int *rate,
+ int64_t *dist) {
+ // This function models the rate and distortion for a Laplacian
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expressions are in:
+ // Hang and Chen, "Source Model for transform video coder and its
+ // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+ // Sys. for Video Tech., April 1997.
+ if (var == 0) {
+ *rate = 0;
+ *dist = 0;
+ } else {
+ int d_q10, r_q10;
+ static const uint32_t MAX_XSQ_Q10 = 245727;
+ const uint64_t xsq_q10_64 =
+ (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+ const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10);
+ model_rd_norm(xsq_q10, &r_q10, &d_q10);
+ *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT);
+ *dist = (var * (int64_t)d_q10 + 512) >> 10;
+ }
+}
+
+static double interp_cubic(const double *p, double x) {
+ return p[1] + 0.5 * x *
+ (p[2] - p[0] +
+ x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+ x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+/*
+static double interp_bicubic(const double *p, int p_stride, double x,
+ double y) {
+ double q[4];
+ q[0] = interp_cubic(p, x);
+ q[1] = interp_cubic(p + p_stride, x);
+ q[2] = interp_cubic(p + 2 * p_stride, x);
+ q[3] = interp_cubic(p + 3 * p_stride, x);
+ return interp_cubic(q, y);
+}
+*/
+
+static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 3, 3
+};
+
+static int sse_norm_curvfit_model_cat_lookup(double sse_norm) {
+ return (sse_norm > 16.0);
+}
+
+// Models distortion by sse using a logistic function on
+// l = log2(sse / q^2) as:
+// dbysse = 16 / (1 + k exp(l + c))
+static double get_dbysse_logistic(double l, double c, double k) {
+ const double A = 16.0;
+ const double dbysse = A / (1 + k * exp(l + c));
+ return dbysse;
+}
+
+// Models rate using a clamped linear function on
+// l = log2(sse / q^2) as:
+// rate = max(0, a + b * l)
+static double get_rate_clamplinear(double l, double a, double b) {
+ const double rate = a + b * l;
+ return (rate < 0 ? 0 : rate);
+}
+
+static const uint8_t bsize_surffit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 0, 0, 2, 2, 4, 4
+};
+
+static const double surffit_rate_params[9][4] = {
+ {
+ 638.390212,
+ 2.253108,
+ 166.585650,
+ -3.939401,
+ },
+ {
+ 5.256905,
+ 81.997240,
+ -1.321771,
+ 17.694216,
+ },
+ {
+ -74.193045,
+ 72.431868,
+ -19.033152,
+ 15.407276,
+ },
+ {
+ 416.770113,
+ 14.794188,
+ 167.686830,
+ -6.997756,
+ },
+ {
+ 378.511276,
+ 9.558376,
+ 154.658843,
+ -6.635663,
+ },
+ {
+ 277.818787,
+ 4.413180,
+ 150.317637,
+ -9.893038,
+ },
+ {
+ 142.212132,
+ 11.542038,
+ 94.393964,
+ -5.518517,
+ },
+ {
+ 219.100256,
+ 4.007421,
+ 108.932852,
+ -6.981310,
+ },
+ {
+ 222.261971,
+ 3.251049,
+ 95.972916,
+ -5.609789,
+ },
+};
+
+static const double surffit_dist_params[7] = { 1.475844, 4.328362, -5.680233,
+ -0.500994, 0.554585, 4.839478,
+ -0.695837 };
+
+static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+ double *rpar) {
+ const int cat = bsize_surffit_model_cat_lookup[bsize];
+ rpar[0] = surffit_rate_params[cat][0] + surffit_rate_params[cat][1] * xm;
+ rpar[1] = surffit_rate_params[cat][2] + surffit_rate_params[cat][3] * xm;
+}
+
+static void dist_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+ double *dpar) {
+ (void)bsize;
+ const double *params = surffit_dist_params;
+ dpar[0] = params[0] + params[1] / (1 + exp((xm + params[2]) * params[3]));
+ dpar[1] = params[4] + params[5] * exp(params[6] * xm);
+}
+
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+ double yl, double *rate_f, double *distbysse_f) {
+ (void)sse_norm;
+ double rpar[2], dpar[2];
+ rate_surffit_model_params_lookup(bsize, xm, rpar);
+ dist_surffit_model_params_lookup(bsize, xm, dpar);
+
+ *rate_f = get_rate_clamplinear(yl, rpar[0], rpar[1]);
+ *distbysse_f = get_dbysse_logistic(yl, dpar[0], dpar[1]);
+}
+
+static const double interp_rgrid_curv[4][65] = {
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 118.257702, 120.210658, 121.434853, 122.100487,
+ 122.377758, 122.436865, 72.290102, 96.974289, 101.652727,
+ 126.830141, 140.417377, 157.644879, 184.315291, 215.823873,
+ 262.300169, 335.919859, 420.624173, 519.185032, 619.854243,
+ 726.053595, 827.663369, 933.127475, 1037.988755, 1138.839609,
+ 1233.342933, 1333.508064, 1428.760126, 1533.396364, 1616.952052,
+ 1744.539319, 1803.413586, 1951.466618, 1994.227838, 2086.031680,
+ 2148.635443, 2239.068450, 2222.590637, 2338.859809, 2402.929011,
+ 2418.727875, 2435.342670, 2471.159469, 2523.187446, 2591.183827,
+ 2674.905840, 2774.110714, 2888.555675, 3017.997952, 3162.194773,
+ 3320.903365, 3493.880956, 3680.884773, 3881.672045, 4096.000000,
+ },
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 13.087244, 15.919735, 25.930313, 24.412411,
+ 28.567417, 29.924194, 30.857010, 32.742979, 36.382570,
+ 39.210386, 42.265690, 47.378572, 57.014850, 82.740067,
+ 137.346562, 219.968084, 316.781856, 415.643773, 516.706538,
+ 614.914364, 714.303763, 815.512135, 911.210485, 1008.501528,
+ 1109.787854, 1213.772279, 1322.922561, 1414.752579, 1510.505641,
+ 1615.741888, 1697.989032, 1780.123933, 1847.453790, 1913.742309,
+ 1960.828122, 2047.500168, 2085.454095, 2129.230668, 2158.171824,
+ 2182.231724, 2217.684864, 2269.589211, 2337.264824, 2420.618694,
+ 2519.557814, 2633.989178, 2763.819779, 2908.956609, 3069.306660,
+ 3244.776927, 3435.274401, 3640.706076, 3860.978945, 4096.000000,
+ },
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 4.656893, 5.123633, 5.594132, 6.162376,
+ 6.918433, 7.768444, 8.739415, 10.105862, 11.477328,
+ 13.236604, 15.421030, 19.093623, 25.801871, 46.724612,
+ 98.841054, 181.113466, 272.586364, 359.499769, 445.546343,
+ 525.944439, 605.188743, 681.793483, 756.668359, 838.486885,
+ 926.950356, 1015.482542, 1113.353926, 1204.897193, 1288.871992,
+ 1373.464145, 1455.746628, 1527.796460, 1588.475066, 1658.144771,
+ 1710.302500, 1807.563351, 1863.197608, 1927.281616, 1964.450872,
+ 2022.719898, 2100.041145, 2185.205712, 2280.993936, 2387.616216,
+ 2505.282950, 2634.204540, 2774.591385, 2926.653884, 3090.602436,
+ 3266.647443, 3454.999303, 3655.868416, 3869.465182, 4096.000000,
+ },
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.337370, 0.391916, 0.468839, 0.566334,
+ 0.762564, 1.069225, 1.384361, 1.787581, 2.293948,
+ 3.251909, 4.412991, 8.050068, 11.606073, 27.668092,
+ 65.227758, 128.463938, 202.097653, 262.715851, 312.464873,
+ 355.601398, 400.609054, 447.201352, 495.761568, 552.871938,
+ 619.067625, 691.984883, 773.753288, 860.628503, 946.262808,
+ 1019.805896, 1106.061360, 1178.422145, 1244.852258, 1302.173987,
+ 1399.650266, 1548.092912, 1545.928652, 1670.817500, 1694.523823,
+ 1779.195362, 1882.155494, 1990.662097, 2108.325181, 2235.456119,
+ 2372.366287, 2519.367059, 2676.769812, 2844.885918, 3024.026754,
+ 3214.503695, 3416.628115, 3630.711389, 3857.064892, 4096.000000,
+ },
+};
+
+static const double interp_dgrid_curv[3][65] = {
+ {
+ 16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770,
+ 15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870,
+ 15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387,
+ 13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790,
+ 7.487633, 5.688649, 4.267515, 3.196300, 2.434201, 1.834064,
+ 1.369920, 1.035921, 0.775279, 0.574895, 0.427232, 0.314123,
+ 0.233236, 0.171440, 0.128188, 0.092762, 0.067569, 0.049324,
+ 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733,
+ 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848,
+ 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550,
+ 0.000348, 0.000193, 0.000085, 0.000021, 0.000000,
+ },
+ {
+ 16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501,
+ 15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967,
+ 15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212,
+ 13.073692, 12.222005, 11.237799, 9.985848, 8.898823, 7.423519,
+ 5.995325, 4.773152, 3.744032, 2.938217, 2.294526, 1.762412,
+ 1.327145, 1.020728, 0.765535, 0.570548, 0.425833, 0.313825,
+ 0.232959, 0.171324, 0.128174, 0.092750, 0.067558, 0.049319,
+ 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733,
+ 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848,
+ 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550,
+ 0.000348, 0.000193, 0.000085, 0.000021, -0.000000,
+ },
+};
+
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+ double *rate_f, double *distbysse_f) {
+ const double x_start = -15.5;
+ const double x_end = 16.5;
+ const double x_step = 0.5;
+ const double epsilon = 1e-6;
+ const int rcat = bsize_curvfit_model_cat_lookup[bsize];
+ const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm);
+ (void)x_end;
+
+ xqr = AOMMAX(xqr, x_start + x_step + epsilon);
+ xqr = AOMMIN(xqr, x_end - x_step - epsilon);
+ const double x = (xqr - x_start) / x_step;
+ const int xi = (int)floor(x);
+ const double xo = x - xi;
+
+ assert(xi > 0);
+
+ const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
+ *rate_f = interp_cubic(prate, xo);
+ const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
+ *distbysse_f = interp_cubic(pdist, xo);
+}
+
+static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+ const int num_4x4_w = mi_size_wide[plane_bsize];
+ const int num_4x4_h = mi_size_high[plane_bsize];
+ const ENTROPY_CONTEXT *const above = pd->above_entropy_context;
+ const ENTROPY_CONTEXT *const left = pd->left_entropy_context;
+
+ memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+ memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+}
+
+void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left);
+}
+
+// Special clamping used in the encoder when calculating a prediction
+//
+// Logically, all pixel fetches used for prediction are clamped against the
+// edges of the frame. But doing this directly is slow, so instead we allocate
+// a finite border around the frame and fill it with copies of the outermost
+// pixels.
+//
+// Since this border is finite, we need to clamp the motion vector before
+// prediction in order to avoid out-of-bounds reads. At the same time, this
+// clamp must not change the prediction result.
+//
+// We can balance both of these concerns by calculating how far we would have
+// to go in each direction before the extended prediction region (the current
+// block + AOM_INTERP_EXTEND many pixels around the block) would be mapped
+// so that it touches the frame only at one row or column. This is a special
+// point because any more extreme MV will always lead to the same prediction.
+// So it is safe to clamp at that point.
+//
+// In the worst case, this requires a border of
+// max_block_width + 2*AOM_INTERP_EXTEND = 128 + 2*4 = 136 pixels
+// around the frame edges.
+static INLINE void enc_clamp_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ MV *mv) {
+ int bw = xd->width << MI_SIZE_LOG2;
+ int bh = xd->height << MI_SIZE_LOG2;
+
+ int px_to_left_edge = xd->mi_col << MI_SIZE_LOG2;
+ int px_to_right_edge = (cm->mi_params.mi_cols - xd->mi_col) << MI_SIZE_LOG2;
+ int px_to_top_edge = xd->mi_row << MI_SIZE_LOG2;
+ int px_to_bottom_edge = (cm->mi_params.mi_rows - xd->mi_row) << MI_SIZE_LOG2;
+
+ const SubpelMvLimits mv_limits = {
+ .col_min = -GET_MV_SUBPEL(px_to_left_edge + bw + AOM_INTERP_EXTEND),
+ .col_max = GET_MV_SUBPEL(px_to_right_edge + AOM_INTERP_EXTEND),
+ .row_min = -GET_MV_SUBPEL(px_to_top_edge + bh + AOM_INTERP_EXTEND),
+ .row_max = GET_MV_SUBPEL(px_to_bottom_edge + AOM_INTERP_EXTEND)
+ };
+ clamp_mv(mv, &mv_limits);
+}
+
+void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
+ int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
+ const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+ const int_mv ref_mv =
+ av1_get_ref_mv_from_stack(0, ref_frames, 0, &x->mbmi_ext);
+ const int_mv ref_mv1 =
+ av1_get_ref_mv_from_stack(0, ref_frames, 1, &x->mbmi_ext);
+ MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
+ int num_mv_refs = 0;
+ pred_mv[num_mv_refs++] = ref_mv.as_mv;
+ if (ref_mv.as_int != ref_mv1.as_int) {
+ pred_mv[num_mv_refs++] = ref_mv1.as_mv;
+ }
+
+ assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+ const uint8_t *const src_y_ptr = x->plane[0].src.buf;
+ int zero_seen = 0;
+ int best_sad = INT_MAX;
+ int max_mv = 0;
+ // Get the sad for each candidate reference mv.
+ for (int i = 0; i < num_mv_refs; ++i) {
+ MV *this_mv = &pred_mv[i];
+ enc_clamp_mv(&cpi->common, &x->e_mbd, this_mv);
+
+ const int fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+ const int fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+ max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+ if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
+ zero_seen |= (fp_row == 0 && fp_col == 0);
+
+ const uint8_t *const ref_y_ptr =
+ &ref_y_buffer[ref_y_stride * fp_row + fp_col];
+ // Find sad for current vector.
+ const int this_sad = cpi->ppi->fn_ptr[block_size].sdf(
+ src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride);
+ // Note if it is the best so far.
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ }
+ if (i == 0)
+ x->pred_mv0_sad[ref_frame] = this_sad;
+ else if (i == 1)
+ x->pred_mv1_sad[ref_frame] = this_sad;
+ }
+
+ // Note the index of the mv that worked best in the reference list.
+ x->max_mv_context[ref_frame] = max_mv;
+ x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+ struct buf_2d dst[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src,
+ const struct scale_factors *scale,
+ const struct scale_factors *scale_uv,
+ const int num_planes) {
+ dst[0].buf = src->y_buffer;
+ dst[0].stride = src->y_stride;
+ dst[1].buf = src->u_buffer;
+ dst[2].buf = src->v_buffer;
+ dst[1].stride = dst[2].stride = src->uv_stride;
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ for (int i = 0; i < num_planes; ++i) {
+ setup_pred_plane(dst + i, xd->mi[0]->bsize, dst[i].buf,
+ i ? src->uv_crop_width : src->y_crop_width,
+ i ? src->uv_crop_height : src->y_crop_height,
+ dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
+ xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+ }
+}
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
+ int ref_frame) {
+ assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+ RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1];
+ const RefCntBuffer *const ref_buf =
+ get_ref_frame_buf(&cpi->common, ref_frame);
+ return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf
+ : NULL;
+}
+
+int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
+ InterpFilter interp_filter, int dual_filter) {
+ if (interp_filter == SWITCHABLE) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ int inter_filter_cost = 0;
+ for (int dir = 0; dir < 2; ++dir) {
+ if (dir && !dual_filter) break;
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ const InterpFilter filter =
+ av1_extract_interp_filter(mbmi->interp_filters, dir);
+ inter_filter_cost += x->mode_costs.switchable_interp_costs[ctx][filter];
+ }
+ return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+ } else {
+ return 0;
+ }
+}
+
+void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
+ RD_OPT *const rd = &cpi->rd;
+
+ // Set baseline threshold values.
+ av1_zero(rd->thresh_mult);
+
+ rd->thresh_mult[THR_NEARESTMV] = 300;
+ rd->thresh_mult[THR_NEARESTL2] = 300;
+ rd->thresh_mult[THR_NEARESTL3] = 300;
+ rd->thresh_mult[THR_NEARESTB] = 300;
+ rd->thresh_mult[THR_NEARESTA2] = 300;
+ rd->thresh_mult[THR_NEARESTA] = 300;
+ rd->thresh_mult[THR_NEARESTG] = 300;
+
+ rd->thresh_mult[THR_NEWMV] = 1000;
+ rd->thresh_mult[THR_NEWL2] = 1000;
+ rd->thresh_mult[THR_NEWL3] = 1000;
+ rd->thresh_mult[THR_NEWB] = 1000;
+ rd->thresh_mult[THR_NEWA2] = 1100;
+ rd->thresh_mult[THR_NEWA] = 1000;
+ rd->thresh_mult[THR_NEWG] = 1000;
+
+ rd->thresh_mult[THR_NEARMV] = 1000;
+ rd->thresh_mult[THR_NEARL2] = 1000;
+ rd->thresh_mult[THR_NEARL3] = 1000;
+ rd->thresh_mult[THR_NEARB] = 1000;
+ rd->thresh_mult[THR_NEARA2] = 1000;
+ rd->thresh_mult[THR_NEARA] = 1000;
+ rd->thresh_mult[THR_NEARG] = 1000;
+
+ rd->thresh_mult[THR_GLOBALMV] = 2200;
+ rd->thresh_mult[THR_GLOBALL2] = 2000;
+ rd->thresh_mult[THR_GLOBALL3] = 2000;
+ rd->thresh_mult[THR_GLOBALB] = 2400;
+ rd->thresh_mult[THR_GLOBALA2] = 2000;
+ rd->thresh_mult[THR_GLOBALG] = 2000;
+ rd->thresh_mult[THR_GLOBALA] = 2400;
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] = 1100;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] = 800;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] = 900;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] = 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] = 1000;
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] = 2000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] = 2000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] = 2000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] = 2000;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLA] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLA] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLA] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLA] = 1530;
+ rd->thresh_mult[THR_COMP_NEW_NEARLA] = 1870;
+ rd->thresh_mult[THR_COMP_NEW_NEWLA] = 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] = 2750;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2A] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2A] = 1870;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2A] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2A] = 1800;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3A] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3A] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3A] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3A] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] = 3000;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARGA] = 1320;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGA] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGA] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGA] = 2040;
+ rd->thresh_mult[THR_COMP_NEW_NEARGA] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGA] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] = 2250;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLB] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLB] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLB] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLB] = 1360;
+ rd->thresh_mult[THR_COMP_NEW_NEARLB] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWLB] = 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] = 2250;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2B] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2B] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2B] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2B] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3B] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3B] = 1870;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3B] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3B] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARGB] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGB] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGB] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGB] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARGB] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGB] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLA2] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] = 1800;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLA2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARLA2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWLA2] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2A2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2A2] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] = 1440;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3A2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3A2] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] = 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARGA2] = 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] = 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] = 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGA2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARGA2] = 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGA2] = 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] = 2750;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLL2] = 1600;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] = 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] = 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLL2] = 2640;
+ rd->thresh_mult[THR_COMP_NEW_NEARLL2] = 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEWLL2] = 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] = 3200;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLL3] = 1600;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] = 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] = 1800;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLL3] = 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEARLL3] = 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEWLL3] = 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] = 3200;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLG] = 1760;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLG] = 2400;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLG] = 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLG] = 1760;
+ rd->thresh_mult[THR_COMP_NEW_NEARLG] = 2640;
+ rd->thresh_mult[THR_COMP_NEW_NEWLG] = 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] = 3200;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARBA] = 1600;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWBA] = 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTBA] = 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEWBA] = 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEARBA] = 1980;
+ rd->thresh_mult[THR_COMP_NEW_NEWBA] = 2640;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] = 3200;
+
+ rd->thresh_mult[THR_DC] = 1000;
+ rd->thresh_mult[THR_PAETH] = 1000;
+ rd->thresh_mult[THR_SMOOTH] = 2200;
+ rd->thresh_mult[THR_SMOOTH_V] = 2000;
+ rd->thresh_mult[THR_SMOOTH_H] = 2000;
+ rd->thresh_mult[THR_H_PRED] = 2000;
+ rd->thresh_mult[THR_V_PRED] = 1800;
+ rd->thresh_mult[THR_D135_PRED] = 2500;
+ rd->thresh_mult[THR_D203_PRED] = 2000;
+ rd->thresh_mult[THR_D157_PRED] = 2500;
+ rd->thresh_mult[THR_D67_PRED] = 2000;
+ rd->thresh_mult[THR_D113_PRED] = 2500;
+ rd->thresh_mult[THR_D45_PRED] = 2500;
+}
+
+static INLINE void update_thr_fact(int (*factor_buf)[MAX_MODES],
+ THR_MODES best_mode_index,
+ THR_MODES mode_start, THR_MODES mode_end,
+ BLOCK_SIZE min_size, BLOCK_SIZE max_size,
+ int max_rd_thresh_factor) {
+ for (THR_MODES mode = mode_start; mode < mode_end; ++mode) {
+ for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) {
+ int *const fact = &factor_buf[bs][mode];
+ if (mode == best_mode_index) {
+ *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR);
+ } else {
+ *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor);
+ }
+ }
+ }
+}
+
+void av1_update_rd_thresh_fact(
+ const AV1_COMMON *const cm, int (*factor_buf)[MAX_MODES],
+ int use_adaptive_rd_thresh, BLOCK_SIZE bsize, THR_MODES best_mode_index,
+ THR_MODES inter_mode_start, THR_MODES inter_mode_end,
+ THR_MODES intra_mode_start, THR_MODES intra_mode_end) {
+ assert(use_adaptive_rd_thresh > 0);
+ const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT;
+
+ const int bsize_is_1_to_4 = bsize > cm->seq_params->sb_size;
+ BLOCK_SIZE min_size, max_size;
+ if (bsize_is_1_to_4) {
+ // This part handles block sizes with 1:4 and 4:1 aspect ratios
+ // TODO(any): Experiment with threshold update for parent/child blocks
+ min_size = bsize;
+ max_size = bsize;
+ } else {
+ min_size = AOMMAX(bsize - 2, BLOCK_4X4);
+ max_size = AOMMIN(bsize + 2, (int)cm->seq_params->sb_size);
+ }
+
+ update_thr_fact(factor_buf, best_mode_index, inter_mode_start, inter_mode_end,
+ min_size, max_size, max_rd_thresh_factor);
+ update_thr_fact(factor_buf, best_mode_index, intra_mode_start, intra_mode_end,
+ min_size, max_size, max_rd_thresh_factor);
+}
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+ aom_bit_depth_t bit_depth) {
+ const int q = av1_dc_quant_QTX(qindex, qdelta, bit_depth);
+ switch (bit_depth) {
+ case AOM_BITS_8: return 20 * q;
+ case AOM_BITS_10: return 5 * q;
+ case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+}
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
new file mode 100644
index 0000000000..b38d9ca542
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.h
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RD_H_
+#define AOM_AV1_ENCODER_RD_H_
+
+#include <limits.h>
+
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS 7
+#define RD_EPB_SHIFT 6
+
+#define RDCOST(RM, R, D) \
+ (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \
+ ((D) * (1 << RDDIV_BITS)))
+
+#define RDCOST_NEG_R(RM, R, D) \
+ (((D) * (1 << RDDIV_BITS)) - \
+ ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT))
+
+#define RDCOST_DBL_WITH_NATIVE_BD_DIST(RM, R, D, BD) \
+ (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
+ ((double)((D) >> (2 * (BD - 8))) * (1 << RDDIV_BITS)))
+
+#define QIDX_SKIP_THRESH 115
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
+
+// The fractional part of rd_thresh factor is stored with 5 bits. The maximum
+// factor that we allow is two, which is stored as 2 ** (5+1) = 64
+#define RD_THRESH_FAC_FRAC_BITS (5)
+#define RD_THRESH_FAC_FRAC_VAL (1 << (RD_THRESH_FAC_FRAC_BITS))
+#define RD_THRESH_MAX_FACT ((RD_THRESH_FAC_FRAC_VAL) << 1)
+#define RD_THRESH_LOG_DEC_FACTOR (4)
+#define RD_THRESH_INC (1)
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+// Macros for common video resolutions: width x height
+// For example, 720p represents video resolution of 1280x720 pixels.
+#define RESOLUTION_288P 352 * 288
+#define RESOLUTION_360P 640 * 360
+#define RESOLUTION_480P 640 * 480
+#define RESOLUTION_720P 1280 * 720
+#define RESOLUTION_1080P 1920 * 1080
+#define RESOLUTION_1440P 2560 * 1440
+#define RESOLUTION_4K 3840 * 2160
+
+#define RTC_REFS 4
+static const MV_REFERENCE_FRAME real_time_ref_combos[RTC_REFS][2] = {
+ { LAST_FRAME, NONE_FRAME },
+ { ALTREF_FRAME, NONE_FRAME },
+ { GOLDEN_FRAME, NONE_FRAME },
+ { INTRA_FRAME, NONE_FRAME }
+};
+
+static INLINE int mode_offset(const PREDICTION_MODE mode) {
+ if (mode >= NEARESTMV) {
+ return INTER_OFFSET(mode);
+ } else {
+ switch (mode) {
+ case DC_PRED: return 0;
+ case V_PRED: return 1;
+ case H_PRED: return 2;
+ case SMOOTH_PRED: return 3;
+ default: assert(0); return -1;
+ }
+ }
+}
+
+enum {
+ // Default initialization when we are not using winner mode framework. e.g.
+ // intrabc
+ DEFAULT_EVAL = 0,
+ // Initialization for selecting winner mode
+ MODE_EVAL,
+ // Initialization for winner mode evaluation
+ WINNER_MODE_EVAL,
+ // All mode evaluation types
+ MODE_EVAL_TYPES,
+} UENUM1BYTE(MODE_EVAL_TYPE);
+
+typedef struct RD_OPT {
+ // Thresh_mult is used to set a threshold for the rd score. A higher value
+ // means that we will accept the best mode so far more often. This number
+ // is used in combination with the current block size, and thresh_freq_fact
+ // to pick a threshold.
+ int thresh_mult[MAX_MODES];
+
+ int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
+
+ int RDMULT;
+
+ double r0;
+} RD_OPT;
+
+static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+ int plane;
+#endif
+ rd_stats->rate = 0;
+ rd_stats->dist = 0;
+ rd_stats->rdcost = 0;
+ rd_stats->sse = 0;
+ rd_stats->skip_txfm = 1;
+ rd_stats->zero_rate = 0;
+#if CONFIG_RD_DEBUG
+ // This may run into problems when monochrome video is
+ // encoded, as there will only be 1 plane
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats->txb_coeff_cost[plane] = 0;
+ }
+#endif
+}
+
+static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+ int plane;
+#endif
+ rd_stats->rate = INT_MAX;
+ rd_stats->dist = INT64_MAX;
+ rd_stats->rdcost = INT64_MAX;
+ rd_stats->sse = INT64_MAX;
+ rd_stats->skip_txfm = 0;
+ rd_stats->zero_rate = 0;
+#if CONFIG_RD_DEBUG
+ // This may run into problems when monochrome video is
+ // encoded, as there will only be 1 plane
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats->txb_coeff_cost[plane] = INT_MAX;
+ }
+#endif
+}
+
+static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+ const RD_STATS *rd_stats_src) {
+ if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) {
+ // If rd_stats_dst or rd_stats_src has invalid rate, we will make
+ // rd_stats_dst invalid.
+ av1_invalid_rd_stats(rd_stats_dst);
+ return;
+ }
+ rd_stats_dst->rate = (int)AOMMIN(
+ ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX);
+ if (!rd_stats_dst->zero_rate)
+ rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
+ rd_stats_dst->dist += rd_stats_src->dist;
+ if (rd_stats_dst->sse < INT64_MAX && rd_stats_src->sse < INT64_MAX) {
+ rd_stats_dst->sse += rd_stats_src->sse;
+ }
+ rd_stats_dst->skip_txfm &= rd_stats_src->skip_txfm;
+#if CONFIG_RD_DEBUG
+ // This may run into problems when monochrome video is
+ // encoded, as there will only be 1 plane
+ for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
+ }
+#endif
+}
+
+static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist,
+ int rate, int skip_txfm, int64_t sse,
+ int zero_rate) {
+ assert(rd_stats->rate != INT_MAX && rate != INT_MAX);
+ rd_stats->rate += rate;
+ if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate;
+ rd_stats->dist += dist;
+ rd_stats->skip_txfm &= skip_txfm;
+ rd_stats->sse += sse;
+}
+
+static INLINE int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) {
+ assert(mult >= 0);
+ if (rate >= 0) {
+ return RDCOST(mult, rate, dist);
+ }
+ return RDCOST_NEG_R(mult, -rate, dist);
+}
+
+static INLINE void av1_rd_cost_update(int mult, RD_STATS *rd_cost) {
+ if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX &&
+ rd_cost->rdcost < INT64_MAX) {
+ rd_cost->rdcost = av1_calculate_rd_cost(mult, rd_cost->rate, rd_cost->dist);
+ } else {
+ av1_invalid_rd_stats(rd_cost);
+ }
+}
+
+static INLINE void av1_rd_stats_subtraction(int mult,
+ const RD_STATS *const left,
+ const RD_STATS *const right,
+ RD_STATS *result) {
+ if (left->rate == INT_MAX || right->rate == INT_MAX ||
+ left->dist == INT64_MAX || right->dist == INT64_MAX ||
+ left->rdcost == INT64_MAX || right->rdcost == INT64_MAX) {
+ av1_invalid_rd_stats(result);
+ } else {
+ result->rate = left->rate - right->rate;
+ result->dist = left->dist - right->dist;
+ result->rdcost = av1_calculate_rd_cost(mult, result->rate, result->dist);
+ }
+}
+
+struct TileInfo;
+struct TileDataEnc;
+struct AV1_COMP;
+struct macroblock;
+
+/*!\brief Compute rdmult based on q index and frame update type
+ *
+ * \param[in] bit_depth bit depth
+ * \param[in] update_type frame update type
+ * \param[in] qindex q index
+ *
+ * \return rdmult
+ */
+int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
+ FRAME_UPDATE_TYPE update_type,
+ int qindex);
+
+int av1_compute_rd_mult(const int qindex, const aom_bit_depth_t bit_depth,
+ const FRAME_UPDATE_TYPE update_type,
+ const int layer_depth, const int boost_index,
+ const FRAME_TYPE frame_type,
+ const int use_fixed_qp_offsets,
+ const int is_stat_consumption_stage);
+
+void av1_initialize_rd_consts(struct AV1_COMP *cpi);
+
+// Sets the multiplier to convert mv cost to l1 error during motion search.
+void av1_set_sad_per_bit(const struct AV1_COMP *cpi, int *sadperbit,
+ int qindex);
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
+ unsigned int qstep, int *rate, int64_t *dist);
+
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+ double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+ double yl, double *rate_f, double *distbysse_f);
+
+int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
+ InterpFilter interp_filter, int dual_filter);
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
+ int ref_frame);
+
+void av1_init_me_luts(void);
+
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx);
+
+void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]);
+
+void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+ int (*fact)[MAX_MODES], int rd_thresh,
+ BLOCK_SIZE bsize, THR_MODES best_mode_index,
+ THR_MODES inter_mode_start,
+ THR_MODES inter_mode_end,
+ THR_MODES intra_mode_start,
+ THR_MODES intra_mode_end);
+
+static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) {
+ for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ for (int j = 0; j < MAX_MODES; ++j) {
+ x->thresh_freq_fact[i][j] = RD_THRESH_FAC_FRAC_VAL;
+ }
+ }
+}
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int64_t thresh,
+ int thresh_fact) {
+ return best_rd < (thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame,
+ BLOCK_SIZE block_size);
+
+// Sets the multiplier to convert mv cost to l2 error during motion search.
+static INLINE void av1_set_error_per_bit(int *errorperbit, int rdmult) {
+ *errorperbit = AOMMAX(rdmult >> RD_EPB_SHIFT, 1);
+}
+
+// Get the threshold for R-D optimization of coefficients depending upon mode
+// decision/winner mode processing
+static INLINE void get_rd_opt_coeff_thresh(
+ const uint32_t (*const coeff_opt_threshold)[2],
+ TxfmSearchParams *txfm_params, int enable_winner_mode_for_coeff_opt,
+ int is_winner_mode) {
+ if (!enable_winner_mode_for_coeff_opt) {
+ // Default initialization of threshold
+ txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[DEFAULT_EVAL][0];
+ txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[DEFAULT_EVAL][1];
+ return;
+ }
+ // TODO(any): Experiment with coeff_opt_dist_threshold values when
+ // enable_winner_mode_for_coeff_opt is ON
+ // TODO(any): Skip the winner mode processing for blocks with lower residual
+ // energy as R-D optimization of coefficients would have been enabled during
+ // mode decision
+
+ // Use conservative threshold during mode decision and perform R-D
+ // optimization of coeffs always for winner modes
+ if (is_winner_mode) {
+ txfm_params->coeff_opt_thresholds[0] =
+ coeff_opt_threshold[WINNER_MODE_EVAL][0];
+ txfm_params->coeff_opt_thresholds[1] =
+ coeff_opt_threshold[WINNER_MODE_EVAL][1];
+ } else {
+ txfm_params->coeff_opt_thresholds[0] = coeff_opt_threshold[MODE_EVAL][0];
+ txfm_params->coeff_opt_thresholds[1] = coeff_opt_threshold[MODE_EVAL][1];
+ }
+}
+
+// Used to reset the state of mb rd hash information
+static INLINE void reset_mb_rd_record(MB_RD_RECORD *const mb_rd_record) {
+ if (!mb_rd_record) return;
+
+ // Reset the state for use_mb_rd_hash
+ mb_rd_record->num = mb_rd_record->index_start = 0;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+ struct buf_2d dst[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src,
+ const struct scale_factors *scale,
+ const struct scale_factors *scale_uv,
+ const int num_planes);
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+ aom_bit_depth_t bit_depth);
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, ModeCosts *mode_costs,
+ FRAME_CONTEXT *fc);
+
+void av1_fill_lr_rates(ModeCosts *mode_costs, FRAME_CONTEXT *fc);
+
+void av1_fill_coeff_costs(CoeffCosts *coeff_costs, FRAME_CONTEXT *fc,
+ const int num_planes);
+
+void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
+ MvCosts *mv_costs);
+
+void av1_fill_dv_costs(const nmv_context *ndvc, IntraBCMVCosts *dv_costs);
+
+int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta);
+
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta);
+
+/*!\brief Adjust current superblock's q_index based on delta q resolution
+ *
+ * \param[in] delta_q_res delta q resolution
+ * \param[in] prev_qindex previous superblock's q index
+ * \param[in] curr_qindex current superblock's q index
+ *
+ * \return the current superblock's adjusted q_index
+ */
+int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
+ int curr_qindex);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
new file mode 100644
index 0000000000..c17fbccf8c
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -0,0 +1,6598 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/txb_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/compound_type.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/mode_prune_model_weights.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/tx_search.h"
+#include "av1/encoder/var_based_part.h"
+
+#define LAST_NEW_MV_INDEX 6
+
+// Mode_threshold multiplication factor table for prune_inter_modes_if_skippable
+// The values are kept in Q12 format and equation used to derive is
+// (2.5 - ((float)x->qindex / MAXQ) * 1.5)
+#define MODE_THRESH_QBITS 12
+static const int mode_threshold_mul_factor[QINDEX_RANGE] = {
+ 10240, 10216, 10192, 10168, 10144, 10120, 10095, 10071, 10047, 10023, 9999,
+ 9975, 9951, 9927, 9903, 9879, 9854, 9830, 9806, 9782, 9758, 9734,
+ 9710, 9686, 9662, 9638, 9614, 9589, 9565, 9541, 9517, 9493, 9469,
+ 9445, 9421, 9397, 9373, 9349, 9324, 9300, 9276, 9252, 9228, 9204,
+ 9180, 9156, 9132, 9108, 9083, 9059, 9035, 9011, 8987, 8963, 8939,
+ 8915, 8891, 8867, 8843, 8818, 8794, 8770, 8746, 8722, 8698, 8674,
+ 8650, 8626, 8602, 8578, 8553, 8529, 8505, 8481, 8457, 8433, 8409,
+ 8385, 8361, 8337, 8312, 8288, 8264, 8240, 8216, 8192, 8168, 8144,
+ 8120, 8096, 8072, 8047, 8023, 7999, 7975, 7951, 7927, 7903, 7879,
+ 7855, 7831, 7806, 7782, 7758, 7734, 7710, 7686, 7662, 7638, 7614,
+ 7590, 7566, 7541, 7517, 7493, 7469, 7445, 7421, 7397, 7373, 7349,
+ 7325, 7301, 7276, 7252, 7228, 7204, 7180, 7156, 7132, 7108, 7084,
+ 7060, 7035, 7011, 6987, 6963, 6939, 6915, 6891, 6867, 6843, 6819,
+ 6795, 6770, 6746, 6722, 6698, 6674, 6650, 6626, 6602, 6578, 6554,
+ 6530, 6505, 6481, 6457, 6433, 6409, 6385, 6361, 6337, 6313, 6289,
+ 6264, 6240, 6216, 6192, 6168, 6144, 6120, 6096, 6072, 6048, 6024,
+ 5999, 5975, 5951, 5927, 5903, 5879, 5855, 5831, 5807, 5783, 5758,
+ 5734, 5710, 5686, 5662, 5638, 5614, 5590, 5566, 5542, 5518, 5493,
+ 5469, 5445, 5421, 5397, 5373, 5349, 5325, 5301, 5277, 5253, 5228,
+ 5204, 5180, 5156, 5132, 5108, 5084, 5060, 5036, 5012, 4987, 4963,
+ 4939, 4915, 4891, 4867, 4843, 4819, 4795, 4771, 4747, 4722, 4698,
+ 4674, 4650, 4626, 4602, 4578, 4554, 4530, 4506, 4482, 4457, 4433,
+ 4409, 4385, 4361, 4337, 4313, 4289, 4265, 4241, 4216, 4192, 4168,
+ 4144, 4120, 4096
+};
+
+static const THR_MODES av1_default_mode_order[MAX_MODES] = {
+ THR_NEARESTMV,
+ THR_NEARESTL2,
+ THR_NEARESTL3,
+ THR_NEARESTB,
+ THR_NEARESTA2,
+ THR_NEARESTA,
+ THR_NEARESTG,
+
+ THR_NEWMV,
+ THR_NEWL2,
+ THR_NEWL3,
+ THR_NEWB,
+ THR_NEWA2,
+ THR_NEWA,
+ THR_NEWG,
+
+ THR_NEARMV,
+ THR_NEARL2,
+ THR_NEARL3,
+ THR_NEARB,
+ THR_NEARA2,
+ THR_NEARA,
+ THR_NEARG,
+
+ THR_GLOBALMV,
+ THR_GLOBALL2,
+ THR_GLOBALL3,
+ THR_GLOBALB,
+ THR_GLOBALA2,
+ THR_GLOBALA,
+ THR_GLOBALG,
+
+ THR_COMP_NEAREST_NEARESTLA,
+ THR_COMP_NEAREST_NEARESTL2A,
+ THR_COMP_NEAREST_NEARESTL3A,
+ THR_COMP_NEAREST_NEARESTGA,
+ THR_COMP_NEAREST_NEARESTLB,
+ THR_COMP_NEAREST_NEARESTL2B,
+ THR_COMP_NEAREST_NEARESTL3B,
+ THR_COMP_NEAREST_NEARESTGB,
+ THR_COMP_NEAREST_NEARESTLA2,
+ THR_COMP_NEAREST_NEARESTL2A2,
+ THR_COMP_NEAREST_NEARESTL3A2,
+ THR_COMP_NEAREST_NEARESTGA2,
+ THR_COMP_NEAREST_NEARESTLL2,
+ THR_COMP_NEAREST_NEARESTLL3,
+ THR_COMP_NEAREST_NEARESTLG,
+ THR_COMP_NEAREST_NEARESTBA,
+
+ THR_COMP_NEAR_NEARLB,
+ THR_COMP_NEW_NEWLB,
+ THR_COMP_NEW_NEARESTLB,
+ THR_COMP_NEAREST_NEWLB,
+ THR_COMP_NEW_NEARLB,
+ THR_COMP_NEAR_NEWLB,
+ THR_COMP_GLOBAL_GLOBALLB,
+
+ THR_COMP_NEAR_NEARLA,
+ THR_COMP_NEW_NEWLA,
+ THR_COMP_NEW_NEARESTLA,
+ THR_COMP_NEAREST_NEWLA,
+ THR_COMP_NEW_NEARLA,
+ THR_COMP_NEAR_NEWLA,
+ THR_COMP_GLOBAL_GLOBALLA,
+
+ THR_COMP_NEAR_NEARL2A,
+ THR_COMP_NEW_NEWL2A,
+ THR_COMP_NEW_NEARESTL2A,
+ THR_COMP_NEAREST_NEWL2A,
+ THR_COMP_NEW_NEARL2A,
+ THR_COMP_NEAR_NEWL2A,
+ THR_COMP_GLOBAL_GLOBALL2A,
+
+ THR_COMP_NEAR_NEARL3A,
+ THR_COMP_NEW_NEWL3A,
+ THR_COMP_NEW_NEARESTL3A,
+ THR_COMP_NEAREST_NEWL3A,
+ THR_COMP_NEW_NEARL3A,
+ THR_COMP_NEAR_NEWL3A,
+ THR_COMP_GLOBAL_GLOBALL3A,
+
+ THR_COMP_NEAR_NEARGA,
+ THR_COMP_NEW_NEWGA,
+ THR_COMP_NEW_NEARESTGA,
+ THR_COMP_NEAREST_NEWGA,
+ THR_COMP_NEW_NEARGA,
+ THR_COMP_NEAR_NEWGA,
+ THR_COMP_GLOBAL_GLOBALGA,
+
+ THR_COMP_NEAR_NEARL2B,
+ THR_COMP_NEW_NEWL2B,
+ THR_COMP_NEW_NEARESTL2B,
+ THR_COMP_NEAREST_NEWL2B,
+ THR_COMP_NEW_NEARL2B,
+ THR_COMP_NEAR_NEWL2B,
+ THR_COMP_GLOBAL_GLOBALL2B,
+
+ THR_COMP_NEAR_NEARL3B,
+ THR_COMP_NEW_NEWL3B,
+ THR_COMP_NEW_NEARESTL3B,
+ THR_COMP_NEAREST_NEWL3B,
+ THR_COMP_NEW_NEARL3B,
+ THR_COMP_NEAR_NEWL3B,
+ THR_COMP_GLOBAL_GLOBALL3B,
+
+ THR_COMP_NEAR_NEARGB,
+ THR_COMP_NEW_NEWGB,
+ THR_COMP_NEW_NEARESTGB,
+ THR_COMP_NEAREST_NEWGB,
+ THR_COMP_NEW_NEARGB,
+ THR_COMP_NEAR_NEWGB,
+ THR_COMP_GLOBAL_GLOBALGB,
+
+ THR_COMP_NEAR_NEARLA2,
+ THR_COMP_NEW_NEWLA2,
+ THR_COMP_NEW_NEARESTLA2,
+ THR_COMP_NEAREST_NEWLA2,
+ THR_COMP_NEW_NEARLA2,
+ THR_COMP_NEAR_NEWLA2,
+ THR_COMP_GLOBAL_GLOBALLA2,
+
+ THR_COMP_NEAR_NEARL2A2,
+ THR_COMP_NEW_NEWL2A2,
+ THR_COMP_NEW_NEARESTL2A2,
+ THR_COMP_NEAREST_NEWL2A2,
+ THR_COMP_NEW_NEARL2A2,
+ THR_COMP_NEAR_NEWL2A2,
+ THR_COMP_GLOBAL_GLOBALL2A2,
+
+ THR_COMP_NEAR_NEARL3A2,
+ THR_COMP_NEW_NEWL3A2,
+ THR_COMP_NEW_NEARESTL3A2,
+ THR_COMP_NEAREST_NEWL3A2,
+ THR_COMP_NEW_NEARL3A2,
+ THR_COMP_NEAR_NEWL3A2,
+ THR_COMP_GLOBAL_GLOBALL3A2,
+
+ THR_COMP_NEAR_NEARGA2,
+ THR_COMP_NEW_NEWGA2,
+ THR_COMP_NEW_NEARESTGA2,
+ THR_COMP_NEAREST_NEWGA2,
+ THR_COMP_NEW_NEARGA2,
+ THR_COMP_NEAR_NEWGA2,
+ THR_COMP_GLOBAL_GLOBALGA2,
+
+ THR_COMP_NEAR_NEARLL2,
+ THR_COMP_NEW_NEWLL2,
+ THR_COMP_NEW_NEARESTLL2,
+ THR_COMP_NEAREST_NEWLL2,
+ THR_COMP_NEW_NEARLL2,
+ THR_COMP_NEAR_NEWLL2,
+ THR_COMP_GLOBAL_GLOBALLL2,
+
+ THR_COMP_NEAR_NEARLL3,
+ THR_COMP_NEW_NEWLL3,
+ THR_COMP_NEW_NEARESTLL3,
+ THR_COMP_NEAREST_NEWLL3,
+ THR_COMP_NEW_NEARLL3,
+ THR_COMP_NEAR_NEWLL3,
+ THR_COMP_GLOBAL_GLOBALLL3,
+
+ THR_COMP_NEAR_NEARLG,
+ THR_COMP_NEW_NEWLG,
+ THR_COMP_NEW_NEARESTLG,
+ THR_COMP_NEAREST_NEWLG,
+ THR_COMP_NEW_NEARLG,
+ THR_COMP_NEAR_NEWLG,
+ THR_COMP_GLOBAL_GLOBALLG,
+
+ THR_COMP_NEAR_NEARBA,
+ THR_COMP_NEW_NEWBA,
+ THR_COMP_NEW_NEARESTBA,
+ THR_COMP_NEAREST_NEWBA,
+ THR_COMP_NEW_NEARBA,
+ THR_COMP_NEAR_NEWBA,
+ THR_COMP_GLOBAL_GLOBALBA,
+
+ THR_DC,
+ THR_PAETH,
+ THR_SMOOTH,
+ THR_SMOOTH_V,
+ THR_SMOOTH_H,
+ THR_H_PRED,
+ THR_V_PRED,
+ THR_D135_PRED,
+ THR_D203_PRED,
+ THR_D157_PRED,
+ THR_D67_PRED,
+ THR_D113_PRED,
+ THR_D45_PRED,
+};
+
+/*!\cond */
+typedef struct SingleInterModeState {
+ int64_t rd;
+ MV_REFERENCE_FRAME ref_frame;
+ int valid;
+} SingleInterModeState;
+
+typedef struct InterModeSearchState {
+ int64_t best_rd;
+ int64_t best_skip_rd[2];
+ MB_MODE_INFO best_mbmode;
+ int best_rate_y;
+ int best_rate_uv;
+ int best_mode_skippable;
+ int best_skip2;
+ THR_MODES best_mode_index;
+ int num_available_refs;
+ int64_t dist_refs[REF_FRAMES];
+ int dist_order_refs[REF_FRAMES];
+ int64_t mode_threshold[MAX_MODES];
+ int64_t best_intra_rd;
+ unsigned int best_pred_sse;
+
+ /*!
+ * \brief Keep track of best intra rd for use in compound mode.
+ */
+ int64_t best_pred_rd[REFERENCE_MODES];
+ // Save a set of single_newmv for each checked ref_mv.
+ int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES];
+ int single_newmv_rate[MAX_REF_MV_SEARCH][REF_FRAMES];
+ int single_newmv_valid[MAX_REF_MV_SEARCH][REF_FRAMES];
+ int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
+ // The rd of simple translation in single inter modes
+ int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
+ int64_t best_single_rd[REF_FRAMES];
+ PREDICTION_MODE best_single_mode[REF_FRAMES];
+
+ // Single search results by [directions][modes][reference frames]
+ SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+ int single_state_cnt[2][SINGLE_INTER_MODE_NUM];
+ SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM]
+ [FWD_REFS];
+ int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
+ MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+ IntraModeSearchState intra_search_state;
+ RD_STATS best_y_rdcost;
+} InterModeSearchState;
+/*!\endcond */
+
+void av1_inter_mode_data_init(TileDataEnc *tile_data) {
+ for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[i];
+ md->ready = 0;
+ md->num = 0;
+ md->dist_sum = 0;
+ md->ld_sum = 0;
+ md->sse_sum = 0;
+ md->sse_sse_sum = 0;
+ md->sse_ld_sum = 0;
+ }
+}
+
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ int64_t sse, int *est_residue_cost,
+ int64_t *est_dist) {
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ if (md->ready) {
+ if (sse < md->dist_mean) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ } else {
+ *est_dist = (int64_t)round(md->dist_mean);
+ const double est_ld = md->a * sse + md->b;
+ // Clamp estimated rate cost by INT_MAX / 2.
+ // TODO(angiebird@google.com): find better solution than clamping.
+ if (fabs(est_ld) < 1e-2) {
+ *est_residue_cost = INT_MAX / 2;
+ } else {
+ double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+ if (est_residue_cost_dbl < 0) {
+ *est_residue_cost = 0;
+ } else {
+ *est_residue_cost =
+ (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+ }
+ }
+ if (*est_residue_cost <= 0) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ }
+ }
+ return 1;
+ }
+ return 0;
+}
+
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ const int block_idx = inter_mode_data_block_idx(bsize);
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ if (block_idx == -1) continue;
+ if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) {
+ continue;
+ } else {
+ if (md->ready == 0) {
+ md->dist_mean = md->dist_sum / md->num;
+ md->ld_mean = md->ld_sum / md->num;
+ md->sse_mean = md->sse_sum / md->num;
+ md->sse_sse_mean = md->sse_sse_sum / md->num;
+ md->sse_ld_mean = md->sse_ld_sum / md->num;
+ } else {
+ const double factor = 3;
+ md->dist_mean =
+ (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1);
+ md->ld_mean =
+ (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1);
+ md->sse_mean =
+ (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1);
+ md->sse_sse_mean =
+ (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) /
+ (factor + 1);
+ md->sse_ld_mean =
+ (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) /
+ (factor + 1);
+ }
+
+ const double my = md->ld_mean;
+ const double mx = md->sse_mean;
+ const double dx = sqrt(md->sse_sse_mean);
+ const double dxy = md->sse_ld_mean;
+
+ md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+ md->b = my - md->a * mx;
+ md->ready = 1;
+
+ md->num = 0;
+ md->dist_sum = 0;
+ md->ld_sum = 0;
+ md->sse_sum = 0;
+ md->sse_sse_sum = 0;
+ md->sse_ld_sum = 0;
+ }
+ (void)rdmult;
+ }
+}
+
+static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data,
+ BLOCK_SIZE bsize, int64_t sse,
+ int64_t dist, int residue_cost) {
+ if (residue_cost == 0 || sse == dist) return;
+ const int block_idx = inter_mode_data_block_idx(bsize);
+ if (block_idx == -1) return;
+ InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
+ if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+ const double ld = (sse - dist) * 1. / residue_cost;
+ ++rd_model->num;
+ rd_model->dist_sum += dist;
+ rd_model->ld_sum += ld;
+ rd_model->sse_sum += sse;
+ rd_model->sse_sse_sum += (double)sse * (double)sse;
+ rd_model->sse_ld_sum += sse * ld;
+ }
+}
+
+static AOM_INLINE void inter_modes_info_push(InterModesInfo *inter_modes_info,
+ int mode_rate, int64_t sse,
+ int64_t rd, RD_STATS *rd_cost,
+ RD_STATS *rd_cost_y,
+ RD_STATS *rd_cost_uv,
+ const MB_MODE_INFO *mbmi) {
+ const int num = inter_modes_info->num;
+ assert(num < MAX_INTER_MODES);
+ inter_modes_info->mbmi_arr[num] = *mbmi;
+ inter_modes_info->mode_rate_arr[num] = mode_rate;
+ inter_modes_info->sse_arr[num] = sse;
+ inter_modes_info->est_rd_arr[num] = rd;
+ inter_modes_info->rd_cost_arr[num] = *rd_cost;
+ inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y;
+ inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv;
+ ++inter_modes_info->num;
+}
+
+static int compare_rd_idx_pair(const void *a, const void *b) {
+ if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) {
+ // To avoid inconsistency in qsort() ordering when two elements are equal,
+ // using idx as tie breaker. Refer aomedia:2928
+ if (((RdIdxPair *)a)->idx == ((RdIdxPair *)b)->idx)
+ return 0;
+ else if (((RdIdxPair *)a)->idx > ((RdIdxPair *)b)->idx)
+ return 1;
+ else
+ return -1;
+ } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) {
+ return 1;
+ } else {
+ return -1;
+ }
+}
+
+static AOM_INLINE void inter_modes_info_sort(
+ const InterModesInfo *inter_modes_info, RdIdxPair *rd_idx_pair_arr) {
+ if (inter_modes_info->num == 0) {
+ return;
+ }
+ for (int i = 0; i < inter_modes_info->num; ++i) {
+ rd_idx_pair_arr[i].idx = i;
+ rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i];
+ }
+ qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
+ compare_rd_idx_pair);
+}
+
+// Similar to get_horver_correlation, but also takes into account first
+// row/column, when computing horizontal/vertical correlation.
+void av1_get_horver_correlation_full_c(const int16_t *diff, int stride,
+ int width, int height, float *hcorr,
+ float *vcorr) {
+ // The following notation is used:
+ // x - current pixel
+ // y - left neighbor pixel
+ // z - top neighbor pixel
+ int64_t x_sum = 0, x2_sum = 0, xy_sum = 0, xz_sum = 0;
+ int64_t x_firstrow = 0, x_finalrow = 0, x_firstcol = 0, x_finalcol = 0;
+ int64_t x2_firstrow = 0, x2_finalrow = 0, x2_firstcol = 0, x2_finalcol = 0;
+
+ // First, process horizontal correlation on just the first row
+ x_sum += diff[0];
+ x2_sum += diff[0] * diff[0];
+ x_firstrow += diff[0];
+ x2_firstrow += diff[0] * diff[0];
+ for (int j = 1; j < width; ++j) {
+ const int16_t x = diff[j];
+ const int16_t y = diff[j - 1];
+ x_sum += x;
+ x_firstrow += x;
+ x2_sum += x * x;
+ x2_firstrow += x * x;
+ xy_sum += x * y;
+ }
+
+ // Process vertical correlation in the first column
+ x_firstcol += diff[0];
+ x2_firstcol += diff[0] * diff[0];
+ for (int i = 1; i < height; ++i) {
+ const int16_t x = diff[i * stride];
+ const int16_t z = diff[(i - 1) * stride];
+ x_sum += x;
+ x_firstcol += x;
+ x2_sum += x * x;
+ x2_firstcol += x * x;
+ xz_sum += x * z;
+ }
+
+ // Now process horiz and vert correlation through the rest unit
+ for (int i = 1; i < height; ++i) {
+ for (int j = 1; j < width; ++j) {
+ const int16_t x = diff[i * stride + j];
+ const int16_t y = diff[i * stride + j - 1];
+ const int16_t z = diff[(i - 1) * stride + j];
+ x_sum += x;
+ x2_sum += x * x;
+ xy_sum += x * y;
+ xz_sum += x * z;
+ }
+ }
+
+ for (int j = 0; j < width; ++j) {
+ x_finalrow += diff[(height - 1) * stride + j];
+ x2_finalrow +=
+ diff[(height - 1) * stride + j] * diff[(height - 1) * stride + j];
+ }
+ for (int i = 0; i < height; ++i) {
+ x_finalcol += diff[i * stride + width - 1];
+ x2_finalcol += diff[i * stride + width - 1] * diff[i * stride + width - 1];
+ }
+
+ int64_t xhor_sum = x_sum - x_finalcol;
+ int64_t xver_sum = x_sum - x_finalrow;
+ int64_t y_sum = x_sum - x_firstcol;
+ int64_t z_sum = x_sum - x_firstrow;
+ int64_t x2hor_sum = x2_sum - x2_finalcol;
+ int64_t x2ver_sum = x2_sum - x2_finalrow;
+ int64_t y2_sum = x2_sum - x2_firstcol;
+ int64_t z2_sum = x2_sum - x2_firstrow;
+
+ const float num_hor = (float)(height * (width - 1));
+ const float num_ver = (float)((height - 1) * width);
+
+ const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+ const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+ const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+ const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+ const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+ const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+ if (xhor_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ } else {
+ *hcorr = 1.0;
+ }
+ if (xver_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ } else {
+ *vcorr = 1.0;
+ }
+}
+
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x,
+ int64_t *sse_y) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ int64_t total_sse = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+ unsigned int sse;
+
+ cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, &sse);
+ total_sse += sse;
+ if (!plane && sse_y) *sse_y = sse;
+ }
+ total_sse <<= 4;
+ return total_sse;
+}
+
+int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += coeff[i] * coeff[i];
+ }
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+int64_t av1_block_error_lp_c(const int16_t *coeff, const int16_t *dqcoeff,
+ intptr_t block_size) {
+ int64_t error = 0;
+
+ for (int i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ }
+
+ return error;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, intptr_t block_size,
+ int64_t *ssz, int bd) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+ int shift = 2 * (bd - 8);
+ int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int64_t diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
+#endif
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+ PREDICTION_MODE best_intra_mode) {
+ if (mode == D113_PRED && best_intra_mode != V_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ if (mode == D67_PRED && best_intra_mode != V_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D203_PRED && best_intra_mode != H_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D157_PRED && best_intra_mode != H_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ return 0;
+}
+
+static int cost_mv_ref(const ModeCosts *const mode_costs, PREDICTION_MODE mode,
+ int16_t mode_context) {
+ if (is_inter_compound_mode(mode)) {
+ return mode_costs
+ ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+ }
+
+ int mode_cost = 0;
+ int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+ assert(is_inter_mode(mode));
+
+ if (mode == NEWMV) {
+ mode_cost = mode_costs->newmv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost = mode_costs->newmv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+ if (mode == GLOBALMV) {
+ mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost += mode_costs->zeromv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ mode_cost += mode_costs->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+ return mode_cost;
+ }
+ }
+}
+
+static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
+ int ref_idx) {
+ return ref_idx ? compound_ref1_mode(this_mode)
+ : compound_ref0_mode(this_mode);
+}
+
+static AOM_INLINE void estimate_ref_frame_costs(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs,
+ int segment_id, unsigned int *ref_costs_single,
+ unsigned int (*ref_costs_comp)[REF_FRAMES]) {
+ int seg_ref_active =
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ if (seg_ref_active) {
+ memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+ int ref_frame;
+ for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+ memset(ref_costs_comp[ref_frame], 0,
+ REF_FRAMES * sizeof((*ref_costs_comp)[0]));
+ } else {
+ int intra_inter_ctx = av1_get_intra_inter_context(xd);
+ ref_costs_single[INTRA_FRAME] =
+ mode_costs->intra_inter_cost[intra_inter_ctx][0];
+ unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
+
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+ ref_costs_single[i] = base_cost;
+
+ const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
+ const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
+ const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
+ const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
+ const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
+ const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
+
+ // Determine cost of a single ref frame, where frame types are represented
+ // by a tree:
+ // Level 0: add cost whether this ref is a forward or backward ref
+ ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1];
+ ref_costs_single[ALTREF2_FRAME] +=
+ mode_costs->single_ref_cost[ctx_p1][0][1];
+ ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p1][0][1];
+
+ // Level 1: if this ref is forward ref,
+ // add cost whether it is last/last2 or last3/golden
+ ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0];
+ ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][0];
+ ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1];
+ ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p3][2][1];
+
+ // Level 1: if this ref is backward ref
+ // then add cost whether this ref is altref or backward ref
+ ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][0];
+ ref_costs_single[ALTREF2_FRAME] +=
+ mode_costs->single_ref_cost[ctx_p2][1][0];
+ ref_costs_single[ALTREF_FRAME] += mode_costs->single_ref_cost[ctx_p2][1][1];
+
+ // Level 2: further add cost whether this ref is last or last2
+ ref_costs_single[LAST_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][0];
+ ref_costs_single[LAST2_FRAME] += mode_costs->single_ref_cost[ctx_p4][3][1];
+
+ // Level 2: last3 or golden
+ ref_costs_single[LAST3_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][0];
+ ref_costs_single[GOLDEN_FRAME] += mode_costs->single_ref_cost[ctx_p5][4][1];
+
+ // Level 2: bwdref or altref2
+ ref_costs_single[BWDREF_FRAME] += mode_costs->single_ref_cost[ctx_p6][5][0];
+ ref_costs_single[ALTREF2_FRAME] +=
+ mode_costs->single_ref_cost[ctx_p6][5][1];
+
+ if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
+ // Similar to single ref, determine cost of compound ref frames.
+ // cost_compound_refs = cost_first_ref + cost_second_ref
+ const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
+ const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
+ const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
+ const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
+ const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
+
+ const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+ unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
+
+ ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
+ ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
+ base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
+ ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
+ ref_bicomp_costs[ALTREF_FRAME] = 0;
+
+ // cost of first ref frame
+ ref_bicomp_costs[LAST_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+ ref_bicomp_costs[LAST2_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
+ ref_bicomp_costs[LAST3_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
+ ref_bicomp_costs[GOLDEN_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
+
+ ref_bicomp_costs[LAST_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][0];
+ ref_bicomp_costs[LAST2_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][1];
+
+ ref_bicomp_costs[LAST3_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][0];
+ ref_bicomp_costs[GOLDEN_FRAME] +=
+ mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][1];
+
+ // cost of second ref frame
+ ref_bicomp_costs[BWDREF_FRAME] +=
+ mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+ ref_bicomp_costs[ALTREF2_FRAME] +=
+ mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+ ref_bicomp_costs[ALTREF_FRAME] +=
+ mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+
+ ref_bicomp_costs[BWDREF_FRAME] +=
+ mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+ ref_bicomp_costs[ALTREF2_FRAME] +=
+ mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+
+ // cost: if one ref frame is forward ref, the other ref is backward ref
+ int ref0, ref1;
+ for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+ for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
+ ref_costs_comp[ref0][ref1] =
+ ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
+ }
+ }
+
+ // cost: if both ref frames are the same side.
+ const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
+ const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
+ const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
+ ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
+ base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+ ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
+ base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+ ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
+ base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+ ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
+ base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+ } else {
+ int ref0, ref1;
+ for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+ for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
+ ref_costs_comp[ref0][ref1] = 512;
+ }
+ ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
+ ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
+ ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
+ ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
+ }
+ }
+}
+
+static AOM_INLINE void store_coding_context(
+#if CONFIG_INTERNAL_STATS
+ MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index,
+#else
+ MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+#endif // CONFIG_INTERNAL_STATS
+ int skippable) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // Take a snapshot of the coding context so it can be
+ // restored if we decide to encode this way
+ ctx->rd_stats.skip_txfm = x->txfm_search_info.skip_txfm;
+ ctx->skippable = skippable;
+#if CONFIG_INTERNAL_STATS
+ ctx->best_mode_index = mode_index;
+#endif // CONFIG_INTERNAL_STATS
+ ctx->mic = *xd->mi[0];
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
+ av1_ref_frame_type(xd->mi[0]->ref_frame));
+}
+
+static AOM_INLINE void setup_buffer_ref_mvs_inter(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+ BLOCK_SIZE block_size, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref_frame);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, ref_frame);
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
+ assert(yv12 != NULL);
+
+ if (scaled_ref_frame) {
+ // Setup pred block based on scaled reference, because av1_mv_pred() doesn't
+ // support scaling.
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, NULL, NULL,
+ num_planes);
+ } else {
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+ }
+
+ // Gets an initial list of candidate vectors from neighbours and orders them
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+ // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+ // Further refinement that is encode side only to test the top few candidates
+ // in full and choose the best as the center point for subsequent searches.
+ // The current implementation doesn't support scaling.
+ av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride,
+ ref_frame, block_size);
+
+ // Go back to unscaled reference.
+ if (scaled_ref_frame) {
+ // We had temporarily setup pred block based on scaled reference above. Go
+ // back to unscaled reference now, for subsequent use.
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+ }
+}
+
+#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+ const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+ xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_bottom_edge +
+ RIGHT_BOTTOM_MARGIN };
+ clamp_mv(mv, &mv_limits);
+}
+
+/* If the current mode shares the same mv with other modes with higher cost,
+ * skip this mode. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+ const MACROBLOCK *const x,
+ PREDICTION_MODE this_mode,
+ const MV_REFERENCE_FRAME ref_frames[2],
+ InterModeSearchState *search_state) {
+ const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+ const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ PREDICTION_MODE compare_mode = MB_MODE_COUNT;
+ if (!is_comp_pred) {
+ if (this_mode == NEARMV) {
+ if (ref_mv_count == 0) {
+ // NEARMV has the same motion vector as NEARESTMV
+ compare_mode = NEARESTMV;
+ }
+ if (ref_mv_count == 1 &&
+ cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+ // NEARMV has the same motion vector as GLOBALMV
+ compare_mode = GLOBALMV;
+ }
+ }
+ if (this_mode == GLOBALMV) {
+ if (ref_mv_count == 0 &&
+ cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+ // GLOBALMV has the same motion vector as NEARESTMV
+ compare_mode = NEARESTMV;
+ }
+ if (ref_mv_count == 1) {
+ // GLOBALMV has the same motion vector as NEARMV
+ compare_mode = NEARMV;
+ }
+ }
+
+ if (compare_mode != MB_MODE_COUNT) {
+ // Use modelled_rd to check whether compare mode was searched
+ if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] !=
+ INT64_MAX) {
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
+ const int compare_cost =
+ cost_mv_ref(&x->mode_costs, compare_mode, mode_ctx);
+ const int this_cost = cost_mv_ref(&x->mode_costs, this_mode, mode_ctx);
+
+ // Only skip if the mode cost is larger than compare mode cost
+ if (this_cost > compare_cost) {
+ search_state->modelled_rd[this_mode][0][ref_frames[0]] =
+ search_state->modelled_rd[compare_mode][0][ref_frames[0]];
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
+ const AV1_COMMON *cm,
+ const MACROBLOCK *x) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ *out_mv = in_mv;
+ lower_mv_precision(&out_mv->as_mv, cm->features.allow_high_precision_mv,
+ cm->features.cur_frame_force_integer_mv);
+ clamp_mv2(&out_mv->as_mv, xd);
+ return av1_is_fullmv_in_range(&x->mv_limits,
+ get_fullmv_from_mv(&out_mv->as_mv));
+}
+
+// To use single newmv directly for compound modes, need to clamp the mv to the
+// valid mv range. Without this, encoder would generate out of range mv, and
+// this is seen in 8k encoding.
+static INLINE void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv,
+ int ref_idx) {
+ const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+ SubpelMvLimits mv_limits;
+
+ av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv);
+ clamp_mv(&mv->as_mv, &mv_limits);
+}
+
+static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, int_mv *cur_mv,
+ int *const rate_mv, HandleInterModeArgs *const args,
+ inter_mode_info *mode_info) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int refs[2] = { mbmi->ref_frame[0],
+ mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+
+ if (is_comp_pred) {
+ const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]];
+ const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]];
+ if (this_mode == NEW_NEWMV) {
+ if (valid_mv0) {
+ cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+ clamp_mv_in_range(x, &cur_mv[0], 0);
+ }
+ if (valid_mv1) {
+ cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+ clamp_mv_in_range(x, &cur_mv[1], 1);
+ }
+ *rate_mv = 0;
+ for (int i = 0; i < 2; ++i) {
+ const int_mv ref_mv = av1_get_ref_mv(x, i);
+ *rate_mv += av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv,
+ x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ }
+ } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+ if (valid_mv1) {
+ cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+ clamp_mv_in_range(x, &cur_mv[1], 1);
+ }
+ const int_mv ref_mv = av1_get_ref_mv(x, 1);
+ *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv,
+ x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ } else {
+ assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
+ if (valid_mv0) {
+ cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+ clamp_mv_in_range(x, &cur_mv[0], 0);
+ }
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv,
+ x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ }
+ } else {
+ // Single ref case.
+ const int ref_idx = 0;
+ int search_range = INT_MAX;
+
+ if (cpi->sf.mv_sf.reduce_search_range && mbmi->ref_mv_idx > 0) {
+ const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+ int min_mv_diff = INT_MAX;
+ int best_match = -1;
+ MV prev_ref_mv[2] = { { 0 } };
+ for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) {
+ prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame,
+ idx, &x->mbmi_ext)
+ .as_mv;
+ const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row),
+ abs(ref_mv.col - prev_ref_mv[idx].col));
+
+ if (min_mv_diff > ref_mv_diff) {
+ min_mv_diff = ref_mv_diff;
+ best_match = idx;
+ }
+ }
+
+ if (min_mv_diff < (16 << 3)) {
+ if (args->single_newmv_valid[best_match][refs[0]]) {
+ search_range = min_mv_diff;
+ search_range +=
+ AOMMAX(abs(args->single_newmv[best_match][refs[0]].as_mv.row -
+ prev_ref_mv[best_match].row),
+ abs(args->single_newmv[best_match][refs[0]].as_mv.col -
+ prev_ref_mv[best_match].col));
+ // Get full pixel search range.
+ search_range = (search_range + 4) >> 3;
+ }
+ }
+ }
+
+ int_mv best_mv;
+ av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range,
+ mode_info, &best_mv, args);
+ if (best_mv.as_int == INVALID_MV) return INT64_MAX;
+
+ args->single_newmv[ref_mv_idx][refs[0]] = best_mv;
+ args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
+ args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
+ cur_mv[0].as_int = best_mv.as_int;
+
+ // Return after single_newmv is set.
+ if (mode_info[mbmi->ref_mv_idx].skip) return INT64_MAX;
+ }
+
+ return 0;
+}
+
+static INLINE void update_mode_start_end_index(
+ const AV1_COMP *const cpi, const MB_MODE_INFO *const mbmi,
+ int *mode_index_start, int *mode_index_end, int last_motion_mode_allowed,
+ int interintra_allowed, int eval_motion_mode) {
+ *mode_index_start = (int)SIMPLE_TRANSLATION;
+ *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed;
+ if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+ if (!eval_motion_mode) {
+ *mode_index_end = (int)SIMPLE_TRANSLATION;
+ } else {
+ // Set the start index appropriately to process motion modes other than
+ // simple translation
+ *mode_index_start = 1;
+ }
+ }
+ if (cpi->sf.inter_sf.extra_prune_warped && mbmi->bsize > BLOCK_16X16)
+ *mode_index_end = SIMPLE_TRANSLATION;
+}
+
+/*!\brief AV1 motion mode search
+ *
+ * \ingroup inter_mode_search
+ * Function to search over and determine the motion mode. It will update
+ * mbmi->motion_mode to one of SIMPLE_TRANSLATION, OBMC_CAUSAL, or
+ * WARPED_CAUSAL and determine any necessary side information for the selected
+ * motion mode. It will also perform the full transform search, unless the
+ * input parameter do_tx_search indicates to do an estimation of the RD rather
+ * than an RD corresponding to a full transform search. It will return the
+ * RD for the final motion_mode.
+ * Do the RD search for a given inter mode and compute all information relevant
+ * to the input mode. It will compute the best MV,
+ * compound parameters (if the mode is a compound mode) and interpolation filter
+ * parameters.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during
+ * encoding.
+ * \param[in] x Pointer to struct holding all the data for
+ * the current macroblock.
+ * \param[in] bsize Current block size.
+ * \param[in,out] rd_stats Struct to keep track of the overall RD
+ * information.
+ * \param[in,out] rd_stats_y Struct to keep track of the RD information
+ * for only the Y plane.
+ * \param[in,out] rd_stats_uv Struct to keep track of the RD information
+ * for only the UV planes.
+ * \param[in] args HandleInterModeArgs struct holding
+ * miscellaneous arguments for inter mode
+ * search. See the documentation for this
+ * struct for a description of each member.
+ * \param[in] ref_best_rd Best RD found so far for this block.
+ * It is used for early termination of this
+ * search if the RD exceeds this value.
+ * \param[in,out] ref_skip_rd A length 2 array, where skip_rd[0] is the
+ * best total RD for a skip mode so far, and
+ * skip_rd[1] is the best RD for a skip mode so
+ * far in luma. This is used as a speed feature
+ * to skip the transform search if the computed
+ * skip RD for the current mode is not better
+ * than the best skip_rd so far.
+ * \param[in,out] rate_mv The rate associated with the motion vectors.
+ * This will be modified if a motion search is
+ * done in the motion mode search.
+ * \param[in,out] orig_dst A prediction buffer to hold a computed
+ * prediction. This will eventually hold the
+ * final prediction, and the tmp_dst info will
+ * be copied here.
+ * \param[in,out] best_est_rd Estimated RD for motion mode search if
+ * do_tx_search (see below) is 0.
+ * \param[in] do_tx_search Parameter to indicate whether or not to do
+ * a full transform search. This will compute
+ * an estimated RD for the modes without the
+ * transform search and later perform the full
+ * transform search on the best candidates.
+ * \param[in] inter_modes_info InterModesInfo struct to hold inter mode
+ * information to perform a full transform
+ * search only on winning candidates searched
+ * with an estimate for transform coding RD.
+ * \param[in] eval_motion_mode Boolean whether or not to evaluate motion
+ * motion modes other than SIMPLE_TRANSLATION.
+ * \param[out] yrd Stores the rdcost corresponding to encoding
+ * the luma plane.
+ * \return Returns INT64_MAX if the determined motion mode is invalid and the
+ * current motion mode being tested should be skipped. It returns 0 if the
+ * motion mode search is a success.
+ */
+static int64_t motion_mode_rd(
+ const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, HandleInterModeArgs *const args, int64_t ref_best_rd,
+ int64_t *ref_skip_rd, int *rate_mv, const BUFFER_SET *orig_dst,
+ int64_t *best_est_rd, int do_tx_search, InterModesInfo *inter_modes_info,
+ int eval_motion_mode, int64_t *yrd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const FeatureFlags *const features = &cm->features;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int rate2_nocoeff = rd_stats->rate;
+ int best_xskip_txfm = 0;
+ RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const int rate_mv0 = *rate_mv;
+ const int interintra_allowed = cm->seq_params->enable_interintra_compound &&
+ is_interintra_allowed(mbmi) &&
+ mbmi->compound_idx;
+ WARP_SAMPLE_INFO *const warp_sample_info =
+ &x->warp_sample_info[mbmi->ref_frame[0]];
+ int *pts0 = warp_sample_info->pts;
+ int *pts_inref0 = warp_sample_info->pts_inref;
+
+ assert(mbmi->ref_frame[1] != INTRA_FRAME);
+ const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
+ av1_invalid_rd_stats(&best_rd_stats);
+ mbmi->num_proj_ref = 1; // assume num_proj_ref >=1
+ MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+ *yrd = INT64_MAX;
+ if (features->switchable_motion_mode) {
+ // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+ // is allowed.
+ last_motion_mode_allowed = motion_mode_allowed(
+ xd->global_motion, xd, mbmi, features->allow_warped_motion);
+ }
+
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ // Collect projection samples used in least squares approximation of
+ // the warped motion parameters if WARPED_CAUSAL is going to be searched.
+ if (warp_sample_info->num < 0) {
+ warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
+ }
+ mbmi->num_proj_ref = warp_sample_info->num;
+ }
+ const int total_samples = mbmi->num_proj_ref;
+ if (total_samples == 0) {
+ // Do not search WARPED_CAUSAL if there are no samples to use to determine
+ // warped parameters.
+ last_motion_mode_allowed = OBMC_CAUSAL;
+ }
+
+ const MB_MODE_INFO base_mbmi = *mbmi;
+ MB_MODE_INFO best_mbmi;
+ const int interp_filter = features->interp_filter;
+ const int switchable_rate =
+ av1_is_interp_needed(xd)
+ ? av1_get_switchable_rate(x, xd, interp_filter,
+ cm->seq_params->enable_dual_filter)
+ : 0;
+ int64_t best_rd = INT64_MAX;
+ int best_rate_mv = rate_mv0;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int mode_index_start, mode_index_end;
+ const int txfm_rd_gate_level =
+ get_txfm_rd_gate_level(cm->seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize,
+ TX_SEARCH_MOTION_MODE, eval_motion_mode);
+
+ // Modify the start and end index according to speed features. For example,
+ // if SIMPLE_TRANSLATION has already been searched according to
+ // the motion_mode_for_winner_cand speed feature, update the mode_index_start
+ // to avoid searching it again.
+ update_mode_start_end_index(cpi, mbmi, &mode_index_start, &mode_index_end,
+ last_motion_mode_allowed, interintra_allowed,
+ eval_motion_mode);
+ // Main function loop. This loops over all of the possible motion modes and
+ // computes RD to determine the best one. This process includes computing
+ // any necessary side information for the motion mode and performing the
+ // transform search.
+ for (int mode_index = mode_index_start; mode_index <= mode_index_end;
+ mode_index++) {
+ if (args->skip_motion_mode && mode_index) continue;
+ int tmp_rate2 = rate2_nocoeff;
+ const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
+ int tmp_rate_mv = rate_mv0;
+
+ *mbmi = base_mbmi;
+ if (is_interintra_mode) {
+ // Only use SIMPLE_TRANSLATION for interintra
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ } else {
+ mbmi->motion_mode = (MOTION_MODE)mode_index;
+ assert(mbmi->ref_frame[1] != INTRA_FRAME);
+ }
+
+ // Do not search OBMC if the probability of selecting it is below a
+ // predetermined threshold for this update_type and block size.
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ int use_actual_frame_probs = 1;
+ int prune_obmc;
+#if CONFIG_FPMT_TEST
+ use_actual_frame_probs =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+ if (!use_actual_frame_probs) {
+ prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] <
+ cpi->sf.inter_sf.prune_obmc_prob_thresh;
+ }
+#endif
+ if (use_actual_frame_probs) {
+ prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] <
+ cpi->sf.inter_sf.prune_obmc_prob_thresh;
+ }
+ if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || prune_obmc) &&
+ mbmi->motion_mode == OBMC_CAUSAL)
+ continue;
+
+ if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) {
+ // SIMPLE_TRANSLATION mode: no need to recalculate.
+ // The prediction is calculated before motion_mode_rd() is called in
+ // handle_inter_mode()
+ } else if (mbmi->motion_mode == OBMC_CAUSAL) {
+ const uint32_t cur_mv = mbmi->mv[0].as_int;
+ // OBMC_CAUSAL not allowed for compound prediction
+ assert(!is_comp_pred);
+ if (have_newmv_in_inter_mode(this_mode)) {
+ av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL,
+ &mbmi->mv[0], NULL);
+ tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+ }
+ if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) {
+ // Build the predictor according to the current motion vector if it has
+ // not already been built
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ 0, av1_num_planes(cm) - 1);
+ }
+ // Build the inter predictor by blending the predictor corresponding to
+ // this MV, and the neighboring blocks using the OBMC model
+ av1_build_obmc_inter_prediction(
+ cm, xd, args->above_pred_buf, args->above_pred_stride,
+ args->left_pred_buf, args->left_pred_stride);
+#if !CONFIG_REALTIME_ONLY
+ } else if (mbmi->motion_mode == WARPED_CAUSAL) {
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ mbmi->motion_mode = WARPED_CAUSAL;
+ mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+ mbmi->interp_filters =
+ av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+
+ memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+ memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+ // Select the samples according to motion vector difference
+ if (mbmi->num_proj_ref > 1) {
+ mbmi->num_proj_ref = av1_selectSamples(
+ &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize);
+ }
+
+ // Compute the warped motion parameters with a least squares fit
+ // using the collected samples
+ if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+ mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+ &mbmi->wm_params, mi_row, mi_col)) {
+ assert(!is_comp_pred);
+ if (have_newmv_in_inter_mode(this_mode)) {
+ // Refine MV for NEWMV mode
+ const int_mv mv0 = mbmi->mv[0];
+ const WarpedMotionParams wm_params0 = mbmi->wm_params;
+ const int num_proj_ref0 = mbmi->num_proj_ref;
+
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+ &ref_mv.as_mv, NULL);
+
+ // Refine MV in a small range.
+ av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
+ total_samples, cpi->sf.mv_sf.warp_search_method,
+ cpi->sf.mv_sf.warp_search_iters);
+
+ if (mv0.as_int != mbmi->mv[0].as_int) {
+ // Keep the refined MV and WM parameters.
+ tmp_rate_mv = av1_mv_bit_cost(
+ &mbmi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
+ x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+ tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+ } else {
+ // Restore the old MV and WM parameters.
+ mbmi->mv[0] = mv0;
+ mbmi->wm_params = wm_params0;
+ mbmi->num_proj_ref = num_proj_ref0;
+ }
+ }
+
+ // Build the warped predictor
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+ } else {
+ continue;
+ }
+#endif // !CONFIG_REALTIME_ONLY
+ } else if (is_interintra_mode) {
+ const int ret =
+ av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd,
+ &tmp_rate_mv, &tmp_rate2, orig_dst);
+ if (ret < 0) continue;
+ }
+
+ // If we are searching newmv and the mv is the same as refmv, skip the
+ // current mode
+ if (!av1_check_newmv_joint_nonzero(cm, x)) continue;
+
+ // Update rd_stats for the current motion mode
+ txfm_info->skip_txfm = 0;
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ rd_stats->skip_txfm = 1;
+ rd_stats->rate = tmp_rate2;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
+ if (interintra_allowed) {
+ rd_stats->rate +=
+ mode_costs->interintra_cost[size_group_lookup[bsize]]
+ [mbmi->ref_frame[1] == INTRA_FRAME];
+ }
+ if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
+ (mbmi->ref_frame[1] != INTRA_FRAME)) {
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ rd_stats->rate +=
+ mode_costs->motion_mode_cost[bsize][mbmi->motion_mode];
+ } else {
+ rd_stats->rate +=
+ mode_costs->motion_mode_cost1[bsize][mbmi->motion_mode];
+ }
+ }
+
+ int64_t this_yrd = INT64_MAX;
+
+ if (!do_tx_search) {
+ // Avoid doing a transform search here to speed up the overall mode
+ // search. It will be done later in the mode search if the current
+ // motion mode seems promising.
+ int64_t curr_sse = -1;
+ int64_t sse_y = -1;
+ int est_residue_cost = 0;
+ int64_t est_dist = 0;
+ int64_t est_rd = 0;
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ curr_sse = get_sse(cpi, x, &sse_y);
+ const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+ &est_residue_cost, &est_dist);
+ (void)has_est_rd;
+ assert(has_est_rd);
+ } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 ||
+ cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD](
+ cpi, bsize, x, xd, 0, num_planes - 1, &est_residue_cost, &est_dist,
+ NULL, &curr_sse, NULL, NULL, NULL);
+ sse_y = x->pred_sse[xd->mi[0]->ref_frame[0]];
+ }
+ est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist);
+ if (est_rd * 0.80 > *best_est_rd) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ continue;
+ }
+ const int mode_rate = rd_stats->rate;
+ rd_stats->rate += est_residue_cost;
+ rd_stats->dist = est_dist;
+ rd_stats->rdcost = est_rd;
+ if (rd_stats->rdcost < *best_est_rd) {
+ *best_est_rd = rd_stats->rdcost;
+ assert(sse_y >= 0);
+ ref_skip_rd[1] = txfm_rd_gate_level
+ ? RDCOST(x->rdmult, mode_rate, (sse_y << 4))
+ : INT64_MAX;
+ }
+ if (cm->current_frame.reference_mode == SINGLE_REFERENCE) {
+ if (!is_comp_pred) {
+ assert(curr_sse >= 0);
+ inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+ rd_stats->rdcost, rd_stats, rd_stats_y,
+ rd_stats_uv, mbmi);
+ }
+ } else {
+ assert(curr_sse >= 0);
+ inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+ rd_stats->rdcost, rd_stats, rd_stats_y,
+ rd_stats_uv, mbmi);
+ }
+ mbmi->skip_txfm = 0;
+ } else {
+ // Perform full transform search
+ int64_t skip_rd = INT64_MAX;
+ int64_t skip_rdy = INT64_MAX;
+ if (txfm_rd_gate_level) {
+ // Check if the mode is good enough based on skip RD
+ int64_t sse_y = INT64_MAX;
+ int64_t curr_sse = get_sse(cpi, x, &sse_y);
+ skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse);
+ skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4));
+ int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd,
+ txfm_rd_gate_level, 0);
+ if (!eval_txfm) continue;
+ }
+
+ // Do transform search
+ const int mode_rate = rd_stats->rate;
+ if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+ rd_stats->rate, ref_best_rd)) {
+ if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
+ return INT64_MAX;
+ }
+ continue;
+ }
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int y_rate =
+ rd_stats->skip_txfm
+ ? x->mode_costs.skip_txfm_cost[skip_ctx][1]
+ : (rd_stats_y->rate + x->mode_costs.skip_txfm_cost[skip_ctx][0]);
+ this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y->dist);
+
+ const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (curr_rd < ref_best_rd) {
+ ref_best_rd = curr_rd;
+ ref_skip_rd[0] = skip_rd;
+ ref_skip_rd[1] = skip_rdy;
+ }
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ inter_mode_data_push(
+ tile_data, mbmi->bsize, rd_stats->sse, rd_stats->dist,
+ rd_stats_y->rate + rd_stats_uv->rate +
+ mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]);
+ }
+ }
+
+ if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
+ if (is_nontrans_global_motion(xd, xd->mi[0])) {
+ mbmi->interp_filters =
+ av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+ }
+ }
+
+ const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (mode_index == 0) {
+ args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
+ }
+ if (mode_index == 0 || tmp_rd < best_rd) {
+ // Update best_rd data if this is the best motion mode so far
+ best_mbmi = *mbmi;
+ best_rd = tmp_rd;
+ best_rd_stats = *rd_stats;
+ best_rd_stats_y = *rd_stats_y;
+ best_rate_mv = tmp_rate_mv;
+ *yrd = this_yrd;
+ if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
+ memcpy(best_blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
+ best_xskip_txfm = mbmi->skip_txfm;
+ }
+ }
+ // Update RD and mbmi stats for selected motion mode
+ mbmi->ref_frame[1] = ref_frame_1;
+ *rate_mv = best_rate_mv;
+ if (best_rd == INT64_MAX || !av1_check_newmv_joint_nonzero(cm, x)) {
+ av1_invalid_rd_stats(rd_stats);
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return INT64_MAX;
+ }
+ *mbmi = best_mbmi;
+ *rd_stats = best_rd_stats;
+ *rd_stats_y = best_rd_stats_y;
+ if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
+ memcpy(txfm_info->blk_skip, best_blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
+ txfm_info->skip_txfm = best_xskip_txfm;
+
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return 0;
+}
+
+static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
+ MACROBLOCK *const x, BLOCK_SIZE bsize,
+ const BUFFER_SET *const orig_dst, int64_t best_rd) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int64_t total_sse = 0;
+ int64_t this_rd = INT64_MAX;
+ const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+ rd_stats->rate = x->mode_costs.skip_mode_cost[skip_mode_ctx][1];
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ // Call av1_enc_build_inter_predictor() for one plane at a time.
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ plane, plane);
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ av1_subtract_plane(x, plane_bsize, plane);
+
+ int64_t sse =
+ av1_pixel_diff_dist(x, plane, 0, 0, plane_bsize, plane_bsize, NULL);
+ if (is_cur_buf_hbd(xd)) sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+ sse <<= 4;
+ total_sse += sse;
+ // When current rd cost is more than the best rd, skip evaluation of
+ // remaining planes.
+ this_rd = RDCOST(x->rdmult, rd_stats->rate, total_sse);
+ if (this_rd > best_rd) break;
+ }
+
+ rd_stats->dist = rd_stats->sse = total_sse;
+ rd_stats->rdcost = this_rd;
+
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return 0;
+}
+
+// Check NEARESTMV, NEARMV, GLOBALMV ref mvs for duplicate and skip the relevant
+// mode
+// Note(rachelbarker): This speed feature currently does not interact correctly
+// with global motion. The issue is that, when global motion is used, GLOBALMV
+// produces a different prediction to NEARESTMV/NEARMV even if the motion
+// vectors are the same. Thus GLOBALMV should not be pruned in this case.
+static INLINE int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext,
+ int ref_idx,
+ const MV_REFERENCE_FRAME *ref_frame,
+ PREDICTION_MODE single_mode) {
+ const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ assert(single_mode != NEWMV);
+ if (single_mode == NEARESTMV) {
+ return 0;
+ } else if (single_mode == NEARMV) {
+ // when ref_mv_count = 0, NEARESTMV and NEARMV are same as GLOBALMV
+ // when ref_mv_count = 1, NEARMV is same as GLOBALMV
+ if (ref_mv_count < 2) return 1;
+ } else if (single_mode == GLOBALMV) {
+ // when ref_mv_count == 0, GLOBALMV is same as NEARESTMV
+ if (ref_mv_count == 0) return 1;
+ // when ref_mv_count == 1, NEARMV is same as GLOBALMV
+ else if (ref_mv_count == 1)
+ return 0;
+
+ int stack_size = AOMMIN(USABLE_REF_MV_STACK_SIZE, ref_mv_count);
+ // Check GLOBALMV is matching with any mv in ref_mv_stack
+ for (int ref_mv_idx = 0; ref_mv_idx < stack_size; ref_mv_idx++) {
+ int_mv this_mv;
+
+ if (ref_idx == 0)
+ this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+ else
+ this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+
+ if (this_mv.as_int == mbmi_ext->global_mvs[ref_frame[ref_idx]].as_int)
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+ int ref_idx, int ref_mv_idx,
+ int skip_repeated_ref_mv,
+ const MV_REFERENCE_FRAME *ref_frame,
+ const MB_MODE_INFO_EXT *mbmi_ext) {
+ const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx);
+ assert(is_inter_singleref_mode(single_mode));
+ if (single_mode == NEWMV) {
+ this_mv->as_int = INVALID_MV;
+ } else if (single_mode == GLOBALMV) {
+ if (skip_repeated_ref_mv &&
+ check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode))
+ return 0;
+ *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+ } else {
+ assert(single_mode == NEARMV || single_mode == NEARESTMV);
+ const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const int ref_mv_offset = single_mode == NEARESTMV ? 0 : ref_mv_idx + 1;
+ if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) {
+ assert(ref_mv_offset >= 0);
+ if (ref_idx == 0) {
+ *this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv;
+ } else {
+ *this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv;
+ }
+ } else {
+ if (skip_repeated_ref_mv &&
+ check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode))
+ return 0;
+ *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+ }
+ }
+ return 1;
+}
+
+// Skip NEARESTMV and NEARMV modes based on refmv weight computed in ref mv list
+// population
+static INLINE int skip_nearest_near_mv_using_refmv_weight(
+ const MACROBLOCK *const x, const PREDICTION_MODE this_mode,
+ const int8_t ref_frame_type, PREDICTION_MODE best_mode) {
+ if (this_mode != NEARESTMV && this_mode != NEARMV) return 0;
+ // Do not skip the mode if the current block has not yet obtained a valid
+ // inter mode.
+ if (!is_inter_mode(best_mode)) return 0;
+
+ const MACROBLOCKD *xd = &x->e_mbd;
+ // Do not skip the mode if both the top and left neighboring blocks are not
+ // available.
+ if (!xd->left_available || !xd->up_available) return 0;
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const uint16_t *const ref_mv_weight = mbmi_ext->weight[ref_frame_type];
+ const int ref_mv_count =
+ AOMMIN(MAX_REF_MV_SEARCH, mbmi_ext->ref_mv_count[ref_frame_type]);
+
+ if (ref_mv_count == 0) return 0;
+ // If ref mv list has at least one nearest candidate do not prune NEARESTMV
+ if (this_mode == NEARESTMV && ref_mv_weight[0] >= REF_CAT_LEVEL) return 0;
+
+ // Count number of ref mvs populated from nearest candidates
+ int nearest_refmv_count = 0;
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_mv_count; ref_mv_idx++) {
+ if (ref_mv_weight[ref_mv_idx] >= REF_CAT_LEVEL) nearest_refmv_count++;
+ }
+
+ // nearest_refmv_count indicates the closeness of block motion characteristics
+ // with respect to its spatial neighbor. Smaller value of nearest_refmv_count
+ // w.r.t to ref_mv_count means less correlation with its spatial neighbors.
+ // Hence less possibility for NEARESTMV and NEARMV modes becoming the best
+ // mode since these modes work well for blocks that shares similar motion
+ // characteristics with its neighbor. Thus, NEARMV mode is pruned when
+ // nearest_refmv_count is relatively smaller than ref_mv_count and NEARESTMV
+ // mode is pruned if none of the ref mvs are populated from nearest candidate.
+ const int prune_thresh = 1 + (ref_mv_count >= 2);
+ if (nearest_refmv_count < prune_thresh) return 1;
+ return 0;
+}
+
+// This function update the non-new mv for the current prediction mode
+static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
+ const AV1_COMMON *cm, const MACROBLOCK *x,
+ int skip_repeated_ref_mv) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_comp_pred = has_second_ref(mbmi);
+
+ int ret = 1;
+ for (int i = 0; i < is_comp_pred + 1; ++i) {
+ int_mv this_mv;
+ this_mv.as_int = INVALID_MV;
+ ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx,
+ skip_repeated_ref_mv, mbmi->ref_frame, &x->mbmi_ext);
+ if (!ret) return 0;
+ const PREDICTION_MODE single_mode = get_single_mode(this_mode, i);
+ if (single_mode == NEWMV) {
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ cur_mv[i] =
+ (i == 0) ? x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+ .this_mv
+ : x->mbmi_ext.ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+ .comp_mv;
+ } else {
+ ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
+ }
+ }
+ return ret;
+}
+
+static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ const int (*const drl_mode_cost0)[2],
+ int8_t ref_frame_type) {
+ int cost = 0;
+ if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+ for (int idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx];
+ if (mbmi->ref_mv_idx == idx) return cost;
+ }
+ }
+ return cost;
+ }
+
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ for (int idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+ cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)];
+ if (mbmi->ref_mv_idx == (idx - 1)) return cost;
+ }
+ }
+ return cost;
+ }
+ return cost;
+}
+
+static INLINE int is_single_newmv_valid(const HandleInterModeArgs *const args,
+ const MB_MODE_INFO *const mbmi,
+ PREDICTION_MODE this_mode) {
+ for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
+ const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx);
+ const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx];
+ if (single_mode == NEWMV &&
+ args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int get_drl_refmv_count(const MACROBLOCK *const x,
+ const MV_REFERENCE_FRAME *ref_frame,
+ PREDICTION_MODE mode) {
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV);
+ const int has_drl =
+ (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1);
+ const int ref_set =
+ has_drl ? AOMMIN(MAX_REF_MV_SEARCH, ref_mv_count - has_nearmv) : 1;
+
+ return ref_set;
+}
+
+// Checks if particular ref_mv_idx should be pruned.
+static int prune_ref_mv_idx_using_qindex(const int reduce_inter_modes,
+ const int qindex,
+ const int ref_mv_idx) {
+ if (reduce_inter_modes >= 3) return 1;
+ // Q-index logic based pruning is enabled only for
+ // reduce_inter_modes = 2.
+ assert(reduce_inter_modes == 2);
+ // When reduce_inter_modes=2, pruning happens as below based on q index.
+ // For q index range between 0 and 85: prune if ref_mv_idx >= 1.
+ // For q index range between 86 and 170: prune if ref_mv_idx == 2.
+ // For q index range between 171 and 255: no pruning.
+ const int min_prune_ref_mv_idx = (qindex * 3 / QINDEX_RANGE) + 1;
+ return (ref_mv_idx >= min_prune_ref_mv_idx);
+}
+
+// Whether this reference motion vector can be skipped, based on initial
+// heuristics.
+static bool ref_mv_idx_early_breakout(
+ const SPEED_FEATURES *const sf,
+ const RefFrameDistanceInfo *const ref_frame_dist_info, MACROBLOCK *x,
+ const HandleInterModeArgs *const args, int64_t ref_best_rd,
+ int ref_mv_idx) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ const int is_comp_pred = has_second_ref(mbmi);
+ if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) {
+ if (mbmi->ref_frame[0] == LAST2_FRAME ||
+ mbmi->ref_frame[0] == LAST3_FRAME ||
+ mbmi->ref_frame[1] == LAST2_FRAME ||
+ mbmi->ref_frame[1] == LAST3_FRAME) {
+ const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+ if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+ REF_CAT_LEVEL) {
+ return true;
+ }
+ }
+ // TODO(any): Experiment with reduce_inter_modes for compound prediction
+ if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred &&
+ have_newmv_in_inter_mode(mbmi->mode)) {
+ if (mbmi->ref_frame[0] != ref_frame_dist_info->nearest_past_ref &&
+ mbmi->ref_frame[0] != ref_frame_dist_info->nearest_future_ref) {
+ const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+ const int do_prune = prune_ref_mv_idx_using_qindex(
+ sf->inter_sf.reduce_inter_modes, x->qindex, ref_mv_idx);
+ if (do_prune &&
+ (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+ REF_CAT_LEVEL)) {
+ return true;
+ }
+ }
+ }
+ }
+
+ mbmi->ref_mv_idx = ref_mv_idx;
+ if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) {
+ return true;
+ }
+ size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost;
+ const int drl_cost = get_drl_cost(
+ mbmi, mbmi_ext, x->mode_costs.drl_mode_cost0, ref_frame_type);
+ est_rd_rate += drl_cost;
+ if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+ return true;
+ }
+ return false;
+}
+
+// Compute the estimated RD cost for the motion vector with simple translation.
+static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats,
+ HandleInterModeArgs *args,
+ int ref_mv_idx, int64_t ref_best_rd,
+ BLOCK_SIZE bsize) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ const AV1_COMMON *cm = &cpi->common;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const ModeCosts *mode_costs = &x->mode_costs;
+
+ struct macroblockd_plane *p = xd->plane;
+ const BUFFER_SET orig_dst = {
+ { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+ { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+ };
+ av1_init_rd_stats(rd_stats);
+
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ if (mbmi->ref_frame[1] == INTRA_FRAME) {
+ mbmi->ref_frame[1] = NONE_FRAME;
+ }
+ int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+
+ mbmi->num_proj_ref = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->ref_mv_idx = ref_mv_idx;
+
+ rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+ const int drl_cost =
+ get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
+ rd_stats->rate += drl_cost;
+
+ int_mv cur_mv[2];
+ if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) {
+ return INT64_MAX;
+ }
+ assert(have_nearmv_in_inter_mode(mbmi->mode));
+ for (int i = 0; i < is_comp_pred + 1; ++i) {
+ mbmi->mv[i].as_int = cur_mv[i].as_int;
+ }
+ const int ref_mv_cost = cost_mv_ref(mode_costs, mbmi->mode, mode_ctx);
+ rd_stats->rate += ref_mv_cost;
+
+ if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) {
+ return INT64_MAX;
+ }
+
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->num_proj_ref = 0;
+ if (is_comp_pred) {
+ // Only compound_average
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ }
+ set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ int est_rate;
+ int64_t est_dist;
+ model_rd_sb_fn[MODELRD_CURVFIT](cpi, bsize, x, xd, 0, 0, &est_rate, &est_dist,
+ NULL, NULL, NULL, NULL, NULL);
+ return RDCOST(x->rdmult, rd_stats->rate + est_rate, est_dist);
+}
+
+// Represents a set of integers, from 0 to sizeof(int) * 8, as bits in
+// an integer. 0 for the i-th bit means that integer is excluded, 1 means
+// it is included.
+static INLINE void mask_set_bit(int *mask, int index) { *mask |= (1 << index); }
+
+static INLINE bool mask_check_bit(int mask, int index) {
+ return (mask >> index) & 0x1;
+}
+
+// Before performing the full MV search in handle_inter_mode, do a simple
+// translation search and see if we can eliminate any motion vectors.
+// Returns an integer where, if the i-th bit is set, it means that the i-th
+// motion vector should be searched. This is only set for NEAR_MV.
+static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats,
+ HandleInterModeArgs *const args,
+ int64_t ref_best_rd, BLOCK_SIZE bsize,
+ const int ref_set) {
+ // If the number of ref mv count is equal to 1, do not prune the same. It
+ // is better to evaluate the same than to prune it.
+ if (ref_set == 1) return 1;
+ AV1_COMMON *const cm = &cpi->common;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+
+ // Only search indices if they have some chance of being good.
+ int good_indices = 0;
+ for (int i = 0; i < ref_set; ++i) {
+ if (ref_mv_idx_early_breakout(&cpi->sf, &cpi->ref_frame_dist_info, x, args,
+ ref_best_rd, i)) {
+ continue;
+ }
+ mask_set_bit(&good_indices, i);
+ }
+
+ // Only prune in NEARMV mode, if the speed feature is set, and the block size
+ // is large enough. If these conditions are not met, return all good indices
+ // found so far.
+ if (!cpi->sf.inter_sf.prune_mode_search_simple_translation)
+ return good_indices;
+ if (!have_nearmv_in_inter_mode(this_mode)) return good_indices;
+ if (num_pels_log2_lookup[bsize] <= 6) return good_indices;
+ // Do not prune when there is internal resizing. TODO(elliottk) fix this
+ // so b/2384 can be resolved.
+ if (av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[0])) ||
+ (mbmi->ref_frame[1] > 0 &&
+ av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[1])))) {
+ return good_indices;
+ }
+
+ // Calculate the RD cost for the motion vectors using simple translation.
+ int64_t idx_rdcost[] = { INT64_MAX, INT64_MAX, INT64_MAX };
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ // If this index is bad, ignore it.
+ if (!mask_check_bit(good_indices, ref_mv_idx)) {
+ continue;
+ }
+ idx_rdcost[ref_mv_idx] = simple_translation_pred_rd(
+ cpi, x, rd_stats, args, ref_mv_idx, ref_best_rd, bsize);
+ }
+ // Find the index with the best RD cost.
+ int best_idx = 0;
+ for (int i = 1; i < MAX_REF_MV_SEARCH; ++i) {
+ if (idx_rdcost[i] < idx_rdcost[best_idx]) {
+ best_idx = i;
+ }
+ }
+ // Only include indices that are good and within a % of the best.
+ const double dth = has_second_ref(mbmi) ? 1.05 : 1.001;
+ // If the simple translation cost is not within this multiple of the
+ // best RD, skip it. Note that the cutoff is derived experimentally.
+ const double ref_dth = 5;
+ int result = 0;
+ for (int i = 0; i < ref_set; ++i) {
+ if (mask_check_bit(good_indices, i) &&
+ (1.0 * idx_rdcost[i]) / idx_rdcost[best_idx] < dth &&
+ (1.0 * idx_rdcost[i]) / ref_best_rd < ref_dth) {
+ mask_set_bit(&result, i);
+ }
+ }
+ return result;
+}
+
+/*!\brief Motion mode information for inter mode search speedup.
+ *
+ * Used in a speed feature to search motion modes other than
+ * SIMPLE_TRANSLATION only on winning candidates.
+ */
+typedef struct motion_mode_candidate {
+ /*!
+ * Mode info for the motion mode candidate.
+ */
+ MB_MODE_INFO mbmi;
+ /*!
+ * Rate describing the cost of the motion vectors for this candidate.
+ */
+ int rate_mv;
+ /*!
+ * Rate before motion mode search and transform coding is applied.
+ */
+ int rate2_nocoeff;
+ /*!
+ * An integer value 0 or 1 which indicates whether or not to skip the motion
+ * mode search and default to SIMPLE_TRANSLATION as a speed feature for this
+ * candidate.
+ */
+ int skip_motion_mode;
+ /*!
+ * Total RD cost for this candidate.
+ */
+ int64_t rd_cost;
+} motion_mode_candidate;
+
+/*!\cond */
+typedef struct motion_mode_best_st_candidate {
+ motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES];
+ int num_motion_mode_cand;
+} motion_mode_best_st_candidate;
+
+// Checks if the current reference frame matches with neighbouring block's
+// (top/left) reference frames
+static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi,
+ MB_MODE_INFO *nb_mbmi) {
+ MV_REFERENCE_FRAME nb_ref_frames[2] = { nb_mbmi->ref_frame[0],
+ nb_mbmi->ref_frame[1] };
+ MV_REFERENCE_FRAME cur_ref_frames[2] = { cur_mbmi->ref_frame[0],
+ cur_mbmi->ref_frame[1] };
+ const int is_cur_comp_pred = has_second_ref(cur_mbmi);
+ int match_found = 0;
+
+ for (int i = 0; i < (is_cur_comp_pred + 1); i++) {
+ if ((cur_ref_frames[i] == nb_ref_frames[0]) ||
+ (cur_ref_frames[i] == nb_ref_frames[1]))
+ match_found = 1;
+ }
+ return match_found;
+}
+
+static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
+ MACROBLOCKD *xd) {
+ if (!xd->up_available) return 1;
+ const int mi_col = xd->mi_col;
+ MB_MODE_INFO **cur_mbmi = xd->mi;
+ // prev_row_mi points into the mi array, starting at the beginning of the
+ // previous row.
+ MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+ const int end_col = AOMMIN(mi_col + xd->width, total_mi_cols);
+ uint8_t mi_step;
+ for (int above_mi_col = mi_col; above_mi_col < end_col;
+ above_mi_col += mi_step) {
+ MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+ mi_step = mi_size_wide[above_mi[0]->bsize];
+ int match_found = 0;
+ if (is_inter_block(*above_mi))
+ match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi);
+ if (match_found) return 1;
+ }
+ return 0;
+}
+
+static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows,
+ MACROBLOCKD *xd) {
+ if (!xd->left_available) return 1;
+ const int mi_row = xd->mi_row;
+ MB_MODE_INFO **cur_mbmi = xd->mi;
+ // prev_col_mi points into the mi array, starting at the top of the
+ // previous column
+ MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+ const int end_row = AOMMIN(mi_row + xd->height, total_mi_rows);
+ uint8_t mi_step;
+ for (int left_mi_row = mi_row; left_mi_row < end_row;
+ left_mi_row += mi_step) {
+ MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+ mi_step = mi_size_high[left_mi[0]->bsize];
+ int match_found = 0;
+ if (is_inter_block(*left_mi))
+ match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi);
+ if (match_found) return 1;
+ }
+ return 0;
+}
+/*!\endcond */
+
+/*! \brief Struct used to hold TPL data to
+ * narrow down parts of the inter mode search.
+ */
+typedef struct {
+ /*!
+ * The best inter cost out of all of the reference frames.
+ */
+ int64_t best_inter_cost;
+ /*!
+ * The inter cost for each reference frame.
+ */
+ int64_t ref_inter_cost[INTER_REFS_PER_FRAME];
+} PruneInfoFromTpl;
+
+#if !CONFIG_REALTIME_ONLY
+// TODO(Remya): Check if get_tpl_stats_b() can be reused
+static AOM_INLINE void get_block_level_tpl_stats(
+ AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs,
+ PruneInfoFromTpl *inter_cost_info_from_tpl) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ if (!av1_tpl_stats_ready(tpl_data, tpl_idx)) return;
+ const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+ const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ const int mi_wide = mi_size_wide[bsize];
+ const int mi_high = mi_size_high[bsize];
+ const int tpl_stride = tpl_frame->stride;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_col_end_sr =
+ coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+ const int row_step = step;
+ const int col_step_sr =
+ coded_to_superres_mi(step, cm->superres_scale_denominator);
+ for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows);
+ row += row_step) {
+ for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr);
+ col += col_step_sr) {
+ const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+
+ // Sums up the inter cost of corresponding ref frames
+ for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) {
+ inter_cost_info_from_tpl->ref_inter_cost[ref_idx] +=
+ this_stats->pred_error[ref_idx];
+ }
+ }
+ }
+
+ // Computes the best inter cost (minimum inter_cost)
+ int64_t best_inter_cost = INT64_MAX;
+ for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) {
+ const int64_t cur_inter_cost =
+ inter_cost_info_from_tpl->ref_inter_cost[ref_idx];
+ // For invalid ref frames, cur_inter_cost = 0 and has to be handled while
+ // calculating the minimum inter_cost
+ if (cur_inter_cost != 0 && (cur_inter_cost < best_inter_cost) &&
+ valid_refs[ref_idx])
+ best_inter_cost = cur_inter_cost;
+ }
+ inter_cost_info_from_tpl->best_inter_cost = best_inter_cost;
+}
+#endif
+
+static AOM_INLINE int prune_modes_based_on_tpl_stats(
+ PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx,
+ const PREDICTION_MODE this_mode, int prune_mode_level) {
+ const int have_newmv = have_newmv_in_inter_mode(this_mode);
+ if ((prune_mode_level < 2) && have_newmv) return 0;
+
+ const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost;
+ if (best_inter_cost == INT64_MAX) return 0;
+
+ const int prune_level = prune_mode_level - 1;
+ int64_t cur_inter_cost;
+
+ const int is_globalmv =
+ (this_mode == GLOBALMV) || (this_mode == GLOBAL_GLOBALMV);
+ const int prune_index = is_globalmv ? MAX_REF_MV_SEARCH : ref_mv_idx;
+
+ // Thresholds used for pruning:
+ // Lower value indicates aggressive pruning and higher value indicates
+ // conservative pruning which is set based on ref_mv_idx and speed feature.
+ // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index
+ // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV
+ static const int tpl_inter_mode_prune_mul_factor[3][MAX_REF_MV_SEARCH + 1] = {
+ { 6, 6, 6, 4 }, { 6, 4, 4, 4 }, { 5, 4, 4, 4 }
+ };
+
+ const int is_comp_pred = (refs[1] > INTRA_FRAME);
+ if (!is_comp_pred) {
+ cur_inter_cost = inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1];
+ } else {
+ const int64_t inter_cost_ref0 =
+ inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1];
+ const int64_t inter_cost_ref1 =
+ inter_cost_info_from_tpl->ref_inter_cost[refs[1] - 1];
+ // Choose maximum inter_cost among inter_cost_ref0 and inter_cost_ref1 for
+ // more aggressive pruning
+ cur_inter_cost = AOMMAX(inter_cost_ref0, inter_cost_ref1);
+ }
+
+ // Prune the mode if cur_inter_cost is greater than threshold times
+ // best_inter_cost
+ if (cur_inter_cost >
+ ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] *
+ best_inter_cost) >>
+ 2))
+ return 1;
+ return 0;
+}
+
+/*!\brief High level function to select parameters for compound mode.
+ *
+ * \ingroup inter_mode_search
+ * The main search functionality is done in the call to av1_compound_type_rd().
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to struct holding all the data for
+ * the current macroblock.
+ * \param[in] args HandleInterModeArgs struct holding
+ * miscellaneous arguments for inter mode
+ * search. See the documentation for this
+ * struct for a description of each member.
+ * \param[in] ref_best_rd Best RD found so far for this block.
+ * It is used for early termination of this
+ * search if the RD exceeds this value.
+ * \param[in,out] cur_mv Current motion vector.
+ * \param[in] bsize Current block size.
+ * \param[in,out] compmode_interinter_cost RD of the selected interinter
+ compound mode.
+ * \param[in,out] rd_buffers CompoundTypeRdBuffers struct to hold all
+ * allocated buffers for the compound
+ * predictors and masks in the compound type
+ * search.
+ * \param[in,out] orig_dst A prediction buffer to hold a computed
+ * prediction. This will eventually hold the
+ * final prediction, and the tmp_dst info will
+ * be copied here.
+ * \param[in] tmp_dst A temporary prediction buffer to hold a
+ * computed prediction.
+ * \param[in,out] rate_mv The rate associated with the motion vectors.
+ * This will be modified if a motion search is
+ * done in the motion mode search.
+ * \param[in,out] rd_stats Struct to keep track of the overall RD
+ * information.
+ * \param[in,out] skip_rd An array of length 2 where skip_rd[0] is the
+ * best total RD for a skip mode so far, and
+ * skip_rd[1] is the best RD for a skip mode so
+ * far in luma. This is used as a speed feature
+ * to skip the transform search if the computed
+ * skip RD for the current mode is not better
+ * than the best skip_rd so far.
+ * \param[in,out] skip_build_pred Indicates whether or not to build the inter
+ * predictor. If this is 0, the inter predictor
+ * has already been built and thus we can avoid
+ * repeating computation.
+ * \return Returns 1 if this mode is worse than one already seen and 0 if it is
+ * a viable candidate.
+ */
+static int process_compound_inter_mode(
+ AV1_COMP *const cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+ int64_t ref_best_rd, int_mv *cur_mv, BLOCK_SIZE bsize,
+ int *compmode_interinter_cost, const CompoundTypeRdBuffers *rd_buffers,
+ const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, int *rate_mv,
+ RD_STATS *rd_stats, int64_t *skip_rd, int *skip_build_pred) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const AV1_COMMON *cm = &cpi->common;
+ const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+ cm->seq_params->enable_masked_compound;
+ int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+ (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
+
+ const int num_planes = av1_num_planes(cm);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ int is_luma_interp_done = 0;
+ set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+ int64_t best_rd_compound;
+ int64_t rd_thresh;
+ const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT;
+ const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE;
+ rd_thresh = get_rd_thresh_from_best_rd(ref_best_rd, (1 << comp_type_rd_shift),
+ comp_type_rd_scale);
+ // Select compound type and any parameters related to that type
+ // (for example, the mask parameters if it is a masked mode) and compute
+ // the RD
+ *compmode_interinter_cost = av1_compound_type_rd(
+ cpi, x, args, bsize, cur_mv, mode_search_mask, masked_compound_used,
+ orig_dst, tmp_dst, rd_buffers, rate_mv, &best_rd_compound, rd_stats,
+ ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh);
+ if (ref_best_rd < INT64_MAX &&
+ (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale >
+ ref_best_rd) {
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return 1;
+ }
+
+ // Build only uv predictor for COMPOUND_AVERAGE.
+ // Note there is no need to call av1_enc_build_inter_predictor
+ // for luma if COMPOUND_AVERAGE is selected because it is the first
+ // candidate in av1_compound_type_rd, which means it used the dst_buf
+ // rather than the tmp_buf.
+ if (mbmi->interinter_comp.type == COMPOUND_AVERAGE && is_luma_interp_done) {
+ if (num_planes > 1) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_U, num_planes - 1);
+ }
+ *skip_build_pred = 1;
+ }
+ return 0;
+}
+
+// Speed feature to prune out MVs that are similar to previous MVs if they
+// don't achieve the best RD advantage.
+static int prune_ref_mv_idx_search(int ref_mv_idx, int best_ref_mv_idx,
+ int_mv save_mv[MAX_REF_MV_SEARCH - 1][2],
+ MB_MODE_INFO *mbmi, int pruning_factor) {
+ int i;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const int thr = (1 + is_comp_pred) << (pruning_factor + 1);
+
+ // Skip the evaluation if an MV match is found.
+ if (ref_mv_idx > 0) {
+ for (int idx = 0; idx < ref_mv_idx; ++idx) {
+ if (save_mv[idx][0].as_int == INVALID_MV) continue;
+
+ int mv_diff = 0;
+ for (i = 0; i < 1 + is_comp_pred; ++i) {
+ mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) +
+ abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col);
+ }
+
+ // If this mode is not the best one, and current MV is similar to
+ // previous stored MV, terminate this ref_mv_idx evaluation.
+ if (best_ref_mv_idx == -1 && mv_diff <= thr) return 1;
+ }
+ }
+
+ if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) {
+ for (i = 0; i < is_comp_pred + 1; ++i)
+ save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int;
+ }
+
+ return 0;
+}
+
+/*!\brief Prunes ZeroMV Search Using Best NEWMV's SSE
+ *
+ * \ingroup inter_mode_search
+ *
+ * Compares the sse of zero mv and the best sse found in single new_mv. If the
+ * sse of the zero_mv is higher, returns 1 to signal zero_mv can be skipped.
+ * Else returns 0.
+ *
+ * Note that the sse of here comes from single_motion_search. So it is
+ * interpolated with the filter in motion search, not the actual interpolation
+ * filter used in encoding.
+ *
+ * \param[in] fn_ptr A table of function pointers to compute SSE.
+ * \param[in] x Pointer to struct holding all the data for
+ * the current macroblock.
+ * \param[in] bsize The current block_size.
+ * \param[in] args The args to handle_inter_mode, used to track
+ * the best SSE.
+ * \param[in] prune_zero_mv_with_sse The argument holds speed feature
+ * prune_zero_mv_with_sse value
+ * \return Returns 1 if zero_mv is pruned, 0 otherwise.
+ */
+static AOM_INLINE int prune_zero_mv_with_sse(
+ const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize,
+ const HandleInterModeArgs *args, int prune_zero_mv_with_sse) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+
+ const int is_comp_pred = has_second_ref(mbmi);
+ const MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+
+ for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+ if (xd->global_motion[refs[idx]].wmtype != IDENTITY) {
+ // Pruning logic only works for IDENTITY type models
+ // Note: In theory we could apply similar logic for TRANSLATION
+ // type models, but we do not code these due to a spec bug
+ // (see comments in gm_get_motion_vector() in av1/common/mv.h)
+ assert(xd->global_motion[refs[idx]].wmtype != TRANSLATION);
+ return 0;
+ }
+
+ // Don't prune if we have invalid data
+ assert(mbmi->mv[idx].as_int == 0);
+ if (args->best_single_sse_in_refs[refs[idx]] == INT32_MAX) {
+ return 0;
+ }
+ }
+
+ // Sum up the sse of ZEROMV and best NEWMV
+ unsigned int this_sse_sum = 0;
+ unsigned int best_sse_sum = 0;
+ for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+ const struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+ const struct macroblockd_plane *pd = xd->plane;
+ const struct buf_2d *src_buf = &p->src;
+ const struct buf_2d *ref_buf = &pd->pre[idx];
+ const uint8_t *src = src_buf->buf;
+ const uint8_t *ref = ref_buf->buf;
+ const int src_stride = src_buf->stride;
+ const int ref_stride = ref_buf->stride;
+
+ unsigned int this_sse;
+ fn_ptr[bsize].vf(ref, ref_stride, src, src_stride, &this_sse);
+ this_sse_sum += this_sse;
+
+ const unsigned int best_sse = args->best_single_sse_in_refs[refs[idx]];
+ best_sse_sum += best_sse;
+ }
+
+ const double mul = prune_zero_mv_with_sse > 1 ? 1.00 : 1.25;
+ if ((double)this_sse_sum > (mul * (double)best_sse_sum)) {
+ return 1;
+ }
+
+ return 0;
+}
+
+/*!\brief Searches for interpolation filter in realtime mode during winner eval
+ *
+ * \ingroup inter_mode_search
+ *
+ * Does a simple interpolation filter search during winner mode evaluation. This
+ * is currently only used by realtime mode as \ref
+ * av1_interpolation_filter_search is not called during realtime encoding.
+ *
+ * This function only searches over two possible filters. EIGHTTAP_REGULAR is
+ * always search. For lowres clips (<= 240p), MULTITAP_SHARP is also search. For
+ * higher res slips (>240p), EIGHTTAP_SMOOTH is also searched.
+ * *
+ * \param[in] cpi Pointer to the compressor. Used for feature
+ * flags.
+ * \param[in,out] x Pointer to macroblock. This is primarily
+ * used to access the buffers.
+ * \param[in] mi_row The current row in mi unit (4X4 pixels).
+ * \param[in] mi_col The current col in mi unit (4X4 pixels).
+ * \param[in] bsize The current block_size.
+ * \return Returns true if a predictor is built in xd->dst, false otherwise.
+ */
+static AOM_INLINE bool fast_interp_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ static const InterpFilters filters_ref_set[3] = {
+ { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR },
+ { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
+ { MULTITAP_SHARP, MULTITAP_SHARP }
+ };
+
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ int64_t best_cost = INT64_MAX;
+ int best_filter_index = -1;
+ // dst_bufs[0] sores the new predictor, and dist_bifs[1] stores the best
+ const int num_planes = av1_num_planes(cm);
+ const int is_240p_or_lesser = AOMMIN(cm->width, cm->height) <= 240;
+ assert(is_inter_mode(mi->mode));
+ assert(mi->motion_mode == SIMPLE_TRANSLATION);
+ assert(!is_inter_compound_mode(mi->mode));
+
+ if (!av1_is_interp_needed(xd)) {
+ return false;
+ }
+
+ struct macroblockd_plane *pd = xd->plane;
+ const BUFFER_SET orig_dst = {
+ { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+ { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
+ };
+ uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
+ const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+ tmp_buf + 2 * MAX_SB_SQUARE },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
+ const BUFFER_SET *dst_bufs[2] = { &orig_dst, &tmp_dst };
+
+ for (int i = 0; i < 3; ++i) {
+ if (is_240p_or_lesser) {
+ if (filters_ref_set[i].x_filter == EIGHTTAP_SMOOTH) {
+ continue;
+ }
+ } else {
+ if (filters_ref_set[i].x_filter == MULTITAP_SHARP) {
+ continue;
+ }
+ }
+ int64_t cost;
+ RD_STATS tmp_rd = { 0 };
+
+ mi->interp_filters.as_filters = filters_ref_set[i];
+ av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+
+ model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model
+ ? MODELRD_LEGACY
+ : MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, AOM_PLANE_Y, AOM_PLANE_Y, &tmp_rd.rate, &tmp_rd.dist,
+ &tmp_rd.skip_txfm, &tmp_rd.sse, NULL, NULL, NULL);
+
+ tmp_rd.rate += av1_get_switchable_rate(x, xd, cm->features.interp_filter,
+ cm->seq_params->enable_dual_filter);
+ cost = RDCOST(x->rdmult, tmp_rd.rate, tmp_rd.dist);
+ if (cost < best_cost) {
+ best_filter_index = i;
+ best_cost = cost;
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ }
+ }
+ assert(best_filter_index >= 0);
+
+ mi->interp_filters.as_filters = filters_ref_set[best_filter_index];
+
+ const bool is_best_pred_in_orig = &orig_dst == dst_bufs[1];
+
+ if (is_best_pred_in_orig) {
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ } else {
+ // Note that xd->pd's bufers are kept in sync with dst_bufs[0]. So if
+ // is_best_pred_in_orig is false, that means the current buffer is the
+ // original one.
+ assert(&orig_dst == dst_bufs[0]);
+ assert(xd->plane[AOM_PLANE_Y].dst.buf == orig_dst.plane[AOM_PLANE_Y]);
+ const int width = block_size_wide[bsize];
+ const int height = block_size_high[bsize];
+#if CONFIG_AV1_HIGHBITDEPTH
+ const bool is_hbd = is_cur_buf_hbd(xd);
+ if (is_hbd) {
+ aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(tmp_dst.plane[AOM_PLANE_Y]),
+ tmp_dst.stride[AOM_PLANE_Y],
+ CONVERT_TO_SHORTPTR(orig_dst.plane[AOM_PLANE_Y]),
+ orig_dst.stride[AOM_PLANE_Y], width, height);
+ } else {
+ aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y],
+ orig_dst.plane[AOM_PLANE_Y],
+ orig_dst.stride[AOM_PLANE_Y], width, height);
+ }
+#else
+ aom_convolve_copy(tmp_dst.plane[AOM_PLANE_Y], tmp_dst.stride[AOM_PLANE_Y],
+ orig_dst.plane[AOM_PLANE_Y], orig_dst.stride[AOM_PLANE_Y],
+ width, height);
+#endif
+ }
+
+ // Build the YUV predictor.
+ if (num_planes > 1) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_U, AOM_PLANE_V);
+ }
+
+ return true;
+}
+
+/*!\brief AV1 inter mode RD computation
+ *
+ * \ingroup inter_mode_search
+ * Do the RD search for a given inter mode and compute all information relevant
+ * to the input mode. It will compute the best MV,
+ * compound parameters (if the mode is a compound mode) and interpolation filter
+ * parameters.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] tile_data Pointer to struct holding adaptive
+ * data/contexts/models for the tile during
+ * encoding.
+ * \param[in] x Pointer to structure holding all the data
+ * for the current macroblock.
+ * \param[in] bsize Current block size.
+ * \param[in,out] rd_stats Struct to keep track of the overall RD
+ * information.
+ * \param[in,out] rd_stats_y Struct to keep track of the RD information
+ * for only the Y plane.
+ * \param[in,out] rd_stats_uv Struct to keep track of the RD information
+ * for only the UV planes.
+ * \param[in] args HandleInterModeArgs struct holding
+ * miscellaneous arguments for inter mode
+ * search. See the documentation for this
+ * struct for a description of each member.
+ * \param[in] ref_best_rd Best RD found so far for this block.
+ * It is used for early termination of this
+ * search if the RD exceeds this value.
+ * \param[in] tmp_buf Temporary buffer used to hold predictors
+ * built in this search.
+ * \param[in,out] rd_buffers CompoundTypeRdBuffers struct to hold all
+ * allocated buffers for the compound
+ * predictors and masks in the compound type
+ * search.
+ * \param[in,out] best_est_rd Estimated RD for motion mode search if
+ * do_tx_search (see below) is 0.
+ * \param[in] do_tx_search Parameter to indicate whether or not to do
+ * a full transform search. This will compute
+ * an estimated RD for the modes without the
+ * transform search and later perform the full
+ * transform search on the best candidates.
+ * \param[in,out] inter_modes_info InterModesInfo struct to hold inter mode
+ * information to perform a full transform
+ * search only on winning candidates searched
+ * with an estimate for transform coding RD.
+ * \param[in,out] motion_mode_cand A motion_mode_candidate struct to store
+ * motion mode information used in a speed
+ * feature to search motion modes other than
+ * SIMPLE_TRANSLATION only on winning
+ * candidates.
+ * \param[in,out] skip_rd A length 2 array, where skip_rd[0] is the
+ * best total RD for a skip mode so far, and
+ * skip_rd[1] is the best RD for a skip mode so
+ * far in luma. This is used as a speed feature
+ * to skip the transform search if the computed
+ * skip RD for the current mode is not better
+ * than the best skip_rd so far.
+ * \param[in] inter_cost_info_from_tpl A PruneInfoFromTpl struct used to
+ * narrow down the search based on data
+ * collected in the TPL model.
+ * \param[out] yrd Stores the rdcost corresponding to encoding
+ * the luma plane.
+ *
+ * \return The RD cost for the mode being searched.
+ */
+static int64_t handle_inter_mode(
+ AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+ BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, HandleInterModeArgs *args, int64_t ref_best_rd,
+ uint8_t *const tmp_buf, const CompoundTypeRdBuffers *rd_buffers,
+ int64_t *best_est_rd, const int do_tx_search,
+ InterModesInfo *inter_modes_info, motion_mode_candidate *motion_mode_cand,
+ int64_t *skip_rd, PruneInfoFromTpl *inter_cost_info_from_tpl,
+ int64_t *yrd) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+
+#if CONFIG_REALTIME_ONLY
+ const int prune_modes_based_on_tpl = 0;
+#else // CONFIG_REALTIME_ONLY
+ const TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const int prune_modes_based_on_tpl =
+ cpi->sf.inter_sf.prune_inter_modes_based_on_tpl &&
+ av1_tpl_stats_ready(tpl_data, cpi->gf_frame_index);
+#endif // CONFIG_REALTIME_ONLY
+ int i;
+ // Reference frames for this mode
+ const int refs[2] = { mbmi->ref_frame[0],
+ (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+ int rate_mv = 0;
+ int64_t rd = INT64_MAX;
+ // Do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ struct macroblockd_plane *pd = xd->plane;
+ const BUFFER_SET orig_dst = {
+ { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+ { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
+ };
+ const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+ tmp_buf + 2 * MAX_SB_SQUARE },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
+
+ int64_t ret_val = INT64_MAX;
+ const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+ int64_t best_rd = INT64_MAX;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ int64_t best_yrd = INT64_MAX;
+ MB_MODE_INFO best_mbmi = *mbmi;
+ int best_xskip_txfm = 0;
+ int64_t newmv_ret_val = INT64_MAX;
+ inter_mode_info mode_info[MAX_REF_MV_SEARCH];
+
+ // Do not prune the mode based on inter cost from tpl if the current ref frame
+ // is the winner ref in neighbouring blocks.
+ int ref_match_found_in_above_nb = 0;
+ int ref_match_found_in_left_nb = 0;
+ if (prune_modes_based_on_tpl) {
+ ref_match_found_in_above_nb =
+ find_ref_match_in_above_nbs(cm->mi_params.mi_cols, xd);
+ ref_match_found_in_left_nb =
+ find_ref_match_in_left_nbs(cm->mi_params.mi_rows, xd);
+ }
+
+ // First, perform a simple translation search for each of the indices. If
+ // an index performs well, it will be fully searched in the main loop
+ // of this function.
+ const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+ // Save MV results from first 2 ref_mv_idx.
+ int_mv save_mv[MAX_REF_MV_SEARCH - 1][2];
+ int best_ref_mv_idx = -1;
+ const int idx_mask =
+ ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, bsize, ref_set);
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int ref_mv_cost = cost_mv_ref(mode_costs, this_mode, mode_ctx);
+ const int base_rate =
+ args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
+
+ for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) {
+ save_mv[i][0].as_int = INVALID_MV;
+ save_mv[i][1].as_int = INVALID_MV;
+ }
+ args->start_mv_cnt = 0;
+
+ // Main loop of this function. This will iterate over all of the ref mvs
+ // in the dynamic reference list and do the following:
+ // 1.) Get the current MV. Create newmv MV if necessary
+ // 2.) Search compound type and parameters if applicable
+ // 3.) Do interpolation filter search
+ // 4.) Build the inter predictor
+ // 5.) Pick the motion mode (SIMPLE_TRANSLATION, OBMC_CAUSAL,
+ // WARPED_CAUSAL)
+ // 6.) Update stats if best so far
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ mbmi->ref_mv_idx = ref_mv_idx;
+
+ mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV;
+ mode_info[ref_mv_idx].full_mv_bestsme = INT_MAX;
+ const int drl_cost = get_drl_cost(
+ mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
+ mode_info[ref_mv_idx].drl_cost = drl_cost;
+ mode_info[ref_mv_idx].skip = 0;
+
+ if (!mask_check_bit(idx_mask, ref_mv_idx)) {
+ // MV did not perform well in simple translation search. Skip it.
+ continue;
+ }
+ if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb &&
+ !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) {
+ // Skip mode if TPL model indicates it will not be beneficial.
+ if (prune_modes_based_on_tpl_stats(
+ inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode,
+ cpi->sf.inter_sf.prune_inter_modes_based_on_tpl))
+ continue;
+ }
+ av1_init_rd_stats(rd_stats);
+
+ // Initialize compound mode data
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+
+ mbmi->num_proj_ref = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+ // Compute cost for signalling this DRL index
+ rd_stats->rate = base_rate;
+ rd_stats->rate += drl_cost;
+
+ int rs = 0;
+ int compmode_interinter_cost = 0;
+
+ int_mv cur_mv[2];
+
+ // TODO(Cherma): Extend this speed feature to support compound mode
+ int skip_repeated_ref_mv =
+ is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv;
+ // Generate the current mv according to the prediction mode
+ if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) {
+ continue;
+ }
+
+ // The above call to build_cur_mv does not handle NEWMV modes. Build
+ // the mv here if we have NEWMV for any predictors.
+ if (have_newmv_in_inter_mode(this_mode)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_newmv_time);
+#endif
+ newmv_ret_val =
+ handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_newmv_time);
+#endif
+
+ if (newmv_ret_val != 0) continue;
+
+ if (is_inter_singleref_mode(this_mode) &&
+ cur_mv[0].as_int != INVALID_MV) {
+ const MV_REFERENCE_FRAME ref = refs[0];
+ const unsigned int this_sse = x->pred_sse[ref];
+ if (this_sse < args->best_single_sse_in_refs[ref]) {
+ args->best_single_sse_in_refs[ref] = this_sse;
+ }
+
+ if (cpi->sf.rt_sf.skip_newmv_mode_based_on_sse) {
+ const int th_idx = cpi->sf.rt_sf.skip_newmv_mode_based_on_sse - 1;
+ const int pix_idx = num_pels_log2_lookup[bsize] - 4;
+ const double scale_factor[3][11] = {
+ { 0.7, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9 },
+ { 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 1, 1, 1, 1, 1 },
+ { 0.7, 0.7, 0.7, 0.7, 1, 1, 1, 1, 1, 1, 1 }
+ };
+ assert(pix_idx >= 0);
+ assert(th_idx <= 2);
+ if (args->best_pred_sse < scale_factor[th_idx][pix_idx] * this_sse)
+ continue;
+ }
+ }
+
+ rd_stats->rate += rate_mv;
+ }
+ // Copy the motion vector for this mode into mbmi struct
+ for (i = 0; i < is_comp_pred + 1; ++i) {
+ mbmi->mv[i].as_int = cur_mv[i].as_int;
+ }
+
+ if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+ continue;
+ }
+
+ // Skip the rest of the search if prune_ref_mv_idx_search speed feature
+ // is enabled, and the current MV is similar to a previous one.
+ if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred &&
+ prune_ref_mv_idx_search(ref_mv_idx, best_ref_mv_idx, save_mv, mbmi,
+ cpi->sf.inter_sf.prune_ref_mv_idx_search))
+ continue;
+
+ if (cpi->sf.gm_sf.prune_zero_mv_with_sse &&
+ (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) {
+ if (prune_zero_mv_with_sse(cpi->ppi->fn_ptr, x, bsize, args,
+ cpi->sf.gm_sf.prune_zero_mv_with_sse)) {
+ continue;
+ }
+ }
+
+ int skip_build_pred = 0;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // Handle a compound predictor, continue if it is determined this
+ // cannot be the best compound mode
+ if (is_comp_pred) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, compound_type_rd_time);
+#endif
+ const int not_best_mode = process_compound_inter_mode(
+ cpi, x, args, ref_best_rd, cur_mv, bsize, &compmode_interinter_cost,
+ rd_buffers, &orig_dst, &tmp_dst, &rate_mv, rd_stats, skip_rd,
+ &skip_build_pred);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, compound_type_rd_time);
+#endif
+ if (not_best_mode) continue;
+ }
+
+ if (!args->skip_ifs) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, interpolation_filter_search_time);
+#endif
+ // Determine the interpolation filter for this mode
+ ret_val = av1_interpolation_filter_search(
+ x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
+ &skip_build_pred, args, ref_best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, interpolation_filter_search_time);
+#endif
+ if (args->modelled_rd != NULL && !is_comp_pred) {
+ args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+ }
+ if (ret_val != 0) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
+ ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+
+ // Compute modelled RD if enabled
+ if (args->modelled_rd != NULL) {
+ if (is_comp_pred) {
+ const int mode0 = compound_ref0_mode(this_mode);
+ const int mode1 = compound_ref1_mode(this_mode);
+ const int64_t mrd =
+ AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+ args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+ if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ }
+ }
+ }
+
+ rd_stats->rate += compmode_interinter_cost;
+ if (skip_build_pred != 1) {
+ // Build this inter predictor if it has not been previously built
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0,
+ av1_num_planes(cm) - 1);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, motion_mode_rd_time);
+#endif
+ int rate2_nocoeff = rd_stats->rate;
+ // Determine the motion mode. This will be one of SIMPLE_TRANSLATION,
+ // OBMC_CAUSAL or WARPED_CAUSAL
+ int64_t this_yrd;
+ ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y,
+ rd_stats_uv, args, ref_best_rd, skip_rd, &rate_mv,
+ &orig_dst, best_est_rd, do_tx_search,
+ inter_modes_info, 0, &this_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, motion_mode_rd_time);
+#endif
+ assert(
+ IMPLIES(!av1_check_newmv_joint_nonzero(cm, x), ret_val == INT64_MAX));
+
+ if (ret_val != INT64_MAX) {
+ int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ const THR_MODES mode_enum = get_prediction_mode_idx(
+ mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ // Collect mode stats for multiwinner mode processing
+ store_winner_mode_stats(&cpi->common, x, mbmi, rd_stats, rd_stats_y,
+ rd_stats_uv, mode_enum, NULL, bsize, tmp_rd,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type,
+ do_tx_search);
+ if (tmp_rd < best_rd) {
+ best_yrd = this_yrd;
+ // Update the best rd stats if we found the best mode so far
+ best_rd_stats = *rd_stats;
+ best_rd_stats_y = *rd_stats_y;
+ best_rd_stats_uv = *rd_stats_uv;
+ best_rd = tmp_rd;
+ best_mbmi = *mbmi;
+ best_xskip_txfm = txfm_info->skip_txfm;
+ memcpy(best_blk_skip, txfm_info->blk_skip,
+ sizeof(best_blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(best_tx_type_map, xd->tx_type_map,
+ xd->height * xd->width);
+ motion_mode_cand->rate_mv = rate_mv;
+ motion_mode_cand->rate2_nocoeff = rate2_nocoeff;
+ }
+
+ if (tmp_rd < ref_best_rd) {
+ ref_best_rd = tmp_rd;
+ best_ref_mv_idx = ref_mv_idx;
+ }
+ }
+ restore_dst_buf(xd, orig_dst, num_planes);
+ }
+
+ if (best_rd == INT64_MAX) return INT64_MAX;
+
+ // re-instate status of the best choice
+ *rd_stats = best_rd_stats;
+ *rd_stats_y = best_rd_stats_y;
+ *rd_stats_uv = best_rd_stats_uv;
+ *yrd = best_yrd;
+ *mbmi = best_mbmi;
+ txfm_info->skip_txfm = best_xskip_txfm;
+ assert(IMPLIES(mbmi->comp_group_idx == 1,
+ mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+ memcpy(txfm_info->blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
+
+ rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+ return rd_stats->rdcost;
+}
+
+/*!\brief Search for the best intrabc predictor
+ *
+ * \ingroup intra_mode_search
+ * \callergraph
+ * This function performs a motion search to find the best intrabc predictor.
+ *
+ * \returns Returns the best overall rdcost (including the non-intrabc modes
+ * search before this function).
+ */
+static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+ PICK_MODE_CONTEXT *ctx,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc ||
+ !cpi->sf.mv_sf.use_intrabc || cpi->sf.rt_sf.use_nonrd_pick_mode)
+ return INT64_MAX;
+ const int num_planes = av1_num_planes(cm);
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TileInfo *tile = &xd->tile;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int sb_row = mi_row >> cm->seq_params->mib_size_log2;
+ const int sb_col = mi_col >> cm->seq_params->mib_size_log2;
+
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ const MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+ // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+ int_mv nearestmv, nearmv;
+ av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
+ 0);
+
+ if (nearestmv.as_int == INVALID_MV) {
+ nearestmv.as_int = 0;
+ }
+ if (nearmv.as_int == INVALID_MV) {
+ nearmv.as_int = 0;
+ }
+
+ int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+ if (dv_ref.as_int == 0) {
+ av1_find_ref_dv(&dv_ref, tile, cm->seq_params->mib_size, mi_row);
+ }
+ // Ref DV should not have sub-pel.
+ assert((dv_ref.as_mv.col & 7) == 0);
+ assert((dv_ref.as_mv.row & 7) == 0);
+ mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref;
+
+ struct buf_2d yv12_mb[MAX_MB_PLANE];
+ av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, NULL, NULL, num_planes);
+ for (int i = 0; i < num_planes; ++i) {
+ xd->plane[i].pre[0] = yv12_mb[i];
+ }
+
+ enum IntrabcMotionDirection {
+ IBC_MOTION_ABOVE,
+ IBC_MOTION_LEFT,
+ IBC_MOTION_DIRECTIONS
+ };
+
+ MB_MODE_INFO best_mbmi = *mbmi;
+ RD_STATS best_rdstats = *rd_stats;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+
+ FULLPEL_MOTION_SEARCH_PARAMS fullms_params;
+ const SEARCH_METHODS search_method =
+ av1_get_default_mv_search_method(x, &cpi->sf.mv_sf, bsize);
+ const search_site_config *lookahead_search_sites =
+ cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
+ const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv);
+ av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
+ &dv_ref.as_mv, start_mv,
+ lookahead_search_sites, search_method,
+ /*fine_search_interval=*/0);
+ const IntraBCMVCosts *const dv_costs = x->dv_costs;
+ av1_set_ms_to_intra_mode(&fullms_params, dv_costs);
+
+ for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
+ dir < IBC_MOTION_DIRECTIONS; ++dir) {
+ switch (dir) {
+ case IBC_MOTION_ABOVE:
+ fullms_params.mv_limits.col_min =
+ (tile->mi_col_start - mi_col) * MI_SIZE;
+ fullms_params.mv_limits.col_max =
+ (tile->mi_col_end - mi_col) * MI_SIZE - w;
+ fullms_params.mv_limits.row_min =
+ (tile->mi_row_start - mi_row) * MI_SIZE;
+ fullms_params.mv_limits.row_max =
+ (sb_row * cm->seq_params->mib_size - mi_row) * MI_SIZE - h;
+ break;
+ case IBC_MOTION_LEFT:
+ fullms_params.mv_limits.col_min =
+ (tile->mi_col_start - mi_col) * MI_SIZE;
+ fullms_params.mv_limits.col_max =
+ (sb_col * cm->seq_params->mib_size - mi_col) * MI_SIZE - w;
+ // TODO(aconverse@google.com): Minimize the overlap between above and
+ // left areas.
+ fullms_params.mv_limits.row_min =
+ (tile->mi_row_start - mi_row) * MI_SIZE;
+ int bottom_coded_mi_edge =
+ AOMMIN((sb_row + 1) * cm->seq_params->mib_size, tile->mi_row_end);
+ fullms_params.mv_limits.row_max =
+ (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
+ break;
+ default: assert(0);
+ }
+ assert(fullms_params.mv_limits.col_min >= fullms_params.mv_limits.col_min);
+ assert(fullms_params.mv_limits.col_max <= fullms_params.mv_limits.col_max);
+ assert(fullms_params.mv_limits.row_min >= fullms_params.mv_limits.row_min);
+ assert(fullms_params.mv_limits.row_max <= fullms_params.mv_limits.row_max);
+
+ av1_set_mv_search_range(&fullms_params.mv_limits, &dv_ref.as_mv);
+
+ if (fullms_params.mv_limits.col_max < fullms_params.mv_limits.col_min ||
+ fullms_params.mv_limits.row_max < fullms_params.mv_limits.row_min) {
+ continue;
+ }
+
+ const int step_param = cpi->mv_search_params.mv_step_param;
+ IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info;
+ int_mv best_mv, best_hash_mv;
+ FULLPEL_MV_STATS best_mv_stats;
+
+ int bestsme =
+ av1_full_pixel_search(start_mv, &fullms_params, step_param, NULL,
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
+ const int hashsme = av1_intrabc_hash_search(
+ cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv);
+ if (hashsme < bestsme) {
+ best_mv = best_hash_mv;
+ bestsme = hashsme;
+ }
+
+ if (bestsme == INT_MAX) continue;
+ const MV dv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ if (!av1_is_fullmv_in_range(&fullms_params.mv_limits,
+ get_fullmv_from_mv(&dv)))
+ continue;
+ if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
+ cm->seq_params->mib_size_log2))
+ continue;
+
+ // DV should not have sub-pel.
+ assert((dv.col & 7) == 0);
+ assert((dv.row & 7) == 0);
+ memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->use_intrabc = 1;
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->mv[0].as_mv = dv;
+ mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+ mbmi->skip_txfm = 0;
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+
+ // TODO(aconverse@google.com): The full motion field defining discount
+ // in MV_COST_WEIGHT is too large. Explore other values.
+ const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv,
+ dv_costs->dv_costs, MV_COST_WEIGHT_SUB);
+ const int rate_mode = x->mode_costs.intrabc_cost[1];
+ RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv;
+ if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y,
+ &rd_stats_uv, rate_mode + rate_mv, INT64_MAX))
+ continue;
+ rd_stats_yuv.rdcost =
+ RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist);
+ if (rd_stats_yuv.rdcost < best_rd) {
+ best_rd = rd_stats_yuv.rdcost;
+ best_mbmi = *mbmi;
+ best_rdstats = rd_stats_yuv;
+ memcpy(best_blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
+ }
+ }
+ *mbmi = best_mbmi;
+ *rd_stats = best_rdstats;
+ memcpy(txfm_info->blk_skip, best_blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * xd->height * xd->width);
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+#if CONFIG_RD_DEBUG
+ mbmi->rd_stats = *rd_stats;
+#endif
+ return best_rd;
+}
+
+// TODO(chiyotsai@google.com): We are using struct $struct_name instead of their
+// typedef here because Doxygen doesn't know about the typedefs yet. So using
+// the typedef will prevent doxygen from finding this function and generating
+// the callgraph. Once documents for AV1_COMP and MACROBLOCK are added to
+// doxygen, we can revert back to using the typedefs.
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+ struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int num_planes = av1_num_planes(cm);
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+ uint8_t y_skip_txfm = 0, uv_skip_txfm = 0;
+ int64_t dist_y = 0, dist_uv = 0;
+
+ ctx->rd_stats.skip_txfm = 0;
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->use_intrabc = 0;
+ mbmi->mv[0].as_int = 0;
+ mbmi->skip_mode = 0;
+
+ const int64_t intra_yrd =
+ av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
+ &y_skip_txfm, bsize, best_rd, ctx);
+
+ // Initialize default mode evaluation params
+ set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+ if (intra_yrd < best_rd) {
+ // Search intra modes for uv planes if needed
+ if (num_planes > 1) {
+ // Set up the tx variables for reproducing the y predictions in case we
+ // need it for chroma-from-luma.
+ if (xd->is_chroma_ref && store_cfl_required_rdo(cm, x)) {
+ memcpy(txfm_info->blk_skip, ctx->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
+ }
+ const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+ av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+ &dist_uv, &uv_skip_txfm, bsize,
+ max_uv_tx_size);
+ }
+
+ // Intra block is always coded as non-skip
+ rd_cost->rate =
+ rate_y + rate_uv +
+ x->mode_costs.skip_txfm_cost[av1_get_skip_txfm_context(xd)][0];
+ rd_cost->dist = dist_y + dist_uv;
+ rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+ rd_cost->skip_txfm = 0;
+ } else {
+ rd_cost->rate = INT_MAX;
+ }
+
+ if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
+ best_rd = rd_cost->rdcost;
+ if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) {
+ ctx->rd_stats.skip_txfm = mbmi->skip_txfm;
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ assert(rd_cost->rate != INT_MAX);
+ }
+ if (rd_cost->rate == INT_MAX) return;
+
+ ctx->mic = *xd->mi[0];
+ av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, &x->mbmi_ext,
+ av1_ref_frame_type(xd->mi[0]->ref_frame));
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+}
+
+static AOM_INLINE void calc_target_weighted_pred(
+ const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
+ const uint8_t *above, int above_stride, const uint8_t *left,
+ int left_stride);
+
+static AOM_INLINE void rd_pick_skip_mode(
+ RD_STATS *rd_cost, InterModeSearchState *search_state,
+ const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ x->compound_idx = 1; // COMPOUND_AVERAGE
+ RD_STATS skip_mode_rd_stats;
+ av1_invalid_rd_stats(&skip_mode_rd_stats);
+
+ if (skip_mode_info->ref_frame_idx_0 == INVALID_IDX ||
+ skip_mode_info->ref_frame_idx_1 == INVALID_IDX) {
+ return;
+ }
+
+ const MV_REFERENCE_FRAME ref_frame =
+ LAST_FRAME + skip_mode_info->ref_frame_idx_0;
+ const MV_REFERENCE_FRAME second_ref_frame =
+ LAST_FRAME + skip_mode_info->ref_frame_idx_1;
+ const PREDICTION_MODE this_mode = NEAREST_NEARESTMV;
+ const THR_MODES mode_index =
+ get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame);
+
+ if (mode_index == THR_INVALID) {
+ return;
+ }
+
+ if ((!cpi->oxcf.ref_frm_cfg.enable_onesided_comp ||
+ cpi->sf.inter_sf.disable_onesided_comp) &&
+ cpi->all_one_sided_refs) {
+ return;
+ }
+
+ mbmi->mode = this_mode;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = ref_frame;
+ mbmi->ref_frame[1] = second_ref_frame;
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ if (x->mbmi_ext.ref_mv_count[ref_frame_type] == UINT8_MAX) {
+ MB_MODE_INFO_EXT *mbmi_ext = &x->mbmi_ext;
+ if (mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+ mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+ return;
+ }
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+ // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_type);
+ }
+
+ assert(this_mode == NEAREST_NEARESTMV);
+ if (!build_cur_mv(mbmi->mv, this_mode, cm, x, 0)) {
+ return;
+ }
+
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = x->compound_idx;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->ref_mv_idx = 0;
+ mbmi->skip_mode = mbmi->skip_txfm = 1;
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+
+ set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ BUFFER_SET orig_dst;
+ for (int i = 0; i < num_planes; i++) {
+ orig_dst.plane[i] = xd->plane[i].dst.buf;
+ orig_dst.stride[i] = xd->plane[i].dst.stride;
+ }
+
+ // Compare the use of skip_mode with the best intra/inter mode obtained.
+ const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+ int64_t best_intra_inter_mode_cost = INT64_MAX;
+ if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) {
+ const ModeCosts *mode_costs = &x->mode_costs;
+ best_intra_inter_mode_cost = RDCOST(
+ x->rdmult, rd_cost->rate + mode_costs->skip_mode_cost[skip_mode_ctx][0],
+ rd_cost->dist);
+ // Account for non-skip mode rate in total rd stats
+ rd_cost->rate += mode_costs->skip_mode_cost[skip_mode_ctx][0];
+ av1_rd_cost_update(x->rdmult, rd_cost);
+ }
+
+ // Obtain the rdcost for skip_mode.
+ skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, &orig_dst,
+ best_intra_inter_mode_cost);
+
+ if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost &&
+ (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) {
+ assert(mode_index != THR_INVALID);
+ search_state->best_mbmode.skip_mode = 1;
+ search_state->best_mbmode = *mbmi;
+ memset(search_state->best_mbmode.inter_tx_size,
+ search_state->best_mbmode.tx_size,
+ sizeof(search_state->best_mbmode.inter_tx_size));
+ set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height,
+ search_state->best_mbmode.skip_txfm && is_inter_block(mbmi),
+ xd);
+ search_state->best_mode_index = mode_index;
+
+ // Update rd_cost
+ rd_cost->rate = skip_mode_rd_stats.rate;
+ rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist;
+ rd_cost->rdcost = skip_mode_rd_stats.rdcost;
+
+ search_state->best_rd = rd_cost->rdcost;
+ search_state->best_skip2 = 1;
+ search_state->best_mode_skippable = 1;
+
+ x->txfm_search_info.skip_txfm = 1;
+ }
+}
+
+// Get winner mode stats of given mode index
+static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats(
+ MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost,
+ int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index,
+ RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv,
+ THR_MODES *winner_mode_index, MULTI_WINNER_MODE_TYPE multi_winner_mode_type,
+ int mode_idx) {
+ MB_MODE_INFO *winner_mbmi;
+ if (multi_winner_mode_type) {
+ assert(mode_idx >= 0 && mode_idx < x->winner_mode_count);
+ WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx];
+ winner_mbmi = &winner_mode_stat->mbmi;
+
+ *winner_rd_cost = &winner_mode_stat->rd_cost;
+ *winner_rate_y = winner_mode_stat->rate_y;
+ *winner_rate_uv = winner_mode_stat->rate_uv;
+ *winner_mode_index = winner_mode_stat->mode_index;
+ } else {
+ winner_mbmi = best_mbmode;
+ *winner_rd_cost = best_rd_cost;
+ *winner_rate_y = best_rate_y;
+ *winner_rate_uv = best_rate_uv;
+ *winner_mode_index = *best_mode_index;
+ }
+ return winner_mbmi;
+}
+
+// speed feature: fast intra/inter transform type search
+// Used for speed >= 2
+// When this speed feature is on, in rd mode search, only DCT is used.
+// After the mode is determined, this function is called, to select
+// transform types and get accurate rdcost.
+static AOM_INLINE void refine_winner_mode_tx(
+ const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, THR_MODES *best_mode_index,
+ MB_MODE_INFO *best_mbmode, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
+ int best_rate_y, int best_rate_uv, int *best_skip2, int winner_mode_count) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ int64_t best_rd;
+ const int num_planes = av1_num_planes(cm);
+
+ if (!is_winner_mode_processing_enabled(cpi, x, best_mbmode,
+ rd_cost->skip_txfm))
+ return;
+
+ // Set params for winner mode evaluation
+ set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+
+ // No best mode identified so far
+ if (*best_mode_index == THR_INVALID) return;
+
+ best_rd = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+ for (int mode_idx = 0; mode_idx < winner_mode_count; mode_idx++) {
+ RD_STATS *winner_rd_stats = NULL;
+ int winner_rate_y = 0, winner_rate_uv = 0;
+ THR_MODES winner_mode_index = 0;
+
+ // TODO(any): Combine best mode and multi-winner mode processing paths
+ // Get winner mode stats for current mode index
+ MB_MODE_INFO *winner_mbmi = get_winner_mode_stats(
+ x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index,
+ &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, mode_idx);
+
+ if (xd->lossless[winner_mbmi->segment_id] == 0 &&
+ winner_mode_index != THR_INVALID &&
+ is_winner_mode_processing_enabled(cpi, x, winner_mbmi,
+ rd_cost->skip_txfm)) {
+ RD_STATS rd_stats = *winner_rd_stats;
+ int skip_blk = 0;
+ RD_STATS rd_stats_y, rd_stats_uv;
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+
+ *mbmi = *winner_mbmi;
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (has_second_ref(mbmi))
+ xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ if (is_inter_mode(mbmi->mode)) {
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ bool is_predictor_built = false;
+ const PREDICTION_MODE prediction_mode = mbmi->mode;
+ // Do interpolation filter search for realtime mode if applicable.
+ if (cpi->sf.winner_mode_sf.winner_mode_ifs &&
+ cpi->oxcf.mode == REALTIME &&
+ cm->current_frame.reference_mode == SINGLE_REFERENCE &&
+ is_inter_mode(prediction_mode) &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION &&
+ !is_inter_compound_mode(prediction_mode)) {
+ is_predictor_built =
+ fast_interp_search(cpi, x, mi_row, mi_col, bsize);
+ }
+ if (!is_predictor_built) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+ }
+ if (mbmi->motion_mode == OBMC_CAUSAL)
+ av1_build_obmc_inter_predictors_sb(cm, xd);
+
+ av1_subtract_plane(x, bsize, 0);
+ if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ !xd->lossless[mbmi->segment_id]) {
+ av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+ INT64_MAX);
+ assert(rd_stats_y.rate != INT_MAX);
+ } else {
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+ INT64_MAX);
+ memset(mbmi->inter_tx_size, mbmi->tx_size,
+ sizeof(mbmi->inter_tx_size));
+ for (int i = 0; i < xd->height * xd->width; ++i)
+ set_blk_skip(txfm_info->blk_skip, 0, i, rd_stats_y.skip_txfm);
+ }
+ } else {
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+ INT64_MAX);
+ }
+
+ if (num_planes > 1) {
+ av1_txfm_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+ } else {
+ av1_init_rd_stats(&rd_stats_uv);
+ }
+
+ const ModeCosts *mode_costs = &x->mode_costs;
+ if (is_inter_mode(mbmi->mode) &&
+ RDCOST(x->rdmult,
+ mode_costs->skip_txfm_cost[skip_ctx][0] + rd_stats_y.rate +
+ rd_stats_uv.rate,
+ (rd_stats_y.dist + rd_stats_uv.dist)) >
+ RDCOST(x->rdmult, mode_costs->skip_txfm_cost[skip_ctx][1],
+ (rd_stats_y.sse + rd_stats_uv.sse))) {
+ skip_blk = 1;
+ rd_stats_y.rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+ rd_stats_uv.rate = 0;
+ rd_stats_y.dist = rd_stats_y.sse;
+ rd_stats_uv.dist = rd_stats_uv.sse;
+ } else {
+ skip_blk = 0;
+ rd_stats_y.rate += mode_costs->skip_txfm_cost[skip_ctx][0];
+ }
+ int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate -
+ winner_rate_y - winner_rate_uv;
+ int64_t this_rd =
+ RDCOST(x->rdmult, this_rate, (rd_stats_y.dist + rd_stats_uv.dist));
+ if (best_rd > this_rd) {
+ *best_mbmode = *mbmi;
+ *best_mode_index = winner_mode_index;
+ av1_copy_array(ctx->blk_skip, txfm_info->blk_skip, ctx->num_4x4_blk);
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ rd_cost->rate = this_rate;
+ rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+ rd_cost->sse = rd_stats_y.sse + rd_stats_uv.sse;
+ rd_cost->rdcost = this_rd;
+ best_rd = this_rd;
+ *best_skip2 = skip_blk;
+ }
+ }
+ }
+}
+
+/*!\cond */
+typedef struct {
+ // Mask for each reference frame, specifying which prediction modes to NOT try
+ // during search.
+ uint32_t pred_modes[REF_FRAMES];
+ // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of
+ // reference frames (i, j).
+ // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1
+ // (NONE_FRAME).
+ bool ref_combo[REF_FRAMES][REF_FRAMES + 1];
+} mode_skip_mask_t;
+/*!\endcond */
+
+// Update 'ref_combo' mask to disable given 'ref' in single and compound modes.
+static AOM_INLINE void disable_reference(
+ MV_REFERENCE_FRAME ref, bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+ for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+ ref_combo[ref][ref2 + 1] = true;
+ }
+}
+
+// Update 'ref_combo' mask to disable all inter references except ALTREF.
+static AOM_INLINE void disable_inter_references_except_altref(
+ bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+ disable_reference(LAST_FRAME, ref_combo);
+ disable_reference(LAST2_FRAME, ref_combo);
+ disable_reference(LAST3_FRAME, ref_combo);
+ disable_reference(GOLDEN_FRAME, ref_combo);
+ disable_reference(BWDREF_FRAME, ref_combo);
+ disable_reference(ALTREF2_FRAME, ref_combo);
+}
+
+static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = {
+ { LAST_FRAME, NONE_FRAME }, { ALTREF_FRAME, NONE_FRAME },
+ { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, NONE_FRAME },
+ { INTRA_FRAME, NONE_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME },
+ { LAST_FRAME, GOLDEN_FRAME }, { LAST_FRAME, INTRA_FRAME },
+ { LAST_FRAME, BWDREF_FRAME }, { LAST_FRAME, LAST3_FRAME },
+ { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME },
+ { BWDREF_FRAME, NONE_FRAME }, { BWDREF_FRAME, ALTREF_FRAME },
+ { ALTREF_FRAME, INTRA_FRAME }, { BWDREF_FRAME, INTRA_FRAME },
+};
+
+typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET;
+
+static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask,
+ REF_SET ref_set) {
+ if (ref_set == REF_SET_FULL) {
+ // Everything available by default.
+ memset(mask, 0, sizeof(*mask));
+ } else {
+ // All modes available by default.
+ memset(mask->pred_modes, 0, sizeof(mask->pred_modes));
+ // All references disabled first.
+ for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) {
+ for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+ mask->ref_combo[ref1][ref2 + 1] = true;
+ }
+ }
+ const MV_REFERENCE_FRAME(*ref_set_combos)[2];
+ int num_ref_combos;
+
+ // Then enable reduced set of references explicitly.
+ switch (ref_set) {
+ case REF_SET_REDUCED:
+ ref_set_combos = reduced_ref_combos;
+ num_ref_combos =
+ (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]);
+ break;
+ case REF_SET_REALTIME:
+ ref_set_combos = real_time_ref_combos;
+ num_ref_combos =
+ (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]);
+ break;
+ default: assert(0); num_ref_combos = 0;
+ }
+
+ for (int i = 0; i < num_ref_combos; ++i) {
+ const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i];
+ mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false;
+ }
+ }
+}
+
+static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ unsigned char segment_id = mbmi->segment_id;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const INTER_MODE_SPEED_FEATURES *const inter_sf = &sf->inter_sf;
+ REF_SET ref_set = REF_SET_FULL;
+
+ if (sf->rt_sf.use_real_time_ref_set)
+ ref_set = REF_SET_REALTIME;
+ else if (cpi->oxcf.ref_frm_cfg.enable_reduced_reference_set)
+ ref_set = REF_SET_REDUCED;
+
+ default_skip_mask(mask, ref_set);
+
+ int min_pred_mv_sad = INT_MAX;
+ MV_REFERENCE_FRAME ref_frame;
+ if (ref_set == REF_SET_REALTIME) {
+ // For real-time encoding, we only look at a subset of ref frames. So the
+ // threshold for pruning should be computed from this subset as well.
+ const int num_rt_refs =
+ sizeof(real_time_ref_combos) / sizeof(*real_time_ref_combos);
+ for (int r_idx = 0; r_idx < num_rt_refs; r_idx++) {
+ const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0];
+ if (ref != INTRA_FRAME) {
+ min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref]);
+ }
+ }
+ } else {
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+ min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+ }
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) {
+ // Skip checking missing reference in both single and compound reference
+ // modes.
+ disable_reference(ref_frame, mask->ref_combo);
+ } else {
+ // Skip fixed mv modes for poor references
+ if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
+ mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+ }
+ }
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+ // Reference not used for the segment.
+ disable_reference(ref_frame, mask->ref_combo);
+ }
+ }
+ // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature
+ // is disabled for this segment. This is to prevent the possibility that we
+ // end up unable to pick any mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative. We allow near/nearest as well
+ // because they may result in zero-zero MVs but be cheaper.
+ if (cpi->rc.is_src_frame_alt_ref &&
+ (cpi->oxcf.algo_cfg.arnr_max_frames == 0)) {
+ disable_inter_references_except_altref(mask->ref_combo);
+
+ mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+ const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
+ int_mv near_mv, nearest_mv, global_mv;
+ get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames,
+ &x->mbmi_ext);
+ get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext);
+ get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, &x->mbmi_ext);
+
+ if (near_mv.as_int != global_mv.as_int)
+ mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV);
+ if (nearest_mv.as_int != global_mv.as_int)
+ mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV);
+ }
+ }
+
+ if (cpi->rc.is_src_frame_alt_ref) {
+ if (inter_sf->alt_ref_search_fp &&
+ (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME])) {
+ mask->pred_modes[ALTREF_FRAME] = 0;
+ disable_inter_references_except_altref(mask->ref_combo);
+ disable_reference(INTRA_FRAME, mask->ref_combo);
+ }
+ }
+
+ if (inter_sf->alt_ref_search_fp) {
+ if (!cm->show_frame && x->best_pred_mv_sad[0] < INT_MAX) {
+ int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 3);
+ // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if
+ // those are past frames
+ MV_REFERENCE_FRAME start_frame =
+ inter_sf->alt_ref_search_fp == 1 ? ALTREF2_FRAME : BWDREF_FRAME;
+ for (ref_frame = start_frame; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
+ 0) {
+ // Prune inter modes when relative dist of ALTREF2 and ALTREF is close
+ // to the relative dist of LAST_FRAME.
+ if (inter_sf->alt_ref_search_fp == 1 &&
+ (abs(cpi->ref_frame_dist_info
+ .ref_relative_dist[ref_frame - LAST_FRAME]) >
+ 1.5 * abs(cpi->ref_frame_dist_info
+ .ref_relative_dist[LAST_FRAME - LAST_FRAME]))) {
+ continue;
+ }
+ if (x->pred_mv_sad[ref_frame] > sad_thresh)
+ mask->pred_modes[ref_frame] |= INTER_ALL;
+ }
+ }
+ }
+ }
+
+ if (sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) {
+ if (x->best_pred_mv_sad[0] < INT_MAX) {
+ int sad_thresh = x->best_pred_mv_sad[0] + (x->best_pred_mv_sad[0] >> 1);
+ const int prune_ref_list[2] = { GOLDEN_FRAME, ALTREF_FRAME };
+
+ // Conservatively skip the modes w.r.t. GOLDEN and ALTREF references
+ for (int ref_idx = 0; ref_idx < 2; ref_idx++) {
+ ref_frame = prune_ref_list[ref_idx];
+ if (x->pred_mv_sad[ref_frame] > sad_thresh)
+ mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+ }
+ }
+ }
+
+ if (bsize > sf->part_sf.max_intra_bsize) {
+ disable_reference(INTRA_FRAME, mask->ref_combo);
+ }
+
+ if (!cpi->oxcf.tool_cfg.enable_global_motion) {
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ mask->pred_modes[ref_frame] |= (1 << GLOBALMV);
+ mask->pred_modes[ref_frame] |= (1 << GLOBAL_GLOBALMV);
+ }
+ }
+
+ mask->pred_modes[INTRA_FRAME] |=
+ ~(uint32_t)sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
+
+ // Prune reference frames which are not the closest to the current
+ // frame and with large pred_mv_sad.
+ if (inter_sf->prune_single_ref) {
+ assert(inter_sf->prune_single_ref > 0 && inter_sf->prune_single_ref < 3);
+ const double prune_threshes[2] = { 1.20, 1.05 };
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const RefFrameDistanceInfo *const ref_frame_dist_info =
+ &cpi->ref_frame_dist_info;
+ const int is_closest_ref =
+ (ref_frame == ref_frame_dist_info->nearest_past_ref) ||
+ (ref_frame == ref_frame_dist_info->nearest_future_ref);
+
+ if (!is_closest_ref) {
+ const int dir =
+ (ref_frame_dist_info->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+ ? 0
+ : 1;
+ if (x->best_pred_mv_sad[dir] < INT_MAX &&
+ x->pred_mv_sad[ref_frame] >
+ prune_threshes[inter_sf->prune_single_ref - 1] *
+ x->best_pred_mv_sad[dir])
+ mask->pred_modes[ref_frame] |= INTER_SINGLE_ALL;
+ }
+ }
+ }
+}
+
+static AOM_INLINE void init_neighbor_pred_buf(
+ const OBMCBuffer *const obmc_buffer, HandleInterModeArgs *const args,
+ int is_hbd) {
+ if (is_hbd) {
+ const int len = sizeof(uint16_t);
+ args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred);
+ args->above_pred_buf[1] = CONVERT_TO_BYTEPTR(obmc_buffer->above_pred +
+ (MAX_SB_SQUARE >> 1) * len);
+ args->above_pred_buf[2] =
+ CONVERT_TO_BYTEPTR(obmc_buffer->above_pred + MAX_SB_SQUARE * len);
+ args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(obmc_buffer->left_pred);
+ args->left_pred_buf[1] =
+ CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1) * len);
+ args->left_pred_buf[2] =
+ CONVERT_TO_BYTEPTR(obmc_buffer->left_pred + MAX_SB_SQUARE * len);
+ } else {
+ args->above_pred_buf[0] = obmc_buffer->above_pred;
+ args->above_pred_buf[1] = obmc_buffer->above_pred + (MAX_SB_SQUARE >> 1);
+ args->above_pred_buf[2] = obmc_buffer->above_pred + MAX_SB_SQUARE;
+ args->left_pred_buf[0] = obmc_buffer->left_pred;
+ args->left_pred_buf[1] = obmc_buffer->left_pred + (MAX_SB_SQUARE >> 1);
+ args->left_pred_buf[2] = obmc_buffer->left_pred + MAX_SB_SQUARE;
+ }
+}
+
+static AOM_INLINE int prune_ref_frame(const AV1_COMP *cpi, const MACROBLOCK *x,
+ MV_REFERENCE_FRAME ref_frame) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MV_REFERENCE_FRAME rf[2];
+ av1_set_ref_frame(rf, ref_frame);
+
+ if ((cpi->prune_ref_frame_mask >> ref_frame) & 1) return 1;
+
+ if (prune_ref_by_selective_ref_frame(cpi, x, rf,
+ cm->cur_frame->ref_display_order_hint)) {
+ return 1;
+ }
+
+ return 0;
+}
+
+static AOM_INLINE int is_ref_frame_used_by_compound_ref(
+ int ref_frame, int skip_ref_frame_mask) {
+ for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+ if (!(skip_ref_frame_mask & (1 << r))) {
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+ if (rf[0] == ref_frame || rf[1] == ref_frame) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+static AOM_INLINE int is_ref_frame_used_in_cache(MV_REFERENCE_FRAME ref_frame,
+ const MB_MODE_INFO *mi_cache) {
+ if (!mi_cache) {
+ return 0;
+ }
+
+ if (ref_frame < REF_FRAMES) {
+ return (ref_frame == mi_cache->ref_frame[0] ||
+ ref_frame == mi_cache->ref_frame[1]);
+ }
+
+ // if we are here, then the current mode is compound.
+ MV_REFERENCE_FRAME cached_ref_type = av1_ref_frame_type(mi_cache->ref_frame);
+ return ref_frame == cached_ref_type;
+}
+
+// Please add/modify parameter setting in this function, making it consistent
+// and easy to read and maintain.
+static AOM_INLINE void set_params_rd_pick_inter_mode(
+ const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+ BLOCK_SIZE bsize, mode_skip_mask_t *mode_skip_mask, int skip_ref_frame_mask,
+ unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES],
+ struct buf_2d (*yv12_mb)[MAX_MB_PLANE]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+ unsigned char segment_id = mbmi->segment_id;
+
+ init_neighbor_pred_buf(&x->obmc_buffer, args, is_cur_buf_hbd(&x->e_mbd));
+ av1_collect_neighbors_ref_counts(xd);
+ estimate_ref_frame_costs(cm, xd, &x->mode_costs, segment_id, ref_costs_single,
+ ref_costs_comp);
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ x->best_pred_mv_sad[0] = INT_MAX;
+ x->best_pred_mv_sad[1] = INT_MAX;
+
+ for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME;
+ ++ref_frame) {
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+ mbmi_ext->mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ // Skip the ref frame if the mask says skip and the ref is not used by
+ // compound ref.
+ if (skip_ref_frame_mask & (1 << ref_frame) &&
+ !is_ref_frame_used_by_compound_ref(ref_frame, skip_ref_frame_mask) &&
+ !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) {
+ continue;
+ }
+ assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
+ setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb);
+ }
+ if (cpi->sf.inter_sf.alt_ref_search_fp ||
+ cpi->sf.inter_sf.prune_single_ref ||
+ cpi->sf.rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad) {
+ // Store the best pred_mv_sad across all past frames
+ if (cpi->ref_frame_dist_info.ref_relative_dist[ref_frame - LAST_FRAME] <
+ 0)
+ x->best_pred_mv_sad[0] =
+ AOMMIN(x->best_pred_mv_sad[0], x->pred_mv_sad[ref_frame]);
+ else
+ // Store the best pred_mv_sad across all future frames
+ x->best_pred_mv_sad[1] =
+ AOMMIN(x->best_pred_mv_sad[1], x->pred_mv_sad[ref_frame]);
+ }
+ }
+
+ if (!cpi->sf.rt_sf.use_real_time_ref_set && is_comp_ref_allowed(bsize)) {
+ // No second reference on RT ref set, so no need to initialize
+ for (MV_REFERENCE_FRAME ref_frame = EXTREF_FRAME;
+ ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+ mbmi_ext->mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
+ if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) &&
+ (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) {
+ continue;
+ }
+
+ if (skip_ref_frame_mask & (1 << ref_frame) &&
+ !is_ref_frame_used_in_cache(ref_frame, x->mb_mode_cache)) {
+ continue;
+ }
+ // Ref mv list population is not required, when compound references are
+ // pruned.
+ if (prune_ref_frame(cpi, x, ref_frame)) continue;
+
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+ mbmi_ext->mode_context);
+ // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+ // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+ av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+ }
+ }
+
+ av1_count_overlappable_neighbors(cm, xd);
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ int use_actual_frame_probs = 1;
+ int prune_obmc;
+#if CONFIG_FPMT_TEST
+ use_actual_frame_probs =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+ if (!use_actual_frame_probs) {
+ prune_obmc = cpi->ppi->temp_frame_probs.obmc_probs[update_type][bsize] <
+ cpi->sf.inter_sf.prune_obmc_prob_thresh;
+ }
+#endif
+ if (use_actual_frame_probs) {
+ prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] <
+ cpi->sf.inter_sf.prune_obmc_prob_thresh;
+ }
+ if (cpi->oxcf.motion_mode_cfg.enable_obmc && !prune_obmc) {
+ if (check_num_overlappable_neighbors(mbmi) &&
+ is_motion_variation_allowed_bsize(bsize)) {
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+ MAX_SB_SIZE >> 1 };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+ MAX_SB_SIZE >> 1 };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ av1_build_prediction_by_above_preds(cm, xd, args->above_pred_buf,
+ dst_width1, dst_height1,
+ args->above_pred_stride);
+ av1_build_prediction_by_left_preds(cm, xd, args->left_pred_buf,
+ dst_width2, dst_height2,
+ args->left_pred_stride);
+ const int num_planes = av1_num_planes(cm);
+ av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row,
+ mi_col, 0, num_planes);
+ calc_target_weighted_pred(
+ cm, x, xd, args->above_pred_buf[0], args->above_pred_stride[0],
+ args->left_pred_buf[0], args->left_pred_stride[0]);
+ }
+ }
+
+ init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
+
+ // Set params for mode evaluation
+ set_mode_eval_params(cpi, x, MODE_EVAL);
+
+ x->comp_rd_stats_idx = 0;
+
+ for (int idx = 0; idx < REF_FRAMES; idx++) {
+ args->best_single_sse_in_refs[idx] = INT32_MAX;
+ }
+}
+
+static AOM_INLINE void init_single_inter_mode_search_state(
+ InterModeSearchState *search_state) {
+ for (int dir = 0; dir < 2; ++dir) {
+ for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+ SingleInterModeState *state;
+
+ state = &search_state->single_state[dir][mode][ref_frame];
+ state->ref_frame = NONE_FRAME;
+ state->rd = INT64_MAX;
+
+ state = &search_state->single_state_modelled[dir][mode][ref_frame];
+ state->ref_frame = NONE_FRAME;
+ state->rd = INT64_MAX;
+
+ search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+ }
+ }
+ }
+
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+ search_state->best_single_rd[ref_frame] = INT64_MAX;
+ search_state->best_single_mode[ref_frame] = PRED_MODE_INVALID;
+ }
+ av1_zero(search_state->single_state_cnt);
+ av1_zero(search_state->single_state_modelled_cnt);
+}
+
+static AOM_INLINE void init_inter_mode_search_state(
+ InterModeSearchState *search_state, const AV1_COMP *cpi,
+ const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) {
+ init_intra_mode_search_state(&search_state->intra_search_state);
+ av1_invalid_rd_stats(&search_state->best_y_rdcost);
+
+ search_state->best_rd = best_rd_so_far;
+ search_state->best_skip_rd[0] = INT64_MAX;
+ search_state->best_skip_rd[1] = INT64_MAX;
+
+ av1_zero(search_state->best_mbmode);
+
+ search_state->best_rate_y = INT_MAX;
+
+ search_state->best_rate_uv = INT_MAX;
+
+ search_state->best_mode_skippable = 0;
+
+ search_state->best_skip2 = 0;
+
+ search_state->best_mode_index = THR_INVALID;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const unsigned char segment_id = mbmi->segment_id;
+
+ search_state->num_available_refs = 0;
+ memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs));
+ memset(search_state->dist_order_refs, -1,
+ sizeof(search_state->dist_order_refs));
+
+ for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+ search_state->mode_threshold[i] = 0;
+ const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+ for (int i = LAST_NEW_MV_INDEX + 1; i < SINGLE_REF_MODE_END; ++i)
+ search_state->mode_threshold[i] =
+ ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >>
+ RD_THRESH_FAC_FRAC_BITS;
+
+ search_state->best_intra_rd = INT64_MAX;
+
+ search_state->best_pred_sse = UINT_MAX;
+
+ av1_zero(search_state->single_newmv);
+ av1_zero(search_state->single_newmv_rate);
+ av1_zero(search_state->single_newmv_valid);
+ for (int i = SINGLE_INTER_MODE_START; i < SINGLE_INTER_MODE_END; ++i) {
+ for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) {
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+ search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+ search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+ }
+ }
+ }
+
+ for (int i = 0; i < REFERENCE_MODES; ++i) {
+ search_state->best_pred_rd[i] = INT64_MAX;
+ }
+
+ if (cpi->common.current_frame.reference_mode != SINGLE_REFERENCE) {
+ for (int i = SINGLE_REF_MODE_END; i < THR_INTER_MODE_END; ++i)
+ search_state->mode_threshold[i] =
+ ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >>
+ RD_THRESH_FAC_FRAC_BITS;
+
+ for (int i = COMP_INTER_MODE_START; i < COMP_INTER_MODE_END; ++i) {
+ for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) {
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+ search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+ search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+ }
+ }
+ }
+
+ init_single_inter_mode_search_state(search_state);
+ }
+}
+
+static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask,
+ const MV_REFERENCE_FRAME *ref_frame,
+ const PREDICTION_MODE this_mode) {
+ if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) {
+ return true;
+ }
+
+ return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1];
+}
+
+static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x,
+ BLOCK_SIZE bsize,
+ PREDICTION_MODE curr_mode,
+ const MV_REFERENCE_FRAME *ref_frames) {
+ const int comp_pred = ref_frames[1] > INTRA_FRAME;
+ if (comp_pred) {
+ if (!is_comp_ref_allowed(bsize)) return 1;
+ if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frames[1]])) {
+ return 1;
+ }
+
+ const AV1_COMMON *const cm = &cpi->common;
+ if (frame_is_intra_only(cm)) return 1;
+
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ if (current_frame->reference_mode == SINGLE_REFERENCE) return 1;
+
+ const struct segmentation *const seg = &cm->seg;
+ const unsigned char segment_id = x->e_mbd.mi[0]->segment_id;
+ // Do not allow compound prediction if the segment level reference frame
+ // feature is in use as in this case there can only be one reference.
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
+ }
+
+ if (ref_frames[0] > INTRA_FRAME && ref_frames[1] == INTRA_FRAME) {
+ // Mode must be compatible
+ if (!is_interintra_allowed_bsize(bsize)) return 1;
+ if (!is_interintra_allowed_mode(curr_mode)) return 1;
+ }
+
+ return 0;
+}
+
+static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mib_size) {
+ const int sb_size_mask = mib_size - 1;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int mi_row_in_sb = mi_row & sb_size_mask;
+ const int mi_col_in_sb = mi_col & sb_size_mask;
+ const int mi_w = mi_size_wide[bsize];
+ const int mi_h = mi_size_high[bsize];
+ int picked_ref_frames_mask = 0;
+ for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) {
+ for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) {
+ picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j];
+ }
+ }
+ return picked_ref_frames_mask;
+}
+
+// Check if reference frame pair of the current block matches with the given
+// block.
+static INLINE int match_ref_frame_pair(const MB_MODE_INFO *mbmi,
+ const MV_REFERENCE_FRAME *ref_frames) {
+ return ((ref_frames[0] == mbmi->ref_frame[0]) &&
+ (ref_frames[1] == mbmi->ref_frame[1]));
+}
+
+// Case 1: return 0, means don't skip this mode
+// Case 2: return 1, means skip this mode completely
+// Case 3: return 2, means skip compound only, but still try single motion modes
+static int inter_mode_search_order_independent_skip(
+ const AV1_COMP *cpi, const MACROBLOCK *x, mode_skip_mask_t *mode_skip_mask,
+ InterModeSearchState *search_state, int skip_ref_frame_mask,
+ PREDICTION_MODE mode, const MV_REFERENCE_FRAME *ref_frame) {
+ if (mask_says_skip(mode_skip_mask, ref_frame, mode)) {
+ return 1;
+ }
+
+ const int ref_type = av1_ref_frame_type(ref_frame);
+ if (!cpi->sf.rt_sf.use_real_time_ref_set)
+ if (prune_ref_frame(cpi, x, ref_type)) return 1;
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test &&
+ ref_frame[0] == INTRA_FRAME)
+ return 1;
+
+ const AV1_COMMON *const cm = &cpi->common;
+ if (skip_repeated_mv(cm, x, mode, ref_frame, search_state)) {
+ return 1;
+ }
+
+ // Reuse the prediction mode in cache
+ if (x->use_mb_mode_cache) {
+ const MB_MODE_INFO *cached_mi = x->mb_mode_cache;
+ const PREDICTION_MODE cached_mode = cached_mi->mode;
+ const MV_REFERENCE_FRAME *cached_frame = cached_mi->ref_frame;
+ const int cached_mode_is_single = cached_frame[1] <= INTRA_FRAME;
+
+ // If the cached mode is intra, then we just need to match the mode.
+ if (is_mode_intra(cached_mode) && mode != cached_mode) {
+ return 1;
+ }
+
+ // If the cached mode is single inter mode, then we match the mode and
+ // reference frame.
+ if (cached_mode_is_single) {
+ if (mode != cached_mode || ref_frame[0] != cached_frame[0]) {
+ return 1;
+ }
+ } else {
+ // If the cached mode is compound, then we need to consider several cases.
+ const int mode_is_single = ref_frame[1] <= INTRA_FRAME;
+ if (mode_is_single) {
+ // If the mode is single, we know the modes can't match. But we might
+ // still want to search it if compound mode depends on the current mode.
+ int skip_motion_mode_only = 0;
+ if (cached_mode == NEW_NEARMV || cached_mode == NEW_NEARESTMV) {
+ skip_motion_mode_only = (ref_frame[0] == cached_frame[0]);
+ } else if (cached_mode == NEAR_NEWMV || cached_mode == NEAREST_NEWMV) {
+ skip_motion_mode_only = (ref_frame[0] == cached_frame[1]);
+ } else if (cached_mode == NEW_NEWMV) {
+ skip_motion_mode_only = (ref_frame[0] == cached_frame[0] ||
+ ref_frame[0] == cached_frame[1]);
+ }
+
+ return 1 + skip_motion_mode_only;
+ } else {
+ // If both modes are compound, then everything must match.
+ if (mode != cached_mode || ref_frame[0] != cached_frame[0] ||
+ ref_frame[1] != cached_frame[1]) {
+ return 1;
+ }
+ }
+ }
+ }
+
+ const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+ // If no valid mode has been found so far in PARTITION_NONE when finding a
+ // valid partition is required, do not skip mode.
+ if (search_state->best_rd == INT64_MAX && mbmi->partition == PARTITION_NONE &&
+ x->must_find_valid_partition)
+ return 0;
+
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ // Prune NEARMV and NEAR_NEARMV based on q index and neighbor's reference
+ // frames
+ if (sf->inter_sf.prune_nearmv_using_neighbors &&
+ (mode == NEAR_NEARMV || mode == NEARMV)) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ if (search_state->best_rd != INT64_MAX && xd->left_available &&
+ xd->up_available) {
+ const int thresholds[PRUNE_NEARMV_MAX][3] = { { 1, 0, 0 },
+ { 1, 1, 0 },
+ { 2, 1, 0 } };
+ const int qindex_sub_range = x->qindex * 3 / QINDEX_RANGE;
+
+ assert(sf->inter_sf.prune_nearmv_using_neighbors <= PRUNE_NEARMV_MAX &&
+ qindex_sub_range < 3);
+ const int num_ref_frame_pair_match_thresh =
+ thresholds[sf->inter_sf.prune_nearmv_using_neighbors - 1]
+ [qindex_sub_range];
+
+ assert(num_ref_frame_pair_match_thresh <= 2 &&
+ num_ref_frame_pair_match_thresh >= 0);
+ int num_ref_frame_pair_match = 0;
+
+ num_ref_frame_pair_match = match_ref_frame_pair(xd->left_mbmi, ref_frame);
+ num_ref_frame_pair_match +=
+ match_ref_frame_pair(xd->above_mbmi, ref_frame);
+
+ // Pruning based on ref frame pair match with neighbors.
+ if (num_ref_frame_pair_match < num_ref_frame_pair_match_thresh) return 1;
+ }
+ }
+
+ int skip_motion_mode = 0;
+ if (mbmi->partition != PARTITION_NONE) {
+ int skip_ref = skip_ref_frame_mask & (1 << ref_type);
+ if (ref_type <= ALTREF_FRAME && skip_ref) {
+ // Since the compound ref modes depends on the motion estimation result of
+ // two single ref modes (best mv of single ref modes as the start point),
+ // if current single ref mode is marked skip, we need to check if it will
+ // be used in compound ref modes.
+ if (is_ref_frame_used_by_compound_ref(ref_type, skip_ref_frame_mask)) {
+ // Found a not skipped compound ref mode which contains current
+ // single ref. So this single ref can't be skipped completely
+ // Just skip its motion mode search, still try its simple
+ // transition mode.
+ skip_motion_mode = 1;
+ skip_ref = 0;
+ }
+ }
+ // If we are reusing the prediction from cache, and the current frame is
+ // required by the cache, then we cannot prune it.
+ if (is_ref_frame_used_in_cache(ref_type, x->mb_mode_cache)) {
+ skip_ref = 0;
+ // If the cache only needs the current reference type for compound
+ // prediction, then we can skip motion mode search.
+ skip_motion_mode = (ref_type <= ALTREF_FRAME &&
+ x->mb_mode_cache->ref_frame[1] > INTRA_FRAME);
+ }
+ if (skip_ref) return 1;
+ }
+
+ if (ref_frame[0] == INTRA_FRAME) {
+ if (mode != DC_PRED) {
+ // Disable intra modes other than DC_PRED for blocks with low variance
+ // Threshold for intra skipping based on source variance
+ // TODO(debargha): Specialize the threshold for super block sizes
+ const unsigned int skip_intra_var_thresh = 64;
+ if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+ x->source_variance < skip_intra_var_thresh)
+ return 1;
+ }
+ }
+
+ if (skip_motion_mode) return 2;
+
+ return 0;
+}
+
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode,
+ const MV_REFERENCE_FRAME *ref_frames,
+ const AV1_COMMON *cm) {
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ mbmi->ref_mv_idx = 0;
+ mbmi->mode = curr_mode;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = ref_frames[0];
+ mbmi->ref_frame[1] = ref_frames[1];
+ pmi->palette_size[0] = 0;
+ pmi->palette_size[1] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+ set_default_interp_filters(mbmi, cm->features.interp_filter);
+}
+
+static AOM_INLINE void collect_single_states(MACROBLOCK *x,
+ InterModeSearchState *search_state,
+ const MB_MODE_INFO *const mbmi) {
+ int i, j;
+ const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1;
+ const int mode_offset = INTER_OFFSET(this_mode);
+ const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+ // Simple rd
+ int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame];
+ for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ const int64_t rd =
+ search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
+ if (rd < simple_rd) simple_rd = rd;
+ }
+
+ // Insertion sort of single_state
+ const SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
+ SingleInterModeState *state_s = search_state->single_state[dir][mode_offset];
+ i = search_state->single_state_cnt[dir][mode_offset];
+ for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j)
+ state_s[j] = state_s[j - 1];
+ state_s[j] = this_state_s;
+ search_state->single_state_cnt[dir][mode_offset]++;
+
+ // Modelled rd
+ int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame];
+ for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ const int64_t rd =
+ search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
+ if (rd < modelled_rd) modelled_rd = rd;
+ }
+
+ // Insertion sort of single_state_modelled
+ const SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
+ SingleInterModeState *state_m =
+ search_state->single_state_modelled[dir][mode_offset];
+ i = search_state->single_state_modelled_cnt[dir][mode_offset];
+ for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j)
+ state_m[j] = state_m[j - 1];
+ state_m[j] = this_state_m;
+ search_state->single_state_modelled_cnt[dir][mode_offset]++;
+}
+
+static AOM_INLINE void analyze_single_states(
+ const AV1_COMP *cpi, InterModeSearchState *search_state) {
+ const int prune_level = cpi->sf.inter_sf.prune_comp_search_by_single_result;
+ assert(prune_level >= 1);
+ int i, j, dir, mode;
+
+ for (dir = 0; dir < 2; ++dir) {
+ int64_t best_rd;
+ SingleInterModeState(*state)[FWD_REFS];
+ const int prune_factor = prune_level >= 2 ? 6 : 5;
+
+ // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
+ // reference frames for all the modes (NEARESTMV and NEARMV may not
+ // have same motion vectors). Always keep the best of each mode
+ // because it might form the best possible combination with other mode.
+ state = search_state->single_state[dir];
+ best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+ state[INTER_OFFSET(GLOBALMV)][0].rd);
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
+ if (state[mode][i].rd != INT64_MAX &&
+ (state[mode][i].rd >> 3) * prune_factor > best_rd) {
+ state[mode][i].valid = 0;
+ }
+ }
+ }
+
+ state = search_state->single_state_modelled[dir];
+ best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+ state[INTER_OFFSET(GLOBALMV)][0].rd);
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; ++i) {
+ if (state[mode][i].rd != INT64_MAX &&
+ (state[mode][i].rd >> 3) * prune_factor > best_rd) {
+ state[mode][i].valid = 0;
+ }
+ }
+ }
+ }
+
+ // Ordering by simple rd first, then by modelled rd
+ for (dir = 0; dir < 2; ++dir) {
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ const int state_cnt_s = search_state->single_state_cnt[dir][mode];
+ const int state_cnt_m =
+ search_state->single_state_modelled_cnt[dir][mode];
+ SingleInterModeState *state_s = search_state->single_state[dir][mode];
+ SingleInterModeState *state_m =
+ search_state->single_state_modelled[dir][mode];
+ int count = 0;
+ const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m);
+ for (i = 0; i < state_cnt_s; ++i) {
+ if (state_s[i].rd == INT64_MAX) break;
+ if (state_s[i].valid) {
+ search_state->single_rd_order[dir][mode][count++] =
+ state_s[i].ref_frame;
+ }
+ }
+ if (count >= max_candidates) continue;
+
+ for (i = 0; i < state_cnt_m && count < max_candidates; ++i) {
+ if (state_m[i].rd == INT64_MAX) break;
+ if (!state_m[i].valid) continue;
+ const int ref_frame = state_m[i].ref_frame;
+ int match = 0;
+ // Check if existing already
+ for (j = 0; j < count; ++j) {
+ if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
+ match = 1;
+ break;
+ }
+ }
+ if (match) continue;
+ // Check if this ref_frame is removed in simple rd
+ int valid = 1;
+ for (j = 0; j < state_cnt_s; ++j) {
+ if (ref_frame == state_s[j].ref_frame) {
+ valid = state_s[j].valid;
+ break;
+ }
+ }
+ if (valid) {
+ search_state->single_rd_order[dir][mode][count++] = ref_frame;
+ }
+ }
+ }
+ }
+}
+
+static int compound_skip_get_candidates(
+ const AV1_COMP *cpi, const InterModeSearchState *search_state,
+ const int dir, const PREDICTION_MODE mode) {
+ const int mode_offset = INTER_OFFSET(mode);
+ const SingleInterModeState *state =
+ search_state->single_state[dir][mode_offset];
+ const SingleInterModeState *state_modelled =
+ search_state->single_state_modelled[dir][mode_offset];
+
+ int max_candidates = 0;
+ for (int i = 0; i < FWD_REFS; ++i) {
+ if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break;
+ max_candidates++;
+ }
+
+ int candidates = max_candidates;
+ if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 2) {
+ candidates = AOMMIN(2, max_candidates);
+ }
+ if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 3) {
+ if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX &&
+ state[0].ref_frame == state_modelled[0].ref_frame)
+ candidates = 1;
+ if (mode == NEARMV || mode == GLOBALMV) candidates = 1;
+ }
+
+ if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 4) {
+ // Limit the number of candidates to 1 in each direction for compound
+ // prediction
+ candidates = AOMMIN(1, candidates);
+ }
+ return candidates;
+}
+
+static int compound_skip_by_single_states(
+ const AV1_COMP *cpi, const InterModeSearchState *search_state,
+ const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame,
+ const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) {
+ const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame };
+ const int mode[2] = { compound_ref0_mode(this_mode),
+ compound_ref1_mode(this_mode) };
+ const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) };
+ const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1,
+ refs[1] <= GOLDEN_FRAME ? 0 : 1 };
+ int ref_searched[2] = { 0, 0 };
+ int ref_mv_match[2] = { 1, 1 };
+ int i, j;
+
+ for (i = 0; i < 2; ++i) {
+ const SingleInterModeState *state =
+ search_state->single_state[mode_dir[i]][mode_offset[i]];
+ const int state_cnt =
+ search_state->single_state_cnt[mode_dir[i]][mode_offset[i]];
+ for (j = 0; j < state_cnt; ++j) {
+ if (state[j].ref_frame == refs[i]) {
+ ref_searched[i] = 1;
+ break;
+ }
+ }
+ }
+
+ const int ref_set = get_drl_refmv_count(x, refs, this_mode);
+ for (i = 0; i < 2; ++i) {
+ if (!ref_searched[i] || (mode[i] != NEARESTMV && mode[i] != NEARMV)) {
+ continue;
+ }
+ const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
+ int_mv single_mv;
+ int_mv comp_mv;
+ get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs,
+ &x->mbmi_ext);
+ get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, &x->mbmi_ext);
+ if (single_mv.as_int != comp_mv.as_int) {
+ ref_mv_match[i] = 0;
+ break;
+ }
+ }
+ }
+
+ for (i = 0; i < 2; ++i) {
+ if (!ref_searched[i] || !ref_mv_match[i]) continue;
+ const int candidates =
+ compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
+ const MV_REFERENCE_FRAME *ref_order =
+ search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
+ int match = 0;
+ for (j = 0; j < candidates; ++j) {
+ if (refs[i] == ref_order[j]) {
+ match = 1;
+ break;
+ }
+ }
+ if (!match) return 1;
+ }
+
+ return 0;
+}
+
+// Check if ref frames of current block matches with given block.
+static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi,
+ const MV_REFERENCE_FRAME *ref_frames,
+ int *const is_ref_match) {
+ if (is_inter_block(mbmi)) {
+ is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[0];
+ is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[0];
+ if (has_second_ref(mbmi)) {
+ is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[1];
+ is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[1];
+ }
+ }
+}
+
+// Prune compound mode using ref frames of neighbor blocks.
+static INLINE int compound_skip_using_neighbor_refs(
+ MACROBLOCKD *const xd, const PREDICTION_MODE this_mode,
+ const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) {
+ // Exclude non-extended compound modes from pruning
+ if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+ this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
+ return 0;
+
+ if (prune_ext_comp_using_neighbors >= 3) return 1;
+
+ int is_ref_match[2] = { 0 }; // 0 - match for forward refs
+ // 1 - match for backward refs
+ // Check if ref frames of this block matches with left neighbor.
+ if (xd->left_available)
+ match_ref_frame(xd->left_mbmi, ref_frames, is_ref_match);
+
+ // Check if ref frames of this block matches with above neighbor.
+ if (xd->up_available)
+ match_ref_frame(xd->above_mbmi, ref_frames, is_ref_match);
+
+ // Combine ref frame match with neighbors in forward and backward refs.
+ const int track_ref_match = is_ref_match[0] + is_ref_match[1];
+
+ // Pruning based on ref frame match with neighbors.
+ if (track_ref_match >= prune_ext_comp_using_neighbors) return 0;
+ return 1;
+}
+
+// Update best single mode for the given reference frame based on simple rd.
+static INLINE void update_best_single_mode(InterModeSearchState *search_state,
+ const PREDICTION_MODE this_mode,
+ const MV_REFERENCE_FRAME ref_frame,
+ int64_t this_rd) {
+ if (this_rd < search_state->best_single_rd[ref_frame]) {
+ search_state->best_single_rd[ref_frame] = this_rd;
+ search_state->best_single_mode[ref_frame] = this_mode;
+ }
+}
+
+// Prune compound mode using best single mode for the same reference.
+static INLINE int skip_compound_using_best_single_mode_ref(
+ const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME *ref_frames,
+ const PREDICTION_MODE *best_single_mode,
+ int prune_comp_using_best_single_mode_ref) {
+ // Exclude non-extended compound modes from pruning
+ if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+ this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
+ return 0;
+
+ assert(this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV);
+ const PREDICTION_MODE comp_mode_ref0 = compound_ref0_mode(this_mode);
+ // Get ref frame direction corresponding to NEWMV
+ // 0 - NEWMV corresponding to forward direction
+ // 1 - NEWMV corresponding to backward direction
+ const int newmv_dir = comp_mode_ref0 != NEWMV;
+
+ // Avoid pruning the compound mode when ref frame corresponding to NEWMV
+ // have NEWMV as single mode winner.
+ // Example: For an extended-compound mode,
+ // {mode, {fwd_frame, bwd_frame}} = {NEAR_NEWMV, {LAST_FRAME, ALTREF_FRAME}}
+ // - Ref frame corresponding to NEWMV is ALTREF_FRAME
+ // - Avoid pruning this mode, if best single mode corresponding to ref frame
+ // ALTREF_FRAME is NEWMV
+ const PREDICTION_MODE single_mode = best_single_mode[ref_frames[newmv_dir]];
+ if (single_mode == NEWMV) return 0;
+
+ // Avoid pruning the compound mode when best single mode is not available
+ if (prune_comp_using_best_single_mode_ref == 1)
+ if (single_mode == MB_MODE_COUNT) return 0;
+ return 1;
+}
+
+static int compare_int64(const void *a, const void *b) {
+ int64_t a64 = *((int64_t *)a);
+ int64_t b64 = *((int64_t *)b);
+ if (a64 < b64) {
+ return -1;
+ } else if (a64 == b64) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+static INLINE void update_search_state(
+ InterModeSearchState *search_state, RD_STATS *best_rd_stats_dst,
+ PICK_MODE_CONTEXT *ctx, const RD_STATS *new_best_rd_stats,
+ const RD_STATS *new_best_rd_stats_y, const RD_STATS *new_best_rd_stats_uv,
+ THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int skip_txfm =
+ mbmi->skip_txfm && !is_mode_intra(av1_mode_defs[new_best_mode].mode);
+ const TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+
+ search_state->best_rd = new_best_rd_stats->rdcost;
+ search_state->best_mode_index = new_best_mode;
+ *best_rd_stats_dst = *new_best_rd_stats;
+ search_state->best_mbmode = *mbmi;
+ search_state->best_skip2 = skip_txfm;
+ search_state->best_mode_skippable = new_best_rd_stats->skip_txfm;
+ // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and
+ // rate_uv because av1_txfm_search process is replaced by rd estimation.
+ // Therefore, we should avoid updating best_rate_y and best_rate_uv here.
+ // These two values will be updated when av1_txfm_search is called.
+ if (txfm_search_done) {
+ search_state->best_rate_y =
+ new_best_rd_stats_y->rate +
+ x->mode_costs.skip_txfm_cost[skip_ctx]
+ [new_best_rd_stats->skip_txfm || skip_txfm];
+ search_state->best_rate_uv = new_best_rd_stats_uv->rate;
+ }
+ search_state->best_y_rdcost = *new_best_rd_stats_y;
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+}
+
+// Find the best RD for a reference frame (among single reference modes)
+// and store +10% of it in the 0-th element in ref_frame_rd.
+static AOM_INLINE void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) {
+ assert(ref_frame_rd[0] == INT64_MAX);
+ int64_t ref_copy[REF_FRAMES - 1];
+ memcpy(ref_copy, ref_frame_rd + 1,
+ sizeof(ref_frame_rd[0]) * (REF_FRAMES - 1));
+ qsort(ref_copy, REF_FRAMES - 1, sizeof(int64_t), compare_int64);
+
+ int64_t cutoff = ref_copy[0];
+ // The cut-off is within 10% of the best.
+ if (cutoff != INT64_MAX) {
+ assert(cutoff < INT64_MAX / 200);
+ cutoff = (110 * cutoff) / 100;
+ }
+ ref_frame_rd[0] = cutoff;
+}
+
+// Check if either frame is within the cutoff.
+static INLINE bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES],
+ MV_REFERENCE_FRAME frame1,
+ MV_REFERENCE_FRAME frame2) {
+ assert(frame2 > 0);
+ return ref_frame_rd[frame1] <= ref_frame_rd[0] ||
+ ref_frame_rd[frame2] <= ref_frame_rd[0];
+}
+
+static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *const rd_cost,
+ HandleInterModeArgs *const args, TileDataEnc *const tile_data,
+ PICK_MODE_CONTEXT *const ctx,
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
+ const motion_mode_best_st_candidate *const best_motion_mode_cands,
+ int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd,
+ InterModeSearchState *const search_state, int64_t *yrd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ InterModesInfo *const inter_modes_info = x->inter_modes_info;
+ const int num_best_cand = best_motion_mode_cands->num_motion_mode_cand;
+
+ for (int cand = 0; cand < num_best_cand; cand++) {
+ RD_STATS rd_stats;
+ RD_STATS rd_stats_y;
+ RD_STATS rd_stats_uv;
+ av1_init_rd_stats(&rd_stats);
+ av1_init_rd_stats(&rd_stats_y);
+ av1_init_rd_stats(&rd_stats_uv);
+ int rate_mv;
+
+ rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv;
+ args->skip_motion_mode =
+ best_motion_mode_cands->motion_mode_cand[cand].skip_motion_mode;
+ *mbmi = best_motion_mode_cands->motion_mode_cand[cand].mbmi;
+ rd_stats.rate =
+ best_motion_mode_cands->motion_mode_cand[cand].rate2_nocoeff;
+
+ // Continue if the best candidate is compound.
+ if (!is_inter_singleref_mode(mbmi->mode)) continue;
+
+ x->txfm_search_info.skip_txfm = 0;
+ struct macroblockd_plane *pd = xd->plane;
+ const BUFFER_SET orig_dst = {
+ { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+ { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
+ };
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ // Initialize motion mode to simple translation
+ // Calculation of switchable rate depends on it.
+ mbmi->motion_mode = 0;
+ const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ int64_t skip_rd[2] = { search_state->best_skip_rd[0],
+ search_state->best_skip_rd[1] };
+ int64_t this_yrd = INT64_MAX;
+ int64_t ret_value = motion_mode_rd(
+ cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, args,
+ search_state->best_rd, skip_rd, &rate_mv, &orig_dst, best_est_rd,
+ do_tx_search, inter_modes_info, 1, &this_yrd);
+
+ if (ret_value != INT64_MAX) {
+ rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+ const THR_MODES mode_enum = get_prediction_mode_idx(
+ mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ // Collect mode stats for multiwinner mode processing
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
+ mode_enum, NULL, bsize, rd_stats.rdcost,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search);
+ if (rd_stats.rdcost < search_state->best_rd) {
+ *yrd = this_yrd;
+ update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, mode_enum, x, do_tx_search);
+ if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0];
+ }
+ }
+ }
+}
+
+/*!\cond */
+// Arguments for speed feature pruning of inter mode search
+typedef struct {
+ int *skip_motion_mode;
+ mode_skip_mask_t *mode_skip_mask;
+ InterModeSearchState *search_state;
+ int skip_ref_frame_mask;
+ int reach_first_comp_mode;
+ int mode_thresh_mul_fact;
+ int num_single_modes_processed;
+ int prune_cpd_using_sr_stats_ready;
+} InterModeSFArgs;
+/*!\endcond */
+
+static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+ int64_t *ref_frame_rd, int midx,
+ InterModeSFArgs *args, int is_low_temp_var) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ // Get the actual prediction mode we are trying in this iteration
+ const THR_MODES mode_enum = av1_default_mode_order[midx];
+ const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+ const PREDICTION_MODE this_mode = mode_def->mode;
+ const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame;
+ const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
+ const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
+ const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+ if (ref_frame == INTRA_FRAME) return 1;
+
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ if (sf->inter_sf.skip_arf_compound && update_type == ARF_UPDATE &&
+ comp_pred) {
+ return 1;
+ }
+
+ // This is for real time encoding.
+ if (is_low_temp_var && !comp_pred && ref_frame != LAST_FRAME &&
+ this_mode != NEARESTMV)
+ return 1;
+
+ // Check if this mode should be skipped because it is incompatible with the
+ // current frame
+ if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames))
+ return 1;
+ const int ret = inter_mode_search_order_independent_skip(
+ cpi, x, args->mode_skip_mask, args->search_state,
+ args->skip_ref_frame_mask, this_mode, mode_def->ref_frame);
+ if (ret == 1) return 1;
+ *(args->skip_motion_mode) = (ret == 2);
+
+ // We've reached the first compound prediction mode, get stats from the
+ // single reference predictors to help with pruning.
+ // Disable this pruning logic if interpolation filter search was skipped for
+ // single prediction modes as it can result in aggressive pruning of compound
+ // prediction modes due to the absence of modelled_rd populated by
+ // av1_interpolation_filter_search().
+ // TODO(Remya): Check the impact of the sf
+ // 'prune_comp_search_by_single_result' if compound prediction modes are
+ // enabled in future for REALTIME encode.
+ if (!sf->interp_sf.skip_interp_filter_search &&
+ sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred &&
+ args->reach_first_comp_mode == 0) {
+ analyze_single_states(cpi, args->search_state);
+ args->reach_first_comp_mode = 1;
+ }
+
+ // Prune aggressively when best mode is skippable.
+ int mul_fact = args->search_state->best_mode_skippable
+ ? args->mode_thresh_mul_fact
+ : (1 << MODE_THRESH_QBITS);
+ int64_t mode_threshold =
+ (args->search_state->mode_threshold[mode_enum] * mul_fact) >>
+ MODE_THRESH_QBITS;
+
+ if (args->search_state->best_rd < mode_threshold) return 1;
+
+ // Skip this compound mode based on the RD results from the single prediction
+ // modes
+ if (!sf->interp_sf.skip_interp_filter_search &&
+ sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) {
+ if (compound_skip_by_single_states(cpi, args->search_state, this_mode,
+ ref_frame, second_ref_frame, x))
+ return 1;
+ }
+
+ if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) {
+ // After we done with single reference modes, find the 2nd best RD
+ // for a reference frame. Only search compound modes that have a reference
+ // frame at least as good as the 2nd best.
+ if (!args->prune_cpd_using_sr_stats_ready &&
+ args->num_single_modes_processed == NUM_SINGLE_REF_MODES) {
+ find_top_ref(ref_frame_rd);
+ args->prune_cpd_using_sr_stats_ready = 1;
+ }
+ if (args->prune_cpd_using_sr_stats_ready &&
+ !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame))
+ return 1;
+ }
+
+ // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes
+ if (sf->inter_sf.skip_ext_comp_nearmv_mode &&
+ (this_mode == NEW_NEARMV || this_mode == NEAR_NEWMV)) {
+ return 1;
+ }
+
+ if (sf->inter_sf.prune_ext_comp_using_neighbors && comp_pred) {
+ if (compound_skip_using_neighbor_refs(
+ xd, this_mode, ref_frames,
+ sf->inter_sf.prune_ext_comp_using_neighbors))
+ return 1;
+ }
+
+ if (sf->inter_sf.prune_comp_using_best_single_mode_ref && comp_pred) {
+ if (skip_compound_using_best_single_mode_ref(
+ this_mode, ref_frames, args->search_state->best_single_mode,
+ sf->inter_sf.prune_comp_using_best_single_mode_ref))
+ return 1;
+ }
+
+ if (sf->inter_sf.prune_nearest_near_mv_using_refmv_weight && !comp_pred) {
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+ if (skip_nearest_near_mv_using_refmv_weight(
+ x, this_mode, ref_frame_type,
+ args->search_state->best_mbmode.mode)) {
+ // Ensure the mode is pruned only when the current block has obtained a
+ // valid inter mode.
+ assert(is_inter_mode(args->search_state->best_mbmode.mode));
+ return 1;
+ }
+ }
+
+ if (sf->rt_sf.prune_inter_modes_with_golden_ref &&
+ ref_frame == GOLDEN_FRAME && !comp_pred) {
+ const int subgop_size = AOMMIN(cpi->ppi->gf_group.size, FIXED_GF_INTERVAL);
+ if (cpi->rc.frames_since_golden > (subgop_size >> 2) &&
+ args->search_state->best_mbmode.ref_frame[0] != GOLDEN_FRAME) {
+ if ((bsize > BLOCK_16X16 && this_mode == NEWMV) || this_mode == NEARMV)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static void record_best_compound(REFERENCE_MODE reference_mode,
+ RD_STATS *rd_stats, int comp_pred, int rdmult,
+ InterModeSearchState *search_state,
+ int compmode_cost) {
+ int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+ if (reference_mode == REFERENCE_MODE_SELECT) {
+ single_rate = rd_stats->rate - compmode_cost;
+ hybrid_rate = rd_stats->rate;
+ } else {
+ single_rate = rd_stats->rate;
+ hybrid_rate = rd_stats->rate + compmode_cost;
+ }
+
+ single_rd = RDCOST(rdmult, single_rate, rd_stats->dist);
+ hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist);
+
+ if (!comp_pred) {
+ if (single_rd < search_state->best_pred_rd[SINGLE_REFERENCE])
+ search_state->best_pred_rd[SINGLE_REFERENCE] = single_rd;
+ } else {
+ if (single_rd < search_state->best_pred_rd[COMPOUND_REFERENCE])
+ search_state->best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+ }
+ if (hybrid_rd < search_state->best_pred_rd[REFERENCE_MODE_SELECT])
+ search_state->best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+}
+
+// Does a transform search over a list of the best inter mode candidates.
+// This is called if the original mode search computed an RD estimate
+// for the transform search rather than doing a full search.
+static void tx_search_best_inter_candidates(
+ AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+ int64_t best_rd_so_far, BLOCK_SIZE bsize,
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int mi_row, int mi_col,
+ InterModeSearchState *search_state, RD_STATS *rd_cost,
+ PICK_MODE_CONTEXT *ctx, int64_t *yrd) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int num_planes = av1_num_planes(cm);
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ InterModesInfo *inter_modes_info = x->inter_modes_info;
+ inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+ search_state->best_rd = best_rd_so_far;
+ search_state->best_mode_index = THR_INVALID;
+ // Initialize best mode stats for winner mode processing
+ x->winner_mode_count = 0;
+ store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
+ NULL, bsize, best_rd_so_far,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, 0);
+ inter_modes_info->num =
+ inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search
+ ? inter_modes_info->num
+ : cpi->sf.rt_sf.num_inter_modes_for_tx_search;
+ const int64_t top_est_rd =
+ inter_modes_info->num > 0
+ ? inter_modes_info
+ ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
+ : INT64_MAX;
+ *yrd = INT64_MAX;
+ int64_t best_rd_in_this_partition = INT64_MAX;
+ int num_inter_mode_cands = inter_modes_info->num;
+ int newmv_mode_evaled = 0;
+ int max_allowed_cands = INT_MAX;
+ if (cpi->sf.inter_sf.limit_inter_mode_cands) {
+ // The bound on the no. of inter mode candidates, beyond which the
+ // candidates are limited if a newmv mode got evaluated, is set as
+ // max_allowed_cands + 1.
+ const int num_allowed_cands[5] = { INT_MAX, 10, 9, 6, 2 };
+ assert(cpi->sf.inter_sf.limit_inter_mode_cands <= 4);
+ max_allowed_cands =
+ num_allowed_cands[cpi->sf.inter_sf.limit_inter_mode_cands];
+ }
+
+ int num_mode_thresh = INT_MAX;
+ if (cpi->sf.inter_sf.limit_txfm_eval_per_mode) {
+ // Bound the no. of transform searches per prediction mode beyond a
+ // threshold.
+ const int num_mode_thresh_ary[4] = { INT_MAX, 4, 3, 0 };
+ assert(cpi->sf.inter_sf.limit_txfm_eval_per_mode <= 3);
+ num_mode_thresh =
+ num_mode_thresh_ary[cpi->sf.inter_sf.limit_txfm_eval_per_mode];
+ }
+
+ int num_tx_cands = 0;
+ int num_tx_search_modes[INTER_MODE_END - INTER_MODE_START] = { 0 };
+ // Iterate over best inter mode candidates and perform tx search
+ for (int j = 0; j < num_inter_mode_cands; ++j) {
+ const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+ *mbmi = inter_modes_info->mbmi_arr[data_idx];
+ const PREDICTION_MODE prediction_mode = mbmi->mode;
+ int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+ if (curr_est_rd * 0.80 > top_est_rd) break;
+
+ if (num_tx_cands > num_mode_thresh) {
+ if ((prediction_mode != NEARESTMV &&
+ num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 1) ||
+ (prediction_mode == NEARESTMV &&
+ num_tx_search_modes[prediction_mode - INTER_MODE_START] >= 2))
+ continue;
+ }
+
+ txfm_info->skip_txfm = 0;
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ bool is_predictor_built = false;
+
+ // Initialize RD stats
+ RD_STATS rd_stats;
+ RD_STATS rd_stats_y;
+ RD_STATS rd_stats_uv;
+ const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+ int64_t skip_rd = INT64_MAX;
+ const int txfm_rd_gate_level = get_txfm_rd_gate_level(
+ cm->seq_params->enable_masked_compound,
+ cpi->sf.inter_sf.txfm_rd_gate_level, bsize, TX_SEARCH_DEFAULT,
+ /*eval_motion_mode=*/0);
+ if (txfm_rd_gate_level) {
+ // Check if the mode is good enough based on skip RD
+ int64_t curr_sse = inter_modes_info->sse_arr[data_idx];
+ skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse);
+ int eval_txfm = check_txfm_eval(x, bsize, search_state->best_skip_rd[0],
+ skip_rd, txfm_rd_gate_level, 0);
+ if (!eval_txfm) continue;
+ }
+
+ // Build the prediction for this mode
+ if (!is_predictor_built) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+ }
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
+ av1_build_obmc_inter_predictors_sb(cm, xd);
+ }
+
+ num_tx_cands++;
+ if (have_newmv_in_inter_mode(prediction_mode)) newmv_mode_evaled = 1;
+ num_tx_search_modes[prediction_mode - INTER_MODE_START]++;
+ int64_t this_yrd = INT64_MAX;
+ // Do the transform search
+ if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+ mode_rate, search_state->best_rd)) {
+ continue;
+ } else {
+ const int y_rate =
+ rd_stats.skip_txfm
+ ? mode_costs->skip_txfm_cost[skip_ctx][1]
+ : (rd_stats_y.rate + mode_costs->skip_txfm_cost[skip_ctx][0]);
+ this_yrd = RDCOST(x->rdmult, y_rate + mode_rate, rd_stats_y.dist);
+
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ inter_mode_data_push(
+ tile_data, mbmi->bsize, rd_stats.sse, rd_stats.dist,
+ rd_stats_y.rate + rd_stats_uv.rate +
+ mode_costs->skip_txfm_cost[skip_ctx][mbmi->skip_txfm]);
+ }
+ }
+ rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+ if (rd_stats.rdcost < best_rd_in_this_partition) {
+ best_rd_in_this_partition = rd_stats.rdcost;
+ *yrd = this_yrd;
+ }
+
+ const THR_MODES mode_enum = get_prediction_mode_idx(
+ prediction_mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Collect mode stats for multiwinner mode processing
+ const int txfm_search_done = 1;
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv, mode_enum,
+ NULL, bsize, rd_stats.rdcost,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+
+ if (rd_stats.rdcost < search_state->best_rd) {
+ update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, mode_enum, x, txfm_search_done);
+ search_state->best_skip_rd[0] = skip_rd;
+ // Limit the total number of modes to be evaluated if the first is valid
+ // and transform skip or compound
+ if (cpi->sf.inter_sf.inter_mode_txfm_breakout) {
+ if (!j && (search_state->best_mbmode.skip_txfm || rd_stats.skip_txfm)) {
+ // Evaluate more candidates at high quantizers where occurrence of
+ // transform skip is high.
+ const int max_cands_cap[5] = { 2, 3, 5, 7, 9 };
+ const int qindex_band = (5 * x->qindex) >> QINDEX_BITS;
+ num_inter_mode_cands =
+ AOMMIN(max_cands_cap[qindex_band], inter_modes_info->num);
+ } else if (!j && has_second_ref(&search_state->best_mbmode)) {
+ const int aggr = cpi->sf.inter_sf.inter_mode_txfm_breakout - 1;
+ // Evaluate more candidates at low quantizers where occurrence of
+ // single reference mode is high.
+ const int max_cands_cap_cmp[2][4] = { { 10, 7, 5, 4 },
+ { 10, 7, 5, 3 } };
+ const int qindex_band_cmp = (4 * x->qindex) >> QINDEX_BITS;
+ num_inter_mode_cands = AOMMIN(
+ max_cands_cap_cmp[aggr][qindex_band_cmp], inter_modes_info->num);
+ }
+ }
+ }
+ // If the number of candidates evaluated exceeds max_allowed_cands, break if
+ // a newmv mode was evaluated already.
+ if ((num_tx_cands > max_allowed_cands) && newmv_mode_evaled) break;
+ }
+}
+
+// Indicates number of winner simple translation modes to be used
+static const unsigned int num_winner_motion_modes[3] = { 0, 10, 3 };
+
+// Adds a motion mode to the candidate list for motion_mode_for_winner_cand
+// speed feature. This list consists of modes that have only searched
+// SIMPLE_TRANSLATION. The final list will be used to search other motion
+// modes after the initial RD search.
+static void handle_winner_cand(
+ MB_MODE_INFO *const mbmi,
+ motion_mode_best_st_candidate *best_motion_mode_cands,
+ int max_winner_motion_mode_cand, int64_t this_rd,
+ motion_mode_candidate *motion_mode_cand, int skip_motion_mode) {
+ // Number of current motion mode candidates in list
+ const int num_motion_mode_cand = best_motion_mode_cands->num_motion_mode_cand;
+ int valid_motion_mode_cand_loc = num_motion_mode_cand;
+
+ // find the best location to insert new motion mode candidate
+ for (int j = 0; j < num_motion_mode_cand; j++) {
+ if (this_rd < best_motion_mode_cands->motion_mode_cand[j].rd_cost) {
+ valid_motion_mode_cand_loc = j;
+ break;
+ }
+ }
+
+ // Insert motion mode if location is found
+ if (valid_motion_mode_cand_loc < max_winner_motion_mode_cand) {
+ if (num_motion_mode_cand > 0 &&
+ valid_motion_mode_cand_loc < max_winner_motion_mode_cand - 1)
+ memmove(
+ &best_motion_mode_cands
+ ->motion_mode_cand[valid_motion_mode_cand_loc + 1],
+ &best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc],
+ (AOMMIN(num_motion_mode_cand, max_winner_motion_mode_cand - 1) -
+ valid_motion_mode_cand_loc) *
+ sizeof(best_motion_mode_cands->motion_mode_cand[0]));
+ motion_mode_cand->mbmi = *mbmi;
+ motion_mode_cand->rd_cost = this_rd;
+ motion_mode_cand->skip_motion_mode = skip_motion_mode;
+ best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc] =
+ *motion_mode_cand;
+ best_motion_mode_cands->num_motion_mode_cand =
+ AOMMIN(max_winner_motion_mode_cand,
+ best_motion_mode_cands->num_motion_mode_cand + 1);
+ }
+}
+
+/*!\brief Search intra modes in interframes
+ *
+ * \ingroup intra_mode_search
+ *
+ * This function searches for the best intra mode when the current frame is an
+ * interframe. This function however does *not* handle luma palette mode.
+ * Palette mode is currently handled by \ref av1_search_palette_mode.
+ *
+ * This function will first iterate through the luma mode candidates to find the
+ * best luma intra mode. Once the best luma mode it's found, it will then search
+ * for the best chroma mode. Because palette mode is currently not handled by
+ * here, a cache of uv mode is stored in
+ * InterModeSearchState::intra_search_state so it can be reused later by \ref
+ * av1_search_palette_mode.
+ *
+ * \param[in,out] search_state Struct keep track of the prediction mode
+ * search state in interframe.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in,out] x Pointer to struct holding all the data for
+ * the current prediction block.
+ * \param[out] rd_cost Stores the best rd_cost among all the
+ * prediction modes searched.
+ * \param[in] bsize Current block size.
+ * \param[in,out] ctx Structure to hold the number of 4x4 blks to
+ * copy the tx_type and txfm_skip arrays.
+ * for only the Y plane.
+ * \param[in] sf_args Stores the list of intra mode candidates
+ * to be searched.
+ * \param[in] intra_ref_frame_cost The entropy cost for signaling that the
+ * current ref frame is an intra frame.
+ * \param[in] yrd_threshold The rdcost threshold for luma intra mode to
+ * terminate chroma intra mode search.
+ *
+ * \remark If a new best mode is found, search_state and rd_costs are updated
+ * correspondingly. While x is also modified, it is only used as a temporary
+ * buffer, and the final decisions are stored in search_state.
+ */
+static AOM_INLINE void search_intra_modes_in_interframe(
+ InterModeSearchState *search_state, const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ const InterModeSFArgs *sf_args, unsigned int intra_ref_frame_cost,
+ int64_t yrd_threshold) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const IntraModeCfg *const intra_mode_cfg = &cpi->oxcf.intra_mode_cfg;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ IntraModeSearchState *intra_search_state = &search_state->intra_search_state;
+
+ int is_best_y_mode_intra = 0;
+ RD_STATS best_intra_rd_stats_y;
+ int64_t best_rd_y = INT64_MAX;
+ int best_mode_cost_y = -1;
+ MB_MODE_INFO best_mbmi = *xd->mi[0];
+ THR_MODES best_mode_enum = THR_INVALID;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const int num_4x4 = bsize_to_num_blk(bsize);
+
+ // Performs luma search
+ int64_t best_model_rd = INT64_MAX;
+ int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+ for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+ top_intra_model_rd[i] = INT64_MAX;
+ }
+ for (int mode_idx = 0; mode_idx < LUMA_MODE_COUNT; ++mode_idx) {
+ if (sf->intra_sf.skip_intra_in_interframe &&
+ search_state->intra_search_state.skip_intra_modes)
+ break;
+ set_y_mode_and_delta_angle(
+ mode_idx, mbmi, sf->intra_sf.prune_luma_odd_delta_angles_in_intra);
+ assert(mbmi->mode < INTRA_MODE_END);
+
+ // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+ if (sf_args->mode_skip_mask->pred_modes[INTRA_FRAME] & (1 << mbmi->mode))
+ continue;
+
+ const THR_MODES mode_enum =
+ get_prediction_mode_idx(mbmi->mode, INTRA_FRAME, NONE_FRAME);
+ if ((!intra_mode_cfg->enable_smooth_intra ||
+ cpi->sf.intra_sf.disable_smooth_intra) &&
+ (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+ mbmi->mode == SMOOTH_V_PRED))
+ continue;
+ if (!intra_mode_cfg->enable_paeth_intra && mbmi->mode == PAETH_PRED)
+ continue;
+ if (av1_is_directional_mode(mbmi->mode) &&
+ !(av1_use_angle_delta(bsize) && intra_mode_cfg->enable_angle_delta) &&
+ mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+ continue;
+ const PREDICTION_MODE this_mode = mbmi->mode;
+
+ assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
+ assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
+ init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm);
+ x->txfm_search_info.skip_txfm = 0;
+
+ if (this_mode != DC_PRED) {
+ // Only search the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+ (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+ if (search_state->best_mode_index != THR_INVALID &&
+ search_state->best_mbmode.ref_frame[0] > INTRA_FRAME)
+ continue;
+ }
+ if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(
+ this_mode, search_state->intra_search_state.best_intra_mode))
+ continue;
+ }
+ }
+
+ RD_STATS intra_rd_stats_y;
+ int mode_cost_y;
+ int64_t intra_rd_y = INT64_MAX;
+ const int is_luma_result_valid = av1_handle_intra_y_mode(
+ intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx,
+ &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y,
+ &best_model_rd, top_intra_model_rd);
+ if (is_luma_result_valid && intra_rd_y < yrd_threshold) {
+ is_best_y_mode_intra = 1;
+ if (intra_rd_y < best_rd_y) {
+ best_intra_rd_stats_y = intra_rd_stats_y;
+ best_mode_cost_y = mode_cost_y;
+ best_rd_y = intra_rd_y;
+ best_mbmi = *mbmi;
+ best_mode_enum = mode_enum;
+ memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(best_blk_skip[0]) * num_4x4);
+ av1_copy_array(best_tx_type_map, xd->tx_type_map, num_4x4);
+ }
+ }
+ }
+
+ if (!is_best_y_mode_intra) {
+ return;
+ }
+
+ assert(best_rd_y < INT64_MAX);
+
+ // Restores the best luma mode
+ *mbmi = best_mbmi;
+ memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * num_4x4);
+ av1_copy_array(xd->tx_type_map, best_tx_type_map, num_4x4);
+
+ // Performs chroma search
+ RD_STATS intra_rd_stats, intra_rd_stats_uv;
+ av1_init_rd_stats(&intra_rd_stats);
+ av1_init_rd_stats(&intra_rd_stats_uv);
+ const int num_planes = av1_num_planes(cm);
+ if (num_planes > 1) {
+ const int intra_uv_mode_valid = av1_search_intra_uv_modes_in_interframe(
+ intra_search_state, cpi, x, bsize, &intra_rd_stats,
+ &best_intra_rd_stats_y, &intra_rd_stats_uv, search_state->best_rd);
+
+ if (!intra_uv_mode_valid) {
+ return;
+ }
+ }
+
+ // Merge the luma and chroma rd stats
+ assert(best_mode_cost_y >= 0);
+ intra_rd_stats.rate = best_intra_rd_stats_y.rate + best_mode_cost_y;
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+ // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+ // in the tokenonly rate, but for intra blocks, tx_size is always coded
+ // (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ best_intra_rd_stats_y.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+ }
+
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const PREDICTION_MODE mode = mbmi->mode;
+ if (num_planes > 1 && xd->is_chroma_ref) {
+ const int uv_mode_cost =
+ mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode];
+ intra_rd_stats.rate +=
+ intra_rd_stats_uv.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
+ }
+
+ // Intra block is always coded as non-skip
+ intra_rd_stats.skip_txfm = 0;
+ intra_rd_stats.dist = best_intra_rd_stats_y.dist + intra_rd_stats_uv.dist;
+ // Add in the cost of the no skip flag.
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ intra_rd_stats.rate += mode_costs->skip_txfm_cost[skip_ctx][0];
+ // Calculate the final RD estimate for this mode.
+ const int64_t this_rd =
+ RDCOST(x->rdmult, intra_rd_stats.rate, intra_rd_stats.dist);
+ // Keep record of best intra rd
+ if (this_rd < search_state->best_intra_rd) {
+ search_state->best_intra_rd = this_rd;
+ intra_search_state->best_intra_mode = mode;
+ }
+
+ for (int i = 0; i < REFERENCE_MODES; ++i) {
+ search_state->best_pred_rd[i] =
+ AOMMIN(search_state->best_pred_rd[i], this_rd);
+ }
+
+ intra_rd_stats.rdcost = this_rd;
+
+ // Collect mode stats for multiwinner mode processing
+ const int txfm_search_done = 1;
+ store_winner_mode_stats(
+ &cpi->common, x, mbmi, &intra_rd_stats, &best_intra_rd_stats_y,
+ &intra_rd_stats_uv, best_mode_enum, NULL, bsize, intra_rd_stats.rdcost,
+ cpi->sf.winner_mode_sf.multi_winner_mode_type, txfm_search_done);
+ if (intra_rd_stats.rdcost < search_state->best_rd) {
+ update_search_state(search_state, rd_cost, ctx, &intra_rd_stats,
+ &best_intra_rd_stats_y, &intra_rd_stats_uv,
+ best_mode_enum, x, txfm_search_done);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Prepare inter_cost and intra_cost from TPL stats, which are used as ML
+// features in intra mode pruning.
+static AOM_INLINE void calculate_cost_from_tpl_data(
+ const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int64_t *inter_cost, int64_t *intra_cost) {
+ const AV1_COMMON *const cm = &cpi->common;
+ // Only consider full SB.
+ const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+ const int tpl_bsize_1d = cpi->ppi->tpl_data.tpl_bsize_1d;
+ const int len = (block_size_wide[sb_size] / tpl_bsize_1d) *
+ (block_size_high[sb_size] / tpl_bsize_1d);
+ SuperBlockEnc *sb_enc = &x->sb_enc;
+ if (sb_enc->tpl_data_count == len) {
+ const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d);
+ const int tpl_stride = sb_enc->tpl_stride;
+ const int tplw = mi_size_wide[tpl_bsize];
+ const int tplh = mi_size_high[tpl_bsize];
+ const int nw = mi_size_wide[bsize] / tplw;
+ const int nh = mi_size_high[bsize] / tplh;
+ if (nw >= 1 && nh >= 1) {
+ const int of_h = mi_row % mi_size_high[sb_size];
+ const int of_w = mi_col % mi_size_wide[sb_size];
+ const int start = of_h / tplh * tpl_stride + of_w / tplw;
+
+ for (int k = 0; k < nh; k++) {
+ for (int l = 0; l < nw; l++) {
+ *inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l];
+ *intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l];
+ }
+ }
+ *inter_cost /= nw * nh;
+ *intra_cost /= nw * nh;
+ }
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// When the speed feature skip_intra_in_interframe > 0, enable ML model to prune
+// intra mode search.
+static AOM_INLINE void skip_intra_modes_in_interframe(
+ AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize,
+ InterModeSearchState *search_state, const SPEED_FEATURES *const sf,
+ int64_t inter_cost, int64_t intra_cost) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int comp_pred = search_state->best_mbmode.ref_frame[1] > INTRA_FRAME;
+ if (sf->rt_sf.prune_intra_mode_based_on_mv_range &&
+ bsize > sf->part_sf.max_intra_bsize && !comp_pred) {
+ const MV best_mv = search_state->best_mbmode.mv[0].as_mv;
+ const int mv_thresh = 16 << sf->rt_sf.prune_intra_mode_based_on_mv_range;
+ if (abs(best_mv.row) < mv_thresh && abs(best_mv.col) < mv_thresh &&
+ x->source_variance > 128) {
+ search_state->intra_search_state.skip_intra_modes = 1;
+ return;
+ }
+ }
+
+ const unsigned int src_var_thresh_intra_skip = 1;
+ const int skip_intra_in_interframe = sf->intra_sf.skip_intra_in_interframe;
+ if (!(skip_intra_in_interframe &&
+ (x->source_variance > src_var_thresh_intra_skip)))
+ return;
+
+ // Prune intra search based on best inter mode being transfrom skip.
+ if ((skip_intra_in_interframe >= 2) && search_state->best_mbmode.skip_txfm) {
+ const int qindex_thresh[2] = { 200, MAXQ };
+ const int ind = (skip_intra_in_interframe >= 3) ? 1 : 0;
+ if (!have_newmv_in_inter_mode(search_state->best_mbmode.mode) &&
+ (x->qindex <= qindex_thresh[ind])) {
+ search_state->intra_search_state.skip_intra_modes = 1;
+ return;
+ } else if ((skip_intra_in_interframe >= 4) &&
+ (inter_cost < 0 || intra_cost < 0)) {
+ search_state->intra_search_state.skip_intra_modes = 1;
+ return;
+ }
+ }
+ // Use ML model to prune intra search.
+ if (inter_cost >= 0 && intra_cost >= 0) {
+ const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
+ ? &av1_intrap_nn_config
+ : &av1_intrap_hd_nn_config;
+ float nn_features[6];
+ float scores[2] = { 0.0f };
+
+ nn_features[0] = (float)search_state->best_mbmode.skip_txfm;
+ nn_features[1] = (float)mi_size_wide_log2[bsize];
+ nn_features[2] = (float)mi_size_high_log2[bsize];
+ nn_features[3] = (float)intra_cost;
+ nn_features[4] = (float)inter_cost;
+ const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+ const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
+ nn_features[5] = (float)(ac_q_max / ac_q);
+
+ av1_nn_predict(nn_features, nn_config, 1, scores);
+
+ // For two parameters, the max prob returned from av1_nn_softmax equals
+ // 1.0 / (1.0 + e^(-|diff_score|)). Here use scores directly to avoid the
+ // calling of av1_nn_softmax.
+ const float thresh[5] = { 1.4f, 1.4f, 1.4f, 1.4f, 1.4f };
+ assert(skip_intra_in_interframe <= 5);
+ if (scores[1] > scores[0] + thresh[skip_intra_in_interframe - 1]) {
+ search_state->intra_search_state.skip_intra_modes = 1;
+ }
+ }
+}
+
+static AOM_INLINE bool skip_interp_filter_search(const AV1_COMP *cpi,
+ int is_single_pred) {
+ const MODE encoding_mode = cpi->oxcf.mode;
+ if (encoding_mode == REALTIME) {
+ return (cpi->common.current_frame.reference_mode == SINGLE_REFERENCE &&
+ (cpi->sf.interp_sf.skip_interp_filter_search ||
+ cpi->sf.winner_mode_sf.winner_mode_ifs));
+ } else if (encoding_mode == GOOD) {
+ // Skip interpolation filter search for single prediction modes.
+ return (cpi->sf.interp_sf.skip_interp_filter_search && is_single_pred);
+ }
+ return false;
+}
+
+static AOM_INLINE int get_block_temp_var(const AV1_COMP *cpi,
+ const MACROBLOCK *x,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+
+ if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION ||
+ !sf->rt_sf.short_circuit_low_temp_var ||
+ !sf->rt_sf.prune_inter_modes_using_temp_var) {
+ return 0;
+ }
+
+ const int mi_row = x->e_mbd.mi_row;
+ const int mi_col = x->e_mbd.mi_col;
+ int is_low_temp_var = 0;
+
+ if (cm->seq_params->sb_size == BLOCK_64X64)
+ is_low_temp_var = av1_get_force_skip_low_temp_var_small_sb(
+ &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+ else
+ is_low_temp_var = av1_get_force_skip_low_temp_var(
+ &x->part_search_info.variance_low[0], mi_row, mi_col, bsize);
+
+ return is_low_temp_var;
+}
+
+// TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb.
+void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+ struct macroblock *x, struct RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ AV1_COMMON *const cm = &cpi->common;
+ const FeatureFlags *const features = &cm->features;
+ const int num_planes = av1_num_planes(cm);
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ int i;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *comp_inter_cost =
+ mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)];
+
+ InterModeSearchState search_state;
+ init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far);
+ INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+ };
+ HandleInterModeArgs args = { { NULL },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+ { NULL },
+ { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+ MAX_SB_SIZE >> 1 },
+ NULL,
+ NULL,
+ NULL,
+ search_state.modelled_rd,
+ INT_MAX,
+ INT_MAX,
+ search_state.simple_rd,
+ 0,
+ false,
+ interintra_modes,
+ { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } },
+ { { 0, 0 } },
+ { 0 },
+ 0,
+ 0,
+ -1,
+ -1,
+ -1,
+ { 0 },
+ { 0 },
+ UINT_MAX };
+ // Currently, is_low_temp_var is used in real time encoding.
+ const int is_low_temp_var = get_block_temp_var(cpi, x, bsize);
+
+ for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1;
+ // Indicates the appropriate number of simple translation winner modes for
+ // exhaustive motion mode evaluation
+ const int max_winner_motion_mode_cand =
+ num_winner_motion_modes[sf->winner_mode_sf.motion_mode_for_winner_cand];
+ assert(max_winner_motion_mode_cand <= MAX_WINNER_MOTION_MODES);
+ motion_mode_candidate motion_mode_cand;
+ motion_mode_best_st_candidate best_motion_mode_cands;
+ // Initializing the number of motion mode candidates to zero.
+ best_motion_mode_cands.num_motion_mode_cand = 0;
+ for (i = 0; i < MAX_WINNER_MOTION_MODES; ++i)
+ best_motion_mode_cands.motion_mode_cand[i].rd_cost = INT64_MAX;
+
+ for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+
+ av1_invalid_rd_stats(rd_cost);
+
+ for (i = 0; i < REF_FRAMES; ++i) {
+ x->warp_sample_info[i].num = -1;
+ }
+
+ // Ref frames that are selected by square partition blocks.
+ int picked_ref_frames_mask = 0;
+ if (sf->inter_sf.prune_ref_frame_for_rect_partitions &&
+ mbmi->partition != PARTITION_NONE) {
+ // prune_ref_frame_for_rect_partitions = 1 implies prune only extended
+ // partition blocks. prune_ref_frame_for_rect_partitions >=2
+ // implies prune for vert, horiz and extended partition blocks.
+ if ((mbmi->partition != PARTITION_VERT &&
+ mbmi->partition != PARTITION_HORZ) ||
+ sf->inter_sf.prune_ref_frame_for_rect_partitions >= 2) {
+ picked_ref_frames_mask =
+ fetch_picked_ref_frames_mask(x, bsize, cm->seq_params->mib_size);
+ }
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
+ // Skip ref frames that never selected by square blocks.
+ const int skip_ref_frame_mask =
+ picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
+ mode_skip_mask_t mode_skip_mask;
+ unsigned int ref_costs_single[REF_FRAMES];
+ unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+ // init params, set frame modes, speed features
+ set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask,
+ skip_ref_frame_mask, ref_costs_single,
+ ref_costs_comp, yv12_mb);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
+
+ int64_t best_est_rd = INT64_MAX;
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ // If do_tx_search is 0, only estimated RD should be computed.
+ // If do_tx_search is 1, all modes have TX search performed.
+ const int do_tx_search =
+ !((sf->inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
+ (sf->inter_sf.inter_mode_rd_model_estimation == 2 &&
+ num_pels_log2_lookup[bsize] > 8));
+ InterModesInfo *inter_modes_info = x->inter_modes_info;
+ inter_modes_info->num = 0;
+
+ // Temporary buffers used by handle_inter_mode().
+ uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_pred_bufs[0]);
+
+ // The best RD found for the reference frame, among single reference modes.
+ // Note that the 0-th element will contain a cut-off that is later used
+ // to determine if we should skip a compound mode.
+ int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+ INT64_MAX, INT64_MAX, INT64_MAX,
+ INT64_MAX, INT64_MAX };
+
+ // Prepared stats used later to check if we could skip intra mode eval.
+ int64_t inter_cost = -1;
+ int64_t intra_cost = -1;
+ // Need to tweak the threshold for hdres speed 0 & 1.
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ // Obtain the relevant tpl stats for pruning inter modes
+ PruneInfoFromTpl inter_cost_info_from_tpl;
+#if !CONFIG_REALTIME_ONLY
+ if (sf->inter_sf.prune_inter_modes_based_on_tpl) {
+ // x->tpl_keep_ref_frame[id] = 1 => no pruning in
+ // prune_ref_by_selective_ref_frame()
+ // x->tpl_keep_ref_frame[id] = 0 => ref frame can be pruned in
+ // prune_ref_by_selective_ref_frame()
+ // Populating valid_refs[idx] = 1 ensures that
+ // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a
+ // pruned ref frame.
+ int valid_refs[INTER_REFS_PER_FRAME];
+ for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+ const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME };
+ valid_refs[frame - 1] =
+ x->tpl_keep_ref_frame[frame] ||
+ !prune_ref_by_selective_ref_frame(
+ cpi, x, refs, cm->cur_frame->ref_display_order_hint);
+ }
+ av1_zero(inter_cost_info_from_tpl);
+ get_block_level_tpl_stats(cpi, bsize, mi_row, mi_col, valid_refs,
+ &inter_cost_info_from_tpl);
+ }
+
+ const int do_pruning =
+ (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1;
+ if (do_pruning && sf->intra_sf.skip_intra_in_interframe &&
+ cpi->oxcf.algo_cfg.enable_tpl_model)
+ calculate_cost_from_tpl_data(cpi, x, bsize, mi_row, mi_col, &inter_cost,
+ &intra_cost);
+#endif // !CONFIG_REALTIME_ONLY
+
+ // Initialize best mode stats for winner mode processing.
+ const int max_winner_mode_count =
+ winner_mode_count_allowed[sf->winner_mode_sf.multi_winner_mode_type];
+ zero_winner_mode_stats(bsize, max_winner_mode_count, x->winner_mode_stats);
+ x->winner_mode_count = 0;
+ store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
+ NULL, bsize, best_rd_so_far,
+ sf->winner_mode_sf.multi_winner_mode_type, 0);
+
+ int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS);
+ if (sf->inter_sf.prune_inter_modes_if_skippable) {
+ // Higher multiplication factor values for lower quantizers.
+ mode_thresh_mul_fact = mode_threshold_mul_factor[x->qindex];
+ }
+
+ // Initialize arguments for mode loop speed features
+ InterModeSFArgs sf_args = { &args.skip_motion_mode,
+ &mode_skip_mask,
+ &search_state,
+ skip_ref_frame_mask,
+ 0,
+ mode_thresh_mul_fact,
+ 0,
+ 0 };
+ int64_t best_inter_yrd = INT64_MAX;
+
+ // This is the main loop of this function. It loops over all possible inter
+ // modes and calls handle_inter_mode() to compute the RD for each.
+ // Here midx is just an iterator index that should not be used by itself
+ // except to keep track of the number of modes searched. It should be used
+ // with av1_default_mode_order to get the enum that defines the mode, which
+ // can be used with av1_mode_defs to get the prediction mode and the ref
+ // frames.
+ // TODO(yunqing, any): Setting mode_start and mode_end outside for-loop brings
+ // good speedup for real time case. If we decide to use compound mode in real
+ // time, maybe we can modify av1_default_mode_order table.
+ THR_MODES mode_start = THR_INTER_MODE_START;
+ THR_MODES mode_end = THR_INTER_MODE_END;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ if (current_frame->reference_mode == SINGLE_REFERENCE) {
+ mode_start = SINGLE_REF_MODE_START;
+ mode_end = SINGLE_REF_MODE_END;
+ }
+
+ for (THR_MODES midx = mode_start; midx < mode_end; ++midx) {
+ // Get the actual prediction mode we are trying in this iteration
+ const THR_MODES mode_enum = av1_default_mode_order[midx];
+ const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+ const PREDICTION_MODE this_mode = mode_def->mode;
+ const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame;
+
+ const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
+ const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
+ const int is_single_pred =
+ ref_frame > INTRA_FRAME && second_ref_frame == NONE_FRAME;
+ const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+ init_mbmi(mbmi, this_mode, ref_frames, cm);
+
+ txfm_info->skip_txfm = 0;
+ sf_args.num_single_modes_processed += is_single_pred;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, skip_inter_mode_time);
+#endif
+ // Apply speed features to decide if this inter mode can be skipped
+ const int is_skip_inter_mode = skip_inter_mode(
+ cpi, x, bsize, ref_frame_rd, midx, &sf_args, is_low_temp_var);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, skip_inter_mode_time);
+#endif
+ if (is_skip_inter_mode) continue;
+
+ // Select prediction reference frames.
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
+
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->ref_mv_idx = 0;
+
+ const int64_t ref_best_rd = search_state.best_rd;
+ RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+ av1_init_rd_stats(&rd_stats);
+
+ const int ref_frame_cost = comp_pred
+ ? ref_costs_comp[ref_frame][second_ref_frame]
+ : ref_costs_single[ref_frame];
+ const int compmode_cost =
+ is_comp_ref_allowed(mbmi->bsize) ? comp_inter_cost[comp_pred] : 0;
+ const int real_compmode_cost =
+ cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
+ ? compmode_cost
+ : 0;
+ // Point to variables that are maintained between loop iterations
+ args.single_newmv = search_state.single_newmv;
+ args.single_newmv_rate = search_state.single_newmv_rate;
+ args.single_newmv_valid = search_state.single_newmv_valid;
+ args.single_comp_cost = real_compmode_cost;
+ args.ref_frame_cost = ref_frame_cost;
+ args.best_pred_sse = search_state.best_pred_sse;
+ args.skip_ifs = skip_interp_filter_search(cpi, is_single_pred);
+
+ int64_t skip_rd[2] = { search_state.best_skip_rd[0],
+ search_state.best_skip_rd[1] };
+ int64_t this_yrd = INT64_MAX;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_inter_mode_time);
+#endif
+ int64_t this_rd = handle_inter_mode(
+ cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &args,
+ ref_best_rd, tmp_buf, &x->comp_rd_buffer, &best_est_rd, do_tx_search,
+ inter_modes_info, &motion_mode_cand, skip_rd, &inter_cost_info_from_tpl,
+ &this_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_inter_mode_time);
+#endif
+ if (current_frame->reference_mode != SINGLE_REFERENCE) {
+ if (!args.skip_ifs &&
+ sf->inter_sf.prune_comp_search_by_single_result > 0 &&
+ is_inter_singleref_mode(this_mode)) {
+ collect_single_states(x, &search_state, mbmi);
+ }
+
+ if (sf->inter_sf.prune_comp_using_best_single_mode_ref > 0 &&
+ is_inter_singleref_mode(this_mode))
+ update_best_single_mode(&search_state, this_mode, ref_frame, this_rd);
+ }
+
+ if (this_rd == INT64_MAX) continue;
+
+ if (mbmi->skip_txfm) {
+ rd_stats_y.rate = 0;
+ rd_stats_uv.rate = 0;
+ }
+
+ if (sf->inter_sf.prune_compound_using_single_ref && is_single_pred &&
+ this_rd < ref_frame_rd[ref_frame]) {
+ ref_frame_rd[ref_frame] = this_rd;
+ }
+
+ // Did this mode help, i.e., is it the new best mode
+ if (this_rd < search_state.best_rd) {
+ assert(IMPLIES(comp_pred,
+ cm->current_frame.reference_mode != SINGLE_REFERENCE));
+ search_state.best_pred_sse = x->pred_sse[ref_frame];
+ best_inter_yrd = this_yrd;
+ update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, mode_enum, x, do_tx_search);
+ if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0];
+ // skip_rd[0] is the best total rd for a skip mode so far.
+ // skip_rd[1] is the best total rd for a skip mode so far in luma.
+ // When do_tx_search = 1, both skip_rd[0] and skip_rd[1] are updated.
+ // When do_tx_search = 0, skip_rd[1] is updated.
+ search_state.best_skip_rd[1] = skip_rd[1];
+ }
+ if (sf->winner_mode_sf.motion_mode_for_winner_cand) {
+ // Add this mode to motion mode candidate list for motion mode search
+ // if using motion_mode_for_winner_cand speed feature
+ handle_winner_cand(mbmi, &best_motion_mode_cands,
+ max_winner_motion_mode_cand, this_rd,
+ &motion_mode_cand, args.skip_motion_mode);
+ }
+
+ /* keep record of best compound/single-only prediction */
+ record_best_compound(cm->current_frame.reference_mode, &rd_stats, comp_pred,
+ x->rdmult, &search_state, compmode_cost);
+ }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, evaluate_motion_mode_for_winner_candidates_time);
+#endif
+ if (sf->winner_mode_sf.motion_mode_for_winner_cand) {
+ // For the single ref winner candidates, evaluate other motion modes (non
+ // simple translation).
+ evaluate_motion_mode_for_winner_candidates(
+ cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb,
+ &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd,
+ &search_state, &best_inter_yrd);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, evaluate_motion_mode_for_winner_candidates_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, do_tx_search_time);
+#endif
+ if (do_tx_search != 1) {
+ // A full tx search has not yet been done, do tx search for
+ // top mode candidates
+ tx_search_best_inter_candidates(cpi, tile_data, x, best_rd_so_far, bsize,
+ yv12_mb, mi_row, mi_col, &search_state,
+ rd_cost, ctx, &best_inter_yrd);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, do_tx_search_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_intra_mode_time);
+#endif
+ // Gate intra mode evaluation if best of inter is skip except when source
+ // variance is extremely low and also based on max intra bsize.
+ skip_intra_modes_in_interframe(cm, x, bsize, &search_state, sf, inter_cost,
+ intra_cost);
+
+ const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
+ search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx,
+ &sf_args, intra_ref_frame_cost,
+ best_inter_yrd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_intra_mode_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, refine_winner_mode_tx_time);
+#endif
+ int winner_mode_count =
+ sf->winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1;
+ // In effect only when fast tx search speed features are enabled.
+ refine_winner_mode_tx(
+ cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index,
+ &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
+ search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, refine_winner_mode_tx_time);
+#endif
+
+ // Initialize default mode evaluation params
+ set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+ // Only try palette mode when the best mode so far is an intra mode.
+ const int try_palette =
+ cpi->oxcf.tool_cfg.enable_palette &&
+ av1_allow_palette(features->allow_screen_content_tools, mbmi->bsize) &&
+ !is_inter_mode(search_state.best_mbmode.mode) && rd_cost->rate != INT_MAX;
+ RD_STATS this_rd_cost;
+ int this_skippable = 0;
+ if (try_palette) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_search_palette_mode_time);
+#endif
+ this_skippable = av1_search_palette_mode(
+ &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost,
+ ctx, &this_rd_cost, search_state.best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_search_palette_mode_time);
+#endif
+ if (this_rd_cost.rdcost < search_state.best_rd) {
+ search_state.best_mode_index = THR_DC;
+ mbmi->mv[0].as_int = 0;
+ rd_cost->rate = this_rd_cost.rate;
+ rd_cost->dist = this_rd_cost.dist;
+ rd_cost->rdcost = this_rd_cost.rdcost;
+ search_state.best_rd = rd_cost->rdcost;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = 0;
+ search_state.best_mode_skippable = this_skippable;
+ memcpy(ctx->blk_skip, txfm_info->blk_skip,
+ sizeof(txfm_info->blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+ }
+ }
+
+ search_state.best_mbmode.skip_mode = 0;
+ if (cm->current_frame.skip_mode_info.skip_mode_flag &&
+ is_comp_ref_allowed(bsize)) {
+ const struct segmentation *const seg = &cm->seg;
+ unsigned char segment_id = mbmi->segment_id;
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, yv12_mb);
+ }
+ }
+
+ // Make sure that the ref_mv_idx is only nonzero when we're
+ // using a mode which can support ref_mv_idx
+ if (search_state.best_mbmode.ref_mv_idx != 0 &&
+ !(search_state.best_mbmode.mode == NEWMV ||
+ search_state.best_mbmode.mode == NEW_NEWMV ||
+ have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+ search_state.best_mbmode.ref_mv_idx = 0;
+ }
+
+ if (search_state.best_mode_index == THR_INVALID ||
+ search_state.best_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ const InterpFilter interp_filter = features->interp_filter;
+ assert((interp_filter == SWITCHABLE) ||
+ (interp_filter ==
+ search_state.best_mbmode.interp_filters.as_filters.y_filter) ||
+ !is_inter_block(&search_state.best_mbmode));
+ assert((interp_filter == SWITCHABLE) ||
+ (interp_filter ==
+ search_state.best_mbmode.interp_filters.as_filters.x_filter) ||
+ !is_inter_block(&search_state.best_mbmode));
+
+ if (!cpi->rc.is_src_frame_alt_ref && sf->inter_sf.adaptive_rd_thresh) {
+ av1_update_rd_thresh_fact(
+ cm, x->thresh_freq_fact, sf->inter_sf.adaptive_rd_thresh, bsize,
+ search_state.best_mode_index, mode_start, mode_end, THR_DC, MAX_MODES);
+ }
+
+ // macroblock modes
+ *mbmi = search_state.best_mbmode;
+ txfm_info->skip_txfm |= search_state.best_skip2;
+
+ // Note: this section is needed since the mode may have been forced to
+ // GLOBALMV by the all-zero mode handling of ref-mv.
+ if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+ // Correct the interp filters for GLOBALMV
+ if (is_nontrans_global_motion(xd, xd->mi[0])) {
+ int_interpfilters filters =
+ av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+ assert(mbmi->interp_filters.as_int == filters.as_int);
+ (void)filters;
+ }
+ }
+
+ txfm_info->skip_txfm |= search_state.best_mode_skippable;
+
+ assert(search_state.best_mode_index != THR_INVALID);
+
+#if CONFIG_INTERNAL_STATS
+ store_coding_context(x, ctx, search_state.best_mode_index,
+ search_state.best_mode_skippable);
+#else
+ store_coding_context(x, ctx, search_state.best_mode_skippable);
+#endif // CONFIG_INTERNAL_STATS
+
+ if (mbmi->palette_mode_info.palette_size[1] > 0) {
+ assert(try_palette);
+ av1_restore_uv_color_map(cpi, x);
+ }
+}
+
+void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
+ TileDataEnc *tile_data, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const FeatureFlags *const features = &cm->features;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ unsigned char segment_id = mbmi->segment_id;
+ const int comp_pred = 0;
+ int i;
+ unsigned int ref_costs_single[REF_FRAMES];
+ unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int *comp_inter_cost =
+ mode_costs->comp_inter_cost[av1_get_reference_mode_context(xd)];
+ InterpFilter best_filter = SWITCHABLE;
+ int64_t this_rd = INT64_MAX;
+ int rate2 = 0;
+ const int64_t distortion2 = 0;
+ (void)mi_row;
+ (void)mi_col;
+ (void)tile_data;
+
+ av1_collect_neighbors_ref_counts(xd);
+
+ estimate_ref_frame_costs(cm, xd, mode_costs, segment_id, ref_costs_single,
+ ref_costs_comp);
+
+ for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+ for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX;
+
+ rd_cost->rate = INT_MAX;
+
+ assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->mode = GLOBALMV;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->uv_mode = UV_DC_PRED;
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME))
+ mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ else
+ mbmi->ref_frame[0] = LAST_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->mv[0].as_int =
+ gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
+ features->allow_high_precision_mv, bsize, mi_col,
+ mi_row, features->cur_frame_force_integer_mv)
+ .as_int;
+ mbmi->tx_size = max_txsize_lookup[bsize];
+ x->txfm_search_info.skip_txfm = 1;
+
+ mbmi->ref_mv_idx = 0;
+
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ av1_count_overlappable_neighbors(cm, xd);
+ if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
+ // Select the samples according to motion vector difference
+ if (mbmi->num_proj_ref > 1) {
+ mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
+ }
+ }
+
+ const InterpFilter interp_filter = features->interp_filter;
+ set_default_interp_filters(mbmi, interp_filter);
+
+ if (interp_filter != SWITCHABLE) {
+ best_filter = interp_filter;
+ } else {
+ best_filter = EIGHTTAP_REGULAR;
+ if (av1_is_interp_needed(xd)) {
+ int rs;
+ int best_rs = INT_MAX;
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ mbmi->interp_filters = av1_broadcast_interp_filter(i);
+ rs = av1_get_switchable_rate(x, xd, interp_filter,
+ cm->seq_params->enable_dual_filter);
+ if (rs < best_rs) {
+ best_rs = rs;
+ best_filter = mbmi->interp_filters.as_filters.y_filter;
+ }
+ }
+ }
+ }
+ // Set the appropriate filter
+ mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
+ rate2 += av1_get_switchable_rate(x, xd, interp_filter,
+ cm->seq_params->enable_dual_filter);
+
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT)
+ rate2 += comp_inter_cost[comp_pred];
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ rate2 += ref_costs_single[LAST_FRAME];
+ this_rd = RDCOST(x->rdmult, rate2, distortion2);
+
+ rd_cost->rate = rate2;
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+
+ if (this_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ assert((interp_filter == SWITCHABLE) ||
+ (interp_filter == mbmi->interp_filters.as_filters.y_filter));
+
+ if (cpi->sf.inter_sf.adaptive_rd_thresh) {
+ av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
+ cpi->sf.inter_sf.adaptive_rd_thresh, bsize,
+ THR_GLOBALMV, THR_INTER_MODE_START,
+ THR_INTER_MODE_END, THR_DC, MAX_MODES);
+ }
+
+#if CONFIG_INTERNAL_STATS
+ store_coding_context(x, ctx, THR_GLOBALMV, 0);
+#else
+ store_coding_context(x, ctx, 0);
+#endif // CONFIG_INTERNAL_STATS
+}
+
+/*!\cond */
+struct calc_target_weighted_pred_ctxt {
+ const OBMCBuffer *obmc_buffer;
+ const uint8_t *tmp;
+ int tmp_stride;
+ int overlap;
+};
+/*!\endcond */
+
+static INLINE void calc_target_weighted_pred_above(
+ MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
+ (void)nb_mi;
+ (void)num_planes;
+ (void)rel_mi_row;
+ (void)dir;
+
+ struct calc_target_weighted_pred_ctxt *ctxt =
+ (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+ const int bw = xd->width << MI_SIZE_LOG2;
+ const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+ int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_col * MI_SIZE);
+ int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_col * MI_SIZE);
+ const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+ const int is_hbd = is_cur_buf_hbd(xd);
+
+ if (!is_hbd) {
+ for (int row = 0; row < ctxt->overlap; ++row) {
+ const uint8_t m0 = mask1d[row];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
+ wsrc[col] = m1 * tmp[col];
+ mask[col] = m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp += ctxt->tmp_stride;
+ }
+ } else {
+ const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+ for (int row = 0; row < ctxt->overlap; ++row) {
+ const uint8_t m0 = mask1d[row];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
+ wsrc[col] = m1 * tmp16[col];
+ mask[col] = m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp16 += ctxt->tmp_stride;
+ }
+ }
+}
+
+static INLINE void calc_target_weighted_pred_left(
+ MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
+ (void)nb_mi;
+ (void)num_planes;
+ (void)rel_mi_col;
+ (void)dir;
+
+ struct calc_target_weighted_pred_ctxt *ctxt =
+ (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+ const int bw = xd->width << MI_SIZE_LOG2;
+ const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+ int32_t *wsrc = ctxt->obmc_buffer->wsrc + (rel_mi_row * MI_SIZE * bw);
+ int32_t *mask = ctxt->obmc_buffer->mask + (rel_mi_row * MI_SIZE * bw);
+ const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+ const int is_hbd = is_cur_buf_hbd(xd);
+
+ if (!is_hbd) {
+ for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
+ for (int col = 0; col < ctxt->overlap; ++col) {
+ const uint8_t m0 = mask1d[col];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+ (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+ mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp += ctxt->tmp_stride;
+ }
+ } else {
+ const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+ for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
+ for (int col = 0; col < ctxt->overlap; ++col) {
+ const uint8_t m0 = mask1d[col];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+ (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+ mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp16 += ctxt->tmp_stride;
+ }
+ }
+}
+
+// This function has a structure similar to av1_build_obmc_inter_prediction
+//
+// The OBMC predictor is computed as:
+//
+// PObmc(x,y) =
+// AOM_BLEND_A64(Mh(x),
+// AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
+// PLeft(x, y))
+//
+// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
+// rounding, this can be written as:
+//
+// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
+// Mh(x) * Mv(y) * P(x,y) +
+// Mh(x) * Cv(y) * Pabove(x,y) +
+// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// Where :
+//
+// Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
+// Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
+//
+// This function computes 'wsrc' and 'mask' as:
+//
+// wsrc(x, y) =
+// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
+// Mh(x) * Cv(y) * Pabove(x,y) +
+// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// mask(x, y) = Mh(x) * Mv(y)
+//
+// These can then be used to efficiently approximate the error for any
+// predictor P in the context of the provided neighbouring predictors by
+// computing:
+//
+// error(x, y) =
+// wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
+//
+static AOM_INLINE void calc_target_weighted_pred(
+ const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
+ const uint8_t *above, int above_stride, const uint8_t *left,
+ int left_stride) {
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ const int bw = xd->width << MI_SIZE_LOG2;
+ const int bh = xd->height << MI_SIZE_LOG2;
+ const OBMCBuffer *obmc_buffer = &x->obmc_buffer;
+ int32_t *mask_buf = obmc_buffer->mask;
+ int32_t *wsrc_buf = obmc_buffer->wsrc;
+
+ const int is_hbd = is_cur_buf_hbd(xd);
+ const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
+
+ // plane 0 should not be sub-sampled
+ assert(xd->plane[0].subsampling_x == 0);
+ assert(xd->plane[0].subsampling_y == 0);
+
+ av1_zero_array(wsrc_buf, bw * bh);
+ for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+ // handle above row
+ if (xd->up_available) {
+ const int overlap =
+ AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+ struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, above,
+ above_stride, overlap };
+ foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ calc_target_weighted_pred_above, &ctxt);
+ }
+
+ for (int i = 0; i < bw * bh; ++i) {
+ wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+ mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+ }
+
+ // handle left column
+ if (xd->left_available) {
+ const int overlap =
+ AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+ struct calc_target_weighted_pred_ctxt ctxt = { obmc_buffer, left,
+ left_stride, overlap };
+ foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ calc_target_weighted_pred_left, &ctxt);
+ }
+
+ if (!is_hbd) {
+ const uint8_t *src = x->plane[0].src.buf;
+
+ for (int row = 0; row < bh; ++row) {
+ for (int col = 0; col < bw; ++col) {
+ wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+ }
+ wsrc_buf += bw;
+ src += x->plane[0].src.stride;
+ }
+ } else {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+ for (int row = 0; row < bh; ++row) {
+ for (int col = 0; col < bw; ++col) {
+ wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+ }
+ wsrc_buf += bw;
+ src += x->plane[0].src.stride;
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
new file mode 100644
index 0000000000..efb797e5b5
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_H_
+#define AOM_AV1_ENCODER_RDOPT_H_
+
+#include <stdbool.h>
+
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define COMP_TYPE_RD_THRESH_SCALE 11
+#define COMP_TYPE_RD_THRESH_SHIFT 4
+#define MAX_WINNER_MOTION_MODES 10
+
+struct TileInfo;
+struct macroblock;
+struct RD_STATS;
+
+/*!\brief AV1 intra mode selection for intra frames.
+ *
+ * \ingroup intra_mode_search
+ * \callgraph
+ * Top level function for rd-based intra mode selection during intra frame
+ * encoding. This function will first search for the best luma prediction by
+ * calling av1_rd_pick_intra_sby_mode, then it searches for chroma prediction
+ * with av1_rd_pick_intra_sbuv_mode. If applicable, this function ends the
+ * search with an evaluation for intrabc.
+ *
+ * \param[in] cpi Top-level encoder structure.
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock.
+ * \param[in] rd_cost Struct to keep track of the RD information.
+ * \param[in] bsize Current block size.
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process.
+ * \param[in] best_rd Best RD seen for this block so far.
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+ struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+
+/*!\brief AV1 inter mode selection.
+ *
+ * \ingroup inter_mode_search
+ * \callgraph
+ * Top level function for inter mode selection. This function will loop over
+ * all possible inter modes and select the best one for the current block by
+ * computing the RD cost. The mode search and RD are computed in
+ * handle_inter_mode(), which is called from this function within the main
+ * loop.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ data/contexts/models for the tile during
+ encoding
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] rd_cost Struct to keep track of the RD information
+ * \param[in] bsize Current block size
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process
+ * \param[in] best_rd_so_far Best RD seen for this block so far
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+ struct macroblock *x, struct RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
+
+/*!\brief AV1 intra mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * \callergraph
+ * Top level function for Non-RD optimized intra mode selection.
+ * This finction will loop over subset of intra modes and select the best one
+ * based on calculated modelled RD cost. Only 4 intra modes are checked as
+ * specified in \c intra_mode_list. When calculating RD cost Hadamard transform
+ * of residual is used to calculate rate. Estmation of RD cost is performed
+ * in \c av1_estimate_block_intra which is called from this function
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] rd_cost Struct to keep track of the RD information
+ * \param[in] bsize Current block size
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+
+/*!\brief AV1 inter mode selection based on Non-RD optimized model.
+ *
+ * \ingroup nonrd_mode_search
+ * \callgraph
+ * Top level function for Non-RD optimized inter mode selection.
+ * This finction will loop over subset of inter modes and select the best one
+ * based on calculated modelled RD cost. While making decisions which modes to
+ * check, this function applies heuristics based on previously checked modes,
+ * block residual variance, block size, and other factors to prune certain
+ * modes and reference frames. Currently only single reference frame modes
+ * are checked. Additional heuristics are applied to decide if intra modes
+ * need to be checked.
+ * *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] tile_data Pointer to struct holding adaptive
+ data/contexts/models for the tile during
+ encoding
+ * \param[in] x Pointer to structure holding all the data for
+ the current macroblock
+ * \param[in] rd_cost Struct to keep track of the RD information
+ * \param[in] bsize Current block size
+ * \param[in] ctx Structure to hold snapshot of coding context
+ during the mode picking process
+ *
+ * \remark Nothing is returned. Instead, the MB_MODE_INFO struct inside x
+ * is modified to store information about the best mode computed
+ * in this function. The rd_cost struct is also updated with the RD stats
+ * corresponding to the best mode found.
+ */
+void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi,
+ struct TileDataEnc *tile_data,
+ struct macroblock *x,
+ struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx);
+
+void av1_rd_pick_inter_mode_sb_seg_skip(
+ const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+ struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
+
+static INLINE int coded_to_superres_mi(int mi_col, int denom) {
+ return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR;
+}
+
+static INLINE int av1_encoder_get_relative_dist(int a, int b) {
+ assert(a >= 0 && b >= 0);
+ return (a - b);
+}
+
+// This function will return number of mi's in a superblock.
+static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
+ const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize];
+ int sb_mi_rows =
+ (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) /
+ mi_alloc_size_1d;
+ assert(mi_size_wide[cm->seq_params->sb_size] ==
+ mi_size_high[cm->seq_params->sb_size]);
+ int sb_mi_size = sb_mi_rows * sb_mi_rows;
+
+ return sb_mi_size;
+}
+
+// This function prunes the mode if either of the reference frame falls in the
+// pruning list
+static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
+ const unsigned int *const ref_display_order_hint,
+ const unsigned int frame_display_order_hint,
+ const int *ref_frame_list) {
+ for (int i = 0; i < 2; i++) {
+ if (ref_frame_list[i] == NONE_FRAME) continue;
+
+ if (ref_frame[0] == ref_frame_list[i] ||
+ ref_frame[1] == ref_frame_list[i]) {
+ if (av1_encoder_get_relative_dist(
+ ref_display_order_hint[ref_frame_list[i] - LAST_FRAME],
+ frame_display_order_hint) < 0)
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static INLINE int has_closest_ref_frames(const MV_REFERENCE_FRAME *ref_frame,
+ int8_t closest_past_ref,
+ int8_t closest_future_ref) {
+ int has_closest_past_ref =
+ (ref_frame[0] == closest_past_ref) || (ref_frame[1] == closest_past_ref);
+ int has_closest_future_ref = (ref_frame[0] == closest_future_ref) ||
+ (ref_frame[1] == closest_future_ref);
+ return (has_closest_past_ref && has_closest_future_ref);
+}
+
+static INLINE int has_best_pred_mv_sad(const MV_REFERENCE_FRAME *ref_frame,
+ const MACROBLOCK *const x) {
+ int has_best_past_pred_mv_sad = 0;
+ int has_best_future_pred_mv_sad = 0;
+ if (x->best_pred_mv_sad[0] < INT_MAX && x->best_pred_mv_sad[1] < INT_MAX) {
+ has_best_past_pred_mv_sad =
+ (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[0]) ||
+ (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[0]);
+ has_best_future_pred_mv_sad =
+ (x->pred_mv_sad[ref_frame[0]] == x->best_pred_mv_sad[1]) ||
+ (x->pred_mv_sad[ref_frame[1]] == x->best_pred_mv_sad[1]);
+ }
+ return (has_best_past_pred_mv_sad && has_best_future_pred_mv_sad);
+}
+
+static INLINE int prune_ref_by_selective_ref_frame(
+ const AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const MV_REFERENCE_FRAME *const ref_frame,
+ const unsigned int *const ref_display_order_hint) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ if (!sf->inter_sf.selective_ref_frame) return 0;
+
+ const int comp_pred = ref_frame[1] > INTRA_FRAME;
+
+ if (sf->inter_sf.selective_ref_frame >= 2 ||
+ (sf->inter_sf.selective_ref_frame == 1 && comp_pred)) {
+ int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME };
+
+ if (x != NULL) {
+ // Disable pruning if either tpl suggests that we keep the frame or
+ // the pred_mv gives us the best sad
+ if (x->tpl_keep_ref_frame[LAST3_FRAME] ||
+ x->pred_mv_sad[LAST3_FRAME] == x->best_pred_mv_sad[0]) {
+ ref_frame_list[0] = NONE_FRAME;
+ }
+ if (x->tpl_keep_ref_frame[LAST2_FRAME] ||
+ x->pred_mv_sad[LAST2_FRAME] == x->best_pred_mv_sad[0]) {
+ ref_frame_list[1] = NONE_FRAME;
+ }
+ }
+
+ if (prune_ref(ref_frame, ref_display_order_hint,
+ ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME],
+ ref_frame_list))
+ return 1;
+ }
+
+ if (sf->inter_sf.selective_ref_frame >= 3) {
+ int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME };
+
+ if (x != NULL) {
+ // Disable pruning if either tpl suggests that we keep the frame or
+ // the pred_mv gives us the best sad
+ if (x->tpl_keep_ref_frame[ALTREF2_FRAME] ||
+ x->pred_mv_sad[ALTREF2_FRAME] == x->best_pred_mv_sad[0]) {
+ ref_frame_list[0] = NONE_FRAME;
+ }
+ if (x->tpl_keep_ref_frame[BWDREF_FRAME] ||
+ x->pred_mv_sad[BWDREF_FRAME] == x->best_pred_mv_sad[0]) {
+ ref_frame_list[1] = NONE_FRAME;
+ }
+ }
+
+ if (prune_ref(ref_frame, ref_display_order_hint,
+ ref_display_order_hint[LAST_FRAME - LAST_FRAME],
+ ref_frame_list))
+ return 1;
+ }
+
+ if (x != NULL && sf->inter_sf.prune_comp_ref_frames && comp_pred) {
+ int closest_ref_frames = has_closest_ref_frames(
+ ref_frame, cpi->ref_frame_dist_info.nearest_past_ref,
+ cpi->ref_frame_dist_info.nearest_future_ref);
+ if (closest_ref_frames == 0) {
+ // Prune reference frames which are not the closest to the current frame.
+ if (sf->inter_sf.prune_comp_ref_frames >= 2) {
+ return 1;
+ } else if (sf->inter_sf.prune_comp_ref_frames == 1) {
+ // Prune reference frames with non minimum pred_mv_sad.
+ if (has_best_pred_mv_sad(ref_frame, x) == 0) return 1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT to MB_MODE_INFO_EXT_FRAME.
+static INLINE void av1_copy_mbmi_ext_to_mbmi_ext_frame(
+ MB_MODE_INFO_EXT_FRAME *mbmi_ext_best,
+ const MB_MODE_INFO_EXT *const mbmi_ext, uint8_t ref_frame_type) {
+ memcpy(mbmi_ext_best->ref_mv_stack, mbmi_ext->ref_mv_stack[ref_frame_type],
+ sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+ memcpy(mbmi_ext_best->weight, mbmi_ext->weight[ref_frame_type],
+ sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+ mbmi_ext_best->mode_context = mbmi_ext->mode_context[ref_frame_type];
+ mbmi_ext_best->ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ memcpy(mbmi_ext_best->global_mvs, mbmi_ext->global_mvs,
+ sizeof(mbmi_ext->global_mvs));
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/rdopt_data_defs.h b/third_party/aom/av1/encoder/rdopt_data_defs.h
new file mode 100644
index 0000000000..ca7ef810f3
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt_data_defs.h
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
+#define AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static const THR_MODES intra_to_mode_idx[INTRA_MODE_NUM] = {
+ THR_DC, // DC_PRED,
+ THR_V_PRED, // V_PRED,
+ THR_H_PRED, // H_PRED,
+ THR_D45_PRED, // D45_PRED,
+ THR_D135_PRED, // D135_PRED,
+ THR_D113_PRED, // D113_PRED,
+ THR_D157_PRED, // D157_PRED,
+ THR_D203_PRED, // D203_PRED,
+ THR_D67_PRED, // D67_PRED,
+ THR_SMOOTH, // SMOOTH_PRED,
+ THR_SMOOTH_V, // SMOOTH_V_PRED,
+ THR_SMOOTH_H, // SMOOTH_H_PRED,
+ THR_PAETH, // PAETH_PRED,
+};
+
+/* clang-format off */
+static const THR_MODES single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM]
+ [REF_FRAMES] = {
+ // NEARESTMV,
+ { THR_INVALID, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3,
+ THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, },
+ // NEARMV,
+ { THR_INVALID, THR_NEARMV, THR_NEARL2, THR_NEARL3,
+ THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, },
+ // GLOBALMV,
+ { THR_INVALID, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3,
+ THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, },
+ // NEWMV,
+ { THR_INVALID, THR_NEWMV, THR_NEWL2, THR_NEWL3,
+ THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, },
+};
+/* clang-format on */
+
+/* clang-format off */
+static const THR_MODES comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES]
+ [REF_FRAMES] = {
+ // NEAREST_NEARESTMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3,
+ THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB,
+ THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEARESTL2B,
+ THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEARESTL3B,
+ THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEARESTGB,
+ THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEARESTBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEAR_NEARMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3,
+ THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB,
+ THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEARL2B,
+ THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEARL3B,
+ THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEARGB,
+ THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEARBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEAREST_NEWMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3,
+ THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB,
+ THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEWL2B,
+ THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEWL3B,
+ THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEWGB,
+ THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAREST_NEWBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEW_NEARESTMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3,
+ THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB,
+ THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARESTL2B,
+ THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARESTL3B,
+ THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARESTGB,
+ THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARESTBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEAR_NEWMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3,
+ THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB,
+ THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEWL2B,
+ THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEWL3B,
+ THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEWGB,
+ THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEAR_NEWBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEW_NEARMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3,
+ THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB,
+ THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARL2B,
+ THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARL3B,
+ THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARGB,
+ THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEARBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // GLOBAL_GLOBALMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3,
+ THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB,
+ THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_GLOBAL_GLOBALL2B,
+ THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_GLOBAL_GLOBALL3B,
+ THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_GLOBAL_GLOBALGB,
+ THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_GLOBAL_GLOBALBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+ // NEW_NEWMV,
+ {
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID,
+ THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3,
+ THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB,
+ THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEWL2B,
+ THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEWL3B,
+ THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEWGB,
+ THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, },
+ { THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_COMP_NEW_NEWBA, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+ THR_INVALID, THR_INVALID, THR_INVALID, },
+ },
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
diff --git a/third_party/aom/av1/encoder/rdopt_utils.h b/third_party/aom/av1/encoder/rdopt_utils.h
new file mode 100644
index 0000000000..b6bc4927e3
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt_utils.h
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_UTILS_H_
+#define AOM_AV1_ENCODER_RDOPT_UTILS_H_
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/block.h"
+#include "av1/common/cfl.h"
+#include "av1/common/pred_common.h"
+#include "av1/encoder/rdopt_data_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_REF_MV_SEARCH 3
+#define MAX_TX_RD_GATE_LEVEL 5
+#define INTER_INTRA_RD_THRESH_SCALE 9
+#define INTER_INTRA_RD_THRESH_SHIFT 4
+
+typedef struct {
+ PREDICTION_MODE mode;
+ MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+// This array defines the mapping from the enums in THR_MODES to the actual
+// prediction modes and refrence frames
+static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = {
+ { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
+ { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+ { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { NEWMV, { LAST_FRAME, NONE_FRAME } },
+ { NEWMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEWMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+ { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { NEARMV, { LAST_FRAME, NONE_FRAME } },
+ { NEARMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEARMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+ { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { GLOBALMV, { LAST_FRAME, NONE_FRAME } },
+ { GLOBALMV, { LAST2_FRAME, NONE_FRAME } },
+ { GLOBALMV, { LAST3_FRAME, NONE_FRAME } },
+ { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } },
+ { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } },
+ { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ // TODO(zoeliu): May need to reconsider the order on the modes to check
+
+ { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+ { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
+
+ { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+ // intra modes
+ { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+};
+
+// Number of winner modes allowed for different values of the speed feature
+// multi_winner_mode_type.
+static const int winner_mode_count_allowed[MULTI_WINNER_MODE_LEVELS] = {
+ 1, // MULTI_WINNER_MODE_OFF
+ 2, // MULTI_WINNER_MODE_FAST
+ 3 // MULTI_WINNER_MODE_DEFAULT
+};
+
+static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
+ const int num_planes) {
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].dst.buf = dst.plane[i];
+ xd->plane[i].dst.stride = dst.stride[i];
+ }
+}
+
+static AOM_INLINE void swap_dst_buf(MACROBLOCKD *xd,
+ const BUFFER_SET *dst_bufs[2],
+ int num_planes) {
+ const BUFFER_SET *buf0 = dst_bufs[0];
+ dst_bufs[0] = dst_bufs[1];
+ dst_bufs[1] = buf0;
+ restore_dst_buf(xd, *dst_bufs[0], num_planes);
+}
+
+/* clang-format on */
+// Calculate rd threshold based on ref best rd and relevant scaling factors
+static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd,
+ int mul_factor,
+ int div_factor) {
+ int64_t rd_thresh = ref_best_rd;
+ if (div_factor != 0) {
+ rd_thresh = ref_best_rd < (div_factor * (INT64_MAX / mul_factor))
+ ? ((ref_best_rd / div_factor) * mul_factor)
+ : INT64_MAX;
+ }
+ return rd_thresh;
+}
+
+static AOM_INLINE THR_MODES
+get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame,
+ MV_REFERENCE_FRAME second_ref_frame) {
+ if (this_mode < INTRA_MODE_END) {
+ assert(ref_frame == INTRA_FRAME);
+ assert(second_ref_frame == NONE_FRAME);
+ return intra_to_mode_idx[this_mode - INTRA_MODE_START];
+ }
+ if (this_mode >= SINGLE_INTER_MODE_START &&
+ this_mode < SINGLE_INTER_MODE_END) {
+ assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+ return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
+ [ref_frame];
+ }
+ if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END &&
+ second_ref_frame != NONE_FRAME) {
+ assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+ assert((second_ref_frame > INTRA_FRAME) &&
+ (second_ref_frame <= ALTREF_FRAME));
+ return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame]
+ [second_ref_frame];
+ }
+ assert(0);
+ return THR_INVALID;
+}
+
+static AOM_INLINE int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
+ if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+ bsize == BLOCK_4X16 || bsize == BLOCK_16X4) {
+ return -1;
+ }
+ return 1;
+}
+
+// Get transform block visible dimensions cropped to the MI units.
+static AOM_INLINE void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
+ BLOCK_SIZE plane_bsize, int blk_row,
+ int blk_col, BLOCK_SIZE tx_bsize,
+ int *width, int *height,
+ int *visible_width,
+ int *visible_height) {
+ assert(tx_bsize <= plane_bsize);
+ const int txb_height = block_size_high[tx_bsize];
+ const int txb_width = block_size_wide[tx_bsize];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ // TODO(aconverse@google.com): Investigate using crop_width/height here rather
+ // than the MI size
+ if (xd->mb_to_bottom_edge >= 0) {
+ *visible_height = txb_height;
+ } else {
+ const int block_height = block_size_high[plane_bsize];
+ const int block_rows =
+ (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
+ *visible_height =
+ clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, txb_height);
+ }
+ if (height) *height = txb_height;
+
+ if (xd->mb_to_right_edge >= 0) {
+ *visible_width = txb_width;
+ } else {
+ const int block_width = block_size_wide[plane_bsize];
+ const int block_cols =
+ (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
+ *visible_width =
+ clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, txb_width);
+ }
+ if (width) *width = txb_width;
+}
+
+static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+ int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2);
+ return num_blk;
+}
+
+static INLINE int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int64_t best_skip_rd, int64_t skip_rd,
+ int level, int is_luma_only) {
+ int eval_txfm = 1;
+ // Derive aggressiveness factor for gating the transform search
+ // Lower value indicates more aggressiveness. Be more conservative (high
+ // value) for (i) low quantizers (ii) regions where prediction is poor
+ const int scale[MAX_TX_RD_GATE_LEVEL + 1] = { INT_MAX, 4, 3, 2, 2, 1 };
+ const int qslope = 2 * (!is_luma_only);
+ const int level_to_qindex_map[MAX_TX_RD_GATE_LEVEL + 1] = { 0, 0, 0,
+ 80, 100, 140 };
+ int aggr_factor = 4;
+ assert(level <= MAX_TX_RD_GATE_LEVEL);
+ const int pred_qindex_thresh = level_to_qindex_map[level];
+ if (!is_luma_only && level <= 2) {
+ aggr_factor = 4 * AOMMAX(1, ROUND_POWER_OF_TWO((MAXQ - x->qindex) * qslope,
+ QINDEX_BITS));
+ }
+ if ((best_skip_rd >
+ (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS))) &&
+ (x->qindex >= pred_qindex_thresh))
+ aggr_factor *= scale[level];
+ // For level setting 1, be more conservative for non-luma-only case even when
+ // prediction is good.
+ else if ((level <= 1) && !is_luma_only)
+ aggr_factor = (aggr_factor >> 2) * 6;
+
+ // Be more conservative for luma only cases (called from compound type rd)
+ // since best_skip_rd is computed after and skip_rd is computed (with 8-bit
+ // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before
+ // interpolation filter search
+ const int luma_mul[MAX_TX_RD_GATE_LEVEL + 1] = {
+ INT_MAX, 32, 29, 17, 17, 17
+ };
+ int mul_factor = is_luma_only ? luma_mul[level] : 16;
+ int64_t rd_thresh =
+ (best_skip_rd == INT64_MAX)
+ ? best_skip_rd
+ : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 6);
+ if (skip_rd > rd_thresh) eval_txfm = 0;
+ return eval_txfm;
+}
+
+static TX_MODE select_tx_mode(
+ const AV1_COMMON *cm, const TX_SIZE_SEARCH_METHOD tx_size_search_method) {
+ if (cm->features.coded_lossless) return ONLY_4X4;
+ if (tx_size_search_method == USE_LARGESTALL) {
+ return TX_MODE_LARGEST;
+ } else {
+ assert(tx_size_search_method == USE_FULL_RD ||
+ tx_size_search_method == USE_FAST_RD);
+ return TX_MODE_SELECT;
+ }
+}
+
+// Checks the conditions to disable winner mode processing
+static INLINE int bypass_winner_mode_processing(const MACROBLOCK *const x,
+ const SPEED_FEATURES *sf,
+ int use_txfm_skip,
+ int actual_txfm_skip,
+ PREDICTION_MODE best_mode) {
+ const int prune_winner_mode_eval_level =
+ sf->winner_mode_sf.prune_winner_mode_eval_level;
+
+ // Disable winner mode processing for blocks with low source variance.
+ // The aggressiveness of this pruning logic reduces as qindex increases.
+ // The threshold decreases linearly from 64 as qindex varies from 0 to 255.
+ if (prune_winner_mode_eval_level == 1) {
+ const unsigned int src_var_thresh = 64 - 48 * x->qindex / (MAXQ + 1);
+ if (x->source_variance < src_var_thresh) return 1;
+ } else if (prune_winner_mode_eval_level == 2) {
+ // Skip winner mode processing of blocks for which transform turns out to be
+ // skip due to nature of eob alone except NEWMV mode.
+ if (!have_newmv_in_inter_mode(best_mode) && actual_txfm_skip) return 1;
+ } else if (prune_winner_mode_eval_level == 3) {
+ // Skip winner mode processing of blocks for which transform turns out to be
+ // skip except NEWMV mode and considered based on the quantizer.
+ // At high quantizers: Take conservative approach by considering transform
+ // skip based on eob alone.
+ // At low quantizers: Consider transform skip based on eob nature or RD cost
+ // evaluation.
+ const int is_txfm_skip =
+ x->qindex > 127 ? actual_txfm_skip : actual_txfm_skip || use_txfm_skip;
+
+ if (!have_newmv_in_inter_mode(best_mode) && is_txfm_skip) return 1;
+ } else if (prune_winner_mode_eval_level >= 4) {
+ // Do not skip winner mode evaluation at low quantizers if normal mode's
+ // transform search was too aggressive.
+ if (sf->rd_sf.perform_coeff_opt >= 5 && x->qindex <= 70) return 0;
+
+ if (use_txfm_skip || actual_txfm_skip) return 1;
+ }
+
+ return 0;
+}
+
+// Checks the conditions to enable winner mode processing
+static INLINE int is_winner_mode_processing_enabled(const struct AV1_COMP *cpi,
+ const MACROBLOCK *const x,
+ MB_MODE_INFO *const mbmi,
+ int actual_txfm_skip) {
+ const SPEED_FEATURES *sf = &cpi->sf;
+ const PREDICTION_MODE best_mode = mbmi->mode;
+
+ if (bypass_winner_mode_processing(x, sf, mbmi->skip_txfm, actual_txfm_skip,
+ best_mode))
+ return 0;
+
+ // TODO(any): Move block independent condition checks to frame level
+ if (is_inter_block(mbmi)) {
+ if (is_inter_mode(best_mode) &&
+ (sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != INT_MAX) &&
+ !cpi->oxcf.txfm_cfg.use_inter_dct_only)
+ return 1;
+ } else {
+ if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search &&
+ !cpi->oxcf.txfm_cfg.use_intra_default_tx_only &&
+ !cpi->oxcf.txfm_cfg.use_intra_dct_only)
+ return 1;
+ }
+
+ // Check speed feature related to winner mode processing
+ if (sf->winner_mode_sf.enable_winner_mode_for_coeff_opt &&
+ cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT &&
+ cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT)
+ return 1;
+ if (sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch) return 1;
+
+ return 0;
+}
+
+static INLINE void set_tx_size_search_method(
+ const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params,
+ TxfmSearchParams *txfm_params, int enable_winner_mode_for_tx_size_srch,
+ int is_winner_mode) {
+ // Populate transform size search method/transform mode appropriately
+ txfm_params->tx_size_search_method =
+ winner_mode_params->tx_size_search_methods[DEFAULT_EVAL];
+ if (enable_winner_mode_for_tx_size_srch) {
+ if (is_winner_mode)
+ txfm_params->tx_size_search_method =
+ winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL];
+ else
+ txfm_params->tx_size_search_method =
+ winner_mode_params->tx_size_search_methods[MODE_EVAL];
+ }
+ txfm_params->tx_mode_search_type =
+ select_tx_mode(cm, txfm_params->tx_size_search_method);
+}
+
+static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf,
+ TxfmSearchParams *txfm_params,
+ int winner_mode_tx_type_pruning,
+ int is_winner_mode) {
+ // Populate prune transform mode appropriately
+ txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode;
+ if (!winner_mode_tx_type_pruning) return;
+
+ const int prune_mode[4][2] = { { TX_TYPE_PRUNE_3, TX_TYPE_PRUNE_0 },
+ { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 },
+ { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 },
+ { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_3 } };
+ txfm_params->prune_2d_txfm_mode =
+ prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode];
+}
+
+static INLINE void set_tx_domain_dist_params(
+ const WinnerModeParams *winner_mode_params, TxfmSearchParams *txfm_params,
+ int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) {
+ if (txfm_params->use_qm_dist_metric) {
+ // QM-weighted PSNR is computed in transform space, so we need to forcibly
+ // enable the use of tx domain distortion.
+ txfm_params->use_transform_domain_distortion = 1;
+ txfm_params->tx_domain_dist_threshold = 0;
+ return;
+ }
+
+ if (!enable_winner_mode_for_tx_domain_dist) {
+ txfm_params->use_transform_domain_distortion =
+ winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL];
+ txfm_params->tx_domain_dist_threshold =
+ winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL];
+ return;
+ }
+
+ if (is_winner_mode) {
+ txfm_params->use_transform_domain_distortion =
+ winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL];
+ txfm_params->tx_domain_dist_threshold =
+ winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL];
+ } else {
+ txfm_params->use_transform_domain_distortion =
+ winner_mode_params->use_transform_domain_distortion[MODE_EVAL];
+ txfm_params->tx_domain_dist_threshold =
+ winner_mode_params->tx_domain_dist_threshold[MODE_EVAL];
+ }
+}
+
+// This function sets mode parameters for different mode evaluation stages
+static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi,
+ MACROBLOCK *x,
+ MODE_EVAL_TYPE mode_eval_type) {
+ const AV1_COMMON *cm = &cpi->common;
+ const SPEED_FEATURES *sf = &cpi->sf;
+ const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params;
+ TxfmSearchParams *txfm_params = &x->txfm_search_params;
+
+ txfm_params->use_qm_dist_metric =
+ cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR;
+
+ switch (mode_eval_type) {
+ case DEFAULT_EVAL:
+ txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
+ txfm_params->use_default_intra_tx_type = 0;
+ txfm_params->skip_txfm_level =
+ winner_mode_params->skip_txfm_level[DEFAULT_EVAL];
+ txfm_params->predict_dc_level =
+ winner_mode_params->predict_dc_level[DEFAULT_EVAL];
+ // Set default transform domain distortion type
+ set_tx_domain_dist_params(winner_mode_params, txfm_params, 0, 0);
+
+ // Get default threshold for R-D optimization of coefficients
+ get_rd_opt_coeff_thresh(winner_mode_params->coeff_opt_thresholds,
+ txfm_params, 0, 0);
+
+ // Set default transform size search method
+ set_tx_size_search_method(cm, winner_mode_params, txfm_params, 0, 0);
+ // Set default transform type prune
+ set_tx_type_prune(sf, txfm_params, 0, 0);
+ break;
+ case MODE_EVAL:
+ txfm_params->use_default_intra_tx_type =
+ (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search ||
+ cpi->oxcf.txfm_cfg.use_intra_default_tx_only);
+ txfm_params->default_inter_tx_type_prob_thresh =
+ cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh;
+ txfm_params->skip_txfm_level =
+ winner_mode_params->skip_txfm_level[MODE_EVAL];
+ txfm_params->predict_dc_level =
+ winner_mode_params->predict_dc_level[MODE_EVAL];
+ // Set transform domain distortion type for mode evaluation
+ set_tx_domain_dist_params(
+ winner_mode_params, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0);
+
+ // Get threshold for R-D optimization of coefficients during mode
+ // evaluation
+ get_rd_opt_coeff_thresh(
+ winner_mode_params->coeff_opt_thresholds, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0);
+
+ // Set the transform size search method for mode evaluation
+ set_tx_size_search_method(
+ cm, winner_mode_params, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0);
+ // Set transform type prune for mode evaluation
+ set_tx_type_prune(sf, txfm_params,
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
+ 0);
+ break;
+ case WINNER_MODE_EVAL:
+ txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
+ txfm_params->use_default_intra_tx_type = 0;
+ txfm_params->skip_txfm_level =
+ winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL];
+ txfm_params->predict_dc_level =
+ winner_mode_params->predict_dc_level[WINNER_MODE_EVAL];
+
+ // Set transform domain distortion type for winner mode evaluation
+ set_tx_domain_dist_params(
+ winner_mode_params, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1);
+
+ // Get threshold for R-D optimization of coefficients for winner mode
+ // evaluation
+ get_rd_opt_coeff_thresh(
+ winner_mode_params->coeff_opt_thresholds, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1);
+
+ // Set the transform size search method for winner mode evaluation
+ set_tx_size_search_method(
+ cm, winner_mode_params, txfm_params,
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+ // Set default transform type prune mode for winner mode evaluation
+ set_tx_type_prune(sf, txfm_params,
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
+ 1);
+ break;
+ default: assert(0);
+ }
+
+ // Rd record collected at a specific mode evaluation stage can not be used
+ // across other evaluation stages as the transform parameters are different.
+ // Hence, reset mb rd record whenever mode evaluation stage type changes.
+ if (txfm_params->mode_eval_type != mode_eval_type)
+ reset_mb_rd_record(x->txfm_search_info.mb_rd_record);
+
+ txfm_params->mode_eval_type = mode_eval_type;
+}
+
+// Similar to store_cfl_required(), but for use during the RDO process,
+// where we haven't yet determined whether this block uses CfL.
+static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
+ const MACROBLOCK *x) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ if (cm->seq_params->monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
+
+ if (!xd->is_chroma_ref) {
+ // For non-chroma-reference blocks, we should always store the luma pixels,
+ // in case the corresponding chroma-reference block uses CfL.
+ // Note that this can only happen for block sizes which are <8 on
+ // their shortest side, as otherwise they would be chroma reference
+ // blocks.
+ return CFL_ALLOWED;
+ }
+
+ // For chroma reference blocks, we should store data in the encoder iff we're
+ // allowed to try out CfL.
+ return is_cfl_allowed(xd);
+}
+
+static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+}
+
+// Store best mode stats for winner mode processing
+static INLINE void store_winner_mode_stats(
+ const AV1_COMMON *const cm, MACROBLOCK *x, const MB_MODE_INFO *mbmi,
+ RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv,
+ THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd,
+ int multi_winner_mode_type, int txfm_search_done) {
+ WinnerModeStats *winner_mode_stats = x->winner_mode_stats;
+ int mode_idx = 0;
+ int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0;
+ // Mode stat is not required when multiwinner mode processing is disabled
+ if (multi_winner_mode_type == MULTI_WINNER_MODE_OFF) return;
+ // Ignore mode with maximum rd
+ if (this_rd == INT64_MAX) return;
+ // TODO(any): Winner mode processing is currently not applicable for palette
+ // mode in Inter frames. Clean-up the following code, once support is added
+ if (!frame_is_intra_only(cm) && is_palette_mode) return;
+
+ int max_winner_mode_count = winner_mode_count_allowed[multi_winner_mode_type];
+ assert(x->winner_mode_count >= 0 &&
+ x->winner_mode_count <= max_winner_mode_count);
+
+ if (x->winner_mode_count) {
+ // Find the mode which has higher rd cost than this_rd
+ for (mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++)
+ if (winner_mode_stats[mode_idx].rd > this_rd) break;
+
+ if (mode_idx == max_winner_mode_count) {
+ // No mode has higher rd cost than this_rd
+ return;
+ } else if (mode_idx < max_winner_mode_count - 1) {
+ // Create a slot for current mode and move others to the next slot
+ memmove(
+ &winner_mode_stats[mode_idx + 1], &winner_mode_stats[mode_idx],
+ (max_winner_mode_count - mode_idx - 1) * sizeof(*winner_mode_stats));
+ }
+ }
+ // Add a mode stat for winner mode processing
+ winner_mode_stats[mode_idx].mbmi = *mbmi;
+ winner_mode_stats[mode_idx].rd = this_rd;
+ winner_mode_stats[mode_idx].mode_index = mode_index;
+
+ // Update rd stats required for inter frame
+ if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END;
+ const int skip_txfm = mbmi->skip_txfm && !is_intra_mode;
+
+ winner_mode_stats[mode_idx].rd_cost = *rd_cost;
+ if (txfm_search_done) {
+ winner_mode_stats[mode_idx].rate_y =
+ rd_cost_y->rate +
+ x->mode_costs
+ .skip_txfm_cost[skip_ctx][rd_cost->skip_txfm || skip_txfm];
+ winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate;
+ }
+ }
+
+ if (color_map) {
+ // Store color_index_map for palette mode
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ int block_width, block_height;
+ av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width,
+ &block_height, NULL, NULL);
+ memcpy(winner_mode_stats[mode_idx].color_index_map, color_map,
+ block_width * block_height * sizeof(color_map[0]));
+ }
+
+ x->winner_mode_count =
+ AOMMIN(x->winner_mode_count + 1, max_winner_mode_count);
+}
+
+unsigned int av1_get_perpixel_variance(const AV1_COMP *cpi,
+ const MACROBLOCKD *xd,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bsize, int plane,
+ int use_hbd);
+
+unsigned int av1_get_perpixel_variance_facade(const struct AV1_COMP *cpi,
+ const MACROBLOCKD *xd,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bsize, int plane);
+
+static INLINE int is_mode_intra(PREDICTION_MODE mode) {
+ return mode < INTRA_MODE_END;
+}
+
+// This function will copy usable ref_mv_stack[ref_frame][4] and
+// weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and
+// weight[ref_frame][8].
+static INLINE void av1_copy_usable_ref_mv_stack_and_weight(
+ const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext,
+ MV_REFERENCE_FRAME ref_frame) {
+ memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame],
+ USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0]));
+ memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame],
+ USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0]));
+}
+
+// Get transform rd gate level for the given transform search case.
+static INLINE int get_txfm_rd_gate_level(
+ const int is_masked_compound_enabled,
+ const int txfm_rd_gate_level[TX_SEARCH_CASES], BLOCK_SIZE bsize,
+ TX_SEARCH_CASE tx_search_case, int eval_motion_mode) {
+ assert(tx_search_case < TX_SEARCH_CASES);
+ if (tx_search_case == TX_SEARCH_MOTION_MODE && !eval_motion_mode &&
+ num_pels_log2_lookup[bsize] > 8)
+ return txfm_rd_gate_level[TX_SEARCH_MOTION_MODE];
+ // Enable aggressive gating of transform search only when masked compound type
+ // is enabled.
+ else if (tx_search_case == TX_SEARCH_COMP_TYPE_MODE &&
+ is_masked_compound_enabled)
+ return txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE];
+
+ return txfm_rd_gate_level[TX_SEARCH_DEFAULT];
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RDOPT_UTILS_H_
diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c
new file mode 100644
index 0000000000..9b964113a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static AOM_INLINE void enc_calc_subpel_params(
+ const MV *const src_mv, InterPredParams *const inter_pred_params,
+ uint8_t **pre, SubpelParams *subpel_params, int *src_stride) {
+ struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
+ init_subpel_params(src_mv, inter_pred_params, subpel_params, pre_buf->width,
+ pre_buf->height);
+ *pre = pre_buf->buf0 +
+ (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+ (subpel_params->pos_x >> SCALE_SUBPEL_BITS);
+ *src_stride = pre_buf->stride;
+}
+
+#define IS_DEC 0
+#include "av1/common/reconinter_template.inc"
+#undef IS_DEC
+
+void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
+ const MV *src_mv,
+ InterPredParams *inter_pred_params) {
+ build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params);
+}
+
+static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, const MB_MODE_INFO *mi,
+ int bw, int bh, int mi_x, int mi_y) {
+ build_inter_predictors(cm, xd, plane, mi, /*build_for_obmc=*/0, bw, bh, mi_x,
+ mi_y);
+}
+
+void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) {
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ InterPredParams inter_pred_params;
+
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf;
+ const MV mv = xd->mi[0]->mv[0].as_mv;
+ const struct scale_factors *const sf = xd->block_ref_scale_factors[0];
+
+ av1_init_inter_params(&inter_pred_params, pd->width, pd->height, mi_y, mi_x,
+ pd->subsampling_x, pd->subsampling_y, xd->bd,
+ is_cur_buf_hbd(xd), false, sf, pd->pre,
+ xd->mi[0]->interp_filters);
+
+ inter_pred_params.conv_params = get_conv_params_no_round(
+ 0, AOM_PLANE_Y, xd->tmp_conv_dst, MAX_SB_SIZE, false, xd->bd);
+
+ inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
+ av1_enc_build_one_inter_predictor(dst, dst_buf->stride, &mv,
+ &inter_pred_params);
+}
+
+void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params) {
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *pre_buf = &pd->pre[0];
+ const uint8_t *src =
+ pre_buf->buf0 +
+ (subpel_params->pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+ (subpel_params->pos_x >> SCALE_SUBPEL_BITS);
+ uint8_t *const dst = dst_buf->buf;
+ int src_stride = pre_buf->stride;
+ int dst_stride = dst_buf->stride;
+ inter_pred_params->ref_frame_buf = *pre_buf;
+
+ // Initialize interp filter for single reference mode.
+ init_interp_filter_params(inter_pred_params->interp_filter_params,
+ &mbmi->interp_filters.as_filters, pd->width,
+ pd->height, /*is_intrabc=*/0);
+
+ av1_make_inter_predictor(src, src_stride, dst, dst_stride, inter_pred_params,
+ subpel_params);
+}
+
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+ int plane_from, int plane_to) {
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ enc_build_inter_predictors(cm, xd, plane, xd->mi[0], xd->plane[plane].width,
+ xd->plane[plane].height, mi_x, mi_y);
+
+ if (is_interintra_pred(xd->mi[0])) {
+ BUFFER_SET default_ctx = {
+ { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+ { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+ xd->plane[2].dst.stride }
+ };
+ if (!ctx) {
+ ctx = &default_ctx;
+ }
+ av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf,
+ xd->plane[plane].dst.stride, ctx, plane,
+ bsize);
+ }
+ }
+}
+
+static void setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+ int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+ struct build_prediction_ctxt *ctxt,
+ const int num_planes) {
+ const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->bsize);
+ const int ref_mi_row = xd->mi_row + mi_row_offset;
+ const int ref_mi_col = xd->mi_col + mi_col_offset;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
+ ctxt->tmp_width[plane], ctxt->tmp_height[plane],
+ ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
+ NULL, pd->subsampling_x, pd->subsampling_y);
+ }
+
+ const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
+
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(ctxt->cm, frame);
+
+ xd->block_ref_scale_factors[0] = sf;
+ if (!av1_is_valid_scale(sf))
+ aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Reference frame has invalid dimensions");
+
+ av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
+ num_planes);
+}
+
+static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row,
+ int rel_mi_col, uint8_t op_mi_size,
+ int dir, MB_MODE_INFO *above_mbmi,
+ void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt,
+ num_planes);
+
+ const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2;
+ const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2;
+
+ const BLOCK_SIZE bsize = xd->mi[0]->bsize;
+
+ InterPredParams inter_pred_params;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = 0, bh = 0;
+
+ if (dir) {
+ // prepare left reference block size
+ bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+ block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+ bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
+ } else {
+ // prepare above reference block size
+ bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
+ bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+ block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+ }
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, dir)) continue;
+
+ const struct buf_2d *const pre_buf = &pd->pre[0];
+ const MV mv = above_mbmi->mv[0].as_mv;
+
+ av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x,
+ pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
+ xd->block_ref_scale_factors[0], pre_buf,
+ above_mbmi->interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, j, xd->bd);
+
+ av1_enc_build_one_inter_predictor(pd->dst.buf, pd->dst.stride, &mv,
+ &inter_pred_params);
+ }
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->up_available) return;
+ struct build_prediction_ctxt ctxt = {
+ cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_right_edge, NULL
+ };
+ BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ foreach_overlappable_nb_above(cm, xd,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ build_obmc_prediction, &ctxt);
+}
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->left_available) return;
+ struct build_prediction_ctxt ctxt = {
+ cm, tmp_buf, tmp_width, tmp_height, tmp_stride, xd->mb_to_bottom_edge, NULL
+ };
+ BLOCK_SIZE bsize = xd->mi[0]->bsize;
+ foreach_overlappable_nb_left(cm, xd,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ build_obmc_prediction, &ctxt);
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) {
+ const int num_planes = av1_num_planes(cm);
+ uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+ int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+ av1_setup_obmc_dst_bufs(xd, dst_buf1, dst_buf2);
+
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ av1_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1,
+ dst_stride1);
+ av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
+ dst_stride2);
+ av1_setup_dst_planes(xd->plane, xd->mi[0]->bsize, &cm->cur_frame->buf, mi_row,
+ mi_col, 0, num_planes);
+ av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
+ dst_stride2);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+ MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
+ uint8_t *ext_dst[], int ext_dst_stride[]) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ WarpTypesAllowed warp_types;
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+ warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+
+ InterPredParams inter_pred_params;
+
+ av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x,
+ pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
+ xd->block_ref_scale_factors[ref], &pd->pre[ref],
+ mi->interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+ uint8_t *const dst = get_buf_by_bd(xd, ext_dst[plane]);
+ const MV mv = mi->mv[ref].as_mv;
+
+ av1_enc_build_one_inter_predictor(dst, ext_dst_stride[plane], &mv,
+ &inter_pred_params);
+ }
+}
+
+static void build_masked_compound(
+ uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, block_size_wide[sb_type], w, h, subw, subh);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void build_masked_compound_highbd(
+ uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+ const uint8_t *src1_8, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w, int bd) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ // const uint8_t *mask =
+ // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+ aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, block_size_wide[sb_type], w, h,
+ subw, subh, bd);
+}
+#endif
+
+static void build_wedge_inter_predictor_from_buf(
+ MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+ int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_compound = has_second_ref(mbmi);
+ MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+ mbmi->interinter_comp.seg_mask = xd->seg_mask;
+ const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+ const int is_hbd = is_cur_buf_hbd(xd);
+
+ if (is_compound && is_masked_compound_type(comp_data->type)) {
+ if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_hbd) {
+ av1_build_compound_diffwtd_mask_highbd(
+ comp_data->seg_mask, comp_data->mask_type,
+ CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
+ } else {
+ av1_build_compound_diffwtd_mask(
+ comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+ ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+ }
+#else
+ (void)is_hbd;
+ av1_build_compound_diffwtd_mask(comp_data->seg_mask, comp_data->mask_type,
+ ext_dst0, ext_dst_stride0, ext_dst1,
+ ext_dst_stride1, h, w);
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ }
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_hbd) {
+ build_masked_compound_highbd(
+ dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, mbmi->bsize,
+ h, w, xd->bd);
+ } else {
+ build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+ ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize,
+ h, w);
+ }
+#else
+ build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+ ext_dst1, ext_dst_stride1, comp_data, mbmi->bsize, h,
+ w);
+#endif
+ } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_hbd) {
+ aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_SHORTPTR(dst), dst_buf->stride, w, h);
+ } else {
+ aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
+ }
+#else
+ aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, w, h);
+#endif
+ }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane_from, int plane_to,
+ uint8_t *ext_dst0[],
+ int ext_dst_stride0[],
+ uint8_t *ext_dst1[],
+ int ext_dst_stride1[]) {
+ int plane;
+ assert(bsize < BLOCK_SIZES_ALL);
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(
+ bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ build_wedge_inter_predictor_from_buf(
+ xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+ ext_dst1[plane], ext_dst_stride1[plane]);
+ }
+}
+
+// Get pred block from up-sampled reference.
+void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, int width, int height,
+ int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ for (int i = 0; i < height; i++) {
+ memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+ comp_pred += width;
+ ref += ref_stride;
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+ -1, width, height);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+ 16, width, height);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t,
+ temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
+ ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+ width, intermediate_height);
+ aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
+ MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+ width, height);
+ }
+}
+
+void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, int subpel_search) {
+ int i, j;
+
+ aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
+ }
+ comp_pred += width;
+ pred += width;
+ }
+}
+
+void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask,
+ int subpel_search) {
+ if (subpel_x_q3 | subpel_y_q3) {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+ ref = comp_pred;
+ ref_stride = width;
+ }
+ aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+ mask_stride, invert_mask);
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_c(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+ int i, j;
+ const int fwd_offset = jcp_param->fwd_offset;
+ const int bck_offset = jcp_param->bck_offset;
+
+ aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride,
+ subpel_search);
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+ tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+ comp_pred[j] = (uint8_t)tmp;
+ }
+ comp_pred += width;
+ pred += width;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
+ const struct AV1Common *const cm, int mi_row,
+ int mi_col, const MV *const mv,
+ uint8_t *comp_pred8, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref8, int ref_stride, int bd,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ for (int i = 0; i < height; i++) {
+ memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+ comp_pred += width;
+ ref += ref_stride;
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
+ 16, NULL, -1, width, height, bd);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
+ kernel, 16, width, height, bd);
+ } else {
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
+ ref_stride, CONVERT_TO_BYTEPTR(temp),
+ MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+ intermediate_height, bd);
+ aom_highbd_convolve8_vert_c(
+ CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+ MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+ bd);
+ }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_c(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, int subpel_search) {
+ int i, j;
+
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+ bd, subpel_search);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
+ }
+ comp_pred += width;
+ pred += width;
+ }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+ int subpel_search) {
+ int i, j;
+ const int fwd_offset = jcp_param->fwd_offset;
+ const int bck_offset = jcp_param->bck_offset;
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8,
+ ref_stride, bd, subpel_search);
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+ tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+ comp_pred[j] = (uint16_t)tmp;
+ }
+ comp_pred += width;
+ pred += width;
+ }
+}
+
+void aom_highbd_comp_mask_upsampled_pred(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+ int bd, int subpel_search) {
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+ bd, subpel_search);
+ aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
+ mask, mask_stride, invert_mask);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h
new file mode 100644
index 0000000000..16932f37a0
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_
+#define AOM_AV1_ENCODER_RECONINTER_ENC_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/warped_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, int subpel_x_q3,
+ int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask,
+ int subpel_search);
+
+void aom_highbd_comp_mask_upsampled_pred(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+ int bd, int subpel_search);
+
+// Build single or compound reference inter predictors for all planes.
+// Can build inter-intra predictors, masked predictors etc as well.
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+ int plane_from, int plane_to);
+
+void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col);
+
+void av1_enc_build_inter_predictor_y_nonrd(MACROBLOCKD *xd,
+ InterPredParams *inter_pred_params,
+ const SubpelParams *subpel_params);
+
+// Build one inter predictor. It is called for building predictor for single
+// reference case, or just the 1st or 2nd reference in compound reference case.
+// Can build both regular and masked predictors.
+void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
+ const MV *src_mv,
+ InterPredParams *inter_pred_params);
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive.
+void av1_build_inter_predictors_for_planes_single_buf(
+ MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
+ uint8_t *ext_dst[], int ext_dst_stride[]);
+
+// |ext_dst*| are indexed from |plane_from| to |plane_to| inclusive.
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane_from, int plane_to,
+ uint8_t *ext_dst0[],
+ int ext_dst_stride0[],
+ uint8_t *ext_dst1[],
+ int ext_dst_stride1[]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_
diff --git a/third_party/aom/av1/encoder/saliency_map.c b/third_party/aom/av1/encoder/saliency_map.c
new file mode 100644
index 0000000000..30019bbec0
--- /dev/null
+++ b/third_party/aom/av1/encoder/saliency_map.c
@@ -0,0 +1,1414 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/saliency_map.h"
+
+// The Gabor filter is generated by setting the parameters as:
+// ksize = 9
+// sigma = 1
+// theta = y*np.pi/4, where y /in {0, 1, 2, 3}, i.e., 0, 45, 90, 135 degree
+// lambda1 = 1
+// gamma=0.8
+// phi =0
+static const double kGaborFilter[4][9][9] = { // [angle: 0, 45, 90, 135
+ // degree][ksize][ksize]
+ { { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03,
+ 3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 },
+ { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02,
+ 3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 },
+ { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01,
+ 1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 },
+ { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01,
+ 4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 },
+ { 3.3546262e-04, 1.1108996e-02, 1.3533528e-01, 6.0653067e-01, 1.0000000e+00,
+ 6.0653067e-01, 1.3533528e-01, 1.1108996e-02, 3.3546262e-04 },
+ { 2.4359586e-04, 8.0667874e-03, 9.8273583e-02, 4.4043165e-01, 7.2614902e-01,
+ 4.4043165e-01, 9.8273583e-02, 8.0667874e-03, 2.4359586e-04 },
+ { 9.3271126e-05, 3.0887155e-03, 3.7628256e-02, 1.6863814e-01, 2.7803731e-01,
+ 1.6863814e-01, 3.7628256e-02, 3.0887155e-03, 9.3271126e-05 },
+ { 1.8831115e-05, 6.2360091e-04, 7.5970138e-03, 3.4047455e-02, 5.6134764e-02,
+ 3.4047455e-02, 7.5970138e-03, 6.2360091e-04, 1.8831115e-05 },
+ { 2.0047323e-06, 6.6387620e-05, 8.0876675e-04, 3.6246411e-03, 5.9760227e-03,
+ 3.6246411e-03, 8.0876675e-04, 6.6387620e-05, 2.0047323e-06 } },
+
+ { { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04,
+ 6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05,
+ 3.5712848e-05 },
+ { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03,
+ 1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03,
+ -8.1631159e-05 },
+ { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02,
+ -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03,
+ -9.9486928e-04 },
+ { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01,
+ -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02,
+ 1.3962291e-03 },
+ { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01,
+ 1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02,
+ 6.6981313e-04 },
+ { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01,
+ -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03,
+ -4.4602581e-04 },
+ { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02,
+ -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03,
+ 3.0079011e-06 },
+ { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02,
+ 1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06,
+ 3.8760313e-06 },
+ { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03,
+ 6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06,
+ -6.2165498e-08 } },
+
+ { { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04,
+ 2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 },
+ { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02,
+ 8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 },
+ { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01,
+ 9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 },
+ { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01,
+ 4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 },
+ { 5.9760227e-03, 5.6134764e-02, 2.7803731e-01, 7.2614902e-01, 1.0000000e+00,
+ 7.2614902e-01, 2.7803731e-01, 5.6134764e-02, 5.9760227e-03 },
+ { 3.6246411e-03, 3.4047455e-02, 1.6863814e-01, 4.4043165e-01, 6.0653067e-01,
+ 4.4043165e-01, 1.6863814e-01, 3.4047455e-02, 3.6246411e-03 },
+ { 8.0876675e-04, 7.5970138e-03, 3.7628256e-02, 9.8273583e-02, 1.3533528e-01,
+ 9.8273583e-02, 3.7628256e-02, 7.5970138e-03, 8.0876675e-04 },
+ { 6.6387620e-05, 6.2360091e-04, 3.0887155e-03, 8.0667874e-03, 1.1108996e-02,
+ 8.0667874e-03, 3.0887155e-03, 6.2360091e-04, 6.6387620e-05 },
+ { 2.0047323e-06, 1.8831115e-05, 9.3271126e-05, 2.4359586e-04, 3.3546262e-04,
+ 2.4359586e-04, 9.3271126e-05, 1.8831115e-05, 2.0047323e-06 } },
+
+ { { 3.5712848e-05, -8.1631159e-05, -9.9486928e-04, 1.3962291e-03,
+ 6.6981313e-04, -4.4602581e-04, 3.0079011e-06, 3.8760313e-06,
+ -6.2165498e-08 },
+ { -8.1631159e-05, 3.1511115e-03, -3.7979286e-03, -2.4406660e-02,
+ 1.8061366e-02, 4.5687673e-03, -1.6041942e-03, 5.7044272e-06,
+ 3.8760313e-06 },
+ { -9.9486928e-04, -3.7979286e-03, 7.7304743e-02, -4.9129307e-02,
+ -1.6647682e-01, 6.4960226e-02, 8.6645801e-03, -1.6041942e-03,
+ 3.0079011e-06 },
+ { 1.3962291e-03, -2.4406660e-02, -4.9129307e-02, 5.2729243e-01,
+ -1.7670043e-01, -3.1572008e-01, 6.4960226e-02, 4.5687673e-03,
+ -4.4602581e-04 },
+ { 6.6981313e-04, 1.8061366e-02, -1.6647682e-01, -1.7670043e-01,
+ 1.0000000e+00, -1.7670043e-01, -1.6647682e-01, 1.8061366e-02,
+ 6.6981313e-04 },
+ { -4.4602581e-04, 4.5687673e-03, 6.4960226e-02, -3.1572008e-01,
+ -1.7670043e-01, 5.2729243e-01, -4.9129307e-02, -2.4406660e-02,
+ 1.3962291e-03 },
+ { 3.0079011e-06, -1.6041942e-03, 8.6645801e-03, 6.4960226e-02,
+ -1.6647682e-01, -4.9129307e-02, 7.7304743e-02, -3.7979286e-03,
+ -9.9486928e-04 },
+ { 3.8760313e-06, 5.7044272e-06, -1.6041942e-03, 4.5687673e-03,
+ 1.8061366e-02, -2.4406660e-02, -3.7979286e-03, 3.1511115e-03,
+ -8.1631159e-05 },
+ { -6.2165498e-08, 3.8760313e-06, 3.0079011e-06, -4.4602581e-04,
+ 6.6981313e-04, 1.3962291e-03, -9.9486928e-04, -8.1631159e-05,
+ 3.5712848e-05 } }
+};
+
+// This function is to extract red/green/blue channels, and calculate intensity
+// = (r+g+b)/3. Note that it only handles 8bits case now.
+// TODO(linzhen): add high bitdepth support.
+static void get_color_intensity(const YV12_BUFFER_CONFIG *src,
+ int subsampling_x, int subsampling_y,
+ double *cr, double *cg, double *cb,
+ double *intensity) {
+ const uint8_t *y = src->buffers[0];
+ const uint8_t *u = src->buffers[1];
+ const uint8_t *v = src->buffers[2];
+
+ const int y_height = src->crop_heights[0];
+ const int y_width = src->crop_widths[0];
+ const int y_stride = src->strides[0];
+ const int c_stride = src->strides[1];
+
+ for (int i = 0; i < y_height; ++i) {
+ for (int j = 0; j < y_width; ++j) {
+ cr[i * y_width + j] =
+ fclamp((double)y[i * y_stride + j] +
+ 1.370 * (double)(v[(i >> subsampling_y) * c_stride +
+ (j >> subsampling_x)] -
+ 128),
+ 0, 255);
+ cg[i * y_width + j] =
+ fclamp((double)y[i * y_stride + j] -
+ 0.698 * (double)(u[(i >> subsampling_y) * c_stride +
+ (j >> subsampling_x)] -
+ 128) -
+ 0.337 * (double)(v[(i >> subsampling_y) * c_stride +
+ (j >> subsampling_x)] -
+ 128),
+ 0, 255);
+ cb[i * y_width + j] =
+ fclamp((double)y[i * y_stride + j] +
+ 1.732 * (double)(u[(i >> subsampling_y) * c_stride +
+ (j >> subsampling_x)] -
+ 128),
+ 0, 255);
+
+ intensity[i * y_width + j] =
+ (cr[i * y_width + j] + cg[i * y_width + j] + cb[i * y_width + j]) /
+ 3.0;
+ assert(intensity[i * y_width + j] >= 0 &&
+ intensity[i * y_width + j] <= 255);
+
+ intensity[i * y_width + j] /= 256;
+ cr[i * y_width + j] /= 256;
+ cg[i * y_width + j] /= 256;
+ cb[i * y_width + j] /= 256;
+ }
+ }
+}
+
+static INLINE double convolve_map(const double *filter, const double *map,
+ const int size) {
+ double result = 0;
+ for (int i = 0; i < size; ++i) {
+ result += filter[i] * map[i]; // symmetric filter is used
+ }
+ return result;
+}
+
+// This function is to decimate the map by half, and apply Gaussian filter on
+// top of the downsampled map.
+static INLINE void decimate_map(const double *map, int height, int width,
+ int stride, double *downsampled_map) {
+ const int new_width = width / 2;
+ const int window_size = 5;
+ const double gaussian_filter[25] = {
+ 1. / 256, 1.0 / 64, 3. / 128, 1. / 64, 1. / 256, 1. / 64, 1. / 16,
+ 3. / 32, 1. / 16, 1. / 64, 3. / 128, 3. / 32, 9. / 64, 3. / 32,
+ 3. / 128, 1. / 64, 1. / 16, 3. / 32, 1. / 16, 1. / 64, 1. / 256,
+ 1. / 64, 3. / 128, 1. / 64, 1. / 256
+ };
+
+ double map_region[25];
+ for (int y = 0; y < height - 1; y += 2) {
+ for (int x = 0; x < width - 1; x += 2) {
+ int i = 0;
+ for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) {
+ for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) {
+ int yvalue = clamp(yy, 0, height - 1);
+ int xvalue = clamp(xx, 0, width - 1);
+ map_region[i++] = map[yvalue * stride + xvalue];
+ }
+ }
+ downsampled_map[(y / 2) * new_width + (x / 2)] =
+ convolve_map(gaussian_filter, map_region, window_size * window_size);
+ }
+ }
+}
+
+// This function is to upscale the map from in_level size to out_level size.
+// Note that the map at "level-1" will upscale the map at "level" by x2.
+static INLINE int upscale_map(const double *input, int in_level, int out_level,
+ int height[9], int width[9], double *output) {
+ for (int level = in_level; level > out_level; level--) {
+ const int cur_width = width[level];
+ const int cur_height = height[level];
+ const int cur_stride = width[level];
+
+ double *original = (level == in_level) ? (double *)input : output;
+
+ assert(level > 0);
+
+ const int h_upscale = height[level - 1];
+ const int w_upscale = width[level - 1];
+ const int s_upscale = width[level - 1];
+
+ double *upscale = aom_malloc(h_upscale * w_upscale * sizeof(*upscale));
+
+ if (!upscale) {
+ return 0;
+ }
+
+ for (int i = 0; i < h_upscale; ++i) {
+ for (int j = 0; j < w_upscale; ++j) {
+ const int ii = clamp((i >> 1), 0, cur_height - 1);
+ const int jj = clamp((j >> 1), 0, cur_width - 1);
+ upscale[j + i * s_upscale] = (double)original[jj + ii * cur_stride];
+ }
+ }
+ memcpy(output, upscale, h_upscale * w_upscale * sizeof(double));
+ aom_free(upscale);
+ }
+
+ return 1;
+}
+
+// This function calculates the differences between a fine scale c and a
+// coarser scale s yielding the feature maps. c \in {2, 3, 4}, and s = c +
+// delta, where delta \in {3, 4}.
+static int center_surround_diff(const double *input[9], int height[9],
+ int width[9], saliency_feature_map *output[6]) {
+ int j = 0;
+ for (int k = 2; k < 5; ++k) {
+ int cur_height = height[k];
+ int cur_width = width[k];
+
+ if (upscale_map(input[k + 3], k + 3, k, height, width, output[j]->buf) ==
+ 0) {
+ return 0;
+ }
+
+ for (int r = 0; r < cur_height; ++r) {
+ for (int c = 0; c < cur_width; ++c) {
+ output[j]->buf[r * cur_width + c] =
+ fabs((double)(input[k][r * cur_width + c] -
+ output[j]->buf[r * cur_width + c]));
+ }
+ }
+
+ if (upscale_map(input[k + 4], k + 4, k, height, width,
+ output[j + 1]->buf) == 0) {
+ return 0;
+ }
+
+ for (int r = 0; r < cur_height; ++r) {
+ for (int c = 0; c < cur_width; ++c) {
+ output[j + 1]->buf[r * cur_width + c] =
+ fabs(input[k][r * cur_width + c] -
+ output[j + 1]->buf[r * cur_width + c]);
+ }
+ }
+
+ j += 2;
+ }
+ return 1;
+}
+
+// For color channels, the differences is calculated based on "color
+// double-opponency". For example, the RG feature map is constructed between a
+// fine scale c of R-G component and a coarser scale s of G-R component.
+static int center_surround_diff_rgb(const double *input_1[9],
+ const double *input_2[9], int height[9],
+ int width[9],
+ saliency_feature_map *output[6]) {
+ int j = 0;
+ for (int k = 2; k < 5; ++k) {
+ int cur_height = height[k];
+ int cur_width = width[k];
+
+ if (upscale_map(input_2[k + 3], k + 3, k, height, width, output[j]->buf) ==
+ 0) {
+ return 0;
+ }
+
+ for (int r = 0; r < cur_height; ++r) {
+ for (int c = 0; c < cur_width; ++c) {
+ output[j]->buf[r * cur_width + c] =
+ fabs((double)(input_1[k][r * cur_width + c] -
+ output[j]->buf[r * cur_width + c]));
+ }
+ }
+
+ if (upscale_map(input_2[k + 4], k + 4, k, height, width,
+ output[j + 1]->buf) == 0) {
+ return 0;
+ }
+
+ for (int r = 0; r < cur_height; ++r) {
+ for (int c = 0; c < cur_width; ++c) {
+ output[j + 1]->buf[r * cur_width + c] =
+ fabs(input_1[k][r * cur_width + c] -
+ output[j + 1]->buf[r * cur_width + c]);
+ }
+ }
+
+ j += 2;
+ }
+ return 1;
+}
+
+// This function is to generate Gaussian pyramid images with indexes from 0 to
+// 8, and construct the feature maps from calculating the center-surround
+// differences.
+static int gaussian_pyramid(const double *src, int width[9], int height[9],
+ saliency_feature_map *dst[6]) {
+ double *gaussian_map[9]; // scale = 9
+ gaussian_map[0] =
+ (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0]));
+ if (!gaussian_map[0]) {
+ return 0;
+ }
+
+ memcpy(gaussian_map[0], src, width[0] * height[0] * sizeof(double));
+
+ for (int i = 1; i < 9; ++i) {
+ int stride = width[i - 1];
+ int new_width = width[i];
+ int new_height = height[i];
+
+ gaussian_map[i] =
+ (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i]));
+
+ if (!gaussian_map[i]) {
+ for (int l = 0; l < i; ++l) {
+ aom_free(gaussian_map[l]);
+ }
+ return 0;
+ }
+
+ memset(gaussian_map[i], 0, new_width * new_height * sizeof(double));
+
+ decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride,
+ gaussian_map[i]);
+ }
+
+ if (center_surround_diff((const double **)gaussian_map, height, width, dst) ==
+ 0) {
+ for (int l = 0; l < 9; ++l) {
+ aom_free(gaussian_map[l]);
+ }
+ return 0;
+ }
+
+ for (int i = 0; i < 9; ++i) {
+ aom_free(gaussian_map[i]);
+ }
+ return 1;
+}
+
+static int gaussian_pyramid_rgb(double *src_1, double *src_2, int width[9],
+ int height[9], saliency_feature_map *dst[6]) {
+ double *gaussian_map[2][9]; // scale = 9
+ double *src[2];
+
+ src[0] = src_1;
+ src[1] = src_2;
+
+ for (int k = 0; k < 2; ++k) {
+ gaussian_map[k][0] = (double *)aom_malloc(width[0] * height[0] *
+ sizeof(*gaussian_map[k][0]));
+ if (!gaussian_map[k][0]) {
+ for (int l = 0; l < k; ++l) {
+ aom_free(gaussian_map[l][0]);
+ }
+ return 0;
+ }
+ memcpy(gaussian_map[k][0], src[k], width[0] * height[0] * sizeof(double));
+
+ for (int i = 1; i < 9; ++i) {
+ int stride = width[i - 1];
+ int new_width = width[i];
+ int new_height = height[i];
+
+ gaussian_map[k][i] = (double *)aom_malloc(new_width * new_height *
+ sizeof(*gaussian_map[k][i]));
+ if (!gaussian_map[k][i]) {
+ for (int l = 0; l < k; ++l) {
+ aom_free(gaussian_map[l][i]);
+ }
+ return 0;
+ }
+ memset(gaussian_map[k][i], 0, new_width * new_height * sizeof(double));
+ decimate_map(gaussian_map[k][i - 1], height[i - 1], width[i - 1], stride,
+ gaussian_map[k][i]);
+ }
+ }
+
+ if (center_surround_diff_rgb((const double **)gaussian_map[0],
+ (const double **)gaussian_map[1], height, width,
+ dst) == 0) {
+ for (int l = 0; l < 2; ++l) {
+ for (int i = 0; i < 9; ++i) {
+ aom_free(gaussian_map[l][i]);
+ }
+ }
+ return 0;
+ }
+
+ for (int l = 0; l < 2; ++l) {
+ for (int i = 0; i < 9; ++i) {
+ aom_free(gaussian_map[l][i]);
+ }
+ }
+ return 1;
+}
+
+static int get_feature_map_intensity(double *intensity, int width[9],
+ int height[9],
+ saliency_feature_map *i_map[6]) {
+ if (gaussian_pyramid(intensity, width, height, i_map) == 0) {
+ return 0;
+ }
+ return 1;
+}
+
+static int get_feature_map_rgb(double *cr, double *cg, double *cb, int width[9],
+ int height[9], saliency_feature_map *rg_map[6],
+ saliency_feature_map *by_map[6]) {
+ double *rg_mat = aom_malloc(height[0] * width[0] * sizeof(*rg_mat));
+ double *by_mat = aom_malloc(height[0] * width[0] * sizeof(*by_mat));
+ double *gr_mat = aom_malloc(height[0] * width[0] * sizeof(*gr_mat));
+ double *yb_mat = aom_malloc(height[0] * width[0] * sizeof(*yb_mat));
+
+ if (!rg_mat || !by_mat || !gr_mat || !yb_mat) {
+ aom_free(rg_mat);
+ aom_free(by_mat);
+ aom_free(gr_mat);
+ aom_free(yb_mat);
+ return 0;
+ }
+
+ double r, g, b, y;
+ for (int i = 0; i < height[0]; ++i) {
+ for (int j = 0; j < width[0]; ++j) {
+ r = AOMMAX(0, cr[i * width[0] + j] -
+ (cg[i * width[0] + j] + cb[i * width[0] + j]) / 2);
+ g = AOMMAX(0, cg[i * width[0] + j] -
+ (cr[i * width[0] + j] + cb[i * width[0] + j]) / 2);
+ b = AOMMAX(0, cb[i * width[0] + j] -
+ (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2);
+ y = AOMMAX(0, (cr[i * width[0] + j] + cg[i * width[0] + j]) / 2 -
+ fabs(cr[i * width[0] + j] - cg[i * width[0] + j]) / 2 -
+ cb[i * width[0] + j]);
+
+ rg_mat[i * width[0] + j] = r - g;
+ by_mat[i * width[0] + j] = b - y;
+ gr_mat[i * width[0] + j] = g - r;
+ yb_mat[i * width[0] + j] = y - b;
+ }
+ }
+
+ if (gaussian_pyramid_rgb(rg_mat, gr_mat, width, height, rg_map) == 0 ||
+ gaussian_pyramid_rgb(by_mat, yb_mat, width, height, by_map) == 0) {
+ aom_free(rg_mat);
+ aom_free(by_mat);
+ aom_free(gr_mat);
+ aom_free(yb_mat);
+ return 0;
+ }
+
+ aom_free(rg_mat);
+ aom_free(by_mat);
+ aom_free(gr_mat);
+ aom_free(yb_mat);
+ return 1;
+}
+
+static INLINE void filter2d(const double *input, const double kernel[9][9],
+ int width, int height, double *output) {
+ const int window_size = 9;
+ double map_section[81];
+ for (int y = 0; y <= height - 1; ++y) {
+ for (int x = 0; x <= width - 1; ++x) {
+ int i = 0;
+ for (int yy = y - window_size / 2; yy <= y + window_size / 2; ++yy) {
+ for (int xx = x - window_size / 2; xx <= x + window_size / 2; ++xx) {
+ int yvalue = clamp(yy, 0, height - 1);
+ int xvalue = clamp(xx, 0, width - 1);
+ map_section[i++] = input[yvalue * width + xvalue];
+ }
+ }
+
+ output[y * width + x] = 0;
+ for (int k = 0; k < window_size; ++k) {
+ for (int l = 0; l < window_size; ++l) {
+ output[y * width + x] +=
+ kernel[k][l] * map_section[k * window_size + l];
+ }
+ }
+ }
+ }
+}
+
+static int get_feature_map_orientation(const double *intensity, int width[9],
+ int height[9],
+ saliency_feature_map *dst[24]) {
+ double *gaussian_map[9];
+
+ gaussian_map[0] =
+ (double *)aom_malloc(width[0] * height[0] * sizeof(*gaussian_map[0]));
+ if (!gaussian_map[0]) {
+ return 0;
+ }
+ memcpy(gaussian_map[0], intensity, width[0] * height[0] * sizeof(double));
+
+ for (int i = 1; i < 9; ++i) {
+ int stride = width[i - 1];
+ int new_width = width[i];
+ int new_height = height[i];
+
+ gaussian_map[i] =
+ (double *)aom_malloc(new_width * new_height * sizeof(*gaussian_map[i]));
+ if (!gaussian_map[i]) {
+ for (int l = 0; l < i; ++l) {
+ aom_free(gaussian_map[l]);
+ }
+ return 0;
+ }
+ memset(gaussian_map[i], 0, new_width * new_height * sizeof(double));
+ decimate_map(gaussian_map[i - 1], height[i - 1], width[i - 1], stride,
+ gaussian_map[i]);
+ }
+
+ double *tempGaborOutput[4][9]; //[angle: 0, 45, 90, 135 degree][filter_size]
+
+ for (int i = 2; i < 9; ++i) {
+ const int cur_height = height[i];
+ const int cur_width = width[i];
+ for (int j = 0; j < 4; ++j) {
+ tempGaborOutput[j][i] = (double *)aom_malloc(
+ cur_height * cur_width * sizeof(*tempGaborOutput[j][i]));
+ if (!tempGaborOutput[j][i]) {
+ for (int l = 0; l < 9; ++l) {
+ aom_free(gaussian_map[l]);
+ }
+ for (int h = 0; h < 4; ++h) {
+ for (int g = 2; g < 9; ++g) {
+ aom_free(tempGaborOutput[h][g]);
+ }
+ }
+ return 0;
+ }
+ filter2d(gaussian_map[i], kGaborFilter[j], cur_width, cur_height,
+ tempGaborOutput[j][i]);
+ }
+ }
+
+ for (int i = 0; i < 9; ++i) {
+ aom_free(gaussian_map[i]);
+ }
+
+ saliency_feature_map
+ *tmp[4][6]; //[angle: 0, 45, 90, 135 degree][filter_size]
+
+ for (int i = 0; i < 6; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ tmp[j][i] = dst[j * 6 + i];
+ }
+ }
+
+ for (int j = 0; j < 4; ++j) {
+ if (center_surround_diff((const double **)tempGaborOutput[j], height, width,
+ tmp[j]) == 0) {
+ for (int h = 0; h < 4; ++h) {
+ for (int g = 2; g < 9; ++g) {
+ aom_free(tempGaborOutput[h][g]);
+ }
+ }
+ return 0;
+ }
+ }
+
+ for (int i = 2; i < 9; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ aom_free(tempGaborOutput[j][i]);
+ }
+ }
+
+ return 1;
+}
+
+static INLINE void find_min_max(const saliency_feature_map *input,
+ double *max_value, double *min_value) {
+ assert(input && input->buf);
+ *min_value = DBL_MAX;
+ *max_value = 0.0;
+
+ for (int i = 0; i < input->height; ++i) {
+ for (int j = 0; j < input->width; ++j) {
+ assert(input->buf[i * input->width + j] >= 0.0);
+ *min_value = fmin(input->buf[i * input->width + j], *min_value);
+ *max_value = fmax(input->buf[i * input->width + j], *max_value);
+ }
+ }
+}
+
+static INLINE double average_local_max(const saliency_feature_map *input,
+ int stepsize) {
+ int numlocal = 0;
+ double lmaxmean = 0, lmax = 0, dummy = 0;
+ saliency_feature_map local_map;
+ local_map.height = stepsize;
+ local_map.width = stepsize;
+ local_map.buf =
+ (double *)aom_malloc(stepsize * stepsize * sizeof(*local_map.buf));
+
+ if (!local_map.buf) {
+ return -1;
+ }
+
+ for (int y = 0; y < input->height - stepsize; y += stepsize) {
+ for (int x = 0; x < input->width - stepsize; x += stepsize) {
+ for (int i = 0; i < stepsize; ++i) {
+ for (int j = 0; j < stepsize; ++j) {
+ local_map.buf[i * stepsize + j] =
+ input->buf[(y + i) * input->width + x + j];
+ }
+ }
+
+ find_min_max(&local_map, &lmax, &dummy);
+ lmaxmean += lmax;
+ numlocal++;
+ }
+ }
+
+ aom_free(local_map.buf);
+
+ return lmaxmean / numlocal;
+}
+
+// Linear normalization the values in the map to [0,1].
+static void minmax_normalize(saliency_feature_map *input) {
+ double max_value, min_value;
+ find_min_max(input, &max_value, &min_value);
+
+ for (int i = 0; i < input->height; ++i) {
+ for (int j = 0; j < input->width; ++j) {
+ if (max_value != min_value) {
+ input->buf[i * input->width + j] =
+ input->buf[i * input->width + j] / (max_value - min_value) +
+ min_value / (min_value - max_value);
+ } else {
+ input->buf[i * input->width + j] -= min_value;
+ }
+ }
+ }
+}
+
+// This function is to promote meaningful “activation spots” in the map and
+// ignores homogeneous areas.
+static int nomalization_operator(saliency_feature_map *input, int stepsize) {
+ minmax_normalize(input);
+ double lmaxmean = average_local_max(input, stepsize);
+ if (lmaxmean < 0) {
+ return 0;
+ }
+ double normCoeff = (1 - lmaxmean) * (1 - lmaxmean);
+
+ for (int i = 0; i < input->height; ++i) {
+ for (int j = 0; j < input->width; ++j) {
+ input->buf[i * input->width + j] *= normCoeff;
+ }
+ }
+
+ return 1;
+}
+
+// Normalize the values in feature maps to [0,1], and then upscale all maps to
+// the original frame size.
+static int normalize_fm(saliency_feature_map *input[6], int width[9],
+ int height[9], int num_fm,
+ saliency_feature_map *output[6]) {
+ // Feature maps (FM) are generated by function "center_surround_diff()". The
+ // difference is between a fine scale c and a coarser scale s, where c \in {2,
+ // 3, 4}, and s = c + delta, where delta \in {3, 4}, and the FM size is scale
+ // c. Specifically, i=0: c=2 and s=5, i=1: c=2 and s=6, i=2: c=3 and s=6, i=3:
+ // c=3 and s=7, i=4: c=4 and s=7, i=5: c=4 and s=8.
+ for (int i = 0; i < num_fm; ++i) {
+ if (nomalization_operator(input[i], 8) == 0) {
+ return 0;
+ }
+
+ // Upscale FM to original frame size
+ if (upscale_map(input[i]->buf, (i / 2) + 2, 0, height, width,
+ output[i]->buf) == 0) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+// Combine feature maps with the same category (intensity, color, or
+// orientation) into one conspicuity map.
+static int normalized_map(saliency_feature_map *input[6], int width[9],
+ int height[9], saliency_feature_map *output) {
+ int num_fm = 6;
+
+ saliency_feature_map *n_input[6];
+ for (int i = 0; i < 6; ++i) {
+ n_input[i] = (saliency_feature_map *)aom_malloc(sizeof(*n_input[i]));
+ if (!n_input[i]) {
+ return 0;
+ }
+ n_input[i]->buf =
+ (double *)aom_malloc(width[0] * height[0] * sizeof(*n_input[i]->buf));
+ if (!n_input[i]->buf) {
+ aom_free(n_input[i]);
+ return 0;
+ }
+ n_input[i]->height = height[0];
+ n_input[i]->width = width[0];
+ }
+
+ if (normalize_fm(input, width, height, num_fm, n_input) == 0) {
+ for (int i = 0; i < num_fm; ++i) {
+ aom_free(n_input[i]->buf);
+ aom_free(n_input[i]);
+ }
+ return 0;
+ }
+
+ // Add up all normalized feature maps with the same category into one map.
+ for (int i = 0; i < num_fm; ++i) {
+ for (int r = 0; r < height[0]; ++r) {
+ for (int c = 0; c < width[0]; ++c) {
+ output->buf[r * width[0] + c] += n_input[i]->buf[r * width[0] + c];
+ }
+ }
+ }
+
+ for (int i = 0; i < num_fm; ++i) {
+ aom_free(n_input[i]->buf);
+ aom_free(n_input[i]);
+ }
+
+ nomalization_operator(output, 8);
+ return 1;
+}
+
+static int normalized_map_rgb(saliency_feature_map *rg_map[6],
+ saliency_feature_map *by_map[6], int width[9],
+ int height[9], saliency_feature_map *output) {
+ saliency_feature_map *color_cm[2]; // 0: color_cm_rg, 1: color_cm_by
+ for (int i = 0; i < 2; ++i) {
+ color_cm[i] = aom_malloc(sizeof(*color_cm[i]));
+ if (!color_cm[i]) {
+ return 0;
+ }
+ color_cm[i]->buf =
+ (double *)aom_malloc(width[0] * height[0] * sizeof(*color_cm[i]->buf));
+ if (!color_cm[i]->buf) {
+ for (int l = 0; l < i; ++l) {
+ aom_free(color_cm[l]->buf);
+ }
+ aom_free(color_cm[i]);
+ return 0;
+ }
+
+ color_cm[i]->width = width[0];
+ color_cm[i]->height = height[0];
+ memset(color_cm[i]->buf, 0,
+ width[0] * height[0] * sizeof(*color_cm[i]->buf));
+ }
+
+ if (normalized_map(rg_map, width, height, color_cm[0]) == 0 ||
+ normalized_map(by_map, width, height, color_cm[1]) == 0) {
+ for (int i = 0; i < 2; ++i) {
+ aom_free(color_cm[i]->buf);
+ aom_free(color_cm[i]);
+ }
+ return 0;
+ }
+
+ for (int r = 0; r < height[0]; ++r) {
+ for (int c = 0; c < width[0]; ++c) {
+ output->buf[r * width[0] + c] = color_cm[0]->buf[r * width[0] + c] +
+ color_cm[1]->buf[r * width[0] + c];
+ }
+ }
+
+ for (int i = 0; i < 2; ++i) {
+ aom_free(color_cm[i]->buf);
+ aom_free(color_cm[i]);
+ }
+
+ nomalization_operator(output, 8);
+ return 1;
+}
+
+static int normalized_map_orientation(saliency_feature_map *orientation_map[24],
+ int width[9], int height[9],
+ saliency_feature_map *output) {
+ int num_fms_per_angle = 6;
+
+ saliency_feature_map *ofm[4][6];
+ for (int i = 0; i < num_fms_per_angle; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ ofm[j][i] = orientation_map[j * num_fms_per_angle + i];
+ }
+ }
+
+ // extract conspicuity map for each angle
+ saliency_feature_map *nofm = aom_malloc(sizeof(*nofm));
+ if (!nofm) {
+ return 0;
+ }
+ nofm->buf = (double *)aom_malloc(width[0] * height[0] * sizeof(*nofm->buf));
+ if (!nofm->buf) {
+ aom_free(nofm);
+ return 0;
+ }
+ nofm->height = height[0];
+ nofm->width = width[0];
+
+ for (int i = 0; i < 4; ++i) {
+ memset(nofm->buf, 0, width[0] * height[0] * sizeof(*nofm->buf));
+ if (normalized_map(ofm[i], width, height, nofm) == 0) {
+ aom_free(nofm->buf);
+ aom_free(nofm);
+ return 0;
+ }
+
+ for (int r = 0; r < height[0]; ++r) {
+ for (int c = 0; c < width[0]; ++c) {
+ output->buf[r * width[0] + c] += nofm->buf[r * width[0] + c];
+ }
+ }
+ }
+
+ aom_free(nofm->buf);
+ aom_free(nofm);
+
+ nomalization_operator(output, 8);
+ return 1;
+}
+
+// Set pixel level saliency mask based on Itti-Koch algorithm
+int av1_set_saliency_map(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ int frm_width = cm->width;
+ int frm_height = cm->height;
+
+ int pyr_height[9];
+ int pyr_width[9];
+
+ pyr_height[0] = frm_height;
+ pyr_width[0] = frm_width;
+
+ for (int i = 1; i < 9; ++i) {
+ pyr_width[i] = pyr_width[i - 1] / 2;
+ pyr_height[i] = pyr_height[i - 1] / 2;
+ }
+
+ double *cr = aom_malloc(frm_width * frm_height * sizeof(*cr));
+ double *cg = aom_malloc(frm_width * frm_height * sizeof(*cg));
+ double *cb = aom_malloc(frm_width * frm_height * sizeof(*cb));
+ double *intensity = aom_malloc(frm_width * frm_height * sizeof(*intensity));
+
+ if (!cr || !cg || !cb || !intensity) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ return 0;
+ }
+
+ // Extract red / green / blue channels and intensity component
+ get_color_intensity(cpi->source, cm->seq_params->subsampling_x,
+ cm->seq_params->subsampling_y, cr, cg, cb, intensity);
+
+ // Feature Map Extraction
+ // intensity map
+ saliency_feature_map *i_map[6];
+ for (int i = 0; i < 6; ++i) {
+ int cur_height = pyr_height[(i / 2) + 2];
+ int cur_width = pyr_width[(i / 2) + 2];
+
+ i_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*i_map[i]));
+ if (!i_map[i]) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < i; ++l) {
+ aom_free(i_map[l]);
+ }
+ return 0;
+ }
+ i_map[i]->buf =
+ (double *)aom_malloc(cur_height * cur_width * sizeof(*i_map[i]->buf));
+ if (!i_map[i]->buf) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < i; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(i_map[l]);
+ }
+ return 0;
+ }
+ i_map[i]->height = cur_height;
+ i_map[i]->width = cur_width;
+ }
+
+ if (get_feature_map_intensity(intensity, pyr_width, pyr_height, i_map) == 0) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(i_map[l]);
+ }
+ return 0;
+ }
+
+ // RGB map
+ saliency_feature_map *rg_map[6], *by_map[6];
+ for (int i = 0; i < 6; ++i) {
+ int cur_height = pyr_height[(i / 2) + 2];
+ int cur_width = pyr_width[(i / 2) + 2];
+ rg_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*rg_map[i]));
+ by_map[i] = (saliency_feature_map *)aom_malloc(sizeof(*by_map[i]));
+ if (!rg_map[i] || !by_map[i]) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ return 0;
+ }
+ rg_map[i]->buf =
+ (double *)aom_malloc(cur_height * cur_width * sizeof(*rg_map[i]->buf));
+ by_map[i]->buf =
+ (double *)aom_malloc(cur_height * cur_width * sizeof(*by_map[i]->buf));
+ if (!by_map[i]->buf || !rg_map[i]->buf) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(i_map[l]);
+ }
+ for (int l = 0; l < i; ++l) {
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ return 0;
+ }
+ rg_map[i]->height = cur_height;
+ rg_map[i]->width = cur_width;
+ by_map[i]->height = cur_height;
+ by_map[i]->width = cur_width;
+ }
+
+ if (get_feature_map_rgb(cr, cg, cb, pyr_width, pyr_height, rg_map, by_map) ==
+ 0) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ return 0;
+ }
+
+ // Orientation map
+ saliency_feature_map *orientation_map[24];
+ for (int i = 0; i < 24; ++i) {
+ int cur_height = pyr_height[((i % 6) / 2) + 2];
+ int cur_width = pyr_width[((i % 6) / 2) + 2];
+
+ orientation_map[i] =
+ (saliency_feature_map *)aom_malloc(sizeof(*orientation_map[i]));
+ if (!orientation_map[i]) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ for (int h = 0; h < i; ++h) {
+ aom_free(orientation_map[h]);
+ }
+ return 0;
+ }
+
+ orientation_map[i]->buf = (double *)aom_malloc(
+ cur_height * cur_width * sizeof(*orientation_map[i]->buf));
+ if (!orientation_map[i]->buf) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+
+ for (int h = 0; h < i; ++h) {
+ aom_free(orientation_map[h]->buf);
+ aom_free(orientation_map[h]->buf);
+ aom_free(orientation_map[h]);
+ aom_free(orientation_map[h]);
+ }
+ return 0;
+ }
+
+ orientation_map[i]->height = cur_height;
+ orientation_map[i]->width = cur_width;
+ }
+
+ if (get_feature_map_orientation(intensity, pyr_width, pyr_height,
+ orientation_map) == 0) {
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ for (int h = 0; h < 24; ++h) {
+ aom_free(orientation_map[h]->buf);
+ aom_free(orientation_map[h]);
+ }
+ return 0;
+ }
+
+ aom_free(cr);
+ aom_free(cg);
+ aom_free(cb);
+ aom_free(intensity);
+
+ saliency_feature_map
+ *normalized_maps[3]; // 0: intensity, 1: color, 2: orientation
+
+ for (int i = 0; i < 3; ++i) {
+ normalized_maps[i] = aom_malloc(sizeof(*normalized_maps[i]));
+ if (!normalized_maps[i]) {
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+
+ for (int h = 0; h < 24; ++h) {
+ aom_free(orientation_map[h]->buf);
+ aom_free(orientation_map[h]);
+ }
+
+ for (int l = 0; l < i; ++l) {
+ aom_free(normalized_maps[l]);
+ }
+ return 0;
+ }
+ normalized_maps[i]->buf = (double *)aom_malloc(
+ frm_width * frm_height * sizeof(*normalized_maps[i]->buf));
+ if (!normalized_maps[i]->buf) {
+ for (int l = 0; l < 6; ++l) {
+ aom_free(i_map[l]->buf);
+ aom_free(rg_map[l]->buf);
+ aom_free(by_map[l]->buf);
+ aom_free(i_map[l]);
+ aom_free(rg_map[l]);
+ aom_free(by_map[l]);
+ }
+ for (int h = 0; h < 24; ++h) {
+ aom_free(orientation_map[h]->buf);
+ aom_free(orientation_map[h]);
+ }
+ for (int l = 0; l < i; ++l) {
+ aom_free(normalized_maps[l]->buf);
+ aom_free(normalized_maps[l]);
+ }
+ return 0;
+ }
+ normalized_maps[i]->width = frm_width;
+ normalized_maps[i]->height = frm_height;
+ memset(normalized_maps[i]->buf, 0,
+ frm_width * frm_height * sizeof(*normalized_maps[i]->buf));
+ }
+
+ // Conspicuity map generation
+ if (normalized_map(i_map, pyr_width, pyr_height, normalized_maps[0]) == 0 ||
+ normalized_map_rgb(rg_map, by_map, pyr_width, pyr_height,
+ normalized_maps[1]) == 0 ||
+ normalized_map_orientation(orientation_map, pyr_width, pyr_height,
+ normalized_maps[2]) == 0) {
+ for (int i = 0; i < 6; ++i) {
+ aom_free(i_map[i]->buf);
+ aom_free(rg_map[i]->buf);
+ aom_free(by_map[i]->buf);
+ aom_free(i_map[i]);
+ aom_free(rg_map[i]);
+ aom_free(by_map[i]);
+ }
+
+ for (int i = 0; i < 24; ++i) {
+ aom_free(orientation_map[i]->buf);
+ aom_free(orientation_map[i]);
+ }
+
+ for (int i = 0; i < 3; ++i) {
+ aom_free(normalized_maps[i]->buf);
+ aom_free(normalized_maps[i]);
+ }
+ return 0;
+ }
+
+ for (int i = 0; i < 6; ++i) {
+ aom_free(i_map[i]->buf);
+ aom_free(rg_map[i]->buf);
+ aom_free(by_map[i]->buf);
+ aom_free(i_map[i]);
+ aom_free(rg_map[i]);
+ aom_free(by_map[i]);
+ }
+
+ for (int i = 0; i < 24; ++i) {
+ aom_free(orientation_map[i]->buf);
+ aom_free(orientation_map[i]);
+ }
+
+ // Pixel level saliency map
+ saliency_feature_map *combined_saliency_map =
+ aom_malloc(sizeof(*combined_saliency_map));
+ if (!combined_saliency_map) {
+ for (int i = 0; i < 3; ++i) {
+ aom_free(normalized_maps[i]->buf);
+ aom_free(normalized_maps[i]);
+ }
+ return 0;
+ }
+
+ combined_saliency_map->buf = (double *)aom_malloc(
+ frm_width * frm_height * sizeof(*combined_saliency_map->buf));
+ if (!combined_saliency_map->buf) {
+ for (int i = 0; i < 3; ++i) {
+ aom_free(normalized_maps[i]->buf);
+ aom_free(normalized_maps[i]);
+ }
+
+ aom_free(combined_saliency_map);
+ return 0;
+ }
+ combined_saliency_map->height = frm_height;
+ combined_saliency_map->width = frm_width;
+
+ double w_intensity, w_color, w_orient;
+
+ w_intensity = w_color = w_orient = (double)1 / 3;
+
+ for (int r = 0; r < frm_height; ++r) {
+ for (int c = 0; c < frm_width; ++c) {
+ combined_saliency_map->buf[r * frm_width + c] =
+ (w_intensity * normalized_maps[0]->buf[r * frm_width + c] +
+ w_color * normalized_maps[1]->buf[r * frm_width + c] +
+ w_orient * normalized_maps[2]->buf[r * frm_width + c]);
+ }
+ }
+
+ for (int r = 0; r < frm_height; ++r) {
+ for (int c = 0; c < frm_width; ++c) {
+ int index = r * frm_width + c;
+ cpi->saliency_map[index] =
+ (uint8_t)(combined_saliency_map->buf[index] * 255);
+ }
+ }
+
+ for (int i = 0; i < 3; ++i) {
+ aom_free(normalized_maps[i]->buf);
+ aom_free(normalized_maps[i]);
+ }
+
+ aom_free(combined_saliency_map->buf);
+ aom_free(combined_saliency_map);
+
+ return 1;
+}
+
+// Set superblock level saliency mask for rdmult scaling
+int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio) {
+ AV1_COMMON *cm = &cpi->common;
+
+ saliency_feature_map *sb_saliency_map =
+ aom_malloc(sizeof(saliency_feature_map));
+
+ if (sb_saliency_map == NULL) {
+ return 0;
+ }
+
+ const BLOCK_SIZE bsize = cm->seq_params->sb_size;
+ const int num_mi_w = mi_size_wide[bsize];
+ const int num_mi_h = mi_size_high[bsize];
+ const int block_width = block_size_wide[bsize];
+ const int block_height = block_size_high[bsize];
+ const int num_sb_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_sb_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+
+ sb_saliency_map->height = num_sb_rows;
+ sb_saliency_map->width = num_sb_cols;
+ sb_saliency_map->buf = (double *)aom_malloc(num_sb_rows * num_sb_cols *
+ sizeof(*sb_saliency_map->buf));
+
+ if (sb_saliency_map->buf == NULL) {
+ aom_free(sb_saliency_map);
+ return 0;
+ }
+
+ for (int row = 0; row < num_sb_rows; ++row) {
+ for (int col = 0; col < num_sb_cols; ++col) {
+ const int index = row * num_sb_cols + col;
+ double total_pixel = 0;
+ double total_weight = 0;
+
+ for (int i = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j++) {
+ if ((row * block_height + i) >= cpi->common.height ||
+ (col * block_width + j) >= cpi->common.width)
+ continue;
+ total_pixel++;
+ total_weight +=
+ cpi->saliency_map[(row * block_height + i) * cpi->common.width +
+ col * block_width + j];
+ }
+ }
+
+ assert(total_pixel > 0);
+
+ // Calculate the superblock level saliency map from pixel level saliency
+ // map
+ sb_saliency_map->buf[index] = total_weight / total_pixel;
+
+ // Further lower the superblock saliency score for boundary superblocks.
+ if (row < 1 || row > num_sb_rows - 2 || col < 1 ||
+ col > num_sb_cols - 2) {
+ sb_saliency_map->buf[index] /= 5;
+ }
+ }
+ }
+
+ // superblock level saliency map finalization
+ minmax_normalize(sb_saliency_map);
+
+ double log_sum = 0.0;
+ double sum = 0.0;
+ int block_count = 0;
+
+ // Calculate the average superblock sm_scaling_factor for a frame, to be used
+ // for clamping later.
+ for (int row = 0; row < num_sb_rows; ++row) {
+ for (int col = 0; col < num_sb_cols; ++col) {
+ const int index = row * num_sb_cols + col;
+ const double saliency = sb_saliency_map->buf[index];
+
+ cpi->sm_scaling_factor[index] = 1 - saliency;
+ sum += cpi->sm_scaling_factor[index];
+ block_count++;
+ }
+ }
+ assert(block_count > 0);
+ sum /= block_count;
+
+ // Calculate the geometric mean of superblock sm_scaling_factor for a frame,
+ // to be used for normalization.
+ for (int row = 0; row < num_sb_rows; ++row) {
+ for (int col = 0; col < num_sb_cols; ++col) {
+ const int index = row * num_sb_cols + col;
+ log_sum += log(fmax(cpi->sm_scaling_factor[index], 0.001));
+ cpi->sm_scaling_factor[index] =
+ fmax(cpi->sm_scaling_factor[index], 0.8 * sum);
+ }
+ }
+
+ log_sum = exp(log_sum / block_count);
+
+ // Normalize the sm_scaling_factor by geometric mean.
+ for (int row = 0; row < num_sb_rows; ++row) {
+ for (int col = 0; col < num_sb_cols; ++col) {
+ const int index = row * num_sb_cols + col;
+ assert(log_sum > 0);
+ cpi->sm_scaling_factor[index] /= log_sum;
+
+ // Modulate the sm_scaling_factor by frame basis motion factor
+ cpi->sm_scaling_factor[index] =
+ cpi->sm_scaling_factor[index] * motion_ratio;
+ }
+ }
+
+ aom_free(sb_saliency_map->buf);
+ aom_free(sb_saliency_map);
+ return 1;
+}
+
+// av1_setup_motion_ratio() is only enabled when CONFIG_REALTIME_ONLY is 0,
+// because the computations need to access the first pass stats which are
+// only available when CONFIG_REALTIME_ONLY is equal to 0.
+#if !CONFIG_REALTIME_ONLY
+// Set motion_ratio that reflects the motion quantities between two consecutive
+// frames. Motion_ratio will be used to set up saliency_map based rdmult scaling
+// factor, i.e., the less the motion quantities are, the more bits will be spent
+// on this frame, and vice versa.
+double av1_setup_motion_ratio(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ int frames_since_key =
+ cm->current_frame.display_order_hint - cpi->rc.frames_since_key;
+ const FIRSTPASS_STATS *cur_stats = av1_firstpass_info_peek(
+ &cpi->ppi->twopass.firstpass_info, frames_since_key);
+ assert(cur_stats != NULL);
+ assert(cpi->ppi->twopass.firstpass_info.total_stats.count > 0);
+
+ const double avg_intra_error =
+ exp(cpi->ppi->twopass.firstpass_info.total_stats.log_intra_error /
+ cpi->ppi->twopass.firstpass_info.total_stats.count);
+ const double avg_inter_error =
+ exp(cpi->ppi->twopass.firstpass_info.total_stats.log_coded_error /
+ cpi->ppi->twopass.firstpass_info.total_stats.count);
+
+ double inter_error = cur_stats->coded_error;
+ double error_stdev = 0;
+ const double avg_error =
+ cpi->ppi->twopass.firstpass_info.total_stats.intra_error /
+ cpi->ppi->twopass.firstpass_info.total_stats.count;
+ for (int i = 0; i < cpi->ppi->twopass.firstpass_info.total_stats.count; i++) {
+ const FIRSTPASS_STATS *stats =
+ &cpi->ppi->twopass.firstpass_info.stats_buf[i];
+ error_stdev +=
+ (stats->intra_error - avg_error) * (stats->intra_error - avg_error);
+ }
+ error_stdev =
+ sqrt(error_stdev / cpi->ppi->twopass.firstpass_info.total_stats.count);
+
+ double motion_ratio = 1;
+ if (error_stdev / fmax(avg_intra_error, 1) > 0.1) {
+ motion_ratio = inter_error / fmax(1, avg_inter_error);
+ motion_ratio = AOMMIN(motion_ratio, 1.5);
+ motion_ratio = AOMMAX(motion_ratio, 0.8);
+ }
+
+ return motion_ratio;
+}
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/av1/encoder/saliency_map.h b/third_party/aom/av1/encoder/saliency_map.h
new file mode 100644
index 0000000000..0d27f83633
--- /dev/null
+++ b/third_party/aom/av1/encoder/saliency_map.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SALIENCY_MAP_H_
+#define AOM_AV1_ENCODER_SALIENCY_MAP_H_
+#include "av1/encoder/encoder.h"
+
+typedef struct saliency_feature_map {
+ double *buf; // stores values of the map in 1D array
+ int height;
+ int width;
+} saliency_feature_map;
+
+int av1_set_saliency_map(AV1_COMP *cpi);
+#if !CONFIG_REALTIME_ONLY
+double av1_setup_motion_ratio(AV1_COMP *cpi);
+#endif
+int av1_setup_sm_rdmult_scaling_factor(AV1_COMP *cpi, double motion_ratio);
+
+#endif // AOM_AV1_ENCODER_SALIENCY_MAP_H_
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
new file mode 100644
index 0000000000..4b4e78779c
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/pred_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/segmentation.h"
+
+void av1_enable_segmentation(struct segmentation *seg) {
+ seg->enabled = 1;
+ seg->update_map = 1;
+ seg->update_data = 1;
+ seg->temporal_update = 0;
+}
+
+void av1_disable_segmentation(struct segmentation *seg) {
+ seg->enabled = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
+ seg->temporal_update = 0;
+}
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_mask[segment_id] &= ~(1u << feature_id);
+}
+
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_data[segment_id][feature_id] = 0;
+}
+
+void av1_reset_segment_features(AV1_COMMON *cm) {
+ struct segmentation *seg = &cm->seg;
+
+ // Set up default state for MB feature flags
+ seg->enabled = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
+ av1_clearall_segfeatures(seg);
+}
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
new file mode 100644
index 0000000000..1ad13d66a9
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_
+#define AOM_AV1_ENCODER_SEGMENTATION_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_enable_segmentation(struct segmentation *seg);
+void av1_disable_segmentation(struct segmentation *seg);
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_reset_segment_features(AV1_COMMON *cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_SEGMENTATION_H_
diff --git a/third_party/aom/av1/encoder/sorting_network.h b/third_party/aom/av1/encoder/sorting_network.h
new file mode 100644
index 0000000000..54f4c19dcd
--- /dev/null
+++ b/third_party/aom/av1/encoder/sorting_network.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * This file contains several utility functions used to sort small arrays with
+ * sorting networks.
+ *
+ * Sorting network is a (potentially branch-less) way to quickly sort small
+ * arrays with known size. For more details, consult
+ * (https://en.wikipedia.org/wiki/Sorting_network).
+ */
+#ifndef AOM_AV1_ENCODER_SORTING_NETWORK_H_
+#define AOM_AV1_ENCODER_SORTING_NETWORK_H_
+
+#include "aom/aom_integer.h"
+
+#define SWAP(i, j) \
+ do { \
+ const float maxf = (k[i] >= k[j]) ? k[i] : k[j]; \
+ const float minf = (k[i] >= k[j]) ? k[j] : k[i]; \
+ const int maxi = (k[i] >= k[j]) ? v[i] : v[j]; \
+ const int mini = (k[i] >= k[j]) ? v[j] : v[i]; \
+ k[i] = maxf; \
+ k[j] = minf; \
+ v[i] = maxi; \
+ v[j] = mini; \
+ } while (0)
+
+/*!\brief Sorts two size-16 arrays of keys and values in descending order of
+ * keys.
+ *
+ * \param[in,out] k An length-16 array of float serves as the keys.
+ * \param[in,out] v An length-16 array of int32 serves as the
+ * value.
+ */
+static AOM_INLINE void av1_sort_fi32_16(float k[], int32_t v[]) {
+ SWAP(0, 1);
+ SWAP(2, 3);
+ SWAP(4, 5);
+ SWAP(6, 7);
+ SWAP(8, 9);
+ SWAP(10, 11);
+ SWAP(12, 13);
+ SWAP(14, 15);
+ SWAP(0, 2);
+ SWAP(1, 3);
+ SWAP(4, 6);
+ SWAP(5, 7);
+ SWAP(8, 10);
+ SWAP(9, 11);
+ SWAP(12, 14);
+ SWAP(13, 15);
+ SWAP(1, 2);
+ SWAP(5, 6);
+ SWAP(0, 4);
+ SWAP(3, 7);
+ SWAP(9, 10);
+ SWAP(13, 14);
+ SWAP(8, 12);
+ SWAP(11, 15);
+ SWAP(1, 5);
+ SWAP(2, 6);
+ SWAP(9, 13);
+ SWAP(10, 14);
+ SWAP(0, 8);
+ SWAP(7, 15);
+ SWAP(1, 4);
+ SWAP(3, 6);
+ SWAP(9, 12);
+ SWAP(11, 14);
+ SWAP(2, 4);
+ SWAP(3, 5);
+ SWAP(10, 12);
+ SWAP(11, 13);
+ SWAP(1, 9);
+ SWAP(6, 14);
+ SWAP(3, 4);
+ SWAP(11, 12);
+ SWAP(1, 8);
+ SWAP(2, 10);
+ SWAP(5, 13);
+ SWAP(7, 14);
+ SWAP(3, 11);
+ SWAP(2, 8);
+ SWAP(4, 12);
+ SWAP(7, 13);
+ SWAP(3, 10);
+ SWAP(5, 12);
+ SWAP(3, 9);
+ SWAP(6, 12);
+ SWAP(3, 8);
+ SWAP(7, 12);
+ SWAP(5, 9);
+ SWAP(6, 10);
+ SWAP(4, 8);
+ SWAP(7, 11);
+ SWAP(5, 8);
+ SWAP(7, 10);
+ SWAP(6, 8);
+ SWAP(7, 9);
+ SWAP(7, 8);
+}
+
+/*!\brief Sorts two size-8 arrays of keys and values in descending order of
+ * keys.
+ *
+ * \param[in,out] k An length-8 array of float serves as the keys.
+ * \param[in,out] v An length-8 array of int32 serves as the values.
+ */
+static AOM_INLINE void av1_sort_fi32_8(float k[], int32_t v[]) {
+ SWAP(0, 1);
+ SWAP(2, 3);
+ SWAP(4, 5);
+ SWAP(6, 7);
+ SWAP(0, 2);
+ SWAP(1, 3);
+ SWAP(4, 6);
+ SWAP(5, 7);
+ SWAP(1, 2);
+ SWAP(5, 6);
+ SWAP(0, 4);
+ SWAP(3, 7);
+ SWAP(1, 5);
+ SWAP(2, 6);
+ SWAP(1, 4);
+ SWAP(3, 6);
+ SWAP(2, 4);
+ SWAP(3, 5);
+ SWAP(3, 4);
+}
+#undef SWAP
+#endif // AOM_AV1_ENCODER_SORTING_NETWORK_H_
diff --git a/third_party/aom/av1/encoder/sparse_linear_solver.c b/third_party/aom/av1/encoder/sparse_linear_solver.c
new file mode 100644
index 0000000000..e47c78e148
--- /dev/null
+++ b/third_party/aom/av1/encoder/sparse_linear_solver.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/sparse_linear_solver.h"
+#include "config/aom_config.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/alloccommon.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+/*
+ * Input:
+ * rows: array of row positions
+ * cols: array of column positions
+ * values: array of element values
+ * num_elem: total number of elements in the matrix
+ * num_rows: number of rows in the matrix
+ * num_cols: number of columns in the matrix
+ *
+ * Output:
+ * sm: pointer to the sparse matrix to be initialized
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
+ int num_elem, int num_rows, int num_cols,
+ SPARSE_MTX *sm) {
+ sm->n_elem = num_elem;
+ sm->n_rows = num_rows;
+ sm->n_cols = num_cols;
+ if (num_elem == 0) {
+ sm->row_pos = NULL;
+ sm->col_pos = NULL;
+ sm->value = NULL;
+ return 0;
+ }
+ sm->row_pos = aom_calloc(num_elem, sizeof(*sm->row_pos));
+ sm->col_pos = aom_calloc(num_elem, sizeof(*sm->col_pos));
+ sm->value = aom_calloc(num_elem, sizeof(*sm->value));
+
+ if (!sm->row_pos || !sm->col_pos || !sm->value) {
+ av1_free_sparse_mtx_elems(sm);
+ return -1;
+ }
+
+ memcpy(sm->row_pos, rows, num_elem * sizeof(*sm->row_pos));
+ memcpy(sm->col_pos, cols, num_elem * sizeof(*sm->col_pos));
+ memcpy(sm->value, values, num_elem * sizeof(*sm->value));
+
+ return 0;
+}
+
+/*
+ * Combines two sparse matrices (allocating new space).
+ *
+ * Input:
+ * sm1, sm2: matrices to be combined
+ * row_offset1, row_offset2: row offset of each matrix in the new matrix
+ * col_offset1, col_offset2: column offset of each matrix in the new matrix
+ * new_n_rows, new_n_cols: number of rows and columns in the new matrix
+ *
+ * Output:
+ * sm: the combined matrix
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
+ SPARSE_MTX *sm, int row_offset1,
+ int col_offset1, int row_offset2,
+ int col_offset2, int new_n_rows,
+ int new_n_cols) {
+ sm->n_elem = sm1->n_elem + sm2->n_elem;
+ sm->n_cols = new_n_cols;
+ sm->n_rows = new_n_rows;
+
+ if (sm->n_elem == 0) {
+ sm->row_pos = NULL;
+ sm->col_pos = NULL;
+ sm->value = NULL;
+ return 0;
+ }
+
+ sm->row_pos = aom_calloc(sm->n_elem, sizeof(*sm->row_pos));
+ sm->col_pos = aom_calloc(sm->n_elem, sizeof(*sm->col_pos));
+ sm->value = aom_calloc(sm->n_elem, sizeof(*sm->value));
+
+ if (!sm->row_pos || !sm->col_pos || !sm->value) {
+ av1_free_sparse_mtx_elems(sm);
+ return -1;
+ }
+
+ for (int i = 0; i < sm1->n_elem; i++) {
+ sm->row_pos[i] = sm1->row_pos[i] + row_offset1;
+ sm->col_pos[i] = sm1->col_pos[i] + col_offset1;
+ }
+ memcpy(sm->value, sm1->value, sm1->n_elem * sizeof(*sm1->value));
+ int n_elem1 = sm1->n_elem;
+ for (int i = 0; i < sm2->n_elem; i++) {
+ sm->row_pos[n_elem1 + i] = sm2->row_pos[i] + row_offset2;
+ sm->col_pos[n_elem1 + i] = sm2->col_pos[i] + col_offset2;
+ }
+ memcpy(sm->value + n_elem1, sm2->value, sm2->n_elem * sizeof(*sm2->value));
+ return 0;
+}
+
+void av1_free_sparse_mtx_elems(SPARSE_MTX *sm) {
+ sm->n_cols = 0;
+ sm->n_rows = 0;
+ if (sm->n_elem != 0) {
+ aom_free(sm->row_pos);
+ aom_free(sm->col_pos);
+ aom_free(sm->value);
+ }
+ sm->n_elem = 0;
+}
+
+/*
+ * Calculate matrix and vector multiplication: A*b
+ *
+ * Input:
+ * sm: matrix A
+ * srcv: the vector b to be multiplied to
+ * dstl: the length of vectors
+ *
+ * Output:
+ * dstv: pointer to the resulting vector
+ */
+void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv,
+ double *dstv, int dstl) {
+ memset(dstv, 0, sizeof(*dstv) * dstl);
+ for (int i = 0; i < sm->n_elem; i++) {
+ dstv[sm->row_pos[i]] += srcv[sm->col_pos[i]] * sm->value[i];
+ }
+}
+/*
+ * Calculate matrix and vector multiplication: b*A
+ *
+ * Input:
+ * sm: matrix A
+ * srcv: the vector b to be multiplied to
+ * dstl: the length of vectors
+ *
+ * Output:
+ * dstv: pointer to the resulting vector
+ */
+void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv,
+ double *dstv, int dstl) {
+ memset(dstv, 0, sizeof(*dstv) * dstl);
+ for (int i = 0; i < sm->n_elem; i++) {
+ dstv[sm->col_pos[i]] += srcv[sm->row_pos[i]] * sm->value[i];
+ }
+}
+
+/*
+ * Calculate inner product of two vectors
+ *
+ * Input:
+ * src1, scr2: the vectors to be multiplied
+ * src1l: length of the vectors
+ *
+ * Output:
+ * the inner product
+ */
+double av1_vect_vect_multi(const double *src1, int src1l, const double *src2) {
+ double result = 0;
+ for (int i = 0; i < src1l; i++) {
+ result += src1[i] * src2[i];
+ }
+ return result;
+}
+
+/*
+ * Multiply each element in the matrix sm with a constant c
+ */
+void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c) {
+ for (int i = 0; i < sm->n_elem; i++) {
+ sm->value[i] *= c;
+ }
+}
+
+static INLINE void free_solver_local_buf(double *buf1, double *buf2,
+ double *buf3, double *buf4,
+ double *buf5, double *buf6,
+ double *buf7) {
+ aom_free(buf1);
+ aom_free(buf2);
+ aom_free(buf3);
+ aom_free(buf4);
+ aom_free(buf5);
+ aom_free(buf6);
+ aom_free(buf7);
+}
+
+/*
+ * Solve for Ax = b
+ * no requirement on A
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
+ int bl, double *x) {
+ double *r = NULL, *r_hat = NULL, *p = NULL, *p_hat = NULL, *Ap = NULL,
+ *p_hatA = NULL, *x_hat = NULL;
+ double alpha, beta, rtr, r_norm_2;
+ double denormtemp;
+
+ // initialize
+ r = aom_calloc(bl, sizeof(*r));
+ r_hat = aom_calloc(bl, sizeof(*r_hat));
+ p = aom_calloc(bl, sizeof(*p));
+ p_hat = aom_calloc(bl, sizeof(*p_hat));
+ Ap = aom_calloc(bl, sizeof(*Ap));
+ p_hatA = aom_calloc(bl, sizeof(*p_hatA));
+ x_hat = aom_calloc(bl, sizeof(*x_hat));
+ if (!r || !r_hat || !p || !p_hat || !Ap || !p_hatA || !x_hat) {
+ free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
+ return -1;
+ }
+
+ int i;
+ for (i = 0; i < bl; i++) {
+ r[i] = b[i];
+ r_hat[i] = b[i];
+ p[i] = r[i];
+ p_hat[i] = r_hat[i];
+ x[i] = 0;
+ x_hat[i] = 0;
+ }
+ r_norm_2 = av1_vect_vect_multi(r_hat, bl, r);
+ for (int k = 0; k < MAX_CG_SP_ITER; k++) {
+ rtr = r_norm_2;
+ av1_mtx_vect_multi_right(A, p, Ap, bl);
+ av1_mtx_vect_multi_left(A, p_hat, p_hatA, bl);
+
+ denormtemp = av1_vect_vect_multi(p_hat, bl, Ap);
+ if (denormtemp < 1e-10) break;
+ alpha = rtr / denormtemp;
+ r_norm_2 = 0;
+ for (i = 0; i < bl; i++) {
+ x[i] += alpha * p[i];
+ x_hat[i] += alpha * p_hat[i];
+ r[i] -= alpha * Ap[i];
+ r_hat[i] -= alpha * p_hatA[i];
+ r_norm_2 += r_hat[i] * r[i];
+ }
+ if (sqrt(r_norm_2) < 1e-2) {
+ break;
+ }
+ if (rtr < 1e-10) break;
+ beta = r_norm_2 / rtr;
+ for (i = 0; i < bl; i++) {
+ p[i] = r[i] + beta * p[i];
+ p_hat[i] = r_hat[i] + beta * p_hat[i];
+ }
+ }
+ // free
+ free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
+ return 0;
+}
+
+/*
+ * Solve for Ax = b when A is symmetric and positive definite
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
+ double *x) {
+ double *r = NULL, *p = NULL, *Ap = NULL;
+ double alpha, beta, rtr, r_norm_2;
+ double denormtemp;
+
+ // initialize
+ r = aom_calloc(bl, sizeof(*r));
+ p = aom_calloc(bl, sizeof(*p));
+ Ap = aom_calloc(bl, sizeof(*Ap));
+ if (!r || !p || !Ap) {
+ free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
+ return -1;
+ }
+
+ int i;
+ for (i = 0; i < bl; i++) {
+ r[i] = b[i];
+ p[i] = r[i];
+ x[i] = 0;
+ }
+ r_norm_2 = av1_vect_vect_multi(r, bl, r);
+ int k;
+ for (k = 0; k < MAX_CG_SP_ITER; k++) {
+ rtr = r_norm_2;
+ av1_mtx_vect_multi_right(A, p, Ap, bl);
+ denormtemp = av1_vect_vect_multi(p, bl, Ap);
+ if (denormtemp < 1e-10) break;
+ alpha = rtr / denormtemp;
+ r_norm_2 = 0;
+ for (i = 0; i < bl; i++) {
+ x[i] += alpha * p[i];
+ r[i] -= alpha * Ap[i];
+ r_norm_2 += r[i] * r[i];
+ }
+ if (r_norm_2 < 1e-8 * bl) break;
+ if (rtr < 1e-10) break;
+ beta = r_norm_2 / rtr;
+ for (i = 0; i < bl; i++) {
+ p[i] = r[i] + beta * p[i];
+ }
+ }
+ // free
+ free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+/*
+ * Solve for Ax = b using Jacobi method
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) {
+ double *diags = NULL, *Rx = NULL, *x_last = NULL, *x_cur = NULL,
+ *tempx = NULL;
+ double resi2;
+
+ diags = aom_calloc(bl, sizeof(*diags));
+ Rx = aom_calloc(bl, sizeof(*Rx));
+ x_last = aom_calloc(bl, sizeof(*x_last));
+ x_cur = aom_calloc(bl, sizeof(*x_cur));
+
+ if (!diags || !Rx || !x_last || !x_cur) {
+ free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
+ return -1;
+ }
+
+ int i;
+ memset(x_last, 0, sizeof(*x_last) * bl);
+ // get the diagonals of A
+ memset(diags, 0, sizeof(*diags) * bl);
+ for (int c = 0; c < A->n_elem; c++) {
+ if (A->row_pos[c] != A->col_pos[c]) continue;
+ diags[A->row_pos[c]] = A->value[c];
+ }
+ int k;
+ for (k = 0; k < MAX_CG_SP_ITER; k++) {
+ // R = A - diag(diags)
+ // get R*x_last
+ memset(Rx, 0, sizeof(*Rx) * bl);
+ for (int c = 0; c < A->n_elem; c++) {
+ if (A->row_pos[c] == A->col_pos[c]) continue;
+ Rx[A->row_pos[c]] += x_last[A->col_pos[c]] * A->value[c];
+ }
+ resi2 = 0;
+ for (i = 0; i < bl; i++) {
+ x_cur[i] = (b[i] - Rx[i]) / diags[i];
+ resi2 += (x_last[i] - x_cur[i]) * (x_last[i] - x_cur[i]);
+ }
+ if (resi2 <= 1e-10 * bl) break;
+ // swap last & cur buffer ptrs
+ tempx = x_last;
+ x_last = x_cur;
+ x_cur = tempx;
+ }
+ printf("\n numiter: %d\n", k);
+ for (i = 0; i < bl; i++) {
+ x[i] = x_cur[i];
+ }
+ free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
+ return 0;
+}
+
+/*
+ * Solve for Ax = b using Steepest descent method
+ *
+ * Input:
+ * A: the sparse matrix
+ * b: the vector b
+ * bl: length of b
+ * x: the vector x
+ *
+ * Output:
+ * x: pointer to the solution vector
+ *
+ * Return: 0 - success
+ * -1 - failed
+ */
+int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
+ double *x) {
+ double *d = NULL, *Ad = NULL, *Ax = NULL;
+ double resi2, resi2_last, dAd, temp;
+
+ d = aom_calloc(bl, sizeof(*d));
+ Ax = aom_calloc(bl, sizeof(*Ax));
+ Ad = aom_calloc(bl, sizeof(*Ad));
+
+ if (!d || !Ax || !Ad) {
+ free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
+ return -1;
+ }
+
+ int i;
+ // initialize with 0s
+ resi2 = 0;
+ for (i = 0; i < bl; i++) {
+ x[i] = 0;
+ d[i] = b[i];
+ resi2 += d[i] * d[i] / bl;
+ }
+ int k;
+ for (k = 0; k < MAX_CG_SP_ITER; k++) {
+ // get A*x_last
+ av1_mtx_vect_multi_right(A, d, Ad, bl);
+ dAd = resi2 * bl / av1_vect_vect_multi(d, bl, Ad);
+ for (i = 0; i < bl; i++) {
+ temp = dAd * d[i];
+ x[i] = x[i] + temp;
+ }
+ av1_mtx_vect_multi_right(A, x, Ax, bl);
+ resi2_last = resi2;
+ resi2 = 0;
+ for (i = 0; i < bl; i++) {
+ d[i] = b[i] - Ax[i];
+ resi2 += d[i] * d[i] / bl;
+ }
+ if (resi2 <= 1e-8) break;
+ if (resi2_last - resi2 < 1e-8) {
+ break;
+ }
+ }
+ free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
+
+ return 0;
+}
+
+#endif // CONFIG_OPTICAL_FLOW_API
diff --git a/third_party/aom/av1/encoder/sparse_linear_solver.h b/third_party/aom/av1/encoder/sparse_linear_solver.h
new file mode 100644
index 0000000000..f30fc0f5b1
--- /dev/null
+++ b/third_party/aom/av1/encoder/sparse_linear_solver.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
+#define AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#if CONFIG_OPTICAL_FLOW_API
+
+// Number of iterations for solving linear equations.
+#define MAX_CG_SP_ITER 100
+
+typedef struct {
+ int n_elem; // number of non-zero elements
+ int n_rows;
+ int n_cols;
+ // using arrays to represent non-zero elements.
+ int *col_pos;
+ int *row_pos; // starts with 0
+ double *value;
+} SPARSE_MTX;
+
+int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
+ int num_elem, int num_rows, int num_cols,
+ SPARSE_MTX *sm);
+int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
+ SPARSE_MTX *sm, int row_offset1,
+ int col_offset1, int row_offset2,
+ int col_offset2, int new_n_rows,
+ int new_n_cols);
+void av1_free_sparse_mtx_elems(SPARSE_MTX *sm);
+
+void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv,
+ double *dstv, int dstl);
+void av1_mtx_vect_multi_left(const SPARSE_MTX *sm, const double *srcv,
+ double *dstv, int dstl);
+double av1_vect_vect_multi(const double *src1, int src1l, const double *src2);
+void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c);
+
+int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
+ double *x);
+int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
+ int bl, double *x);
+int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x);
+int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
+ double *x);
+
+#endif // CONFIG_OPTICAL_FLOW_API
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /* AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ */
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
new file mode 100644
index 0000000000..a6c0971096
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -0,0 +1,2715 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/rdopt.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method
+// Max speed setting for tx domain evaluation
+#define MAX_TX_DOMAIN_EVAL_SPEED 5
+static MESH_PATTERN
+ good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ };
+
+// TODO(huisu@google.com): These settings are pretty relaxed, tune them for
+// each speed setting
+static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+ { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+};
+
+// Threshold values to be used for pruning the txfm_domain_distortion
+// based on block MSE
+// Index 0: Default mode evaluation, Winner mode processing is not
+// applicable (Eg : IntraBc). Index 1: Mode evaluation.
+// Index 2: Winner mode evaluation. Index 1 and 2 are applicable when
+// enable_winner_mode_for_use_tx_domain_dist speed feature is ON
+// TODO(any): Experiment the threshold logic based on variance metric
+static unsigned int tx_domain_dist_thresholds[4][MODE_EVAL_TYPES] = {
+ { UINT_MAX, UINT_MAX, UINT_MAX },
+ { 22026, 22026, 22026 },
+ { 1377, 1377, 1377 },
+ { 0, 0, 0 }
+};
+
+// Number of different levels of aggressiveness in using transform domain
+// distortion during the R-D evaluation based on the speed feature
+// tx_domain_dist_level.
+#define TX_DOMAIN_DIST_LEVELS 4
+
+// Transform domain distortion type to be used for default, mode and winner mode
+// evaluation Index 0: Default mode evaluation, Winner mode processing is not
+// applicable (Eg : IntraBc). Index 1: Mode evaluation. Index 2: Winner mode
+// evaluation. Index 1 and 2 are applicable when
+// enable_winner_mode_for_use_tx_domain_dist speed feature is ON
+static unsigned int
+ tx_domain_dist_types[TX_DOMAIN_DIST_LEVELS][MODE_EVAL_TYPES] = {
+ { 0, 2, 0 }, { 1, 2, 0 }, { 2, 2, 0 }, { 2, 2, 2 }
+ };
+
+// Threshold values to be used for disabling coeff RD-optimization
+// based on block MSE / qstep^2.
+// TODO(any): Experiment the threshold logic based on variance metric.
+// Table has satd and dist threshold value index 0 : dist,index 1: satd
+// For each row, the indices are as follows.
+// Index 0: Default mode evaluation, Winner mode processing is not applicable
+// (Eg : IntraBc)
+// Index 1: Mode evaluation.
+// Index 2: Winner mode evaluation.
+// Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
+// feature is ON
+// There are 7 levels with increasing speed, mapping to vertical indices.
+static unsigned int coeff_opt_thresholds[9][MODE_EVAL_TYPES][2] = {
+ { { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+ { { 3200, UINT_MAX }, { 250, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+ { { 1728, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+ { { 864, UINT_MAX }, { 142, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+ { { 432, UINT_MAX }, { 86, UINT_MAX }, { UINT_MAX, UINT_MAX } },
+ { { 864, 97 }, { 142, 16 }, { UINT_MAX, UINT_MAX } },
+ { { 432, 97 }, { 86, 16 }, { UINT_MAX, UINT_MAX } },
+ { { 216, 25 }, { 86, 10 }, { UINT_MAX, UINT_MAX } },
+ { { 216, 25 }, { 0, 10 }, { UINT_MAX, UINT_MAX } }
+};
+
+// Transform size to be used for default, mode and winner mode evaluation
+// Index 0: Default mode evaluation, Winner mode processing is not applicable
+// (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation.
+// Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
+// feature is ON
+static TX_SIZE_SEARCH_METHOD tx_size_search_methods[4][MODE_EVAL_TYPES] = {
+ { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD },
+ { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD },
+ { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD },
+ { USE_LARGESTALL, USE_LARGESTALL, USE_LARGESTALL }
+};
+
+// Predict transform skip levels to be used for default, mode and winner mode
+// evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+// Values indicate the aggressiveness of skip flag prediction.
+// 0 : no early skip prediction
+// 1 : conservative early skip prediction using DCT_DCT
+// 2 : early skip prediction based on SSE
+static unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 },
+ { 1, 1, 1 },
+ { 1, 2, 1 } };
+
+// Predict skip or DC block level used during transform type search. It is
+// indexed using the following:
+// First index : Speed feature 'dc_blk_pred_level' (0 to 3)
+// Second index : Mode evaluation type (DEFAULT_EVAL, MODE_EVAL and
+// WINNER_MODE_EVAL).
+//
+// The values of predict_dc_levels[][] indicate the aggressiveness of predicting
+// a block as transform skip or DC only.
+// Type 0 : No skip block or DC only block prediction
+// Type 1 : Prediction of skip block based on residual mean and variance
+// Type 2 : Prediction of skip block or DC only block based on residual mean and
+// variance
+static unsigned int predict_dc_levels[4][MODE_EVAL_TYPES] = {
+ { 0, 0, 0 }, { 1, 1, 0 }, { 2, 2, 0 }, { 2, 2, 2 }
+};
+
+#if !CONFIG_FPMT_TEST
+// This table holds the maximum number of reference frames for global motion.
+// The table is indexed as per the speed feature 'gm_search_type'.
+// 0 : All reference frames are allowed.
+// 1 : All reference frames except L2 and L3 are allowed.
+// 2 : All reference frames except L2, L3 and ARF2 are allowed.
+// 3 : No reference frame is allowed.
+static int gm_available_reference_frames[GM_DISABLE_SEARCH + 1] = {
+ INTER_REFS_PER_FRAME, INTER_REFS_PER_FRAME - 2, INTER_REFS_PER_FRAME - 3, 0
+};
+#endif
+
+// Qindex threshold levels used for selecting full-pel motion search.
+// ms_qthresh[i][j][k] indicates the qindex boundary value for 'k'th qindex band
+// for resolution index 'j' for aggressiveness level 'i'.
+// Aggressiveness increases from i = 0 to 2.
+// j = 0: lower than 720p resolution, j = 1: 720p or larger resolution.
+// Currently invoked only for speed 0, 1 and 2.
+static int ms_qindex_thresh[3][2][2] = { { { 200, 70 }, { MAXQ, 200 } },
+ { { 170, 50 }, { MAXQ, 200 } },
+ { { 170, 40 }, { 200, 40 } } };
+
+// Full-pel search methods for aggressive search based on qindex.
+// Index 0 is for resolutions lower than 720p, index 1 for 720p or larger
+// resolutions. Currently invoked only for speed 1 and 2.
+static SEARCH_METHODS motion_search_method[2] = { CLAMPED_DIAMOND, DIAMOND };
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const AV1_COMP *cpi) {
+ return frame_is_kf_gf_arf(cpi);
+}
+
+// Set transform rd gate level for all transform search cases.
+static AOM_INLINE void set_txfm_rd_gate_level(
+ int txfm_rd_gate_level[TX_SEARCH_CASES], int level) {
+ assert(level <= MAX_TX_RD_GATE_LEVEL);
+ for (int idx = 0; idx < TX_SEARCH_CASES; idx++)
+ txfm_rd_gate_level[idx] = level;
+}
+
+static void set_allintra_speed_feature_framesize_dependent(
+ const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+ const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+ const bool use_hbd = cpi->oxcf.use_highbitdepth;
+
+ if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+ if (is_720p_or_larger)
+ sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+ else
+ sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 1;
+ }
+
+ if (is_4k_or_larger) {
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ }
+
+ // TODO(huisu@google.com): train models for 720P and above.
+ if (!is_720p_or_larger) {
+ sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64
+ sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ sf->part_sf.ml_early_term_after_part_split_level = 1;
+ }
+
+ if (is_720p_or_larger) {
+ // TODO(chiyotsai@google.com): make this speed feature adaptive based on
+ // current block's vertical texture instead of hardcoded with resolution
+ sf->mv_sf.use_downsampled_sad = 2;
+ }
+
+ if (speed >= 1) {
+ if (is_720p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64
+ sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ }
+ sf->part_sf.ml_early_term_after_part_split_level = 2;
+ }
+
+ if (speed >= 2) {
+ if (is_720p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ }
+
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+ sf->part_sf.partition_search_breakout_rate_thr = 120;
+ } else {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 22);
+ sf->part_sf.partition_search_breakout_rate_thr = 100;
+ }
+
+ if (is_480p_or_larger) {
+ sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
+ } else {
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+ }
+ }
+
+ if (speed >= 3) {
+ sf->part_sf.ml_early_term_after_part_split_level = 0;
+
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+ sf->part_sf.partition_search_breakout_rate_thr = 200;
+ } else {
+ sf->part_sf.max_intra_bsize = BLOCK_32X32;
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
+ sf->part_sf.partition_search_breakout_rate_thr = 120;
+ }
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+ }
+
+ if (speed >= 4) {
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+ } else {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+ }
+
+ if (is_480p_or_larger) {
+ sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
+ }
+ }
+
+ if (speed >= 6) {
+ if (is_720p_or_larger) {
+ sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+ }
+
+ if (is_1080p_or_larger) {
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ }
+
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
+ }
+
+ if (speed >= 7) {
+ // TODO(kyslov): add more speed features to control speed/quality
+ }
+
+ if (speed >= 8) {
+ if (!is_480p_or_larger) {
+ sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+ }
+ if (is_720p_or_larger) {
+ sf->rt_sf.force_large_partition_blocks_intra = 1;
+ }
+ }
+
+ if (speed >= 9) {
+ // TODO(kyslov): add more speed features to control speed/quality
+ if (!is_4k_or_larger) {
+ // In av1_select_sb_size(), superblock size is set to 64x64 only for
+ // resolutions less than 4k in speed>=9, to improve the multithread
+ // performance. If cost update levels are set to INTERNAL_COST_UPD_OFF
+ // for resolutions >= 4k, the SB size setting can be modified for these
+ // resolutions as well.
+ sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_OFF;
+ sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_OFF;
+ }
+ }
+}
+
+static void set_allintra_speed_features_framesize_independent(
+ const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int allow_screen_content_tools =
+ cm->features.allow_screen_content_tools;
+ const int use_hbd = cpi->oxcf.use_highbitdepth;
+
+ sf->part_sf.less_rectangular_check_level = 1;
+ sf->part_sf.ml_prune_partition = 1;
+ sf->part_sf.prune_ext_partition_types_search_level = 1;
+ sf->part_sf.prune_part4_search = 2;
+ sf->part_sf.simple_motion_search_prune_rect = 1;
+ sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+ sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+ sf->part_sf.use_best_rd_for_pruning = 1;
+
+ sf->intra_sf.intra_pruning_with_hog = 1;
+ sf->intra_sf.prune_luma_palette_size_search_level = 1;
+ sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
+ sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+ sf->tx_sf.adaptive_txb_search_level = 1;
+ sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 1;
+ sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+
+ sf->rt_sf.use_nonrd_pick_mode = 0;
+ sf->rt_sf.use_real_time_ref_set = 0;
+
+ if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
+ cpi->use_screen_content_tools) {
+ sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
+ } else {
+ sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+ }
+
+ sf->rd_sf.perform_coeff_opt = 1;
+ sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
+
+ if (speed >= 1) {
+ sf->part_sf.intra_cnn_based_part_prune_level =
+ allow_screen_content_tools ? 0 : 2;
+ sf->part_sf.simple_motion_search_early_term_none = 1;
+ // TODO(Venkat): Clean-up frame type dependency for
+ // simple_motion_search_split in partition search function and set the
+ // speed feature accordingly
+ sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
+ sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3;
+ sf->part_sf.reuse_best_prediction_for_part_ab = 1;
+
+ sf->mv_sf.exhaustive_searches_thresh <<= 1;
+
+ sf->intra_sf.prune_palette_search_level = 1;
+ sf->intra_sf.prune_luma_palette_size_search_level = 2;
+ sf->intra_sf.top_intra_model_count_allowed = 3;
+
+ sf->tx_sf.adaptive_txb_search_level = 2;
+ sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 0;
+ sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+ sf->tx_sf.tx_type_search.skip_tx_search = 1;
+
+ sf->rd_sf.perform_coeff_opt = 2;
+ sf->rd_sf.tx_domain_dist_level = 1;
+ sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
+ sf->lpf_sf.dual_sgr_penalty_level = 1;
+ sf->lpf_sf.enable_sgr_ep_pruning = 1;
+ }
+
+ if (speed >= 2) {
+ sf->mv_sf.auto_mv_step_size = 1;
+
+ sf->intra_sf.disable_smooth_intra = 1;
+ sf->intra_sf.intra_pruning_with_hog = 2;
+ sf->intra_sf.prune_filter_intra_level = 1;
+
+ sf->rd_sf.perform_coeff_opt = 3;
+
+ sf->lpf_sf.prune_wiener_based_on_src_var = 1;
+ sf->lpf_sf.prune_sgr_based_on_wiener = 1;
+ }
+
+ if (speed >= 3) {
+ sf->hl_sf.high_precision_mv_usage = CURRENT_Q;
+ sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+ sf->part_sf.less_rectangular_check_level = 2;
+ sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL1;
+ sf->part_sf.prune_ext_part_using_split_info = 1;
+
+ sf->mv_sf.full_pixel_search_level = 1;
+ sf->mv_sf.search_method = DIAMOND;
+
+ // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are
+ // inherited directly from luma hog with some minor tweaking. Eventually we
+ // should run this with a bayesian optimizer to find the Pareto frontier.
+ sf->intra_sf.chroma_intra_pruning_with_hog = 2;
+ sf->intra_sf.intra_pruning_with_hog = 3;
+ sf->intra_sf.prune_palette_search_level = 2;
+
+ sf->tx_sf.adaptive_txb_search_level = 2;
+ sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+ sf->tx_sf.use_rd_based_breakout_for_intra_tx_search = true;
+
+ // TODO(any): evaluate if these lpf features can be moved to speed 2.
+ // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
+ // loss.
+ sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2;
+ sf->lpf_sf.disable_loop_restoration_chroma = 0;
+ sf->lpf_sf.reduce_wiener_window_size = 1;
+ sf->lpf_sf.prune_wiener_based_on_src_var = 2;
+ }
+
+ if (speed >= 4) {
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+ sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL2;
+ sf->part_sf.simple_motion_search_reduce_search_steps = 4;
+ sf->part_sf.prune_ext_part_using_split_info = 2;
+ sf->part_sf.early_term_after_none_split = 1;
+ sf->part_sf.ml_predict_breakout_level = 3;
+
+ sf->intra_sf.prune_chroma_modes_using_luma_winner = 1;
+
+ sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
+
+ sf->tpl_sf.prune_starting_mv = 2;
+ sf->tpl_sf.subpel_force_stop = HALF_PEL;
+ sf->tpl_sf.search_method = FAST_BIGDIA;
+
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+ sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+ sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
+
+ sf->rd_sf.perform_coeff_opt = 5;
+ sf->rd_sf.tx_domain_dist_thres_level = 3;
+
+ sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3;
+
+ sf->mv_sf.reduce_search_range = 1;
+
+ sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
+ sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
+ sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_DEFAULT;
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+ }
+
+ if (speed >= 5) {
+ sf->part_sf.simple_motion_search_prune_agg = SIMPLE_AGG_LVL3;
+ sf->part_sf.ext_partition_eval_thresh =
+ allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+ sf->part_sf.intra_cnn_based_part_prune_level =
+ allow_screen_content_tools ? 1 : 2;
+
+ sf->intra_sf.chroma_intra_pruning_with_hog = 3;
+
+ sf->lpf_sf.use_coarse_filter_level_search = 0;
+ // Disable Wiener and Self-guided Loop restoration filters.
+ sf->lpf_sf.disable_wiener_filter = true;
+ sf->lpf_sf.disable_sgr_filter = true;
+
+ sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2;
+
+ sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_FAST;
+ }
+
+ if (speed >= 6) {
+ sf->intra_sf.prune_smooth_intra_mode_for_chroma = 1;
+ sf->intra_sf.prune_filter_intra_level = 2;
+ sf->intra_sf.chroma_intra_pruning_with_hog = 4;
+ sf->intra_sf.intra_pruning_with_hog = 4;
+ sf->intra_sf.cfl_search_range = 1;
+ sf->intra_sf.top_intra_model_count_allowed = 2;
+ sf->intra_sf.adapt_top_model_rd_count_using_neighbors = 1;
+ sf->intra_sf.prune_luma_odd_delta_angles_in_intra = 1;
+
+ sf->part_sf.prune_rectangular_split_based_on_qidx =
+ allow_screen_content_tools ? 0 : 2;
+ sf->part_sf.prune_rect_part_using_4x4_var_deviation = true;
+ sf->part_sf.prune_rect_part_using_none_pred_mode = true;
+ sf->part_sf.prune_sub_8x8_partition_level =
+ allow_screen_content_tools ? 0 : 1;
+ sf->part_sf.prune_part4_search = 3;
+ // TODO(jingning): This might not be a good trade off if the
+ // target image quality is very low.
+ sf->part_sf.default_max_partition_size = BLOCK_32X32;
+
+ sf->mv_sf.use_bsize_dependent_search_method = 1;
+
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3;
+ sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0;
+ sf->tx_sf.prune_intra_tx_depths_using_nn = true;
+
+ sf->rd_sf.perform_coeff_opt = 6;
+ sf->rd_sf.tx_domain_dist_level = 3;
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+ sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+ sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
+ sf->winner_mode_sf.prune_winner_mode_eval_level = 1;
+ sf->winner_mode_sf.dc_blk_pred_level = 1;
+ }
+ // The following should make all-intra mode speed 7 approximately equal
+ // to real-time speed 6,
+ // all-intra speed 8 close to real-time speed 7, and all-intra speed 9
+ // close to real-time speed 8
+ if (speed >= 7) {
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+ sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+ sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+ sf->rt_sf.var_part_split_threshold_shift = 7;
+ }
+
+ if (speed >= 8) {
+ sf->rt_sf.hybrid_intra_pickmode = 1;
+ sf->rt_sf.use_nonrd_pick_mode = 1;
+ sf->rt_sf.nonrd_check_partition_merge_mode = 1;
+ sf->rt_sf.var_part_split_threshold_shift = 8;
+ // Set mask for intra modes.
+ for (int i = 0; i < BLOCK_SIZES; ++i)
+ if (i >= BLOCK_32X32)
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+ else
+ // Use DC, H, V intra mode for block sizes < 32X32.
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+ }
+
+ if (speed >= 9) {
+ sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+
+ sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+ sf->rt_sf.hybrid_intra_pickmode = 0;
+ sf->rt_sf.var_part_split_threshold_shift = 9;
+ sf->rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var = true;
+ sf->rt_sf.prune_h_pred_using_best_mode_so_far = true;
+ sf->rt_sf.enable_intra_mode_pruning_using_neighbors = true;
+ sf->rt_sf.prune_intra_mode_using_best_sad_so_far = true;
+ }
+
+ // As the speed feature prune_chroma_modes_using_luma_winner already
+ // constrains the number of chroma directional mode evaluations to a maximum
+ // of 1, the HOG computation and the associated pruning logic does not seem to
+ // help speed-up the chroma mode evaluations. Hence disable the speed feature
+ // chroma_intra_pruning_with_hog when prune_chroma_modes_using_luma_winner is
+ // enabled.
+ if (sf->intra_sf.prune_chroma_modes_using_luma_winner)
+ sf->intra_sf.chroma_intra_pruning_with_hog = 0;
+}
+
+static void set_good_speed_feature_framesize_dependent(
+ const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+ const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+ const bool use_hbd = cpi->oxcf.use_highbitdepth;
+ // Speed features applicable for temporal filtering and tpl modules may be
+ // changed based on frame type at places where the sf is applied (Example :
+ // use_downsampled_sad). This is because temporal filtering and tpl modules
+ // are called before this function (except for the first key frame).
+ // TODO(deepa.kg@ittiam.com): For the speed features applicable to temporal
+ // filtering and tpl modules, modify the sf initialization appropriately
+ // before calling the modules.
+ const int boosted = frame_is_boosted(cpi);
+ const int is_boosted_arf2_bwd_type =
+ boosted ||
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+ const int is_lf_frame =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == LF_UPDATE;
+ const int allow_screen_content_tools =
+ cm->features.allow_screen_content_tools;
+
+ if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+ if (is_720p_or_larger)
+ sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+ else
+ sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 1;
+ }
+
+ if (is_4k_or_larger) {
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ }
+
+ // TODO(huisu@google.com): train models for 720P and above.
+ if (!is_720p_or_larger) {
+ sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->part_sf.ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64
+ sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ sf->part_sf.ml_early_term_after_part_split_level = 1;
+ }
+
+ if (is_720p_or_larger) {
+ // TODO(chiyotsai@google.com): make this speed feature adaptive based on
+ // current block's vertical texture instead of hardcoded with resolution
+ sf->mv_sf.use_downsampled_sad = 2;
+ }
+
+ if (!is_720p_or_larger) {
+ const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+ const int rate_tolerance =
+ AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
+ sf->hl_sf.recode_tolerance = 25 + (rate_tolerance >> 2);
+ }
+
+ if (speed >= 1) {
+ if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 1;
+
+ if (is_720p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->part_sf.ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->part_sf.ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->part_sf.ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->part_sf.ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64
+ sf->part_sf.ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ }
+ sf->part_sf.ml_early_term_after_part_split_level = 2;
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
+ }
+
+ if (speed >= 2) {
+ if (is_720p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ }
+
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+ sf->part_sf.partition_search_breakout_rate_thr = 120;
+ } else {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 22);
+ sf->part_sf.partition_search_breakout_rate_thr = 100;
+ }
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.prune_obmc_prob_thresh = 16;
+ } else {
+ sf->inter_sf.prune_obmc_prob_thresh = 8;
+ }
+
+ if (is_480p_or_larger) {
+ sf->inter_sf.disable_interintra_wedge_var_thresh = 100;
+ } else {
+ sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+ }
+
+ if (is_480p_or_lesser) sf->inter_sf.skip_ext_comp_nearmv_mode = 1;
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 1 : 0;
+ } else {
+ sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 2 : 0;
+ }
+
+ if (is_480p_or_larger) {
+ sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
+ } else {
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = boosted ? 0 : 1;
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = boosted ? 0 : 1;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->mv_sf.disable_second_mv = 1;
+ sf->mv_sf.auto_mv_step_size = 2;
+ } else {
+ sf->mv_sf.disable_second_mv = boosted ? 0 : 2;
+ sf->mv_sf.auto_mv_step_size = 1;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->hl_sf.recode_tolerance = 50;
+ sf->inter_sf.disable_interinter_wedge_newmv_search =
+ is_boosted_arf2_bwd_type ? 0 : 1;
+ sf->inter_sf.enable_fast_wedge_mask_search = 1;
+ }
+ }
+
+ if (speed >= 3) {
+ sf->inter_sf.enable_fast_wedge_mask_search = 1;
+ sf->inter_sf.skip_newmv_in_drl = 2;
+ sf->inter_sf.skip_ext_comp_nearmv_mode = 1;
+ sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 3 : 0;
+ sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch =
+ frame_is_intra_only(&cpi->common) ? 0 : 1;
+
+ sf->part_sf.ml_early_term_after_part_split_level = 0;
+
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+ sf->part_sf.partition_search_breakout_rate_thr = 200;
+ sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 2 : 0;
+ } else {
+ sf->part_sf.max_intra_bsize = BLOCK_32X32;
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
+ sf->part_sf.partition_search_breakout_rate_thr = 120;
+ sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 1 : 0;
+ }
+ if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+
+ if (is_480p_or_larger) {
+ sf->part_sf.early_term_after_none_split = 1;
+ } else {
+ sf->part_sf.early_term_after_none_split = 0;
+ }
+ if (is_720p_or_larger) {
+ sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 2;
+ } else {
+ sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 3;
+ }
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+ sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 1;
+ } else {
+ sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+ sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2;
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL2;
+ }
+
+ sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+ }
+
+ if (speed >= 4) {
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+ if (is_720p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+ } else {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+ }
+ sf->part_sf.early_term_after_none_split = 1;
+
+ if (is_480p_or_larger) {
+ sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
+ } else {
+ sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1;
+ }
+
+ sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+ sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+ sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2;
+ if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 3;
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.prune_comp_ref_frames = 1;
+ } else if (is_480p_or_larger) {
+ sf->inter_sf.prune_comp_ref_frames = is_boosted_arf2_bwd_type ? 0 : 1;
+ }
+
+ if (is_720p_or_larger)
+ sf->hl_sf.recode_tolerance = 32;
+ else
+ sf->hl_sf.recode_tolerance = 55;
+
+ sf->intra_sf.skip_intra_in_interframe = 4;
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3;
+ }
+
+ if (speed >= 5) {
+ if (is_720p_or_larger) {
+ sf->inter_sf.prune_warped_prob_thresh = 16;
+ } else if (is_480p_or_larger) {
+ sf->inter_sf.prune_warped_prob_thresh = 8;
+ }
+ if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 40;
+
+ sf->inter_sf.skip_newmv_in_drl = 4;
+ sf->inter_sf.prune_comp_ref_frames = 1;
+ sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 1;
+
+ if (!is_720p_or_larger) {
+ sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET;
+ sf->inter_sf.prune_nearest_near_mv_using_refmv_weight =
+ (boosted || allow_screen_content_tools) ? 0 : 1;
+ sf->mv_sf.use_downsampled_sad = 1;
+ }
+
+ if (!is_480p_or_larger) {
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+ }
+
+ if (is_480p_or_lesser) {
+ sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL1;
+ } else {
+ sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL2;
+ }
+
+ if (is_720p_or_larger)
+ sf->part_sf.ext_part_eval_based_on_cur_best =
+ (allow_screen_content_tools || frame_is_intra_only(cm)) ? 0 : 1;
+
+ if (is_480p_or_larger) {
+ sf->tpl_sf.reduce_num_frames = 1;
+ }
+ }
+
+ if (speed >= 6) {
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
+ sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3;
+ sf->inter_sf.prune_comp_ref_frames = 2;
+ sf->inter_sf.prune_nearest_near_mv_using_refmv_weight =
+ (boosted || allow_screen_content_tools) ? 0 : 1;
+ sf->mv_sf.skip_fullpel_search_using_startmv = boosted ? 0 : 2;
+
+ if (is_720p_or_larger) {
+ sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+ } else if (is_480p_or_larger) {
+ sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+ }
+
+ if (is_480p_or_larger) {
+ sf->hl_sf.allow_sub_blk_me_in_tf = 1;
+ }
+
+ if (is_1080p_or_larger) {
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ }
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.disable_masked_comp = 1;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ }
+
+ if (is_720p_or_larger) {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 28);
+ } else {
+ sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+ }
+
+ if (is_720p_or_larger) {
+ sf->inter_sf.prune_ref_mv_idx_search = 2;
+ } else {
+ sf->inter_sf.prune_ref_mv_idx_search = 1;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
+ is_boosted_arf2_bwd_type ? 450 : 150;
+ }
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+
+ sf->hl_sf.recode_tolerance = 55;
+ }
+}
+
+static void set_good_speed_features_framesize_independent(
+ const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int boosted = frame_is_boosted(cpi);
+ const int is_boosted_arf2_bwd_type =
+ boosted || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+ const int is_inter_frame =
+ gf_group->frame_type[cpi->gf_frame_index] == INTER_FRAME;
+ const int allow_screen_content_tools =
+ cm->features.allow_screen_content_tools;
+ const int use_hbd = cpi->oxcf.use_highbitdepth;
+ if (!cpi->oxcf.tile_cfg.enable_large_scale_tile) {
+ sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA;
+ }
+
+ // Speed 0 for all speed features that give neutral coding performance change.
+ sf->gm_sf.gm_search_type = boosted ? GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2
+ : GM_SEARCH_CLOSEST_REFS_ONLY;
+ sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1;
+ sf->gm_sf.disable_gm_search_based_on_stats = 1;
+
+ sf->part_sf.less_rectangular_check_level = 1;
+ sf->part_sf.ml_prune_partition = 1;
+ sf->part_sf.prune_ext_partition_types_search_level = 1;
+ sf->part_sf.prune_part4_search = 2;
+ sf->part_sf.simple_motion_search_prune_rect = 1;
+ sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+ sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+ sf->part_sf.use_best_rd_for_pruning = 1;
+ sf->part_sf.simple_motion_search_prune_agg =
+ allow_screen_content_tools ? NO_PRUNING : SIMPLE_AGG_LVL0;
+
+ // TODO(debargha): Test, tweak and turn on either 1 or 2
+ sf->inter_sf.inter_mode_rd_model_estimation = 1;
+ sf->inter_sf.model_based_post_interp_filter_breakout = 1;
+ sf->inter_sf.prune_compound_using_single_ref = 1;
+ sf->inter_sf.prune_mode_search_simple_translation = 1;
+ sf->inter_sf.prune_ref_frame_for_rect_partitions =
+ (boosted || (allow_screen_content_tools))
+ ? 0
+ : (is_boosted_arf2_bwd_type ? 1 : 2);
+ sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2;
+ sf->inter_sf.selective_ref_frame = 1;
+ sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
+
+ sf->interp_sf.use_fast_interpolation_filter_search = 1;
+
+ sf->intra_sf.intra_pruning_with_hog = 1;
+
+ sf->tx_sf.adaptive_txb_search_level = 1;
+ sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 1;
+ sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+
+ sf->tpl_sf.search_method = NSTEP_8PT;
+
+ sf->rt_sf.use_nonrd_pick_mode = 0;
+ sf->rt_sf.use_real_time_ref_set = 0;
+
+ if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
+ cpi->use_screen_content_tools) {
+ sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
+ } else {
+ sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+ }
+
+ sf->rd_sf.perform_coeff_opt = 1;
+ sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
+
+ if (speed >= 1) {
+ sf->hl_sf.adjust_num_frames_for_arf_filtering =
+ allow_screen_content_tools ? 0 : 1;
+
+ sf->part_sf.intra_cnn_based_part_prune_level =
+ allow_screen_content_tools ? 0 : 2;
+ sf->part_sf.simple_motion_search_early_term_none = 1;
+ // TODO(Venkat): Clean-up frame type dependency for
+ // simple_motion_search_split in partition search function and set the
+ // speed feature accordingly
+ sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
+ sf->part_sf.ml_predict_breakout_level = use_hbd ? 2 : 3;
+
+ sf->mv_sf.exhaustive_searches_thresh <<= 1;
+ sf->mv_sf.obmc_full_pixel_search_level = 1;
+ sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
+ sf->mv_sf.disable_extensive_joint_motion_search = 1;
+
+ sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1;
+ sf->inter_sf.prune_comp_type_by_comp_avg = 1;
+ sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1;
+ sf->inter_sf.prune_ref_frame_for_rect_partitions =
+ (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools))
+ ? 0
+ : (boosted ? 1 : 2);
+ sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
+ sf->inter_sf.reuse_inter_intra_mode = 1;
+ sf->inter_sf.selective_ref_frame = 2;
+ sf->inter_sf.skip_arf_compound = 1;
+
+ sf->interp_sf.use_interp_filter = 1;
+
+ sf->intra_sf.prune_palette_search_level = 1;
+
+ sf->tx_sf.adaptive_txb_search_level = 2;
+ sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 0;
+ sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+ sf->tx_sf.tx_type_search.skip_tx_search = 1;
+
+ sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3;
+ sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
+ sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+ sf->lpf_sf.dual_sgr_penalty_level = 1;
+ sf->lpf_sf.enable_sgr_ep_pruning = 1;
+
+ // TODO(any, yunqing): move this feature to speed 0.
+ sf->tpl_sf.skip_alike_starting_mv = 1;
+ }
+
+ if (speed >= 2) {
+ sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+ sf->fp_sf.skip_motion_search_threshold = 25;
+
+ sf->gm_sf.num_refinement_steps = 2;
+
+ sf->part_sf.reuse_best_prediction_for_part_ab =
+ !frame_is_intra_only(&cpi->common);
+
+ sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL;
+ sf->mv_sf.subpel_iters_per_step = 1;
+ sf->mv_sf.reduce_search_range = 1;
+
+ // TODO(chiyotsai@google.com): We can get 10% speed up if we move
+ // adaptive_rd_thresh to speed 1. But currently it performs poorly on some
+ // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a
+ // bit more closely to figure out why.
+ sf->inter_sf.adaptive_rd_thresh = 1;
+ sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+ sf->inter_sf.fast_interintra_wedge_search = 1;
+ sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1;
+ sf->inter_sf.prune_ext_comp_using_neighbors = 1;
+ sf->inter_sf.prune_comp_using_best_single_mode_ref = 2;
+ sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+ sf->inter_sf.selective_ref_frame = 3;
+ sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+ sf->inter_sf.enable_fast_compound_mode_search = 1;
+ sf->inter_sf.reuse_mask_search_results = 1;
+ set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 1);
+ sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 1;
+ sf->inter_sf.alt_ref_search_fp = 1;
+
+ sf->interp_sf.adaptive_interp_filter_search = 1;
+ sf->interp_sf.disable_dual_filter = 1;
+
+ sf->intra_sf.disable_smooth_intra =
+ !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key > 1);
+ sf->intra_sf.intra_pruning_with_hog = 2;
+ sf->intra_sf.skip_intra_in_interframe = is_inter_frame ? 2 : 1;
+ sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+
+ sf->tpl_sf.prune_starting_mv = 1;
+ sf->tpl_sf.search_method = DIAMOND;
+
+ sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4;
+ sf->rd_sf.use_mb_rd_hash = 1;
+
+ sf->lpf_sf.prune_wiener_based_on_src_var = 1;
+ sf->lpf_sf.prune_sgr_based_on_wiener = 1;
+ sf->lpf_sf.disable_loop_restoration_chroma = boosted ? 0 : 1;
+ sf->lpf_sf.reduce_wiener_window_size = boosted ? 0 : 1;
+
+ // TODO(any): Re-evaluate this feature set to 1 in speed 2.
+ sf->tpl_sf.allow_compound_pred = 0;
+ sf->tpl_sf.prune_ref_frames_in_tpl = 1;
+ }
+
+ if (speed >= 3) {
+ sf->hl_sf.high_precision_mv_usage = CURRENT_Q;
+
+ sf->gm_sf.prune_ref_frame_for_gm_search = 1;
+ sf->gm_sf.prune_zero_mv_with_sse = 1;
+ sf->gm_sf.num_refinement_steps = 0;
+
+ sf->part_sf.less_rectangular_check_level = 2;
+ sf->part_sf.simple_motion_search_prune_agg =
+ allow_screen_content_tools
+ ? SIMPLE_AGG_LVL0
+ : (boosted ? SIMPLE_AGG_LVL1 : QIDX_BASED_AGG_LVL1);
+ sf->part_sf.prune_ext_part_using_split_info = 1;
+ sf->part_sf.simple_motion_search_rect_split = 1;
+
+ sf->mv_sf.full_pixel_search_level = 1;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+ sf->mv_sf.search_method = DIAMOND;
+ sf->mv_sf.disable_second_mv = 2;
+ sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_1;
+ sf->mv_sf.use_intrabc = 0;
+
+ sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+ sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ sf->inter_sf.disable_onesided_comp = 1;
+ sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+ // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2
+ // and clean-up the speed feature
+ sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1;
+ sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1;
+ sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2;
+ sf->inter_sf.selective_ref_frame = 5;
+ sf->inter_sf.reuse_compound_type_decision = 1;
+ set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level,
+ boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2));
+ sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 2;
+
+ sf->interp_sf.adaptive_interp_filter_search = 2;
+
+ // TODO(chiyotsai@google.com): the thresholds chosen for intra hog are
+ // inherited directly from luma hog with some minor tweaking. Eventually we
+ // should run this with a bayesian optimizer to find the Pareto frontier.
+ sf->intra_sf.chroma_intra_pruning_with_hog = 2;
+ sf->intra_sf.intra_pruning_with_hog = 3;
+ sf->intra_sf.prune_palette_search_level = 2;
+ sf->intra_sf.top_intra_model_count_allowed = 2;
+
+ sf->tpl_sf.prune_starting_mv = 2;
+ sf->tpl_sf.skip_alike_starting_mv = 2;
+ sf->tpl_sf.prune_intra_modes = 1;
+ sf->tpl_sf.reduce_first_step_size = 6;
+ sf->tpl_sf.subpel_force_stop = QUARTER_PEL;
+ sf->tpl_sf.gop_length_decision_method = 1;
+
+ sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
+ sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+
+ // TODO(any): Refactor the code related to following winner mode speed
+ // features
+ sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
+ sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
+ sf->winner_mode_sf.motion_mode_for_winner_cand =
+ boosted ? 0
+ : gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE ? 1
+ : 2;
+ sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 4;
+
+ // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
+ // loss.
+ sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2;
+ sf->lpf_sf.prune_wiener_based_on_src_var = 2;
+ sf->lpf_sf.use_coarse_filter_level_search =
+ frame_is_intra_only(&cpi->common) ? 0 : 1;
+ sf->lpf_sf.use_downsampled_wiener_stats = 1;
+ }
+
+ if (speed >= 4) {
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+ sf->gm_sf.prune_zero_mv_with_sse = 2;
+
+ sf->part_sf.simple_motion_search_prune_agg =
+ allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL2;
+ sf->part_sf.simple_motion_search_reduce_search_steps = 4;
+ sf->part_sf.prune_ext_part_using_split_info = 2;
+ sf->part_sf.ml_predict_breakout_level = 3;
+ sf->part_sf.prune_rectangular_split_based_on_qidx =
+ (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0
+ : 1;
+
+ sf->inter_sf.alt_ref_search_fp = 2;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 3;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_MOTION_MODE] = boosted ? 0 : 5;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 3;
+
+ sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
+ sf->inter_sf.prune_ext_comp_using_neighbors = 2;
+ sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+ sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+
+ sf->interp_sf.cb_pred_filter_search = 1;
+ sf->interp_sf.skip_sharp_interp_filter_search = 1;
+ sf->interp_sf.use_interp_filter = 2;
+
+ sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
+ sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+ sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+ // TODO(any): "intra_y_mode_mask" doesn't help much at speed 4.
+ // sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ // sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ // sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+ sf->intra_sf.skip_intra_in_interframe = 4;
+
+ sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
+ sf->mv_sf.prune_mesh_search = PRUNE_MESH_SEARCH_LVL_2;
+
+ sf->tpl_sf.subpel_force_stop = HALF_PEL;
+ sf->tpl_sf.search_method = FAST_BIGDIA;
+ sf->tpl_sf.use_sad_for_mode_decision = 1;
+
+ sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+
+ sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 5 : 7;
+
+ // TODO(any): Extend multi-winner mode processing support for inter frames
+ sf->winner_mode_sf.multi_winner_mode_type =
+ frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_DEFAULT
+ : MULTI_WINNER_MODE_OFF;
+ sf->winner_mode_sf.dc_blk_pred_level = boosted ? 0 : 2;
+
+ sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+ }
+
+ if (speed >= 5) {
+ sf->hl_sf.weight_calc_level_in_tf = 1;
+ sf->hl_sf.adjust_num_frames_for_arf_filtering =
+ allow_screen_content_tools ? 0 : 2;
+
+ sf->fp_sf.reduce_mv_step_param = 4;
+
+ sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+
+ sf->part_sf.simple_motion_search_prune_agg =
+ allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL3;
+ sf->part_sf.ext_partition_eval_thresh =
+ allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+ sf->part_sf.prune_sub_8x8_partition_level =
+ allow_screen_content_tools ? 1 : 2;
+
+ sf->mv_sf.warp_search_method = WARP_SEARCH_DIAMOND;
+
+ sf->inter_sf.prune_inter_modes_if_skippable = 1;
+ sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 1;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_DEFAULT] = boosted ? 0 : 4;
+ sf->inter_sf.txfm_rd_gate_level[TX_SEARCH_COMP_TYPE_MODE] = boosted ? 0 : 5;
+ sf->inter_sf.enable_fast_compound_mode_search = 2;
+
+ sf->interp_sf.skip_interp_filter_search = boosted ? 0 : 1;
+
+ sf->intra_sf.chroma_intra_pruning_with_hog = 3;
+
+ // TODO(any): Extend multi-winner mode processing support for inter frames
+ sf->winner_mode_sf.multi_winner_mode_type =
+ frame_is_intra_only(&cpi->common) ? MULTI_WINNER_MODE_FAST
+ : MULTI_WINNER_MODE_OFF;
+
+ // Disable Self-guided Loop restoration filter.
+ sf->lpf_sf.disable_sgr_filter = true;
+ sf->lpf_sf.disable_wiener_coeff_refine_search = true;
+
+ sf->tpl_sf.prune_starting_mv = 3;
+ sf->tpl_sf.use_y_only_rate_distortion = 1;
+ sf->tpl_sf.subpel_force_stop = FULL_PEL;
+ sf->tpl_sf.gop_length_decision_method = 2;
+ sf->tpl_sf.use_sad_for_mode_decision = 2;
+
+ sf->winner_mode_sf.dc_blk_pred_level = 2;
+
+ sf->fp_sf.disable_recon = 1;
+ }
+
+ if (speed >= 6) {
+ sf->hl_sf.disable_extra_sc_testing = 1;
+ sf->hl_sf.second_alt_ref_filtering = 0;
+
+ sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
+ sf->inter_sf.selective_ref_frame = 6;
+ sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 2;
+ sf->inter_sf.prune_ext_comp_using_neighbors = 3;
+
+ sf->intra_sf.chroma_intra_pruning_with_hog = 4;
+ sf->intra_sf.intra_pruning_with_hog = 4;
+ sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC;
+ sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC;
+ sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC;
+ sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC;
+ sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+ sf->part_sf.prune_rectangular_split_based_on_qidx =
+ boosted || allow_screen_content_tools ? 0 : 2;
+
+ sf->part_sf.prune_part4_search = 3;
+
+ sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
+ sf->mv_sf.use_bsize_dependent_search_method = 1;
+
+ sf->tpl_sf.gop_length_decision_method = 3;
+
+ sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 6 : 8;
+
+ sf->winner_mode_sf.dc_blk_pred_level = 3;
+ sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
+
+ sf->fp_sf.skip_zeromv_motion_search = 1;
+ }
+}
+
+static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
+ SPEED_FEATURES *const sf,
+ int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int boosted = frame_is_boosted(cpi);
+ const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+
+ if (!is_360p_or_larger) {
+ sf->rt_sf.prune_intra_mode_based_on_mv_range = 1;
+ sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1;
+ if (speed >= 6)
+ sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2;
+ if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 2;
+ if (speed >= 7) {
+ sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+ sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+ sf->rt_sf.use_rtc_tf = 2;
+ }
+ if (speed == 8) sf->rt_sf.prefer_large_partition_blocks = 1;
+ if (speed >= 8) {
+ sf->rt_sf.use_nonrd_filter_search = 1;
+ sf->rt_sf.tx_size_level_based_on_qstep = 1;
+ }
+ if (speed >= 9) {
+ sf->rt_sf.use_comp_ref_nonrd = 0;
+ sf->rt_sf.nonrd_aggressive_skip = 1;
+ sf->rt_sf.skip_intra_pred = 1;
+ // Only turn on enable_ref_short_signaling for low resolution when only
+ // LAST and GOLDEN ref frames are used.
+ sf->rt_sf.enable_ref_short_signaling =
+ (!sf->rt_sf.use_nonrd_altref_frame &&
+ (!sf->rt_sf.use_comp_ref_nonrd ||
+ (!sf->rt_sf.ref_frame_comp_nonrd[1] &&
+ !sf->rt_sf.ref_frame_comp_nonrd[2])));
+
+// TODO(kyslov) Re-enable when AV1 models are trained
+#if 0
+#if CONFIG_RT_ML_PARTITIONING
+ if (!frame_is_intra_only(cm)) {
+ sf->part_sf.partition_search_type = ML_BASED_PARTITION;
+ sf->rt_sf.reuse_inter_pred_nonrd = 0;
+ }
+#endif
+#endif
+ sf->rt_sf.use_adaptive_subpel_search = false;
+ }
+ if (speed >= 10) {
+ // TODO(yunqingwang@google.com): To be conservative, disable
+ // sf->rt_sf.estimate_motion_for_var_based_partition = 3 for speed 10/qvga
+ // for now. May enable it in the future.
+ sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+ sf->rt_sf.skip_intra_pred = 2;
+ sf->rt_sf.hybrid_intra_pickmode = 3;
+ sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+ sf->rt_sf.use_nonrd_filter_search = 0;
+ }
+ } else {
+ sf->rt_sf.prune_intra_mode_based_on_mv_range = 2;
+ sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+ if (speed <= 5) {
+ sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
+ boosted ? INT_MAX : 350;
+ sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 2;
+ }
+ if (speed == 6) sf->part_sf.disable_8x8_part_based_on_qidx = 1;
+ if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 2;
+ if (speed == 7) {
+ sf->rt_sf.prefer_large_partition_blocks = 1;
+ // Enable this feature for [360p, 720p] resolution range initially.
+ // Only enable for low bitdepth to mitigate issue: b/303023614.
+ if (!cpi->rc.rtc_external_ratectrl &&
+ AOMMIN(cm->width, cm->height) <= 720 && !cpi->oxcf.use_highbitdepth)
+ sf->hl_sf.accurate_bit_estimate = cpi->oxcf.q_cfg.aq_mode == NO_AQ;
+ }
+ if (speed >= 7) {
+ sf->rt_sf.use_rtc_tf = 1;
+ }
+ if (speed == 8 && !cpi->ppi->use_svc) {
+ sf->rt_sf.short_circuit_low_temp_var = 0;
+ sf->rt_sf.use_nonrd_altref_frame = 1;
+ }
+ if (speed >= 8) sf->rt_sf.tx_size_level_based_on_qstep = 2;
+ if (speed >= 9) {
+ sf->rt_sf.gf_length_lvl = 1;
+ sf->rt_sf.skip_cdef_sb = 1;
+ sf->rt_sf.sad_based_adp_altref_lag = 2;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+ sf->rt_sf.use_adaptive_subpel_search = true;
+ sf->interp_sf.cb_pred_filter_search = 1;
+ }
+ if (speed >= 10) {
+ sf->rt_sf.hybrid_intra_pickmode = 2;
+ sf->rt_sf.sad_based_adp_altref_lag = 4;
+ sf->rt_sf.tx_size_level_based_on_qstep = 0;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+ sf->rt_sf.use_adaptive_subpel_search = false;
+ sf->interp_sf.cb_pred_filter_search = 2;
+ }
+ }
+ if (!is_480p_or_larger) {
+ if (speed == 7) {
+ sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+ }
+ }
+ if (!is_720p_or_larger) {
+ if (speed >= 9) {
+ sf->rt_sf.force_large_partition_blocks_intra = 1;
+ }
+ } else {
+ if (speed >= 6) sf->rt_sf.skip_newmv_mode_based_on_sse = 3;
+ if (speed == 7) sf->rt_sf.prefer_large_partition_blocks = 0;
+ if (speed >= 7) {
+ sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 2;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 1;
+ }
+ if (speed >= 9) {
+ sf->rt_sf.sad_based_adp_altref_lag = 1;
+ sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 0;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 2;
+ }
+ if (speed >= 10) {
+ sf->rt_sf.sad_based_adp_altref_lag = 3;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+ }
+ }
+ // TODO(Any): Check/Tune settings of other sfs for 1080p.
+ if (is_1080p_or_larger) {
+ if (speed >= 7) {
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+ sf->rt_sf.use_adaptive_subpel_search = 0;
+ }
+ if (speed >= 9) sf->interp_sf.cb_pred_filter_search = 0;
+ } else {
+ if (speed >= 9) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+ if (speed >= 10) sf->rt_sf.nonrd_aggressive_skip = 1;
+ }
+ // TODO(marpan): Tune settings for speed 11 video mode,
+ // for resolutions below 720p.
+ if (speed >= 11 && !is_720p_or_larger &&
+ cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+ sf->rt_sf.skip_cdef_sb = 2;
+ sf->rt_sf.force_only_last_ref = 1;
+ sf->rt_sf.selective_cdf_update = 1;
+ sf->rt_sf.use_nonrd_filter_search = 0;
+ if (is_360p_or_larger) {
+ sf->part_sf.fixed_partition_size = BLOCK_32X32;
+ sf->rt_sf.use_fast_fixed_part = 1;
+ }
+ sf->rt_sf.increase_source_sad_thresh = 1;
+ sf->rt_sf.part_early_exit_zeromv = 2;
+ sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2;
+ for (int i = 0; i < BLOCK_SIZES; ++i) {
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+ }
+ }
+ // Setting for SVC, or when the ref_frame_config control is
+ // used to set the reference structure.
+ if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) {
+ const RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ // For SVC: for greater than 2 temporal layers, use better mv search on
+ // base temporal layers, and only on base spatial layer if highest
+ // resolution is above 640x360.
+ if (cpi->svc.number_temporal_layers >= 2 &&
+ cpi->svc.temporal_layer_id == 0 &&
+ (cpi->svc.spatial_layer_id == 0 ||
+ cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
+ 640 * 360)) {
+ sf->mv_sf.search_method = NSTEP;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+ sf->rt_sf.fullpel_search_step_param = 10;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+ if (cm->width * cm->height <= 352 * 288)
+ sf->rt_sf.nonrd_prune_ref_frame_search = 2;
+ sf->rt_sf.force_large_partition_blocks_intra = 0;
+ }
+ if (speed >= 8) {
+ if (cpi->svc.number_temporal_layers > 2)
+ sf->rt_sf.disable_cdf_update_non_reference_frame = true;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+ if (rtc_ref->non_reference_frame) {
+ sf->rt_sf.nonrd_aggressive_skip = 1;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+ }
+ }
+ if (speed <= 9 && cpi->svc.number_temporal_layers > 2 &&
+ cpi->svc.temporal_layer_id == 0)
+ sf->rt_sf.check_only_zero_zeromv_on_large_blocks = false;
+ else
+ sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+ sf->rt_sf.frame_level_mode_cost_update = false;
+
+ // Compound mode enabling.
+ if (rtc_ref->ref_frame_comp[0] || rtc_ref->ref_frame_comp[1] ||
+ rtc_ref->ref_frame_comp[2]) {
+ sf->rt_sf.use_comp_ref_nonrd = 1;
+ sf->rt_sf.ref_frame_comp_nonrd[0] =
+ rtc_ref->ref_frame_comp[0] && rtc_ref->reference[GOLDEN_FRAME - 1];
+ sf->rt_sf.ref_frame_comp_nonrd[1] =
+ rtc_ref->ref_frame_comp[1] && rtc_ref->reference[LAST2_FRAME - 1];
+ sf->rt_sf.ref_frame_comp_nonrd[2] =
+ rtc_ref->ref_frame_comp[2] && rtc_ref->reference[ALTREF_FRAME - 1];
+ } else {
+ sf->rt_sf.use_comp_ref_nonrd = 0;
+ }
+
+ if (cpi->svc.number_spatial_layers > 1 ||
+ cpi->svc.number_temporal_layers > 1)
+ sf->hl_sf.accurate_bit_estimate = 0;
+
+ sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+
+ // For single layers RPS: bias/adjustment for recovery frame.
+ if (cpi->ppi->rtc_ref.bias_recovery_frame) {
+ sf->mv_sf.search_method = NSTEP;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+ sf->rt_sf.fullpel_search_step_param = 8;
+ sf->rt_sf.nonrd_aggressive_skip = 0;
+ }
+ }
+ // Screen settings.
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ // TODO(marpan): Check settings for speed 7 and 8.
+ if (speed >= 7) {
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 1;
+ sf->mv_sf.use_bsize_dependent_search_method = 0;
+ sf->rt_sf.skip_cdef_sb = 1;
+ sf->rt_sf.increase_color_thresh_palette = 1;
+ if (!frame_is_intra_only(cm)) sf->rt_sf.dct_only_palette_nonrd = 1;
+ }
+ if (speed >= 8) {
+ sf->rt_sf.nonrd_check_partition_merge_mode = 3;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+ sf->rt_sf.use_nonrd_filter_search = 0;
+ sf->rt_sf.prune_hv_pred_modes_using_src_sad = false;
+ }
+ if (speed >= 9) {
+ sf->rt_sf.prune_idtx_nonrd = 1;
+ sf->rt_sf.part_early_exit_zeromv = 2;
+ sf->rt_sf.skip_lf_screen = 1;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 3;
+ sf->rt_sf.var_part_split_threshold_shift = 10;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 3;
+ sf->rt_sf.reduce_mv_pel_precision_lowcomplex = 1;
+ sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+ sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+ sf->interp_sf.cb_pred_filter_search = 0;
+ }
+ if (speed >= 10) {
+ if (cm->width * cm->height > 1920 * 1080)
+ sf->part_sf.disable_8x8_part_based_on_qidx = 1;
+ sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80;
+ sf->rt_sf.part_early_exit_zeromv = 1;
+ sf->rt_sf.nonrd_aggressive_skip = 1;
+ }
+ if (speed >= 11) {
+ sf->rt_sf.skip_lf_screen = 2;
+ sf->rt_sf.skip_cdef_sb = 2;
+ sf->rt_sf.part_early_exit_zeromv = 2;
+ sf->rt_sf.prune_palette_nonrd = 1;
+ sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2;
+ sf->rt_sf.increase_color_thresh_palette = 0;
+ }
+ sf->rt_sf.use_nonrd_altref_frame = 0;
+ sf->rt_sf.use_rtc_tf = 0;
+ sf->rt_sf.use_comp_ref_nonrd = 0;
+ sf->rt_sf.source_metrics_sb_nonrd = 1;
+ if (cpi->rc.high_source_sad == 1) {
+ sf->rt_sf.prefer_large_partition_blocks = 0;
+ sf->part_sf.max_intra_bsize = BLOCK_128X128;
+ for (int i = 0; i < BLOCK_SIZES; ++i) {
+ if (i > BLOCK_32X32)
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+ else
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+ }
+ }
+ if (cpi->rc.max_block_source_sad > 20000 &&
+ cpi->rc.frame_source_sad > 100 && speed >= 6 &&
+ (cpi->rc.percent_blocks_with_motion > 1 ||
+ cpi->svc.last_layer_dropped[0])) {
+ sf->mv_sf.search_method = NSTEP;
+ sf->rt_sf.fullpel_search_step_param = 2;
+ }
+ sf->rt_sf.partition_direct_merging = 0;
+ sf->hl_sf.accurate_bit_estimate = 0;
+ // This feature is for nonrd_pickmode.
+ if (sf->rt_sf.use_nonrd_pick_mode)
+ sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+ else
+ sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+ }
+ if (is_lossless_requested(&cpi->oxcf.rc_cfg)) {
+ sf->rt_sf.use_rtc_tf = 0;
+ // TODO(aomedia:3412): The setting accurate_bit_estimate = 0
+ // can be removed once it's fixed for lossless mode.
+ sf->hl_sf.accurate_bit_estimate = 0;
+ }
+ if (cpi->oxcf.use_highbitdepth) {
+ // Disable for use_highbitdepth = 1 to mitigate issue: b/303023614.
+ sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+ }
+ if (cpi->oxcf.superres_cfg.enable_superres) {
+ sf->rt_sf.use_rtc_tf = 0;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+ }
+}
+
+// TODO(kyslov): now this is very similar to
+// set_good_speed_features_framesize_independent
+// except it sets non-rd flag on speed 8. This function will likely
+// be modified in the future with RT-specific speed features.
+static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
+ SPEED_FEATURES *sf,
+ int speed) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int boosted = frame_is_boosted(cpi);
+
+ // Currently, rt speed 0, 1, 2, 3, 4, 5 are the same.
+ // Following set of speed features are not impacting encoder's decisions as
+ // the relevant tools are disabled by default.
+ sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+ sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+ sf->inter_sf.reuse_inter_intra_mode = 1;
+ sf->inter_sf.prune_compound_using_single_ref = 0;
+ sf->inter_sf.prune_comp_search_by_single_result = 2;
+ sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+ sf->inter_sf.fast_wedge_sign_estimate = 1;
+ sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+ sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+ sf->interp_sf.cb_pred_filter_search = 0;
+ sf->interp_sf.skip_interp_filter_search = 1;
+ sf->part_sf.ml_prune_partition = 1;
+ sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+ sf->part_sf.prune_ext_partition_types_search_level = 2;
+ sf->part_sf.less_rectangular_check_level = 2;
+ sf->mv_sf.obmc_full_pixel_search_level = 1;
+ sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
+ sf->tx_sf.model_based_prune_tx_search_level = 0;
+ sf->lpf_sf.dual_sgr_penalty_level = 1;
+ // Disable Wiener and Self-guided Loop restoration filters.
+ sf->lpf_sf.disable_wiener_filter = true;
+ sf->lpf_sf.disable_sgr_filter = true;
+ sf->intra_sf.prune_palette_search_level = 2;
+ sf->intra_sf.prune_luma_palette_size_search_level = 2;
+ sf->intra_sf.early_term_chroma_palette_size_search = 1;
+
+ // End of set
+
+ // TODO(any, yunqing): tune these features for real-time use cases.
+ sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_SOLO;
+ sf->hl_sf.frame_parameter_update = 0;
+
+ sf->inter_sf.model_based_post_interp_filter_breakout = 1;
+ // TODO(any): As per the experiments, this speed feature is doing redundant
+ // computation since the model rd based pruning logic is similar to model rd
+ // based gating when inter_mode_rd_model_estimation = 2. Enable this SF if
+ // either of the condition becomes true.
+ // (1) inter_mode_rd_model_estimation != 2
+ // (2) skip_interp_filter_search == 0
+ // (3) Motion mode or compound mode is enabled */
+ sf->inter_sf.prune_mode_search_simple_translation = 0;
+ sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted;
+ sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+ sf->inter_sf.selective_ref_frame = 4;
+ sf->inter_sf.alt_ref_search_fp = 2;
+ set_txfm_rd_gate_level(sf->inter_sf.txfm_rd_gate_level, boosted ? 0 : 4);
+ sf->inter_sf.limit_txfm_eval_per_mode = 3;
+
+ sf->inter_sf.adaptive_rd_thresh = 4;
+ sf->inter_sf.inter_mode_rd_model_estimation = 2;
+ sf->inter_sf.prune_inter_modes_if_skippable = 1;
+ sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3;
+ sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
+ sf->inter_sf.skip_newmv_in_drl = 4;
+
+ sf->interp_sf.use_fast_interpolation_filter_search = 1;
+ sf->interp_sf.use_interp_filter = 1;
+ sf->interp_sf.adaptive_interp_filter_search = 1;
+ sf->interp_sf.disable_dual_filter = 1;
+
+ sf->part_sf.default_max_partition_size = BLOCK_128X128;
+ sf->part_sf.default_min_partition_size = BLOCK_8X8;
+ sf->part_sf.use_best_rd_for_pruning = 1;
+ sf->part_sf.early_term_after_none_split = 1;
+ sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+ sf->part_sf.max_intra_bsize = BLOCK_16X16;
+ sf->part_sf.partition_search_breakout_rate_thr = 500;
+ sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+ sf->part_sf.adjust_var_based_rd_partitioning = 2;
+
+ sf->mv_sf.full_pixel_search_level = 1;
+ sf->mv_sf.exhaustive_searches_thresh = INT_MAX;
+ sf->mv_sf.auto_mv_step_size = 1;
+ sf->mv_sf.subpel_iters_per_step = 1;
+ sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
+ sf->mv_sf.search_method = FAST_DIAMOND;
+ sf->mv_sf.subpel_force_stop = EIGHTH_PEL;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+
+ for (int i = 0; i < TX_SIZES; ++i) {
+ sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC;
+ sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+ }
+ sf->intra_sf.skip_intra_in_interframe = 5;
+ sf->intra_sf.disable_smooth_intra = 1;
+ sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+
+ sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+ sf->tx_sf.adaptive_txb_search_level = 2;
+ sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.tx_size_search_lgr_block = 1;
+ sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+ sf->tx_sf.tx_type_search.skip_tx_search = 1;
+ sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+ sf->tx_sf.refine_fast_tx_search_results = 0;
+ sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+ sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
+
+ sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
+ sf->rd_sf.simple_model_rd_from_var = 1;
+ sf->rd_sf.tx_domain_dist_level = 2;
+ sf->rd_sf.tx_domain_dist_thres_level = 2;
+
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+ sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+ sf->winner_mode_sf.dc_blk_pred_level = frame_is_intra_only(cm) ? 0 : 3;
+ sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+ sf->winner_mode_sf.tx_size_search_level = 1;
+ sf->winner_mode_sf.winner_mode_ifs = 1;
+
+ sf->rt_sf.check_intra_pred_nonrd = 1;
+ sf->rt_sf.estimate_motion_for_var_based_partition = 2;
+ sf->rt_sf.hybrid_intra_pickmode = 1;
+ sf->rt_sf.use_comp_ref_nonrd = 0;
+ sf->rt_sf.ref_frame_comp_nonrd[0] = 0;
+ sf->rt_sf.ref_frame_comp_nonrd[1] = 0;
+ sf->rt_sf.ref_frame_comp_nonrd[2] = 0;
+ sf->rt_sf.use_nonrd_filter_search = 1;
+ sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+ sf->rt_sf.num_inter_modes_for_tx_search = 5;
+ sf->rt_sf.prune_inter_modes_using_temp_var = 1;
+ sf->rt_sf.use_real_time_ref_set = 1;
+ sf->rt_sf.use_simple_rd_model = 1;
+ sf->rt_sf.prune_inter_modes_with_golden_ref = boosted ? 0 : 1;
+ // TODO(any): This sf could be removed.
+ sf->rt_sf.short_circuit_low_temp_var = 1;
+ sf->rt_sf.check_scene_detection = 1;
+ if (cpi->rc.rtc_external_ratectrl) sf->rt_sf.check_scene_detection = 0;
+ if (cm->current_frame.frame_type != KEY_FRAME &&
+ cpi->oxcf.rc_cfg.mode == AOM_CBR)
+ sf->rt_sf.overshoot_detection_cbr = FAST_DETECTION_MAXQ;
+ // Enable noise estimation only for high resolutions for now.
+ //
+ // Since use_temporal_noise_estimate has no effect for all-intra frame
+ // encoding, it is disabled for this case.
+ if (cpi->oxcf.kf_cfg.key_freq_max != 0 && cm->width * cm->height > 640 * 480)
+ sf->rt_sf.use_temporal_noise_estimate = 1;
+ sf->rt_sf.skip_tx_no_split_var_based_partition = 1;
+ sf->rt_sf.skip_newmv_mode_based_on_sse = 1;
+ sf->rt_sf.mode_search_skip_flags =
+ (cm->current_frame.frame_type == KEY_FRAME)
+ ? 0
+ : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+ FLAG_EARLY_TERMINATE;
+ sf->rt_sf.var_part_split_threshold_shift = 5;
+ if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 1;
+ sf->rt_sf.use_fast_fixed_part = 0;
+ sf->rt_sf.increase_source_sad_thresh = 0;
+
+ if (speed >= 6) {
+ sf->mv_sf.use_fullpel_costlist = 1;
+
+ sf->rd_sf.tx_domain_dist_thres_level = 3;
+
+ sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 0;
+ sf->inter_sf.limit_inter_mode_cands = 4;
+ sf->inter_sf.prune_warped_prob_thresh = 8;
+ sf->inter_sf.extra_prune_warped = 1;
+
+ sf->rt_sf.gf_refresh_based_on_qp = 1;
+ sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 1;
+ sf->rt_sf.var_part_split_threshold_shift = 7;
+ if (!frame_is_intra_only(&cpi->common))
+ sf->rt_sf.var_part_based_on_qidx = 2;
+
+ sf->winner_mode_sf.prune_winner_mode_eval_level = boosted ? 0 : 3;
+ }
+
+ if (speed >= 7) {
+ sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_1;
+ sf->rt_sf.use_comp_ref_nonrd = 1;
+ sf->rt_sf.ref_frame_comp_nonrd[2] = 1; // LAST_ALTREF
+ sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2;
+ sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+ sf->part_sf.max_intra_bsize = BLOCK_32X32;
+
+ sf->mv_sf.search_method = FAST_DIAMOND;
+ sf->mv_sf.subpel_force_stop = QUARTER_PEL;
+
+ sf->inter_sf.inter_mode_rd_model_estimation = 2;
+ // This sf is not applicable in non-rd path.
+ sf->inter_sf.skip_newmv_in_drl = 0;
+
+ sf->interp_sf.skip_interp_filter_search = 0;
+
+ // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't
+ // good. May need more study.
+ for (int i = 0; i < TX_SIZES; ++i) {
+ sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL;
+ }
+
+ sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+ sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL5;
+
+ sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+ // This is for rd path only.
+ sf->rt_sf.prune_inter_modes_using_temp_var = 0;
+ sf->rt_sf.prune_inter_modes_wrt_gf_arf_based_on_sad = 0;
+ sf->rt_sf.prune_intra_mode_based_on_mv_range = 0;
+#if !CONFIG_REALTIME_ONLY
+ sf->rt_sf.reuse_inter_pred_nonrd =
+ (cpi->oxcf.motion_mode_cfg.enable_warped_motion == 0);
+#else
+ sf->rt_sf.reuse_inter_pred_nonrd = 1;
+#endif
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ sf->rt_sf.reuse_inter_pred_nonrd = (cpi->oxcf.noise_sensitivity == 0);
+#endif
+ sf->rt_sf.short_circuit_low_temp_var = 0;
+ // For spatial layers, only LAST and GOLDEN are currently used in the SVC
+ // for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the
+ // get_ref_frame_flags() for some patterns, so disable it here for
+ // spatial layers.
+ sf->rt_sf.use_nonrd_altref_frame =
+ (cpi->svc.number_spatial_layers > 1) ? 0 : 1;
+ sf->rt_sf.use_nonrd_pick_mode = 1;
+ sf->rt_sf.nonrd_check_partition_merge_mode = 3;
+ sf->rt_sf.skip_intra_pred = 1;
+ sf->rt_sf.source_metrics_sb_nonrd = 1;
+ // Set mask for intra modes.
+ for (int i = 0; i < BLOCK_SIZES; ++i)
+ if (i >= BLOCK_32X32)
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+ else
+ // Use DC, H, V intra mode for block sizes < 32X32.
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+
+ sf->winner_mode_sf.dc_blk_pred_level = 0;
+ sf->rt_sf.var_part_based_on_qidx = 3;
+ sf->rt_sf.prune_compoundmode_with_singlecompound_var = true;
+ sf->rt_sf.prune_compoundmode_with_singlemode_var = true;
+ sf->rt_sf.skip_compound_based_on_var = true;
+ sf->rt_sf.use_adaptive_subpel_search = true;
+ }
+
+ if (speed >= 8) {
+ sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_2;
+ sf->intra_sf.intra_pruning_with_hog = 1;
+ sf->rt_sf.short_circuit_low_temp_var = 1;
+ sf->rt_sf.use_nonrd_altref_frame = 0;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 2;
+ sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+ sf->rt_sf.var_part_split_threshold_shift = 8;
+ sf->rt_sf.var_part_based_on_qidx = 4;
+ sf->rt_sf.partition_direct_merging = 1;
+ sf->rt_sf.prune_compoundmode_with_singlemode_var = false;
+ sf->mv_sf.use_bsize_dependent_search_method = 2;
+ sf->rt_sf.prune_hv_pred_modes_using_src_sad = true;
+ }
+ if (speed >= 9) {
+ sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_3;
+ sf->rt_sf.estimate_motion_for_var_based_partition = 3;
+ sf->rt_sf.prefer_large_partition_blocks = 3;
+ sf->rt_sf.skip_intra_pred = 2;
+ sf->rt_sf.var_part_split_threshold_shift = 9;
+ for (int i = 0; i < BLOCK_SIZES; ++i)
+ sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+ sf->rt_sf.var_part_based_on_qidx = 0;
+ sf->rt_sf.frame_level_mode_cost_update = true;
+ sf->rt_sf.check_only_zero_zeromv_on_large_blocks = true;
+ sf->rt_sf.reduce_mv_pel_precision_highmotion = 0;
+ sf->rt_sf.use_adaptive_subpel_search = true;
+ sf->mv_sf.use_bsize_dependent_search_method = 0;
+ }
+ if (speed >= 10) {
+ sf->rt_sf.sse_early_term_inter_search = EARLY_TERM_IDX_4;
+ sf->rt_sf.nonrd_prune_ref_frame_search = 3;
+ sf->rt_sf.var_part_split_threshold_shift = 10;
+ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+ }
+ if (speed >= 11 && !frame_is_intra_only(cm) &&
+ cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ sf->winner_mode_sf.dc_blk_pred_level = 3;
+ }
+}
+
+static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
+ // best quality defaults
+ hl_sf->frame_parameter_update = 1;
+ hl_sf->recode_loop = ALLOW_RECODE;
+ // Recode loop tolerance %.
+ hl_sf->recode_tolerance = 25;
+ hl_sf->high_precision_mv_usage = CURRENT_Q;
+ hl_sf->superres_auto_search_type = SUPERRES_AUTO_ALL;
+ hl_sf->disable_extra_sc_testing = 0;
+ hl_sf->second_alt_ref_filtering = 1;
+ hl_sf->adjust_num_frames_for_arf_filtering = 0;
+ hl_sf->accurate_bit_estimate = 0;
+ hl_sf->weight_calc_level_in_tf = 0;
+ hl_sf->allow_sub_blk_me_in_tf = 0;
+}
+
+static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
+ fp_sf->reduce_mv_step_param = 3;
+ fp_sf->skip_motion_search_threshold = 0;
+ fp_sf->disable_recon = 0;
+ fp_sf->skip_zeromv_motion_search = 0;
+}
+
+static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
+ tpl_sf->gop_length_decision_method = 0;
+ tpl_sf->prune_intra_modes = 0;
+ tpl_sf->prune_starting_mv = 0;
+ tpl_sf->reduce_first_step_size = 0;
+ tpl_sf->skip_alike_starting_mv = 0;
+ tpl_sf->subpel_force_stop = EIGHTH_PEL;
+ tpl_sf->search_method = NSTEP;
+ tpl_sf->prune_ref_frames_in_tpl = 0;
+ tpl_sf->allow_compound_pred = 1;
+ tpl_sf->use_y_only_rate_distortion = 0;
+ tpl_sf->use_sad_for_mode_decision = 0;
+ tpl_sf->reduce_num_frames = 0;
+}
+
+static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
+ gm_sf->gm_search_type = GM_FULL_SEARCH;
+ gm_sf->prune_ref_frame_for_gm_search = 0;
+ gm_sf->prune_zero_mv_with_sse = 0;
+ gm_sf->disable_gm_search_based_on_stats = 0;
+ gm_sf->num_refinement_steps = GM_MAX_REFINEMENT_STEPS;
+}
+
+static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+ part_sf->partition_search_type = SEARCH_PARTITION;
+ part_sf->less_rectangular_check_level = 0;
+ part_sf->use_square_partition_only_threshold = BLOCK_128X128;
+ part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+ part_sf->default_max_partition_size = BLOCK_LARGEST;
+ part_sf->default_min_partition_size = BLOCK_4X4;
+ part_sf->adjust_var_based_rd_partitioning = 0;
+ part_sf->max_intra_bsize = BLOCK_LARGEST;
+ // This setting only takes effect when partition_search_type is set
+ // to FIXED_PARTITION.
+ part_sf->fixed_partition_size = BLOCK_16X16;
+ // Recode loop tolerance %.
+ part_sf->partition_search_breakout_dist_thr = 0;
+ part_sf->partition_search_breakout_rate_thr = 0;
+ part_sf->prune_ext_partition_types_search_level = 0;
+ part_sf->prune_part4_search = 0;
+ part_sf->ml_prune_partition = 0;
+ part_sf->ml_early_term_after_part_split_level = 0;
+ for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
+ part_sf->ml_partition_search_breakout_thresh[i] =
+ -1; // -1 means not enabled.
+ }
+ part_sf->simple_motion_search_prune_agg = SIMPLE_AGG_LVL0;
+ part_sf->simple_motion_search_split = 0;
+ part_sf->simple_motion_search_prune_rect = 0;
+ part_sf->simple_motion_search_early_term_none = 0;
+ part_sf->simple_motion_search_reduce_search_steps = 0;
+ part_sf->intra_cnn_based_part_prune_level = 0;
+ part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+ part_sf->rect_partition_eval_thresh = BLOCK_128X128;
+ part_sf->ext_part_eval_based_on_cur_best = 0;
+ part_sf->prune_ext_part_using_split_info = 0;
+ part_sf->prune_rectangular_split_based_on_qidx = 0;
+ part_sf->prune_rect_part_using_4x4_var_deviation = false;
+ part_sf->prune_rect_part_using_none_pred_mode = false;
+ part_sf->early_term_after_none_split = 0;
+ part_sf->ml_predict_breakout_level = 0;
+ part_sf->prune_sub_8x8_partition_level = 0;
+ part_sf->simple_motion_search_rect_split = 0;
+ part_sf->reuse_prev_rd_results_for_part_ab = 0;
+ part_sf->reuse_best_prediction_for_part_ab = 0;
+ part_sf->use_best_rd_for_pruning = 0;
+ part_sf->skip_non_sq_part_based_on_none = 0;
+ part_sf->disable_8x8_part_based_on_qidx = 0;
+}
+
+static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
+ mv_sf->full_pixel_search_level = 0;
+ mv_sf->auto_mv_step_size = 0;
+ mv_sf->exhaustive_searches_thresh = 0;
+ mv_sf->obmc_full_pixel_search_level = 0;
+ mv_sf->prune_mesh_search = PRUNE_MESH_SEARCH_DISABLED;
+ mv_sf->reduce_search_range = 0;
+ mv_sf->search_method = NSTEP;
+ mv_sf->simple_motion_subpel_force_stop = EIGHTH_PEL;
+ mv_sf->subpel_force_stop = EIGHTH_PEL;
+ mv_sf->subpel_iters_per_step = 2;
+ mv_sf->subpel_search_method = SUBPEL_TREE;
+ mv_sf->use_accurate_subpel_search = USE_8_TAPS;
+ mv_sf->use_bsize_dependent_search_method = 0;
+ mv_sf->use_fullpel_costlist = 0;
+ mv_sf->use_downsampled_sad = 0;
+ mv_sf->disable_extensive_joint_motion_search = 0;
+ mv_sf->disable_second_mv = 0;
+ mv_sf->skip_fullpel_search_using_startmv = 0;
+ mv_sf->warp_search_method = WARP_SEARCH_SQUARE;
+ mv_sf->warp_search_iters = 8;
+ mv_sf->use_intrabc = 1;
+}
+
+static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
+ inter_sf->adaptive_rd_thresh = 0;
+ inter_sf->model_based_post_interp_filter_breakout = 0;
+ inter_sf->reduce_inter_modes = 0;
+ inter_sf->alt_ref_search_fp = 0;
+ inter_sf->prune_single_ref = 0;
+ inter_sf->prune_comp_ref_frames = 0;
+ inter_sf->selective_ref_frame = 0;
+ inter_sf->prune_ref_frame_for_rect_partitions = 0;
+ inter_sf->fast_wedge_sign_estimate = 0;
+ inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
+ inter_sf->reuse_inter_intra_mode = 0;
+ inter_sf->mv_cost_upd_level = INTERNAL_COST_UPD_SB;
+ inter_sf->coeff_cost_upd_level = INTERNAL_COST_UPD_SB;
+ inter_sf->mode_cost_upd_level = INTERNAL_COST_UPD_SB;
+ inter_sf->prune_inter_modes_based_on_tpl = 0;
+ inter_sf->prune_nearmv_using_neighbors = PRUNE_NEARMV_OFF;
+ inter_sf->prune_comp_search_by_single_result = 0;
+ inter_sf->skip_repeated_ref_mv = 0;
+ inter_sf->skip_newmv_in_drl = 0;
+ inter_sf->inter_mode_rd_model_estimation = 0;
+ inter_sf->prune_compound_using_single_ref = 0;
+ inter_sf->prune_ext_comp_using_neighbors = 0;
+ inter_sf->skip_ext_comp_nearmv_mode = 0;
+ inter_sf->prune_comp_using_best_single_mode_ref = 0;
+ inter_sf->prune_nearest_near_mv_using_refmv_weight = 0;
+ inter_sf->disable_onesided_comp = 0;
+ inter_sf->prune_mode_search_simple_translation = 0;
+ inter_sf->prune_comp_type_by_comp_avg = 0;
+ inter_sf->disable_interinter_wedge_newmv_search = 0;
+ inter_sf->fast_interintra_wedge_search = 0;
+ inter_sf->prune_comp_type_by_model_rd = 0;
+ inter_sf->perform_best_rd_based_gating_for_chroma = 0;
+ inter_sf->prune_obmc_prob_thresh = 0;
+ inter_sf->disable_interinter_wedge_var_thresh = 0;
+ inter_sf->disable_interintra_wedge_var_thresh = 0;
+ inter_sf->prune_ref_mv_idx_search = 0;
+ inter_sf->prune_warped_prob_thresh = 0;
+ inter_sf->reuse_compound_type_decision = 0;
+ inter_sf->prune_inter_modes_if_skippable = 0;
+ inter_sf->disable_masked_comp = 0;
+ inter_sf->enable_fast_compound_mode_search = 0;
+ inter_sf->reuse_mask_search_results = 0;
+ inter_sf->enable_fast_wedge_mask_search = 0;
+ inter_sf->inter_mode_txfm_breakout = 0;
+ inter_sf->limit_inter_mode_cands = 0;
+ inter_sf->limit_txfm_eval_per_mode = 0;
+ inter_sf->skip_arf_compound = 0;
+ set_txfm_rd_gate_level(inter_sf->txfm_rd_gate_level, 0);
+}
+
+static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
+ interp_sf->adaptive_interp_filter_search = 0;
+ interp_sf->cb_pred_filter_search = 0;
+ interp_sf->disable_dual_filter = 0;
+ interp_sf->skip_sharp_interp_filter_search = 0;
+ interp_sf->use_fast_interpolation_filter_search = 0;
+ interp_sf->use_interp_filter = 0;
+ interp_sf->skip_interp_filter_search = 0;
+}
+
+static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
+ intra_sf->dv_cost_upd_level = INTERNAL_COST_UPD_SB;
+ intra_sf->skip_intra_in_interframe = 1;
+ intra_sf->intra_pruning_with_hog = 0;
+ intra_sf->chroma_intra_pruning_with_hog = 0;
+ intra_sf->prune_palette_search_level = 0;
+ intra_sf->prune_luma_palette_size_search_level = 0;
+
+ for (int i = 0; i < TX_SIZES; i++) {
+ intra_sf->intra_y_mode_mask[i] = INTRA_ALL;
+ intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
+ }
+ intra_sf->disable_smooth_intra = 0;
+ intra_sf->prune_smooth_intra_mode_for_chroma = 0;
+ intra_sf->prune_filter_intra_level = 0;
+ intra_sf->prune_chroma_modes_using_luma_winner = 0;
+ intra_sf->cfl_search_range = 3;
+ intra_sf->top_intra_model_count_allowed = TOP_INTRA_MODEL_COUNT;
+ intra_sf->adapt_top_model_rd_count_using_neighbors = 0;
+ intra_sf->early_term_chroma_palette_size_search = 0;
+ intra_sf->skip_filter_intra_in_inter_frames = 0;
+ intra_sf->prune_luma_odd_delta_angles_in_intra = 0;
+}
+
+static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
+ tx_sf->inter_tx_size_search_init_depth_sqr = 0;
+ tx_sf->inter_tx_size_search_init_depth_rect = 0;
+ tx_sf->intra_tx_size_search_init_depth_rect = 0;
+ tx_sf->intra_tx_size_search_init_depth_sqr = 0;
+ tx_sf->tx_size_search_lgr_block = 0;
+ tx_sf->model_based_prune_tx_search_level = 0;
+ tx_sf->tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_1;
+ tx_sf->tx_type_search.ml_tx_split_thresh = 8500;
+ tx_sf->tx_type_search.use_skip_flag_prediction = 1;
+ tx_sf->tx_type_search.use_reduced_intra_txset = 0;
+ tx_sf->tx_type_search.fast_intra_tx_type_search = 0;
+ tx_sf->tx_type_search.fast_inter_tx_type_prob_thresh = INT_MAX;
+ tx_sf->tx_type_search.skip_tx_search = 0;
+ tx_sf->tx_type_search.prune_tx_type_using_stats = 0;
+ tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
+ tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0;
+ tx_sf->txb_split_cap = 1;
+ tx_sf->adaptive_txb_search_level = 0;
+ tx_sf->refine_fast_tx_search_results = 1;
+ tx_sf->prune_tx_size_level = 0;
+ tx_sf->prune_intra_tx_depths_using_nn = false;
+ tx_sf->use_rd_based_breakout_for_intra_tx_search = false;
+}
+
+static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
+ const AV1EncoderConfig *oxcf) {
+ const int disable_trellis_quant = oxcf->algo_cfg.disable_trellis_quant;
+ if (disable_trellis_quant == 3) {
+ rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
+ ? NO_ESTIMATE_YRD_TRELLIS_OPT
+ : NO_TRELLIS_OPT;
+ } else if (disable_trellis_quant == 2) {
+ rd_sf->optimize_coefficients = !is_lossless_requested(&oxcf->rc_cfg)
+ ? FINAL_PASS_TRELLIS_OPT
+ : NO_TRELLIS_OPT;
+ } else if (disable_trellis_quant == 0) {
+ if (is_lossless_requested(&oxcf->rc_cfg)) {
+ rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
+ } else {
+ rd_sf->optimize_coefficients = FULL_TRELLIS_OPT;
+ }
+ } else if (disable_trellis_quant == 1) {
+ rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
+ } else {
+ assert(0 && "Invalid disable_trellis_quant value");
+ }
+ rd_sf->use_mb_rd_hash = 0;
+ rd_sf->simple_model_rd_from_var = 0;
+ rd_sf->tx_domain_dist_level = 0;
+ rd_sf->tx_domain_dist_thres_level = 0;
+ rd_sf->perform_coeff_opt = 0;
+}
+
+static AOM_INLINE void init_winner_mode_sf(
+ WINNER_MODE_SPEED_FEATURES *winner_mode_sf) {
+ winner_mode_sf->motion_mode_for_winner_cand = 0;
+ // Set this at the appropriate speed levels
+ winner_mode_sf->tx_size_search_level = 0;
+ winner_mode_sf->enable_winner_mode_for_coeff_opt = 0;
+ winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0;
+ winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0;
+ winner_mode_sf->multi_winner_mode_type = 0;
+ winner_mode_sf->dc_blk_pred_level = 0;
+ winner_mode_sf->winner_mode_ifs = 0;
+ winner_mode_sf->prune_winner_mode_eval_level = 0;
+}
+
+static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
+ lpf_sf->disable_loop_restoration_chroma = 0;
+ lpf_sf->disable_loop_restoration_luma = 0;
+ lpf_sf->min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
+ lpf_sf->max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ lpf_sf->prune_wiener_based_on_src_var = 0;
+ lpf_sf->prune_sgr_based_on_wiener = 0;
+ lpf_sf->enable_sgr_ep_pruning = 0;
+ lpf_sf->reduce_wiener_window_size = 0;
+ lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+ lpf_sf->use_coarse_filter_level_search = 0;
+ lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH;
+ // Set decoder side speed feature to use less dual sgr modes
+ lpf_sf->dual_sgr_penalty_level = 0;
+ // Enable Wiener and Self-guided Loop restoration filters by default.
+ lpf_sf->disable_wiener_filter = false;
+ lpf_sf->disable_sgr_filter = false;
+ lpf_sf->disable_wiener_coeff_refine_search = false;
+ lpf_sf->use_downsampled_wiener_stats = 0;
+}
+
+static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
+ rt_sf->check_intra_pred_nonrd = 0;
+ rt_sf->skip_intra_pred = 0;
+ rt_sf->estimate_motion_for_var_based_partition = 0;
+ rt_sf->nonrd_check_partition_merge_mode = 0;
+ rt_sf->nonrd_check_partition_split = 0;
+ rt_sf->mode_search_skip_flags = 0;
+ rt_sf->nonrd_prune_ref_frame_search = 0;
+ rt_sf->use_nonrd_pick_mode = 0;
+ rt_sf->use_nonrd_altref_frame = 0;
+ rt_sf->use_comp_ref_nonrd = 0;
+ rt_sf->use_real_time_ref_set = 0;
+ rt_sf->short_circuit_low_temp_var = 0;
+ rt_sf->reuse_inter_pred_nonrd = 0;
+ rt_sf->num_inter_modes_for_tx_search = INT_MAX;
+ rt_sf->use_nonrd_filter_search = 0;
+ rt_sf->use_simple_rd_model = 0;
+ rt_sf->hybrid_intra_pickmode = 0;
+ rt_sf->source_metrics_sb_nonrd = 0;
+ rt_sf->overshoot_detection_cbr = NO_DETECTION;
+ rt_sf->check_scene_detection = 0;
+ rt_sf->prefer_large_partition_blocks = 0;
+ rt_sf->use_temporal_noise_estimate = 0;
+ rt_sf->fullpel_search_step_param = 0;
+ for (int i = 0; i < BLOCK_SIZES; ++i)
+ rt_sf->intra_y_mode_bsize_mask_nrd[i] = INTRA_ALL;
+ rt_sf->prune_hv_pred_modes_using_src_sad = false;
+ rt_sf->nonrd_aggressive_skip = 0;
+ rt_sf->skip_cdef_sb = 0;
+ rt_sf->force_large_partition_blocks_intra = 0;
+ rt_sf->skip_tx_no_split_var_based_partition = 0;
+ rt_sf->skip_newmv_mode_based_on_sse = 0;
+ rt_sf->gf_length_lvl = 0;
+ rt_sf->prune_inter_modes_with_golden_ref = 0;
+ rt_sf->prune_inter_modes_wrt_gf_arf_based_on_sad = 0;
+ rt_sf->prune_inter_modes_using_temp_var = 0;
+ rt_sf->reduce_mv_pel_precision_highmotion = 0;
+ rt_sf->reduce_mv_pel_precision_lowcomplex = 0;
+ rt_sf->prune_intra_mode_based_on_mv_range = 0;
+ rt_sf->var_part_split_threshold_shift = 7;
+ rt_sf->gf_refresh_based_on_qp = 0;
+ rt_sf->use_rtc_tf = 0;
+ rt_sf->prune_idtx_nonrd = 0;
+ rt_sf->prune_palette_nonrd = 0;
+ rt_sf->dct_only_palette_nonrd = 0;
+ rt_sf->part_early_exit_zeromv = 0;
+ rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED;
+ rt_sf->skip_lf_screen = 0;
+ rt_sf->sad_based_adp_altref_lag = 0;
+ rt_sf->partition_direct_merging = 0;
+ rt_sf->var_part_based_on_qidx = 0;
+ rt_sf->tx_size_level_based_on_qstep = 0;
+ rt_sf->vbp_prune_16x16_split_using_min_max_sub_blk_var = false;
+ rt_sf->prune_compoundmode_with_singlecompound_var = false;
+ rt_sf->frame_level_mode_cost_update = false;
+ rt_sf->prune_h_pred_using_best_mode_so_far = false;
+ rt_sf->enable_intra_mode_pruning_using_neighbors = false;
+ rt_sf->prune_intra_mode_using_best_sad_so_far = false;
+ rt_sf->check_only_zero_zeromv_on_large_blocks = false;
+ rt_sf->disable_cdf_update_non_reference_frame = false;
+ rt_sf->prune_compoundmode_with_singlemode_var = false;
+ rt_sf->skip_compound_based_on_var = false;
+ rt_sf->set_zeromv_skip_based_on_source_sad = 1;
+ rt_sf->use_adaptive_subpel_search = false;
+ rt_sf->screen_content_cdef_filter_qindex_thresh = 0;
+ rt_sf->enable_ref_short_signaling = false;
+ rt_sf->check_globalmv_on_single_ref = true;
+ rt_sf->increase_color_thresh_palette = false;
+ rt_sf->selective_cdf_update = 0;
+ rt_sf->force_only_last_ref = 0;
+}
+
+static fractional_mv_step_fp
+ *const fractional_mv_search[SUBPEL_SEARCH_METHODS] = {
+ av1_find_best_sub_pixel_tree, // SUBPEL_TREE = 0
+ av1_find_best_sub_pixel_tree_pruned, // SUBPEL_TREE_PRUNED = 1
+ av1_find_best_sub_pixel_tree_pruned_more // SUBPEL_TREE_PRUNED_MORE = 2
+ };
+
+// Populate appropriate sub-pel search method based on speed feature and user
+// specified settings
+static void set_subpel_search_method(
+ MotionVectorSearchParams *mv_search_params,
+ unsigned int motion_vector_unit_test,
+ SUBPEL_SEARCH_METHOD subpel_search_method) {
+ assert(subpel_search_method <= SUBPEL_TREE_PRUNED_MORE);
+ mv_search_params->find_fractional_mv_step =
+ fractional_mv_search[subpel_search_method];
+
+ // This is only used in motion vector unit test.
+ if (motion_vector_unit_test == 1)
+ mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+ else if (motion_vector_unit_test == 2)
+ mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+}
+
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ switch (oxcf->mode) {
+ case GOOD:
+ set_good_speed_feature_framesize_dependent(cpi, sf, speed);
+ break;
+ case ALLINTRA:
+ set_allintra_speed_feature_framesize_dependent(cpi, sf, speed);
+ break;
+ case REALTIME:
+ set_rt_speed_feature_framesize_dependent(cpi, sf, speed);
+ break;
+ }
+
+ if (!cpi->ppi->seq_params_locked) {
+ cpi->common.seq_params->enable_masked_compound &=
+ !sf->inter_sf.disable_masked_comp;
+ cpi->common.seq_params->enable_interintra_compound &=
+ (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
+ }
+
+ set_subpel_search_method(&cpi->mv_search_params,
+ cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+ sf->mv_sf.subpel_search_method);
+
+ // For multi-thread use case with row_mt enabled, cost update for a set of
+ // SB rows is not desirable. Hence, the sf mv_cost_upd_level is set to
+ // INTERNAL_COST_UPD_SBROW in such cases.
+ if ((cpi->oxcf.row_mt == 1) && (cpi->mt_info.num_workers > 1)) {
+ if (sf->inter_sf.mv_cost_upd_level == INTERNAL_COST_UPD_SBROW_SET) {
+ // Set mv_cost_upd_level to use row level update.
+ sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+ }
+ }
+}
+
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ int i;
+
+ init_hl_sf(&sf->hl_sf);
+ init_fp_sf(&sf->fp_sf);
+ init_tpl_sf(&sf->tpl_sf);
+ init_gm_sf(&sf->gm_sf);
+ init_part_sf(&sf->part_sf);
+ init_mv_sf(&sf->mv_sf);
+ init_inter_sf(&sf->inter_sf);
+ init_interp_sf(&sf->interp_sf);
+ init_intra_sf(&sf->intra_sf);
+ init_tx_sf(&sf->tx_sf);
+ init_rd_sf(&sf->rd_sf, oxcf);
+ init_winner_mode_sf(&sf->winner_mode_sf);
+ init_lpf_sf(&sf->lpf_sf);
+ init_rt_sf(&sf->rt_sf);
+
+ switch (oxcf->mode) {
+ case GOOD:
+ set_good_speed_features_framesize_independent(cpi, sf, speed);
+ break;
+ case ALLINTRA:
+ set_allintra_speed_features_framesize_independent(cpi, sf, speed);
+ break;
+ case REALTIME:
+ set_rt_speed_features_framesize_independent(cpi, sf, speed);
+ break;
+ }
+
+ // Note: when use_nonrd_pick_mode is true, the transform size is the
+ // minimum of 16x16 and the largest possible size of the current block,
+ // which conflicts with the speed feature "enable_tx_size_search".
+ if (!oxcf->txfm_cfg.enable_tx_size_search &&
+ sf->rt_sf.use_nonrd_pick_mode == 0) {
+ sf->winner_mode_sf.tx_size_search_level = 3;
+ }
+
+ if (cpi->mt_info.num_workers > 1) {
+ // Loop restoration stage is conditionally disabled for speed 5, 6 when
+ // num_workers > 1. Since av1_pick_filter_restoration() is not
+ // multi-threaded, enabling the Loop restoration stage will cause an
+ // increase in encode time (3% to 7% increase depends on frame
+ // resolution).
+ // TODO(aomedia:3446): Implement multi-threading of
+ // av1_pick_filter_restoration() and enable Wiener filter for speed 5, 6
+ // similar to single thread encoding path.
+ if (speed >= 5) {
+ sf->lpf_sf.disable_sgr_filter = true;
+ sf->lpf_sf.disable_wiener_filter = true;
+ }
+ }
+
+ if (!cpi->ppi->seq_params_locked) {
+ cpi->common.seq_params->order_hint_info.enable_dist_wtd_comp &=
+ (sf->inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+ cpi->common.seq_params->enable_dual_filter &=
+ !sf->interp_sf.disable_dual_filter;
+ // Set the flag 'enable_restoration', if one the Loop restoration filters
+ // (i.e., Wiener or Self-guided) is enabled.
+ cpi->common.seq_params->enable_restoration &=
+ (!sf->lpf_sf.disable_wiener_filter || !sf->lpf_sf.disable_sgr_filter);
+
+ cpi->common.seq_params->enable_interintra_compound &=
+ (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
+ }
+
+ const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED);
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mv_sf.mesh_patterns[i].range =
+ good_quality_mesh_patterns[mesh_speed][i].range;
+ sf->mv_sf.mesh_patterns[i].interval =
+ good_quality_mesh_patterns[mesh_speed][i].interval;
+ }
+
+ // Update the mesh pattern of exhaustive motion search for intraBC
+ // Though intraBC mesh pattern is populated for all frame types, it is used
+ // only for intra frames of screen contents
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mv_sf.intrabc_mesh_patterns[i].range =
+ intrabc_mesh_patterns[mesh_speed][i].range;
+ sf->mv_sf.intrabc_mesh_patterns[i].interval =
+ intrabc_mesh_patterns[mesh_speed][i].interval;
+ }
+
+ // Slow quant, dct and trellis not worthwhile for first pass
+ // so make sure they are always turned off.
+ if (is_stat_generation_stage(cpi))
+ sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
+
+ // No recode for 1 pass.
+ if (oxcf->pass == AOM_RC_ONE_PASS && has_no_stats_stage(cpi))
+ sf->hl_sf.recode_loop = DISALLOW_RECODE;
+
+ set_subpel_search_method(&cpi->mv_search_params,
+ cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+ sf->mv_sf.subpel_search_method);
+
+ // assert ensures that tx_domain_dist_level is accessed correctly
+ assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 &&
+ cpi->sf.rd_sf.tx_domain_dist_thres_level < 4);
+ memcpy(winner_mode_params->tx_domain_dist_threshold,
+ tx_domain_dist_thresholds[cpi->sf.rd_sf.tx_domain_dist_thres_level],
+ sizeof(winner_mode_params->tx_domain_dist_threshold));
+
+ assert(cpi->sf.rd_sf.tx_domain_dist_level >= 0 &&
+ cpi->sf.rd_sf.tx_domain_dist_level < TX_DOMAIN_DIST_LEVELS);
+ memcpy(winner_mode_params->use_transform_domain_distortion,
+ tx_domain_dist_types[cpi->sf.rd_sf.tx_domain_dist_level],
+ sizeof(winner_mode_params->use_transform_domain_distortion));
+
+ // assert ensures that coeff_opt_thresholds is accessed correctly
+ assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 &&
+ cpi->sf.rd_sf.perform_coeff_opt < 9);
+ memcpy(winner_mode_params->coeff_opt_thresholds,
+ &coeff_opt_thresholds[cpi->sf.rd_sf.perform_coeff_opt],
+ sizeof(winner_mode_params->coeff_opt_thresholds));
+
+ // assert ensures that predict_skip_levels is accessed correctly
+ assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 &&
+ cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3);
+ memcpy(winner_mode_params->skip_txfm_level,
+ predict_skip_levels[cpi->sf.tx_sf.tx_type_search
+ .use_skip_flag_prediction],
+ sizeof(winner_mode_params->skip_txfm_level));
+
+ // assert ensures that tx_size_search_level is accessed correctly
+ assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 &&
+ cpi->sf.winner_mode_sf.tx_size_search_level <= 3);
+ memcpy(winner_mode_params->tx_size_search_methods,
+ tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level],
+ sizeof(winner_mode_params->tx_size_search_methods));
+ memcpy(winner_mode_params->predict_dc_level,
+ predict_dc_levels[cpi->sf.winner_mode_sf.dc_blk_pred_level],
+ sizeof(winner_mode_params->predict_dc_level));
+
+ if (cpi->oxcf.row_mt == 1 && (cpi->mt_info.num_workers > 1)) {
+ if (sf->inter_sf.inter_mode_rd_model_estimation == 1) {
+ // Revert to type 2
+ sf->inter_sf.inter_mode_rd_model_estimation = 2;
+ }
+
+#if !CONFIG_FPMT_TEST
+ // Disable the speed feature 'prune_ref_frame_for_gm_search' to achieve
+ // better parallelism when number of threads available are greater than or
+ // equal to maximum number of reference frames allowed for global motion.
+ if (sf->gm_sf.gm_search_type != GM_DISABLE_SEARCH &&
+ (cpi->mt_info.num_workers >=
+ gm_available_reference_frames[sf->gm_sf.gm_search_type]))
+ sf->gm_sf.prune_ref_frame_for_gm_search = 0;
+#endif
+ }
+
+ // This only applies to the real time mode. Adaptive gf refresh is disabled if
+ // gf_cbr_boost_pct that is set by the user is larger than 0.
+ if (cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 0)
+ sf->rt_sf.gf_refresh_based_on_qp = 0;
+}
+
+// Override some speed features based on qindex
+void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
+ AV1_COMMON *const cm = &cpi->common;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
+ const int boosted = frame_is_boosted(cpi);
+ const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
+ const int is_1440p_or_larger = AOMMIN(cm->width, cm->height) >= 1440;
+ const int is_arf2_bwd_type =
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+
+ if (cpi->oxcf.mode == REALTIME) {
+ if (speed >= 6) {
+ const int qindex_thresh = boosted ? 190 : (is_720p_or_larger ? 120 : 150);
+ sf->part_sf.adjust_var_based_rd_partitioning =
+ frame_is_intra_only(cm)
+ ? 0
+ : cm->quant_params.base_qindex > qindex_thresh;
+ }
+ return;
+ }
+
+ if (speed == 0) {
+ // qindex_thresh for resolution < 720p
+ const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140);
+ if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) {
+ sf->part_sf.simple_motion_search_split =
+ cm->features.allow_screen_content_tools ? 1 : 2;
+ sf->part_sf.simple_motion_search_early_term_none = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 0;
+ }
+
+ if (is_720p_or_larger && cm->quant_params.base_qindex <= 128) {
+ sf->rd_sf.perform_coeff_opt = 2 + is_1080p_or_larger;
+ memcpy(winner_mode_params->coeff_opt_thresholds,
+ &coeff_opt_thresholds[sf->rd_sf.perform_coeff_opt],
+ sizeof(winner_mode_params->coeff_opt_thresholds));
+ sf->part_sf.simple_motion_search_split =
+ cm->features.allow_screen_content_tools ? 1 : 2;
+ sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+ sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+ sf->tx_sf.model_based_prune_tx_search_level = 0;
+
+ if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) {
+ sf->inter_sf.selective_ref_frame = 2;
+ sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
+ sf->rd_sf.tx_domain_dist_thres_level = 1;
+ sf->part_sf.simple_motion_search_early_term_none = 1;
+ sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+ sf->interp_sf.cb_pred_filter_search = 0;
+ sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
+ sf->tx_sf.tx_type_search.skip_tx_search = 1;
+ }
+ }
+ }
+
+ if (speed >= 2) {
+ // Disable extended partitions for lower quantizers
+ const int aggr = AOMMIN(4, speed - 2);
+ const int qindex_thresh1[4] = { 50, 50, 80, 100 };
+ const int qindex_thresh2[4] = { 80, 100, 120, 160 };
+ int qindex_thresh;
+ if (aggr <= 1) {
+ const int qthresh2 =
+ (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr];
+ qindex_thresh = cm->features.allow_screen_content_tools
+ ? qindex_thresh1[aggr]
+ : qthresh2;
+ if (cm->quant_params.base_qindex <= qindex_thresh && !boosted)
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else if (aggr <= 2) {
+ qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+ if (cm->quant_params.base_qindex <= qindex_thresh &&
+ !frame_is_intra_only(cm))
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else if (aggr <= 3) {
+ if (!is_480p_or_larger) {
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else if (!is_720p_or_larger && !frame_is_intra_only(cm) &&
+ !cm->features.allow_screen_content_tools) {
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ } else {
+ qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+ if (cm->quant_params.base_qindex <= qindex_thresh &&
+ !frame_is_intra_only(cm))
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ }
+ } else {
+ sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+ }
+ }
+
+ if (speed >= 4) {
+ // Disable rectangular partitions for lower quantizers
+ const int aggr = AOMMIN(1, speed - 4);
+ const int qindex_thresh[2] = { 65, 80 };
+ int disable_rect_part;
+ disable_rect_part = !boosted;
+ if (cm->quant_params.base_qindex <= qindex_thresh[aggr] &&
+ disable_rect_part && is_480p_or_larger) {
+ sf->part_sf.rect_partition_eval_thresh = BLOCK_8X8;
+ }
+ }
+
+ if (speed <= 2) {
+ if (!is_stat_generation_stage(cpi)) {
+ // Use faster full-pel motion search for high quantizers.
+ // Also use reduced total search range for low resolutions at high
+ // quantizers.
+ const int aggr = speed;
+ const int qindex_thresh1 = ms_qindex_thresh[aggr][is_720p_or_larger][0];
+ const int qindex_thresh2 = ms_qindex_thresh[aggr][is_720p_or_larger][1];
+ const SEARCH_METHODS search_method =
+ motion_search_method[is_720p_or_larger];
+ if (cm->quant_params.base_qindex > qindex_thresh1) {
+ sf->mv_sf.search_method = search_method;
+ sf->tpl_sf.search_method = search_method;
+ } else if (cm->quant_params.base_qindex > qindex_thresh2) {
+ sf->mv_sf.search_method = NSTEP_8PT;
+ }
+ }
+ }
+
+ if (speed >= 4) {
+ // Disable LR search at low and high quantizers and enable only for
+ // mid-quantizer range.
+ if (!boosted && !is_arf2_bwd_type) {
+ const int qindex_low[2] = { 100, 60 };
+ const int qindex_high[2] = { 180, 160 };
+ if (cm->quant_params.base_qindex <= qindex_low[is_720p_or_larger] ||
+ cm->quant_params.base_qindex > qindex_high[is_720p_or_larger]) {
+ sf->lpf_sf.disable_loop_restoration_luma = 1;
+ }
+ }
+ }
+
+ if (speed == 1) {
+ // Reuse interinter wedge mask search from first search for non-boosted
+ // non-internal-arf frames, except at very high quantizers.
+ if (cm->quant_params.base_qindex <= 200) {
+ if (!boosted && !is_arf2_bwd_type)
+ sf->inter_sf.reuse_mask_search_results = 1;
+ }
+ }
+
+ if (speed == 5) {
+ if (!(frame_is_intra_only(&cpi->common) ||
+ cm->features.allow_screen_content_tools)) {
+ const int qindex[2] = { 256, 128 };
+ // Set the sf value as 3 for low resolution and
+ // for higher resolutions with low quantizers.
+ if (cm->quant_params.base_qindex < qindex[is_480p_or_larger])
+ sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3;
+ }
+ }
+
+ if (speed >= 5) {
+ // Disable the sf for low quantizers in case of low resolution screen
+ // contents.
+ if (cm->features.allow_screen_content_tools &&
+ cm->quant_params.base_qindex < 128 && is_480p_or_lesser) {
+ sf->part_sf.prune_sub_8x8_partition_level = 0;
+ }
+ }
+
+ // Loop restoration size search
+ // At speed 0, always search all available sizes for the maximum possible gain
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_PROC_UNIT_SIZE;
+ sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+
+ if (speed >= 1) {
+ // For large frames, small restoration units are almost never useful,
+ // so prune them away
+ if (is_1440p_or_larger) {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ } else if (is_720p_or_larger) {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+ }
+ }
+
+ if (speed >= 3 || (cpi->oxcf.mode == ALLINTRA && speed >= 1)) {
+ // At this speed, a full search is too expensive. Instead, pick a single
+ // size based on size and qindex. Note that, in general, higher quantizers
+ // (== lower quality) and larger frames generally want to use larger
+ // restoration units.
+ int qindex_thresh = 96;
+ if (cm->quant_params.base_qindex <= qindex_thresh && !is_1440p_or_larger) {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+ sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX >> 1;
+ } else {
+ sf->lpf_sf.min_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ sf->lpf_sf.max_lr_unit_size = RESTORATION_UNITSIZE_MAX;
+ }
+ }
+
+ set_subpel_search_method(&cpi->mv_search_params,
+ cpi->oxcf.unit_test_cfg.motion_vector_unit_test,
+ sf->mv_sf.subpel_search_method);
+}
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
new file mode 100644
index 0000000000..60c000e4f4
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -0,0 +1,2025 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_
+#define AOM_AV1_ENCODER_SPEED_FEATURES_H_
+
+#include "av1/common/enums.h"
+#include "av1/encoder/enc_enums.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/encodemb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @file */
+
+/*!\cond */
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+ int range;
+ int interval;
+} MESH_PATTERN;
+
+enum {
+ GM_FULL_SEARCH,
+ GM_REDUCED_REF_SEARCH_SKIP_L2_L3,
+ GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2,
+
+ // Same as GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2 but with extra filtering
+ // to keep at most two ref frames
+ GM_SEARCH_CLOSEST_REFS_ONLY,
+
+ GM_DISABLE_SEARCH
+} UENUM1BYTE(GM_SEARCH_TYPE);
+
+enum {
+ DIST_WTD_COMP_ENABLED,
+ DIST_WTD_COMP_SKIP_MV_SEARCH,
+ DIST_WTD_COMP_DISABLED,
+} UENUM1BYTE(DIST_WTD_COMP_FLAG);
+
+enum {
+ INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
+ (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) |
+ (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) |
+ (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED),
+ UV_INTRA_ALL =
+ (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+ (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) |
+ (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) |
+ (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) |
+ (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC = (1 << UV_DC_PRED),
+ UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED),
+ UV_INTRA_DC_PAETH_CFL =
+ (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED),
+ UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) |
+ (1 << UV_H_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+ (1 << UV_V_PRED) | (1 << UV_H_PRED),
+ UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+ (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+ (1 << UV_CFL_PRED),
+ INTRA_DC = (1 << DC_PRED),
+ INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED),
+ INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+ INTRA_DC_H_V_SMOOTH =
+ (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << SMOOTH_PRED),
+ INTRA_DC_PAETH_H_V =
+ (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED)
+};
+
+enum {
+ INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+ (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
+ (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
+ (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
+ INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+ (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
+ (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+ (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
+ (1 << NEAR_NEARMV),
+ INTER_SINGLE_ALL =
+ (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) | (1 << NEWMV),
+};
+
+enum {
+ DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST),
+
+ DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+ DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+ LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) | (1 << THR_GOLD)
+};
+
+enum {
+ TXFM_CODING_SF = 1,
+ INTER_PRED_SF = 2,
+ INTRA_PRED_SF = 4,
+ PARTITION_SF = 8,
+ LOOP_FILTER_SF = 16,
+ RD_SKIP_SF = 32,
+ RESERVE_2_SF = 64,
+ RESERVE_3_SF = 128,
+} UENUM1BYTE(DEV_SPEED_FEATURES);
+
+/* This enumeration defines when the rate control recode loop will be
+ * enabled.
+ */
+enum {
+ /*
+ * No recodes allowed
+ */
+ DISALLOW_RECODE = 0,
+ /*
+ * Allow recode only for KF/ARF/GF frames
+ */
+ ALLOW_RECODE_KFARFGF = 1,
+ /*
+ * Allow recode for all frame types based on bitrate constraints.
+ */
+ ALLOW_RECODE = 2,
+} UENUM1BYTE(RECODE_LOOP_TYPE);
+
+enum {
+ SUBPEL_TREE = 0,
+ SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches
+ SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively
+ SUBPEL_SEARCH_METHODS
+} UENUM1BYTE(SUBPEL_SEARCH_METHOD);
+
+enum {
+ // Try the full image with different values.
+ LPF_PICK_FROM_FULL_IMAGE,
+ // Try the full image filter search with non-dual filter only.
+ LPF_PICK_FROM_FULL_IMAGE_NON_DUAL,
+ // Try a small portion of the image with different values.
+ LPF_PICK_FROM_SUBIMAGE,
+ // Estimate the level based on quantizer and frame type
+ LPF_PICK_FROM_Q,
+ // Pick 0 to disable LPF if LPF was enabled last frame
+ LPF_PICK_MINIMAL_LPF
+} UENUM1BYTE(LPF_PICK_METHOD);
+/*!\endcond */
+
+/*!\enum CDEF_PICK_METHOD
+ * \brief This enumeration defines a variety of CDEF pick methods
+ */
+typedef enum {
+ CDEF_FULL_SEARCH, /**< Full search */
+ CDEF_FAST_SEARCH_LVL1, /**< Search among a subset of all possible filters. */
+ CDEF_FAST_SEARCH_LVL2, /**< Search reduced subset of filters than Level 1. */
+ CDEF_FAST_SEARCH_LVL3, /**< Search reduced subset of secondary filters than
+ Level 2. */
+ CDEF_FAST_SEARCH_LVL4, /**< Search reduced subset of filters than Level 3. */
+ CDEF_FAST_SEARCH_LVL5, /**< Search reduced subset of filters than Level 4. */
+ CDEF_PICK_FROM_Q, /**< Estimate filter strength based on quantizer. */
+ CDEF_PICK_METHODS
+} CDEF_PICK_METHOD;
+
+/*!\cond */
+enum {
+ // Terminate search early based on distortion so far compared to
+ // qp step, distortion in the neighborhood of the frame, etc.
+ FLAG_EARLY_TERMINATE = 1 << 0,
+
+ // Skips comp inter modes if the best so far is an intra mode.
+ FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+ // Skips oblique intra modes if the best so far is an inter mode.
+ FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+ // Skips oblique intra modes at angles 27, 63, 117, 153 if the best
+ // intra so far is not one of the neighboring directions.
+ FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+ // Skips intra modes other than DC_PRED if the source variance is small
+ FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC);
+
+enum {
+ // No tx type pruning
+ TX_TYPE_PRUNE_0 = 0,
+ // adaptively prunes the least perspective tx types out of all 16
+ // (tuned to provide negligible quality loss)
+ TX_TYPE_PRUNE_1 = 1,
+ // similar, but applies much more aggressive pruning to get better speed-up
+ TX_TYPE_PRUNE_2 = 2,
+ TX_TYPE_PRUNE_3 = 3,
+ // More aggressive pruning based on tx type score and allowed tx count
+ TX_TYPE_PRUNE_4 = 4,
+ TX_TYPE_PRUNE_5 = 5,
+} UENUM1BYTE(TX_TYPE_PRUNE_MODE);
+
+enum {
+ // No reaction to rate control on a detected slide/scene change.
+ NO_DETECTION = 0,
+
+ // Set to larger Q based only on the detected slide/scene change and
+ // current/past Q.
+ FAST_DETECTION_MAXQ = 1,
+} UENUM1BYTE(OVERSHOOT_DETECTION_CBR);
+
+enum {
+ // Turns off multi-winner mode. So we will do txfm search on either all modes
+ // if winner mode is off, or we will only on txfm search on a single winner
+ // mode.
+ MULTI_WINNER_MODE_OFF = 0,
+
+ // Limits the number of winner modes to at most 2
+ MULTI_WINNER_MODE_FAST = 1,
+
+ // Uses the default number of winner modes, which is 3 for intra mode, and 1
+ // for inter mode.
+ MULTI_WINNER_MODE_DEFAULT = 2,
+
+ // Maximum number of winner modes allowed.
+ MULTI_WINNER_MODE_LEVELS,
+} UENUM1BYTE(MULTI_WINNER_MODE_TYPE);
+
+enum {
+ PRUNE_NEARMV_OFF = 0, // Turn off nearmv pruning
+ PRUNE_NEARMV_LEVEL1 = 1, // Prune nearmv for qindex (0-85)
+ PRUNE_NEARMV_LEVEL2 = 2, // Prune nearmv for qindex (0-170)
+ PRUNE_NEARMV_LEVEL3 = 3, // Prune nearmv more aggressively for qindex (0-170)
+ PRUNE_NEARMV_MAX = PRUNE_NEARMV_LEVEL3,
+} UENUM1BYTE(PRUNE_NEARMV_LEVEL);
+
+enum {
+ // Default transform search used in evaluation of best inter candidates
+ // (MODE_EVAL stage) and motion mode winner processing (WINNER_MODE_EVAL
+ // stage).
+ TX_SEARCH_DEFAULT = 0,
+ // Transform search in motion mode rd during MODE_EVAL stage.
+ TX_SEARCH_MOTION_MODE,
+ // Transform search in compound type mode rd during MODE_EVAL stage.
+ TX_SEARCH_COMP_TYPE_MODE,
+ // All transform search cases
+ TX_SEARCH_CASES
+} UENUM1BYTE(TX_SEARCH_CASE);
+
+typedef struct {
+ TX_TYPE_PRUNE_MODE prune_2d_txfm_mode;
+ int fast_intra_tx_type_search;
+
+ // INT_MAX: Disable fast search.
+ // 1 - 1024: Probability threshold used for conditionally forcing tx type,
+ // during mode search.
+ // 0: Force tx type to be DCT_DCT unconditionally, during
+ // mode search.
+ int fast_inter_tx_type_prob_thresh;
+
+ // Prune less likely chosen transforms for each intra mode. The speed
+ // feature ranges from 0 to 2, for different speed / compression trade offs.
+ int use_reduced_intra_txset;
+
+ // Use a skip flag prediction model to detect blocks with skip = 1 early
+ // and avoid doing full TX type search for such blocks.
+ int use_skip_flag_prediction;
+
+ // Threshold used by the ML based method to predict TX block split decisions.
+ int ml_tx_split_thresh;
+
+ // skip remaining transform type search when we found the rdcost of skip is
+ // better than applying transform
+ int skip_tx_search;
+
+ // Prune tx type search using previous frame stats.
+ int prune_tx_type_using_stats;
+ // Prune tx type search using estimated RDcost
+ int prune_tx_type_est_rd;
+
+ // Flag used to control the winner mode processing for tx type pruning for
+ // inter blocks. It enables further tx type mode pruning based on ML model for
+ // mode evaluation and disables tx type mode pruning for winner mode
+ // processing.
+ int winner_mode_tx_type_pruning;
+} TX_TYPE_SEARCH;
+
+enum {
+ // Search partitions using RD criterion
+ SEARCH_PARTITION,
+
+ // Always use a fixed size partition
+ FIXED_PARTITION,
+
+ // Partition using source variance
+ VAR_BASED_PARTITION,
+
+#if CONFIG_RT_ML_PARTITIONING
+ // Partition using ML model
+ ML_BASED_PARTITION
+#endif
+} UENUM1BYTE(PARTITION_SEARCH_TYPE);
+
+enum {
+ NOT_IN_USE,
+ DIRECT_PRED,
+ RELAXED_PRED,
+ ADAPT_PRED
+} UENUM1BYTE(MAX_PART_PRED_MODE);
+
+enum {
+ LAST_MV_DATA,
+ CURRENT_Q,
+ QTR_ONLY,
+} UENUM1BYTE(MV_PREC_LOGIC);
+
+enum {
+ SUPERRES_AUTO_ALL, // Tries all possible superres ratios
+ SUPERRES_AUTO_DUAL, // Tries no superres and q-based superres ratios
+ SUPERRES_AUTO_SOLO, // Only apply the q-based superres ratio
+} UENUM1BYTE(SUPERRES_AUTO_SEARCH_TYPE);
+/*!\endcond */
+
+/*!\enum INTERNAL_COST_UPDATE_TYPE
+ * \brief This enum decides internally how often to update the entropy costs
+ *
+ * INTERNAL_COST_UPD_TYPE is similar to \ref COST_UPDATE_TYPE but has slightly
+ * more flexibility in update frequency. This enum is separate from \ref
+ * COST_UPDATE_TYPE because although \ref COST_UPDATE_TYPE is not exposed, its
+ * values are public so it cannot be modified without breaking public API.
+ * Due to the use of AOMMIN() in populate_unified_cost_update_freq() to
+ * compute the unified cost update frequencies (out of COST_UPDATE_TYPE and
+ * INTERNAL_COST_UPDATE_TYPE), the values of this enum type must be listed in
+ * the order of increasing frequencies.
+ *
+ * \warning In case of any updates/modifications to the enum COST_UPDATE_TYPE,
+ * update the enum INTERNAL_COST_UPDATE_TYPE as well.
+ */
+typedef enum {
+ INTERNAL_COST_UPD_OFF, /*!< Turn off cost updates. */
+ INTERNAL_COST_UPD_TILE, /*!< Update every tile. */
+ INTERNAL_COST_UPD_SBROW_SET, /*!< Update every row_set of height 256 pixs. */
+ INTERNAL_COST_UPD_SBROW, /*!< Update every sb rows inside a tile. */
+ INTERNAL_COST_UPD_SB, /*!< Update every sb. */
+} INTERNAL_COST_UPDATE_TYPE;
+
+/*!\enum SIMPLE_MOTION_SEARCH_PRUNE_LEVEL
+ * \brief This enumeration defines a variety of simple motion search based
+ * partition prune levels
+ */
+typedef enum {
+ NO_PRUNING = -1,
+ SIMPLE_AGG_LVL0, /*!< Simple prune aggressiveness level 0. */
+ SIMPLE_AGG_LVL1, /*!< Simple prune aggressiveness level 1. */
+ SIMPLE_AGG_LVL2, /*!< Simple prune aggressiveness level 2. */
+ SIMPLE_AGG_LVL3, /*!< Simple prune aggressiveness level 3. */
+ QIDX_BASED_AGG_LVL1, /*!< Qindex based prune aggressiveness level, aggressive
+ level maps to simple agg level 1 or 2 based on qindex.
+ */
+ TOTAL_SIMPLE_AGG_LVLS = QIDX_BASED_AGG_LVL1, /*!< Total number of simple prune
+ aggressiveness levels. */
+ TOTAL_QINDEX_BASED_AGG_LVLS =
+ QIDX_BASED_AGG_LVL1 -
+ SIMPLE_AGG_LVL3, /*!< Total number of qindex based simple prune
+ aggressiveness levels. */
+ TOTAL_AGG_LVLS = TOTAL_SIMPLE_AGG_LVLS +
+ TOTAL_QINDEX_BASED_AGG_LVLS, /*!< Total number of levels. */
+} SIMPLE_MOTION_SEARCH_PRUNE_LEVEL;
+
+/*!\enum PRUNE_MESH_SEARCH_LEVEL
+ * \brief This enumeration defines a variety of mesh search prune levels.
+ */
+typedef enum {
+ PRUNE_MESH_SEARCH_DISABLED = 0, /*!< Prune mesh search level 0. */
+ PRUNE_MESH_SEARCH_LVL_1 = 1, /*!< Prune mesh search level 1. */
+ PRUNE_MESH_SEARCH_LVL_2 = 2, /*!< Prune mesh search level 2. */
+} PRUNE_MESH_SEARCH_LEVEL;
+
+/*!\enum INTER_SEARCH_EARLY_TERM_IDX
+ * \brief This enumeration defines inter search early termination index in
+ * non-rd path based on sse value.
+ */
+typedef enum {
+ EARLY_TERM_DISABLED =
+ 0, /*!< Early terminate inter mode search based on sse disabled. */
+ EARLY_TERM_IDX_1 =
+ 1, /*!< Early terminate inter mode search based on sse, index 1. */
+ EARLY_TERM_IDX_2 =
+ 2, /*!< Early terminate inter mode search based on sse, index 2. */
+ EARLY_TERM_IDX_3 =
+ 3, /*!< Early terminate inter mode search based on sse, index 3. */
+ EARLY_TERM_IDX_4 =
+ 4, /*!< Early terminate inter mode search based on sse, index 4. */
+ EARLY_TERM_INDICES, /*!< Total number of early terminate indices */
+} INTER_SEARCH_EARLY_TERM_IDX;
+
+/*!
+ * \brief Sequence/frame level speed vs quality features
+ */
+typedef struct HIGH_LEVEL_SPEED_FEATURES {
+ /*! Frame level coding parameter update. */
+ int frame_parameter_update;
+
+ /*!
+ * Cases and frame types for which the recode loop is enabled.
+ */
+ RECODE_LOOP_TYPE recode_loop;
+
+ /*!
+ * Controls the tolerance vs target rate used in deciding whether to
+ * recode a frame. It has no meaning if recode is disabled.
+ */
+ int recode_tolerance;
+
+ /*!
+ * Determine how motion vector precision is chosen. The possibilities are:
+ * LAST_MV_DATA: use the mv data from the last coded frame
+ * CURRENT_Q: use the current q as a threshold
+ * QTR_ONLY: use quarter pel precision only.
+ */
+ MV_PREC_LOGIC high_precision_mv_usage;
+
+ /*!
+ * Always set to 0. If on it enables 0 cost background transmission
+ * (except for the initial transmission of the segmentation). The feature is
+ * disabled because the addition of very large block sizes make the
+ * backgrounds very to cheap to encode, and the segmentation we have
+ * adds overhead.
+ */
+ int static_segmentation;
+
+ /*!
+ * Superres-auto mode search type:
+ */
+ SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type;
+
+ /*!
+ * Enable/disable extra screen content test by encoding key frame twice.
+ */
+ int disable_extra_sc_testing;
+
+ /*!
+ * Enable/disable second_alt_ref temporal filtering.
+ */
+ int second_alt_ref_filtering;
+
+ /*!
+ * The number of frames to be used during temporal filtering of an ARF frame
+ * is adjusted based on noise level of the current frame. The sf has three
+ * levels to decide number of frames to be considered for filtering:
+ * 0 : Use default number of frames
+ * 1 and 2 : Reduce the number of frames based on noise level with varied
+ * aggressiveness
+ */
+ int adjust_num_frames_for_arf_filtering;
+
+ /*!
+ * Decide the bit estimation approach used in qindex decision.
+ * 0: estimate bits based on a constant value;
+ * 1: estimate bits more accurately based on the frame complexity.
+ */
+ int accurate_bit_estimate;
+
+ /*!
+ * Decide the approach for weight calculation during temporal filtering.
+ * 0: Calculate weight using exp()
+ * 1: Calculate weight using a lookup table that approximates exp().
+ */
+ int weight_calc_level_in_tf;
+
+ /*!
+ * Decide whether to perform motion estimation at split block (i.e. 16x16)
+ * level or not.
+ * 0: Always allow motion estimation.
+ * 1: Conditionally allow motion estimation based on 4x4 sub-blocks variance.
+ */
+ int allow_sub_blk_me_in_tf;
+} HIGH_LEVEL_SPEED_FEATURES;
+
+/*!
+ * Speed features for the first pass.
+ */
+typedef struct FIRST_PASS_SPEED_FEATURES {
+ /*!
+ * \brief Reduces the mv search window.
+ * By default, the initial search window is around
+ * MIN(MIN(dims), MAX_FULL_PEL_VAL) = MIN(MIN(dims), 1023).
+ * Each step reduction decrease the window size by about a factor of 2.
+ */
+ int reduce_mv_step_param;
+
+ /*!
+ * \brief Skips the motion search when the zero mv has small sse.
+ */
+ int skip_motion_search_threshold;
+
+ /*!
+ * \brief Skips reconstruction by using source buffers for prediction
+ */
+ int disable_recon;
+
+ /*!
+ * \brief Skips the motion search centered on 0,0 mv.
+ */
+ int skip_zeromv_motion_search;
+} FIRST_PASS_SPEED_FEATURES;
+
+/*!\cond */
+typedef struct TPL_SPEED_FEATURES {
+ // GOP length adaptive decision.
+ // If set to 0, tpl model decides whether a shorter gf interval is better.
+ // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and
+ // (base+2) layer decide whether a shorter gf interval is better.
+ // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost
+ // decide whether a shorter gf interval is better.
+ // If set to 3, gop length adaptive decision is disabled.
+ int gop_length_decision_method;
+ // Prune the intra modes search by tpl.
+ // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED.
+ // If set to 1, we only search DC_PRED, V_PRED, and H_PRED.
+ int prune_intra_modes;
+ // This parameter controls which step in the n-step process we start at.
+ int reduce_first_step_size;
+ // Skip motion estimation based on the precision of center MVs and the
+ // difference between center MVs.
+ // If set to 0, motion estimation is skipped for duplicate center MVs
+ // (default). If set to 1, motion estimation is skipped for duplicate
+ // full-pixel center MVs. If set to 2, motion estimation is skipped if the
+ // difference between center MVs is less than the threshold.
+ int skip_alike_starting_mv;
+
+ // When to stop subpel search.
+ SUBPEL_FORCE_STOP subpel_force_stop;
+
+ // Which search method to use.
+ SEARCH_METHODS search_method;
+
+ // Prune starting mvs in TPL based on sad scores.
+ int prune_starting_mv;
+
+ // Prune reference frames in TPL.
+ int prune_ref_frames_in_tpl;
+
+ // Support compound predictions.
+ int allow_compound_pred;
+
+ // Calculate rate and distortion based on Y plane only.
+ int use_y_only_rate_distortion;
+
+ // Use SAD instead of SATD during intra/inter mode search.
+ // If set to 0, use SATD always.
+ // If set to 1, use SAD during intra/inter mode search for frames in the
+ // higher temporal layers of the hierarchical prediction structure.
+ // If set to 2, use SAD during intra/inter mode search for all frames.
+ // This sf is disabled for the first GF group of the key-frame interval,
+ // i.e., SATD is used during intra/inter mode search of the first GF group.
+ int use_sad_for_mode_decision;
+
+ // Skip tpl processing for frames of type LF_UPDATE.
+ // This sf is disabled for the first GF group of the key-frame interval.
+ int reduce_num_frames;
+} TPL_SPEED_FEATURES;
+
+typedef struct GLOBAL_MOTION_SPEED_FEATURES {
+ GM_SEARCH_TYPE gm_search_type;
+
+ // During global motion estimation, prune remaining reference frames in a
+ // given direction(past/future), if the evaluated ref_frame in that direction
+ // yields gm_type as INVALID/TRANSLATION/IDENTITY
+ int prune_ref_frame_for_gm_search;
+
+ // When the current GM type is set to ZEROMV, prune ZEROMV if its performance
+ // is worse than NEWMV under SSE metric.
+ // 0 : no pruning
+ // 1 : conservative pruning
+ // 2 : aggressive pruning
+ int prune_zero_mv_with_sse;
+
+ // Disable global motion estimation based on stats of previous frames in the
+ // GF group
+ int disable_gm_search_based_on_stats;
+
+ // Number of refinement steps to apply after initial model generation
+ int num_refinement_steps;
+} GLOBAL_MOTION_SPEED_FEATURES;
+
+typedef struct PARTITION_SPEED_FEATURES {
+ PARTITION_SEARCH_TYPE partition_search_type;
+
+ // Used if partition_search_type = FIXED_PARTITION
+ BLOCK_SIZE fixed_partition_size;
+
+ // Prune extended partition types search based on the current best partition
+ // and the combined rdcost of the subblocks estimated from previous
+ // partitions. Can take values 0 - 2, 0 referring to no pruning, and 1 - 2
+ // increasing aggressiveness of pruning in order.
+ int prune_ext_partition_types_search_level;
+
+ // Prune part4 based on block size
+ int prune_part4_search;
+
+ // Use a ML model to prune rectangular, ab and 4-way horz
+ // and vert partitions
+ int ml_prune_partition;
+
+ // Use a ML model to adaptively terminate partition search after trying
+ // PARTITION_SPLIT. Can take values 0 - 2, 0 meaning not being enabled, and
+ // 1 - 2 increasing aggressiveness in order.
+ int ml_early_term_after_part_split_level;
+
+ // Skip rectangular partition test when partition type none gives better
+ // rd than partition type split. Can take values 0 - 2, 0 referring to no
+ // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
+ int less_rectangular_check_level;
+
+ // Use square partition only beyond this block size.
+ BLOCK_SIZE use_square_partition_only_threshold;
+
+ // Sets max square partition levels for this superblock based on
+ // motion vector and prediction error distribution produced from 16x16
+ // simple motion search
+ MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion;
+
+ // Min and max square partition size we enable (block_size) as per auto
+ // min max, but also used by adjust partitioning, and pick_partitioning.
+ BLOCK_SIZE default_min_partition_size;
+ BLOCK_SIZE default_max_partition_size;
+
+ // Sets level of adjustment of variance-based partitioning during
+ // rd_use_partition 0 - no partition adjustment, 1 - try to merge partitions
+ // for small blocks and high QP, 2 - try to merge partitions, 3 - try to merge
+ // and split leaf partitions and 0 - 3 decreasing aggressiveness in order.
+ int adjust_var_based_rd_partitioning;
+
+ // Partition search early breakout thresholds.
+ int64_t partition_search_breakout_dist_thr;
+ int partition_search_breakout_rate_thr;
+
+ // Thresholds for ML based partition search breakout.
+ int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
+
+ // Aggressiveness levels for pruning split and rectangular partitions based on
+ // simple_motion_search. SIMPLE_AGG_LVL0 to SIMPLE_AGG_LVL3 correspond to
+ // simple motion search based pruning. QIDX_BASED_AGG_LVL1 corresponds to
+ // qindex based and simple motion search based pruning.
+ int simple_motion_search_prune_agg;
+
+ // Perform simple_motion_search on each possible subblock and use it to prune
+ // PARTITION_HORZ and PARTITION_VERT.
+ int simple_motion_search_prune_rect;
+
+ // Perform simple motion search before none_partition to decide if we
+ // want to remove all partitions other than PARTITION_SPLIT. If set to 0, this
+ // model is disabled. If set to 1, the model attempts to perform
+ // PARTITION_SPLIT only. If set to 2, the model also attempts to prune
+ // PARTITION_SPLIT.
+ int simple_motion_search_split;
+
+ // Use features from simple_motion_search to terminate prediction block
+ // partition after PARTITION_NONE
+ int simple_motion_search_early_term_none;
+
+ // Controls whether to reduce the number of motion search steps. If this is 0,
+ // then simple_motion_search has the same number of steps as
+ // single_motion_search (assuming no other speed features). Otherwise, reduce
+ // the number of steps by the value contained in this variable.
+ int simple_motion_search_reduce_search_steps;
+
+ // This variable controls the maximum block size where intra blocks can be
+ // used in inter frames.
+ // TODO(aconverse): Fold this into one of the other many mode skips
+ BLOCK_SIZE max_intra_bsize;
+
+ // Use CNN with luma pixels on source frame on each of the 64x64 subblock to
+ // perform partition pruning in intra frames.
+ // 0: No Pruning
+ // 1: Prune split and rectangular partitions only
+ // 2: Prune none, split and rectangular partitions
+ int intra_cnn_based_part_prune_level;
+
+ // Disable extended partition search if the current bsize is greater than the
+ // threshold. Must be a square block size BLOCK_8X8 or higher.
+ BLOCK_SIZE ext_partition_eval_thresh;
+
+ // Use best partition decision so far to tune 'ext_partition_eval_thresh'
+ int ext_part_eval_based_on_cur_best;
+
+ // Disable rectangular partitions for larger block sizes.
+ int rect_partition_eval_thresh;
+
+ // Prune extended partition search based on whether the split/rect partitions
+ // provided an improvement in the previous search.
+ // 0 : no pruning
+ // 1 : prune 1:4 partition search using winner info from split partitions
+ // 2 : prune 1:4 and AB partition search using split and HORZ/VERT info
+ int prune_ext_part_using_split_info;
+
+ // Prunt rectangular, AB and 4-way partition based on q index and block size
+ // 0 : no pruning
+ // 1 : prune sub_8x8 at very low quantizers
+ // 2 : prune all block size based on qindex
+ int prune_rectangular_split_based_on_qidx;
+
+ // Prune rectangular partitions based on 4x4 sub-block variance
+ // false : no pruning
+ // true : prune rectangular partitions based on 4x4 sub-block variance
+ // deviation
+ //
+ // For allintra encode, this speed feature reduces instruction count by 6.4%
+ // for speed=6 with coding performance change less than 0.24%. For AVIF image
+ // encode, this speed feature reduces encode time by 8.14% for speed 6 on a
+ // typical image dataset with coding performance change less than 0.16%. This
+ // speed feature is not applicable to speed >= 7.
+ bool prune_rect_part_using_4x4_var_deviation;
+
+ // Prune rectangular partitions based on prediction mode chosen by NONE
+ // partition.
+ // false : no pruning
+ // true : prunes rectangular partition as described below
+ // If prediction mode chosen by NONE partition is
+ // DC_PRED or SMOOTH_PRED: Prunes both horizontal and vertical partitions if
+ // at least one of the left and top neighbor blocks is larger than the
+ // current block.
+ // Directional Mode: Prunes either of the horizontal and vertical partition
+ // based on center angle of the prediction mode chosen by NONE partition. For
+ // example, vertical partition is pruned if center angle of the prediction
+ // mode chosen by NONE partition is close to 180 degrees (i.e. horizontal
+ // direction) and vice versa.
+ // For allintra encode, this speed feature reduces instruction count by 5.1%
+ // for speed=6 with coding performance change less than 0.22%. For AVIF image
+ // encode, this speed feature reduces encode time by 4.44% for speed 6 on a
+ // typical image dataset with coding performance change less than 0.15%.
+ // For speed >= 7, variance-based logic is used to determine the partition
+ // structure instead of recursive partition search. Therefore, this speed
+ // feature is not applicable in such cases.
+ bool prune_rect_part_using_none_pred_mode;
+
+ // Terminate partition search for child partition,
+ // when NONE and SPLIT partition rd_costs are INT64_MAX.
+ int early_term_after_none_split;
+
+ // Level used to adjust threshold for av1_ml_predict_breakout(). At lower
+ // levels, more conservative threshold is used, and value of 0 indicates
+ // av1_ml_predict_breakout() is disabled. Value of 3 corresponds to default
+ // case with no adjustment to lbd thresholds.
+ int ml_predict_breakout_level;
+
+ // Prune sub_8x8 (BLOCK_4X4, BLOCK_4X8 and BLOCK_8X4) partitions.
+ // 0 : no pruning
+ // 1 : pruning based on neighbour block information
+ // 2 : prune always
+ int prune_sub_8x8_partition_level;
+
+ // Prune rectangular split based on simple motion search split/no_split score.
+ // 0: disable pruning, 1: enable pruning
+ int simple_motion_search_rect_split;
+
+ // The current encoder adopts a DFS search for block partitions.
+ // Therefore the mode selection and associated rdcost is ready for smaller
+ // blocks before the mode selection for some partition types.
+ // AB partition could use previous rd information and skip mode search.
+ // An example is:
+ //
+ // current block
+ // +---+---+
+ // | |
+ // + +
+ // | |
+ // +-------+
+ //
+ // SPLIT partition has been searched first before trying HORZ_A
+ // +---+---+
+ // | R | R |
+ // +---+---+
+ // | R | R |
+ // +---+---+
+ //
+ // HORZ_A
+ // +---+---+
+ // | | |
+ // +---+---+
+ // | |
+ // +-------+
+ //
+ // With this speed feature, the top two sub blocks can directly use rdcost
+ // searched in split partition, and the mode info is also copied from
+ // saved info. Similarly, the bottom rectangular block can also use
+ // the available information from previous rectangular search.
+ int reuse_prev_rd_results_for_part_ab;
+
+ // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT
+ // when encoding PARTITION_AB.
+ int reuse_best_prediction_for_part_ab;
+
+ // The current partition search records the best rdcost so far and uses it
+ // in mode search and transform search to early skip when some criteria is
+ // met. For example, when the current rdcost is larger than the best rdcost,
+ // or the model rdcost is larger than the best rdcost times some thresholds.
+ // By default, this feature is turned on to speed up the encoder partition
+ // search.
+ // If disabling it, at speed 0, 30 frames, we could get
+ // about -0.25% quality gain (psnr, ssim, vmaf), with about 13% slowdown.
+ int use_best_rd_for_pruning;
+
+ // Skip evaluation of non-square partitions based on the corresponding NONE
+ // partition.
+ // 0: no pruning
+ // 1: prune extended partitions if NONE is skippable
+ // 2: on top of 1, prune rectangular partitions if NONE is inter, not a newmv
+ // mode and skippable
+ int skip_non_sq_part_based_on_none;
+
+ // Disables 8x8 and below partitions for low quantizers.
+ int disable_8x8_part_based_on_qidx;
+} PARTITION_SPEED_FEATURES;
+
+typedef struct MV_SPEED_FEATURES {
+ // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+ SEARCH_METHODS search_method;
+
+ // Enable the use of faster, less accurate mv search method
+ // 0: disable, 1: if bsize >= BLOCK_32X32, 2: based on bsize, SAD and qp
+ // TODO(chiyotsai@google.com): Take the clip's resolution and mv activity into
+ // account.
+ int use_bsize_dependent_search_method;
+
+ // If this is set to 1, we limit the motion search range to 2 times the
+ // largest motion vector found in the last frame.
+ int auto_mv_step_size;
+
+ // Subpel_search_method can only be subpel_tree which does a subpixel
+ // logarithmic search that keeps stepping at 1/2 pixel units until
+ // you stop getting a gain, and then goes on to 1/4 and repeats
+ // the same process. Along the way it skips many diagonals.
+ SUBPEL_SEARCH_METHOD subpel_search_method;
+
+ // Maximum number of steps in logarithmic subpel search before giving up.
+ int subpel_iters_per_step;
+
+ // When to stop subpel search.
+ SUBPEL_FORCE_STOP subpel_force_stop;
+
+ // When to stop subpel search in simple motion search.
+ SUBPEL_FORCE_STOP simple_motion_subpel_force_stop;
+
+ // If true, sub-pixel search uses the exact convolve function used for final
+ // encoding and decoding; otherwise, it uses bilinear interpolation.
+ SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
+
+ // Threshold for allowing exhaustive motion search.
+ int exhaustive_searches_thresh;
+
+ // Pattern to be used for any exhaustive mesh searches (except intraBC ME).
+ MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+ // Pattern to be used for exhaustive mesh searches of intraBC ME.
+ MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_STEP];
+
+ // Reduce single motion search range based on MV result of prior ref_mv_idx.
+ int reduce_search_range;
+
+ // Prune mesh search.
+ PRUNE_MESH_SEARCH_LEVEL prune_mesh_search;
+
+ // Use the rd cost around the best FULLPEL_MV to speed up subpel search
+ int use_fullpel_costlist;
+
+ // Set the full pixel search level of obmc
+ // 0: obmc_full_pixel_diamond
+ // 1: obmc_refining_search_sad (faster)
+ int obmc_full_pixel_search_level;
+
+ // Accurate full pixel motion search based on TPL stats.
+ int full_pixel_search_level;
+
+ // Allow intrabc motion search
+ int use_intrabc;
+
+ // Whether to downsample the rows in sad calculation during motion search.
+ // This is only active when there are at least 16 rows. When this sf is
+ // active, if there is a large discrepancy in the SAD values for the final
+ // motion vector between skipping vs not skipping, motion search is redone
+ // with skip row features off.
+ // 0: Disabled (do not downsample rows)
+ // 1: Skip SAD calculation of odd rows if the SAD deviation of the even and
+ // odd rows for the starting MV is small. Redo motion search with sf off
+ // when SAD deviation is high for the final motion vector.
+ // 2: Skip SAD calculation of odd rows. SAD deviation is not tested for the
+ // start MV and tested only for the final MV.
+ int use_downsampled_sad;
+
+ // Enable/disable extensive joint motion search.
+ int disable_extensive_joint_motion_search;
+
+ // Enable second best mv check in joint mv search.
+ // 0: allow second MV (use rd cost as the metric)
+ // 1: use var as the metric
+ // 2: disable second MV
+ int disable_second_mv;
+
+ // Skips full pixel search based on start mv of prior ref_mv_idx.
+ // 0: Disabled
+ // 1: Skips the full pixel search upto 4 neighbor full-pel MV positions.
+ // 2: Skips the full pixel search upto 8 neighbor full-pel MV positions.
+ int skip_fullpel_search_using_startmv;
+
+ // Method to use for refining WARPED_CAUSAL motion vectors
+ // TODO(rachelbarker): Can this be unified with OBMC in some way?
+ WARP_SEARCH_METHOD warp_search_method;
+
+ // Maximum number of iterations in WARPED_CAUSAL refinement search
+ int warp_search_iters;
+} MV_SPEED_FEATURES;
+
+typedef struct INTER_MODE_SPEED_FEATURES {
+ // 2-pass inter mode model estimation where the preliminary pass skips
+ // transform search and uses a model to estimate rd, while the final pass
+ // computes the full transform search. Two types of models are supported:
+ // 0: not used
+ // 1: used with online dynamic rd model
+ // 2: used with static rd model
+ int inter_mode_rd_model_estimation;
+
+ // Bypass transform search based on skip rd at following stages
+ // i. Compound type mode search
+ // ii. Motion mode search (mode evaluation and winner motion mode stage)
+ // iii. Transform search for best inter candidates
+ int txfm_rd_gate_level[TX_SEARCH_CASES];
+
+ // Limit the inter mode tested in the RD loop
+ int reduce_inter_modes;
+
+ // This variable is used to cap the maximum number of times we skip testing a
+ // mode to be evaluated. A high value means we will be faster.
+ int adaptive_rd_thresh;
+
+ // Aggressively prune inter modes when best mode is skippable.
+ int prune_inter_modes_if_skippable;
+
+ // Drop less likely to be picked reference frames in the RD search.
+ // Has seven levels for now: 0, 1, 2, 3, 4, 5 and 6 where higher levels prune
+ // more aggressively than lower ones. (0 means no pruning).
+ int selective_ref_frame;
+
+ // Prune reference frames for rectangular partitions.
+ // 0 implies no pruning
+ // 1 implies prune for extended partition
+ // 2 implies prune horiz, vert and extended partition
+ int prune_ref_frame_for_rect_partitions;
+
+ // Prune inter modes w.r.t past reference frames
+ // 0 no pruning
+ // 1 prune inter modes w.r.t ALTREF2 and ALTREF reference frames
+ // 2 prune inter modes w.r.t BWDREF, ALTREF2 and ALTREF reference frames
+ int alt_ref_search_fp;
+
+ // Prune reference frames for single prediction modes based on temporal
+ // distance and pred MV SAD. Feasible values are 0, 1, 2. The feature is
+ // disabled for 0. An increasing value indicates more aggressive pruning
+ // threshold.
+ int prune_single_ref;
+
+ // Prune compound reference frames
+ // 0 no pruning
+ // 1 prune compound references which do not satisfy the two conditions:
+ // a) The references are at a nearest distance from the current frame in
+ // both past and future direction.
+ // b) The references have minimum pred_mv_sad in both past and future
+ // direction.
+ // 2 prune compound references except the one with nearest distance from the
+ // current frame in both past and future direction.
+ int prune_comp_ref_frames;
+
+ // Skip the current ref_mv in NEW_MV mode based on mv, rate cost, etc.
+ // This speed feature equaling 0 means no skipping.
+ // If the speed feature equals 1 or 2, skip the current ref_mv in NEW_MV mode
+ // if we have already encountered ref_mv in the drl such that:
+ // 1. The other drl has the same mv during the SIMPLE_TRANSLATION search
+ // process as the current mv.
+ // 2. The rate needed to encode the current mv is larger than that for the
+ // other ref_mv.
+ // The speed feature equaling 1 means using subpel mv in the comparison.
+ // The speed feature equaling 2 means using fullpel mv in the comparison.
+ // If the speed feature >= 3, skip the current ref_mv in NEW_MV mode based on
+ // known full_mv bestsme and drl cost.
+ int skip_newmv_in_drl;
+
+ // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV,
+ // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found
+ // TODO(any): Instead of skipping repeated ref mv, use the recalculated
+ // rd-cost based on mode rate and skip the mode evaluation
+ int skip_repeated_ref_mv;
+
+ // Flag used to control the ref_best_rd based gating for chroma
+ int perform_best_rd_based_gating_for_chroma;
+
+ // Reuse the inter_intra_mode search result from NEARESTMV mode to other
+ // single ref modes
+ int reuse_inter_intra_mode;
+
+ // prune wedge and compound segment approximate rd evaluation based on
+ // compound average modeled rd
+ int prune_comp_type_by_model_rd;
+
+ // prune wedge and compound segment approximate rd evaluation based on
+ // compound average rd/ref_best_rd
+ int prune_comp_type_by_comp_avg;
+
+ // Skip some ref frames in compound motion search by single motion search
+ // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
+ // increasing aggressiveness of skipping in order.
+ // Note: The search order might affect the result. It assumes that the single
+ // reference modes are searched before compound modes. It is better to search
+ // same single inter mode as a group.
+ int prune_comp_search_by_single_result;
+
+ // Instead of performing a full MV search, do a simple translation first
+ // and only perform a full MV search on the motion vectors that performed
+ // well.
+ int prune_mode_search_simple_translation;
+
+ // Only search compound modes with at least one "good" reference frame.
+ // A reference frame is good if, after looking at its performance among
+ // the single reference modes, it is one of the two best performers.
+ int prune_compound_using_single_ref;
+
+ // Skip extended compound mode (NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV,
+ // NEW_NEARMV) using ref frames of above and left neighbor
+ // blocks.
+ // 0 : no pruning
+ // 1 : prune ext compound modes using neighbor blocks (less aggressiveness)
+ // 2 : prune ext compound modes using neighbor blocks (high aggressiveness)
+ // 3 : prune ext compound modes unconditionally (highest aggressiveness)
+ int prune_ext_comp_using_neighbors;
+
+ // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes
+ int skip_ext_comp_nearmv_mode;
+
+ // Skip extended compound mode when ref frame corresponding to NEWMV does not
+ // have NEWMV as single mode winner.
+ // 0 : no pruning
+ // 1 : prune extended compound mode (less aggressiveness)
+ // 2 : prune extended compound mode (high aggressiveness)
+ int prune_comp_using_best_single_mode_ref;
+
+ // Skip NEARESTMV and NEARMV using weight computed in ref mv list population
+ //
+ // Pruning is enabled only when both the top and left neighbor blocks are
+ // available and when the current block already has a valid inter prediction.
+ int prune_nearest_near_mv_using_refmv_weight;
+
+ // Based on previous ref_mv_idx search result, prune the following search.
+ int prune_ref_mv_idx_search;
+
+ // Disable one sided compound modes.
+ int disable_onesided_comp;
+
+ // Prune obmc search using previous frame stats.
+ // INT_MAX : disable obmc search
+ int prune_obmc_prob_thresh;
+
+ // Prune warped motion search using previous frame stats.
+ int prune_warped_prob_thresh;
+
+ // Variance threshold to enable/disable Interintra wedge search
+ unsigned int disable_interintra_wedge_var_thresh;
+
+ // Variance threshold to enable/disable Interinter wedge search
+ unsigned int disable_interinter_wedge_var_thresh;
+
+ // De-couple wedge and mode search during interintra RDO.
+ int fast_interintra_wedge_search;
+
+ // Whether fast wedge sign estimate is used
+ int fast_wedge_sign_estimate;
+
+ // Enable/disable ME for interinter wedge search.
+ int disable_interinter_wedge_newmv_search;
+
+ // Decide when and how to use joint_comp.
+ DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
+
+ // Clip the frequency of updating the mv cost.
+ INTERNAL_COST_UPDATE_TYPE mv_cost_upd_level;
+
+ // Clip the frequency of updating the coeff cost.
+ INTERNAL_COST_UPDATE_TYPE coeff_cost_upd_level;
+
+ // Clip the frequency of updating the mode cost.
+ INTERNAL_COST_UPDATE_TYPE mode_cost_upd_level;
+
+ // Prune inter modes based on tpl stats
+ // 0 : no pruning
+ // 1 - 3 indicate increasing aggressiveness in order.
+ int prune_inter_modes_based_on_tpl;
+
+ // Skip NEARMV and NEAR_NEARMV modes using ref frames of above and left
+ // neighbor blocks and qindex.
+ PRUNE_NEARMV_LEVEL prune_nearmv_using_neighbors;
+
+ // Model based breakout after interpolation filter search
+ // 0: no breakout
+ // 1: use model based rd breakout
+ int model_based_post_interp_filter_breakout;
+
+ // Reuse compound type rd decision when exact match is found
+ // 0: No reuse
+ // 1: Reuse the compound type decision
+ int reuse_compound_type_decision;
+
+ // Enable/disable masked compound.
+ int disable_masked_comp;
+
+ // Enable/disable MV refinement for compound modes corresponds to compound
+ // types COMPOUND_AVERAGE, COMPOUND_DISTWTD (currently, this compound type
+ // is disabled for speeds >= 2 using the sf 'use_dist_wtd_comp_flag') and
+ // COMPOUND_DIFFWTD based on the availability. Levels 0 to 3 indicate
+ // increasing order of aggressiveness to disable MV refinement.
+ // 0: MV Refinement is enabled and for NEW_NEWMV mode used two iterations of
+ // refinement in av1_joint_motion_search().
+ // 1: MV Refinement is disabled for COMPOUND_DIFFWTD and enabled for
+ // COMPOUND_AVERAGE & COMPOUND_DISTWTD.
+ // 2: MV Refinement is enabled for COMPOUND_AVERAGE & COMPOUND_DISTWTD for
+ // NEW_NEWMV mode with one iteration of refinement in
+ // av1_joint_motion_search() and MV Refinement is disabled for other compound
+ // type modes.
+ // 3: MV Refinement is disabled.
+ int enable_fast_compound_mode_search;
+
+ // Reuse masked compound type search results
+ int reuse_mask_search_results;
+
+ // Enable/disable fast search for wedge masks
+ int enable_fast_wedge_mask_search;
+
+ // Early breakout from transform search of inter modes
+ int inter_mode_txfm_breakout;
+
+ // Limit number of inter modes for txfm search if a newmv mode gets
+ // evaluated among the top modes.
+ // 0: no pruning
+ // 1 to 3 indicate increasing order of aggressiveness
+ int limit_inter_mode_cands;
+
+ // Cap the no. of txfm searches for a given prediction mode.
+ // 0: no cap, 1: cap beyond first 4 searches, 2: cap beyond first 3 searches.
+ int limit_txfm_eval_per_mode;
+
+ // Prune warped motion search based on block size.
+ int extra_prune_warped;
+
+ // Do not search compound modes for ARF.
+ // The intuition is that ARF is predicted by frames far away from it,
+ // whose temporal correlations with the ARF are likely low.
+ // It is therefore likely that compound modes do not work as well for ARF
+ // as other inter frames.
+ // Speed/quality impact:
+ // Speed 1: 12% faster, 0.1% psnr loss.
+ // Speed 2: 2% faster, 0.05% psnr loss.
+ // No change for speed 3 and up, because |disable_onesided_comp| is true.
+ int skip_arf_compound;
+} INTER_MODE_SPEED_FEATURES;
+
+typedef struct INTERP_FILTER_SPEED_FEATURES {
+ // Do limited interpolation filter search for dual filters, since best choice
+ // usually includes EIGHTTAP_REGULAR.
+ int use_fast_interpolation_filter_search;
+
+ // Disable dual filter
+ int disable_dual_filter;
+
+ // Save results of av1_interpolation_filter_search for a block
+ // Check mv and ref_frames before search, if they are very close with previous
+ // saved results, filter search can be skipped.
+ int use_interp_filter;
+
+ // skip sharp_filter evaluation based on regular and smooth filter rd for
+ // dual_filter=0 case
+ int skip_sharp_interp_filter_search;
+
+ // skip interpolation filter search for a block in chessboard pattern
+ int cb_pred_filter_search;
+
+ // adaptive interp_filter search to allow skip of certain filter types.
+ int adaptive_interp_filter_search;
+
+ // Forces interpolation filter to EIGHTTAP_REGULAR and skips interpolation
+ // filter search.
+ int skip_interp_filter_search;
+} INTERP_FILTER_SPEED_FEATURES;
+
+typedef struct INTRA_MODE_SPEED_FEATURES {
+ // These bit masks allow you to enable or disable intra modes for each
+ // transform size separately.
+ int intra_y_mode_mask[TX_SIZES];
+ int intra_uv_mode_mask[TX_SIZES];
+
+ // flag to allow skipping intra mode for inter frame prediction
+ int skip_intra_in_interframe;
+
+ // Prune intra mode candidates based on source block histogram of gradient.
+ // Applies to luma plane only.
+ // Feasible values are 0..4. The feature is disabled for 0. An increasing
+ // value indicates more aggressive pruning threshold.
+ int intra_pruning_with_hog;
+
+ // Prune intra mode candidates based on source block histogram of gradient.
+ // Applies to chroma plane only.
+ // Feasible values are 0..4. The feature is disabled for 0. An increasing
+ // value indicates more aggressive pruning threshold.
+ int chroma_intra_pruning_with_hog;
+
+ // Enable/disable smooth intra modes.
+ int disable_smooth_intra;
+
+ // Prune UV_SMOOTH_PRED mode for chroma based on chroma source variance.
+ // false : No pruning
+ // true : Prune UV_SMOOTH_PRED mode based on chroma source variance
+ //
+ // For allintra encode, this speed feature reduces instruction count
+ // by 1.90%, 2.21% and 1.97% for speed 6, 7 and 8 with coding performance
+ // change less than 0.04%. For AVIF image encode, this speed feature reduces
+ // encode time by 1.56%, 2.14% and 0.90% for speed 6, 7 and 8 on a typical
+ // image dataset with coding performance change less than 0.05%.
+ bool prune_smooth_intra_mode_for_chroma;
+
+ // Prune filter intra modes in intra frames.
+ // 0 : No pruning
+ // 1 : Evaluate applicable filter intra modes based on best intra mode so far
+ // 2 : Do not evaluate filter intra modes
+ int prune_filter_intra_level;
+
+ // prune palette search
+ // 0: No pruning
+ // 1: Perform coarse search to prune the palette colors. For winner colors,
+ // neighbors are also evaluated using a finer search.
+ // 2: Perform 2 way palette search from max colors to min colors (and min
+ // colors to remaining colors) and terminate the search if current number of
+ // palette colors is not the winner.
+ int prune_palette_search_level;
+
+ // Terminate early in luma palette_size search. Speed feature values indicate
+ // increasing level of pruning.
+ // 0: No early termination
+ // 1: Terminate early for higher luma palette_size, if header rd cost of lower
+ // palette_size is more than 2 * best_rd. This level of pruning is more
+ // conservative when compared to sf level 2 as the cases which will get pruned
+ // with sf level 1 is a subset of the cases which will get pruned with sf
+ // level 2.
+ // 2: Terminate early for higher luma palette_size, if header rd cost of lower
+ // palette_size is more than best_rd.
+ // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%,
+ // 2.76%, 2.30%, 1.84%, 2.69%, 2.04%, 2.05% and 1.44% for speed 0, 1, 2, 3, 4,
+ // 5, 6, 7 and 8 on screen content set with coding performance change less
+ // than 0.01% for speed <= 2 and less than 0.03% for speed >= 3. For AVIF
+ // image encode, this sf reduces instruction count by 1.94%, 1.13%, 1.29%,
+ // 0.93%, 0.89%, 1.03%, 1.07%, 1.20% and 0.18% for speed 0, 1, 2, 3, 4, 5, 6,
+ // 7 and 8 on a typical image dataset with coding performance change less than
+ // 0.01%.
+ int prune_luma_palette_size_search_level;
+
+ // Prune chroma intra modes based on luma intra mode winner.
+ // 0: No pruning
+ // 1: Prune chroma intra modes other than UV_DC_PRED, UV_SMOOTH_PRED,
+ // UV_CFL_PRED and the mode that corresponds to luma intra mode winner.
+ int prune_chroma_modes_using_luma_winner;
+
+ // Clip the frequency of updating the mv cost for intrabc.
+ INTERNAL_COST_UPDATE_TYPE dv_cost_upd_level;
+
+ // We use DCT_DCT transform followed by computing SATD (Sum of Absolute
+ // Transformed Differences) as an estimation of RD score to quickly find the
+ // best possible Chroma from Luma (CFL) parameter. Then we do a full RD search
+ // near the best possible parameter. The search range is set here.
+ // The range of cfl_searh_range should be [1, 33], and the following are the
+ // recommended values.
+ // 1: Fastest mode.
+ // 3: Default mode that provides good speedup without losing compression
+ // performance at speed 0.
+ // 33: Exhaustive rd search (33 == CFL_MAGS_SIZE). This mode should only
+ // be used for debugging purpose.
+ int cfl_search_range;
+
+ // TOP_INTRA_MODEL_COUNT is 4 that is the number of top model rd to store in
+ // intra mode decision. Here, add a speed feature to reduce this number for
+ // higher speeds.
+ int top_intra_model_count_allowed;
+
+ // Adapt top_intra_model_count_allowed locally to prune luma intra modes using
+ // neighbor block and quantizer information.
+ int adapt_top_model_rd_count_using_neighbors;
+
+ // Prune the evaluation of odd delta angles of directional luma intra modes by
+ // using the rdcosts of neighbouring delta angles.
+ // For allintra encode, this speed feature reduces instruction count
+ // by 4.461%, 3.699% and 3.536% for speed 6, 7 and 8 on a typical video
+ // dataset with coding performance change less than 0.26%. For AVIF image
+ // encode, this speed feature reduces encode time by 2.849%, 2.471%,
+ // and 2.051% for speed 6, 7 and 8 on a typical image dataset with coding
+ // performance change less than 0.27%.
+ int prune_luma_odd_delta_angles_in_intra;
+
+ // Terminate early in chroma palette_size search.
+ // 0: No early termination
+ // 1: Terminate early for higher palette_size, if header rd cost of lower
+ // palette_size is more than best_rd.
+ // For allintra encode, this sf reduces instruction count by 0.45%,
+ // 0.62%, 1.73%, 2.50%, 2.89%, 3.09% and 3.86% for speed 0 to 6 on screen
+ // content set with coding performance change less than 0.01%.
+ // For AVIF image encode, this sf reduces instruction count by 0.45%, 0.81%,
+ // 0.85%, 1.05%, 1.45%, 1.66% and 1.95% for speed 0 to 6 on a typical image
+ // dataset with no quality drop.
+ int early_term_chroma_palette_size_search;
+
+ // Skips the evaluation of filter intra modes in inter frames if rd evaluation
+ // of luma intra dc mode results in invalid rd stats.
+ int skip_filter_intra_in_inter_frames;
+} INTRA_MODE_SPEED_FEATURES;
+
+typedef struct TX_SPEED_FEATURES {
+ // Init search depth for square and rectangular transform partitions.
+ // Values:
+ // 0 - search full tree, 1: search 1 level, 2: search the highest level only
+ int inter_tx_size_search_init_depth_sqr;
+ int inter_tx_size_search_init_depth_rect;
+ int intra_tx_size_search_init_depth_sqr;
+ int intra_tx_size_search_init_depth_rect;
+
+ // If any dimension of a coding block size above 64, always search the
+ // largest transform only, since the largest transform block size is 64x64.
+ int tx_size_search_lgr_block;
+
+ TX_TYPE_SEARCH tx_type_search;
+
+ // Skip split transform block partition when the collocated bigger block
+ // is selected as all zero coefficients.
+ int txb_split_cap;
+
+ // Shortcut the transform block partition and type search when the target
+ // rdcost is relatively lower.
+ // Values are 0 (not used) , or 1 - 2 with progressively increasing
+ // aggressiveness
+ int adaptive_txb_search_level;
+
+ // Prune level for tx_size_type search for inter based on rd model
+ // 0: no pruning
+ // 1-2: progressively increasing aggressiveness of pruning
+ int model_based_prune_tx_search_level;
+
+ // Refine TX type after fast TX search.
+ int refine_fast_tx_search_results;
+
+ // Prune transform split/no_split eval based on residual properties. A value
+ // of 0 indicates no pruning, and the aggressiveness of pruning progressively
+ // increases from levels 1 to 3.
+ int prune_tx_size_level;
+
+ // Prune the evaluation of transform depths as decided by the NN model.
+ // false: No pruning.
+ // true : Avoid the evaluation of specific transform depths using NN model.
+ //
+ // For allintra encode, this speed feature reduces instruction count
+ // by 4.76%, 8.92% and 11.28% for speed 6, 7 and 8 with coding performance
+ // change less than 0.32%. For AVIF image encode, this speed feature reduces
+ // encode time by 4.65%, 9.16% and 10.45% for speed 6, 7 and 8 on a typical
+ // image dataset with coding performance change less than 0.19%.
+ bool prune_intra_tx_depths_using_nn;
+
+ // Enable/disable early breakout during transform search of intra modes, by
+ // using the minimum rd cost possible. By using this approach, the rd
+ // evaluation of applicable transform blocks (in the current block) can be
+ // avoided as
+ // 1) best_rd evolves during the search in choose_tx_size_type_from_rd()
+ // 2) appropriate ref_best_rd is passed in intra_block_yrd()
+ //
+ // For allintra encode, this speed feature reduces instruction count
+ // by 1.11%, 1.08%, 1.02% and 0.93% for speed 3, 6, 7 and 8 with coding
+ // performance change less than 0.02%. For AVIF image encode, this speed
+ // feature reduces encode time by 0.93%, 1.46%, 1.07%, 0.84%, 0.99% and 0.73%
+ // for speed 3, 4, 5, 6, 7 and 8 on a typical image dataset with coding
+ // performance change less than 0.004%.
+ bool use_rd_based_breakout_for_intra_tx_search;
+} TX_SPEED_FEATURES;
+
+typedef struct RD_CALC_SPEED_FEATURES {
+ // Fast approximation of av1_model_rd_from_var_lapndz
+ int simple_model_rd_from_var;
+
+ // Perform faster distortion computation during the R-D evaluation by trying
+ // to approximate the prediction error with transform coefficients (faster but
+ // less accurate) rather than computing distortion in the pixel domain (slower
+ // but more accurate). The following methods are used for distortion
+ // computation:
+ // Method 0: Always compute distortion in the pixel domain
+ // Method 1: Based on block error, try using transform domain distortion for
+ // tx_type search and compute distortion in pixel domain for final RD_STATS
+ // Method 2: Based on block error, try to compute distortion in transform
+ // domain
+ // Methods 1 and 2 may fallback to computing distortion in the pixel domain in
+ // case the block error is less than the threshold, which is controlled by the
+ // speed feature tx_domain_dist_thres_level.
+ //
+ // The speed feature tx_domain_dist_level decides which of the above methods
+ // needs to be used across different mode evaluation stages as described
+ // below:
+ // Eval type: Default Mode Winner
+ // Level 0 : Method 0 Method 2 Method 0
+ // Level 1 : Method 1 Method 2 Method 0
+ // Level 2 : Method 2 Method 2 Method 0
+ // Level 3 : Method 2 Method 2 Method 2
+ int tx_domain_dist_level;
+
+ // Transform domain distortion threshold level
+ int tx_domain_dist_thres_level;
+
+ // Trellis (dynamic programming) optimization of quantized values
+ TRELLIS_OPT_TYPE optimize_coefficients;
+
+ // Use hash table to store macroblock RD search results
+ // to avoid repeated search on the same residue signal.
+ int use_mb_rd_hash;
+
+ // Flag used to control the extent of coeff R-D optimization
+ int perform_coeff_opt;
+} RD_CALC_SPEED_FEATURES;
+
+typedef struct WINNER_MODE_SPEED_FEATURES {
+ // Flag used to control the winner mode processing for better R-D optimization
+ // of quantized coeffs
+ int enable_winner_mode_for_coeff_opt;
+
+ // Flag used to control the winner mode processing for transform size
+ // search method
+ int enable_winner_mode_for_tx_size_srch;
+
+ // Control transform size search level
+ // Eval type: Default Mode Winner
+ // Level 0 : FULL RD LARGEST ALL FULL RD
+ // Level 1 : FAST RD LARGEST ALL FULL RD
+ // Level 2 : LARGEST ALL LARGEST ALL FULL RD
+ // Level 3 : LARGEST ALL LARGEST ALL LARGEST ALL
+ int tx_size_search_level;
+
+ // Flag used to control the winner mode processing for use transform
+ // domain distortion
+ int enable_winner_mode_for_use_tx_domain_dist;
+
+ // Flag used to enable processing of multiple winner modes
+ MULTI_WINNER_MODE_TYPE multi_winner_mode_type;
+
+ // Motion mode for winner candidates:
+ // 0: speed feature OFF
+ // 1 / 2 : Use configured number of winner candidates
+ int motion_mode_for_winner_cand;
+
+ // Controls the prediction of transform skip block or DC only block.
+ //
+ // Different speed feature values (0 to 3) decide the aggressiveness of
+ // prediction (refer to predict_dc_levels[][] in speed_features.c) to be used
+ // during different mode evaluation stages.
+ int dc_blk_pred_level;
+
+ // If on, disables interpolation filter search in handle_inter_mode loop, and
+ // performs it during winner mode processing by \ref
+ // tx_search_best_inter_candidates.
+ int winner_mode_ifs;
+
+ // Controls the disabling of winner mode processing. Speed feature levels
+ // are ordered in increasing aggressiveness of pruning. The method considered
+ // for disabling, depends on the sf level value and it is described as below.
+ // 0: Do not disable
+ // 1: Disable for blocks with low source variance.
+ // 2: Disable for blocks which turn out to be transform skip (skipped based on
+ // eob) during MODE_EVAL stage except NEWMV mode.
+ // 3: Disable for blocks which turn out to be transform skip during MODE_EVAL
+ // stage except NEWMV mode. For high quantizers, prune conservatively based on
+ // transform skip (skipped based on eob) except for NEWMV mode.
+ // 4: Disable for blocks which turn out to be transform skip during MODE_EVAL
+ // stage.
+ int prune_winner_mode_eval_level;
+} WINNER_MODE_SPEED_FEATURES;
+
+typedef struct LOOP_FILTER_SPEED_FEATURES {
+ // This feature controls how the loop filter level is determined.
+ LPF_PICK_METHOD lpf_pick;
+
+ // Skip some final iterations in the determination of the best loop filter
+ // level.
+ int use_coarse_filter_level_search;
+
+ // Control how the CDEF strength is determined.
+ CDEF_PICK_METHOD cdef_pick_method;
+
+ // Decoder side speed feature to add penalty for use of dual-sgr filters.
+ // Takes values 0 - 10, 0 indicating no penalty and each additional level
+ // adding a penalty of 1%
+ int dual_sgr_penalty_level;
+
+ // prune sgr ep using binary search like mechanism
+ int enable_sgr_ep_pruning;
+
+ // Disable loop restoration for Chroma plane
+ int disable_loop_restoration_chroma;
+
+ // Disable loop restoration for luma plane
+ int disable_loop_restoration_luma;
+
+ // Range of loop restoration unit sizes to search
+ // The minimum size is clamped against the superblock size in
+ // av1_pick_filter_restoration, so that the code which sets this value does
+ // not need to know the superblock size ahead of time.
+ int min_lr_unit_size;
+ int max_lr_unit_size;
+
+ // Prune RESTORE_WIENER evaluation based on source variance
+ // 0 : no pruning
+ // 1 : conservative pruning
+ // 2 : aggressive pruning
+ int prune_wiener_based_on_src_var;
+
+ // Prune self-guided loop restoration based on wiener search results
+ // 0 : no pruning
+ // 1 : pruning based on rdcost ratio of RESTORE_WIENER and RESTORE_NONE
+ // 2 : pruning based on winner restoration type among RESTORE_WIENER and
+ // RESTORE_NONE
+ int prune_sgr_based_on_wiener;
+
+ // Reduce the wiener filter win size for luma
+ int reduce_wiener_window_size;
+
+ // Flag to disable Wiener Loop restoration filter.
+ bool disable_wiener_filter;
+
+ // Flag to disable Self-guided Loop restoration filter.
+ bool disable_sgr_filter;
+
+ // Disable the refinement search around the wiener filter coefficients.
+ bool disable_wiener_coeff_refine_search;
+
+ // Whether to downsample the rows in computation of wiener stats.
+ int use_downsampled_wiener_stats;
+} LOOP_FILTER_SPEED_FEATURES;
+
+typedef struct REAL_TIME_SPEED_FEATURES {
+ // check intra prediction for non-RD mode.
+ int check_intra_pred_nonrd;
+
+ // Skip checking intra prediction.
+ // 0 - don't skip
+ // 1 - skip if TX is skipped and best mode is not NEWMV
+ // 2 - skip if TX is skipped
+ // Skipping aggressiveness increases from level 1 to 2.
+ int skip_intra_pred;
+
+ // Estimate motion before calculating variance in variance-based partition
+ // 0 - Only use zero MV
+ // 1 - perform coarse ME
+ // 2 - perform coarse ME, and also use neighbours' MVs
+ // 3 - use neighbours' MVs without performing coarse ME
+ int estimate_motion_for_var_based_partition;
+
+ // For nonrd_use_partition: mode of extra check of leaf partition
+ // 0 - don't check merge
+ // 1 - always check merge
+ // 2 - check merge and prune checking final split
+ // 3 - check merge and prune checking final split based on bsize and qindex
+ int nonrd_check_partition_merge_mode;
+
+ // For nonrd_use_partition: check of leaf partition extra split
+ int nonrd_check_partition_split;
+
+ // Implements various heuristics to skip searching modes
+ // The heuristics selected are based on flags
+ // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+ unsigned int mode_search_skip_flags;
+
+ // For nonrd: Reduces ref frame search.
+ // 0 - low level of search prune in non last frames
+ // 1 - pruned search in non last frames
+ // 2 - more pruned search in non last frames
+ int nonrd_prune_ref_frame_search;
+
+ // This flag controls the use of non-RD mode decision.
+ int use_nonrd_pick_mode;
+
+ // Use ALTREF frame in non-RD mode decision.
+ int use_nonrd_altref_frame;
+
+ // Use compound reference for non-RD mode.
+ int use_comp_ref_nonrd;
+
+ // Reference frames for compound prediction for nonrd pickmode:
+ // LAST_GOLDEN (0), LAST_LAST2 (1), or LAST_ALTREF (2).
+ int ref_frame_comp_nonrd[3];
+
+ // use reduced ref set for real-time mode
+ int use_real_time_ref_set;
+
+ // Skip a number of expensive mode evaluations for blocks with very low
+ // temporal variance.
+ int short_circuit_low_temp_var;
+
+ // Reuse inter prediction in fast non-rd mode.
+ int reuse_inter_pred_nonrd;
+
+ // Number of best inter modes to search transform. INT_MAX - search all.
+ int num_inter_modes_for_tx_search;
+
+ // Use interpolation filter search in non-RD mode decision.
+ int use_nonrd_filter_search;
+
+ // Use simplified RD model for interpolation search and Intra
+ int use_simple_rd_model;
+
+ // For nonrd mode: use hybrid intra mode search for intra only frames based on
+ // block properties.
+ // 0 : use nonrd pick intra for all blocks
+ // 1 : use rd for bsize < 16x16, nonrd otherwise
+ // 2 : use rd for bsize < 16x16 and src var >= 101, nonrd otherwise
+ int hybrid_intra_pickmode;
+
+ // Compute variance/sse on source difference, prior to encoding superblock.
+ int source_metrics_sb_nonrd;
+
+ // Flag to indicate process for handling overshoot on slide/scene change,
+ // for real-time CBR mode.
+ OVERSHOOT_DETECTION_CBR overshoot_detection_cbr;
+
+ // Check for scene/content change detection on every frame before encoding.
+ int check_scene_detection;
+
+ // For nonrd mode: Prefer larger partition blks in variance based partitioning
+ // 0: disabled, 1-3: increasing aggressiveness
+ int prefer_large_partition_blocks;
+
+ // uses results of temporal noise estimate
+ int use_temporal_noise_estimate;
+
+ // Parameter indicating initial search window to be used in full-pixel search
+ // for nonrd_pickmode. Range [0, MAX_MVSEARCH_STEPS - 1]. Lower value
+ // indicates larger window. If set to 0, step_param is set based on internal
+ // logic in set_mv_search_params().
+ int fullpel_search_step_param;
+
+ // Bit mask to enable or disable intra modes for each prediction block size
+ // separately, for nonrd_pickmode. Currently, the sf is not respected when
+ // 'force_intra_check' is true in 'av1_estimate_intra_mode()' function. Also,
+ // H and V pred modes allowed through this sf can be further pruned when
+ //'prune_hv_pred_modes_using_src_sad' sf is true.
+ int intra_y_mode_bsize_mask_nrd[BLOCK_SIZES];
+
+ // Prune H and V intra predition modes evalution in inter frame.
+ // The sf does not have any impact.
+ // i. when frame_source_sad is 1.1 times greater than avg_source_sad
+ // ii. when cyclic_refresh_segment_id_boosted is enabled
+ // iii. when SB level source sad is greater than kMedSad
+ // iv. when color sensitivity is non zero for both the chroma channels
+ bool prune_hv_pred_modes_using_src_sad;
+
+ // Skips mode checks more aggressively in nonRD mode
+ int nonrd_aggressive_skip;
+
+ // Skip cdef on 64x64 blocks/
+ // 0: disabled
+ // 1: skip when NEWMV or INTRA is not picked or color sensitivity is off.
+ // When color sensitivity is on for a superblock, all 64x64 blocks within
+ // will not skip.
+ // 2: more aggressive mode where skip is done for all frames where
+ // rc->high_source_sad = 0 (non slide-changes), and color sensitivity off.
+ int skip_cdef_sb;
+
+ // Force selective cdf update.
+ int selective_cdf_update;
+
+ // Force only single reference (LAST) for prediction.
+ int force_only_last_ref;
+
+ // Forces larger partition blocks in variance based partitioning for intra
+ // frames
+ int force_large_partition_blocks_intra;
+
+ // Use fixed partition for superblocks based on source_sad.
+ // 0: disabled
+ // 1: enabled
+ int use_fast_fixed_part;
+
+ // Increase source_sad thresholds in nonrd pickmode.
+ int increase_source_sad_thresh;
+
+ // Skip evaluation of no split in tx size selection for merge partition
+ int skip_tx_no_split_var_based_partition;
+
+ // Intermediate termination of newMV mode evaluation based on so far best mode
+ // sse
+ int skip_newmv_mode_based_on_sse;
+
+ // Define gf length multiplier.
+ // Level 0: use large multiplier, level 1: use medium multiplier.
+ int gf_length_lvl;
+
+ // Prune inter modes with golden frame as reference for NEARMV and NEWMV modes
+ int prune_inter_modes_with_golden_ref;
+
+ // Prune inter modes w.r.t golden or alt-ref frame based on sad
+ int prune_inter_modes_wrt_gf_arf_based_on_sad;
+
+ // Prune inter mode search in rd path based on current block's temporal
+ // variance wrt LAST reference.
+ int prune_inter_modes_using_temp_var;
+
+ // Reduce MV precision to halfpel for higher int MV value & frame-level motion
+ // 0: disabled
+ // 1-2: Reduce precision to halfpel, fullpel based on conservative
+ // thresholds, aggressiveness increases with increase in level
+ // 3: Reduce precision to halfpel using more aggressive thresholds
+ int reduce_mv_pel_precision_highmotion;
+
+ // Reduce MV precision for low complexity blocks
+ // 0: disabled
+ // 1: Reduce the mv resolution for zero mv if the variance is low
+ // 2: Switch to halfpel, fullpel based on low block spatial-temporal
+ // complexity.
+ int reduce_mv_pel_precision_lowcomplex;
+
+ // Prune intra mode evaluation in inter frames based on mv range.
+ BLOCK_SIZE prune_intra_mode_based_on_mv_range;
+ // The number of times to left shift the splitting thresholds in variance
+ // based partitioning. The minimum values should be 7 to avoid left shifting
+ // by a negative number.
+ int var_part_split_threshold_shift;
+
+ // Qindex based variance partition threshold index, which determines
+ // the aggressiveness of partition pruning
+ // 0: disabled for speeds 9,10
+ // 1,2: (rd-path) lowers qindex thresholds conditionally (for low SAD sb)
+ // 3,4: (non-rd path) uses pre-tuned qindex thresholds
+ int var_part_based_on_qidx;
+
+ // Enable GF refresh based on Q value.
+ int gf_refresh_based_on_qp;
+
+ // Temporal filtering
+ // The value can be 1 or 2, which indicates the threshold to use.
+ // Must be off for lossless mode.
+ int use_rtc_tf;
+
+ // Prune the use of the identity transform in nonrd_pickmode,
+ // used for screen content mode: only for smaller blocks
+ // and higher spatial variance, and when skip_txfm is not
+ // already set.
+ int prune_idtx_nonrd;
+
+ // Prune the use of paletter mode in nonrd pickmode.
+ int prune_palette_nonrd;
+
+ // Force to only use dct for palette search in nonrd pickmode.
+ int dct_only_palette_nonrd;
+
+ // Skip loopfilter, for static content after slide change
+ // or key frame, once quality has ramped up.
+ // 0: disabled
+ // 1: skip only after quality is ramped up.
+ // 2: aggrssive mode, where skip is done for all frames that
+ // where rc->high_source_sad = 0 (no slide-changes).
+ int skip_lf_screen;
+
+ // For nonrd: early exit out of variance partition that sets the
+ // block size to superblock size, and sets mode to zeromv-last skip.
+ // 0: disabled
+ // 1: zeromv-skip is enabled at SB level only
+ // 2: zeromv-skip is enabled at SB level and coding block level
+ int part_early_exit_zeromv;
+
+ // Early terminate inter mode search based on sse in non-rd path.
+ INTER_SEARCH_EARLY_TERM_IDX sse_early_term_inter_search;
+
+ // SAD based adaptive altref selection
+ int sad_based_adp_altref_lag;
+
+ // Enable/disable partition direct merging.
+ int partition_direct_merging;
+
+ // Level of aggressiveness for obtaining tx size based on qstep
+ int tx_size_level_based_on_qstep;
+
+ // Avoid the partitioning of a 16x16 block in variance based partitioning
+ // (VBP) by making use of minimum and maximum sub-block variances.
+ // For allintra encode, this speed feature reduces instruction count by 5.39%
+ // for speed 9 on a typical video dataset with coding performance gain
+ // of 1.44%.
+ // For AVIF image encode, this speed feature reduces encode time
+ // by 8.44% for speed 9 on a typical image dataset with coding performance
+ // gain of 0.78%.
+ bool vbp_prune_16x16_split_using_min_max_sub_blk_var;
+
+ // A qindex threshold that determines whether to use qindex based CDEF filter
+ // strength estimation for screen content types. The strength estimation model
+ // used for screen contents prefers to allow cdef filtering for more frames.
+ // This sf is used to limit the frames which go through cdef filtering and
+ // following explains the setting of the same.
+ // MAXQ (255): This disables the usage of this sf. Here, frame does not use a
+ // screen content model thus reduces the number of frames that go through cdef
+ // filtering.
+ // MINQ (0): Frames always use screen content model thus increasing the number
+ // of frames that go through cdef filtering.
+ // This speed feature has a substantial gain on coding metrics, with moderate
+ // increase encoding time. Select threshold based on speed vs quality
+ // trade-off.
+ int screen_content_cdef_filter_qindex_thresh;
+
+ // Prune compound mode if its variance is higher than the variance of single
+ // modes.
+ bool prune_compoundmode_with_singlecompound_var;
+
+ // Allow mode cost update at frame level every couple frames. This
+ // overrides the command line setting --mode-cost-upd-freq=3 (never update
+ // except on key frame and first delta).
+ bool frame_level_mode_cost_update;
+
+ // Prune H_PRED during intra mode evaluation in the nonrd path based on best
+ // mode so far.
+ //
+ // For allintra encode, this speed feature reduces instruction count by 1.10%
+ // for speed 9 with coding performance change less than 0.04%.
+ // For AVIF image encode, this speed feature reduces encode time by 1.03% for
+ // speed 9 on a typical image dataset with coding performance change less than
+ // 0.08%.
+ bool prune_h_pred_using_best_mode_so_far;
+
+ // Enable pruning of intra mode evaluations in nonrd path based on source
+ // variance and best mode so far. The pruning logic is enabled only if the
+ // mode is not a winner mode of both the neighboring blocks (left/top).
+ //
+ // For allintra encode, this speed feature reduces instruction count by 3.96%
+ // for speed 9 with coding performance change less than 0.38%.
+ // For AVIF image encode, this speed feature reduces encode time by 3.46% for
+ // speed 9 on a typical image dataset with coding performance change less than
+ // -0.06%.
+ bool enable_intra_mode_pruning_using_neighbors;
+
+ // Prune intra mode evaluations in nonrd path based on best sad so far.
+ //
+ // For allintra encode, this speed feature reduces instruction count by 3.05%
+ // for speed 9 with coding performance change less than 0.24%.
+ // For AVIF image encode, this speed feature reduces encode time by 1.87% for
+ // speed 9 on a typical image dataset with coding performance change less than
+ // 0.16%.
+ bool prune_intra_mode_using_best_sad_so_far;
+
+ // If compound is enabled, and the current block size is \geq BLOCK_16X16,
+ // limit the compound modes to GLOBAL_GLOBALMV. This does not apply to the
+ // base layer of svc.
+ bool check_only_zero_zeromv_on_large_blocks;
+
+ // Allow for disabling cdf update for non reference frames in svc mode.
+ bool disable_cdf_update_non_reference_frame;
+
+ // Prune compound modes if the single modes variances do not perform well.
+ bool prune_compoundmode_with_singlemode_var;
+
+ // Skip searching all compound mode if the variance of single_mode residue is
+ // sufficiently low.
+ bool skip_compound_based_on_var;
+
+ // Sets force_zeromv_skip based on the source sad available. Aggressiveness
+ // increases with increase in the level set for speed feature.
+ // 0: No setting
+ // 1: If source sad is kZeroSad
+ // 2: If source sad <= kVeryLowSad
+ int set_zeromv_skip_based_on_source_sad;
+
+ // Downgrades the block-level subpel motion search to
+ // av1_find_best_sub_pixel_tree_pruned_more for higher QP and when fullpel
+ // search performed well, zeromv has low sad or low source_var
+ bool use_adaptive_subpel_search;
+
+ // A flag used in RTC case to control frame_refs_short_signaling. Note that
+ // the final decision is made in check_frame_refs_short_signaling(). The flag
+ // can only be turned on when res < 360p and speed >= 9, in which case only
+ // LAST and GOLDEN ref frames are used now.
+ bool enable_ref_short_signaling;
+
+ // A flag that controls if we check or bypass GLOBALMV in rtc single ref frame
+ // case.
+ bool check_globalmv_on_single_ref;
+
+ // Allows for increasing the color_threshold for palette prediction.
+ // This generally leads to better coding efficiency but with some speed loss.
+ // Only used for screen content and for nonrd_pickmode.
+ bool increase_color_thresh_palette;
+} REAL_TIME_SPEED_FEATURES;
+
+/*!\endcond */
+
+/*!
+ * \brief Top level speed vs quality trade off data struture.
+ */
+typedef struct SPEED_FEATURES {
+ /*!
+ * Sequence/frame level speed features:
+ */
+ HIGH_LEVEL_SPEED_FEATURES hl_sf;
+
+ /*!
+ * Speed features for the first pass.
+ */
+ FIRST_PASS_SPEED_FEATURES fp_sf;
+
+ /*!
+ * Speed features related to how tpl's searches are done.
+ */
+ TPL_SPEED_FEATURES tpl_sf;
+
+ /*!
+ * Global motion speed features:
+ */
+ GLOBAL_MOTION_SPEED_FEATURES gm_sf;
+
+ /*!
+ * Partition search speed features:
+ */
+ PARTITION_SPEED_FEATURES part_sf;
+
+ /*!
+ * Motion search speed features:
+ */
+ MV_SPEED_FEATURES mv_sf;
+
+ /*!
+ * Inter mode search speed features:
+ */
+ INTER_MODE_SPEED_FEATURES inter_sf;
+
+ /*!
+ * Interpolation filter search speed features:
+ */
+ INTERP_FILTER_SPEED_FEATURES interp_sf;
+
+ /*!
+ * Intra mode search speed features:
+ */
+ INTRA_MODE_SPEED_FEATURES intra_sf;
+
+ /*!
+ * Transform size/type search speed features:
+ */
+ TX_SPEED_FEATURES tx_sf;
+
+ /*!
+ * RD calculation speed features:
+ */
+ RD_CALC_SPEED_FEATURES rd_sf;
+
+ /*!
+ * Two-pass mode evaluation features:
+ */
+ WINNER_MODE_SPEED_FEATURES winner_mode_sf;
+
+ /*!
+ * In-loop filter speed features:
+ */
+ LOOP_FILTER_SPEED_FEATURES lpf_sf;
+
+ /*!
+ * Real-time mode speed features:
+ */
+ REAL_TIME_SPEED_FEATURES rt_sf;
+} SPEED_FEATURES;
+/*!\cond */
+
+struct AV1_COMP;
+
+/*!\endcond */
+/*!\brief Frame size independent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] speed Speed setting passed in from the command line
+ *
+ * \remark No return value but configures the various speed trade off flags
+ * based on the passed in speed setting. (Higher speed gives lower
+ * quality)
+ */
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi,
+ int speed);
+
+/*!\brief Frame size dependent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] speed Speed setting passed in from the command line
+ *
+ * \remark No return value but configures the various speed trade off flags
+ * based on the passed in speed setting and frame size. (Higher speed
+ * corresponds to lower quality)
+ */
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
+ int speed);
+/*!\brief Q index dependent speed vs quality trade off flags
+ *
+ *\ingroup speed_features
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] speed Speed setting passed in from the command line
+ *
+ * \remark No return value but configures the various speed trade off flags
+ * based on the passed in speed setting and current frame's Q index.
+ * (Higher speed corresponds to lower quality)
+ */
+void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/third_party/aom/av1/encoder/superres_scale.c b/third_party/aom/av1/encoder/superres_scale.c
new file mode 100644
index 0000000000..3b47909b15
--- /dev/null
+++ b/third_party/aom/av1/encoder/superres_scale.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encoder_alloc.h"
+#include "av1/encoder/superres_scale.h"
+#include "av1/encoder/random.h"
+
+// Compute the horizontal frequency components' energy in a frame
+// by calculuating the 16x4 Horizontal DCT. This is to be used to
+// decide the superresolution parameters.
+static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
+ uint64_t freq_energy[16] = { 0 };
+ const YV12_BUFFER_CONFIG *buf = cpi->source;
+ const int bd = cpi->td.mb.e_mbd.bd;
+ const int width = buf->y_crop_width;
+ const int height = buf->y_crop_height;
+ DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]);
+ int n = 0;
+ memset(freq_energy, 0, sizeof(freq_energy));
+ if (buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer);
+ for (int i = 0; i < height - 4; i += 4) {
+ for (int j = 0; j < width - 16; j += 16) {
+ av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
+ H_DCT, bd);
+ for (int k = 1; k < 16; ++k) {
+ const uint64_t this_energy =
+ ((int64_t)coeff[k] * coeff[k]) +
+ ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+ ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+ ((int64_t)coeff[k + 48] * coeff[k + 48]);
+ freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
+ }
+ n++;
+ }
+ }
+ } else {
+ assert(bd == 8);
+ DECLARE_ALIGNED(16, int16_t, src16[16 * 4]);
+ for (int i = 0; i < height - 4; i += 4) {
+ for (int j = 0; j < width - 16; j += 16) {
+ for (int ii = 0; ii < 4; ++ii)
+ for (int jj = 0; jj < 16; ++jj)
+ src16[ii * 16 + jj] =
+ buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
+ av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
+ for (int k = 1; k < 16; ++k) {
+ const uint64_t this_energy =
+ ((int64_t)coeff[k] * coeff[k]) +
+ ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+ ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+ ((int64_t)coeff[k + 48] * coeff[k + 48]);
+ freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
+ }
+ n++;
+ }
+ }
+ }
+ if (n) {
+ for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n;
+ // Convert to cumulative energy
+ for (int k = 14; k > 0; --k) energy[k] += energy[k + 1];
+ } else {
+ for (int k = 1; k < 16; ++k) energy[k] = 1e+20;
+ }
+}
+
+static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
+ // Choose an arbitrary random number
+ static unsigned int seed = 56789;
+ const ResizeCfg *resize_cfg = &cpi->oxcf.resize_cfg;
+ if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+ uint8_t new_denom = SCALE_NUMERATOR;
+
+ if (cpi->common.seq_params->reduced_still_picture_hdr) return SCALE_NUMERATOR;
+ switch (resize_cfg->resize_mode) {
+ case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
+ case RESIZE_FIXED:
+ if (cpi->common.current_frame.frame_type == KEY_FRAME)
+ new_denom = resize_cfg->resize_kf_scale_denominator;
+ else
+ new_denom = resize_cfg->resize_scale_denominator;
+ break;
+ case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+ default: assert(0);
+ }
+ return new_denom;
+}
+
+int av1_superres_in_recode_allowed(const AV1_COMP *const cpi) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ // Empirically found to not be beneficial for image coding.
+ return oxcf->superres_cfg.superres_mode == AOM_SUPERRES_AUTO &&
+ cpi->sf.hl_sf.superres_auto_search_type != SUPERRES_AUTO_SOLO &&
+ cpi->rc.frames_to_key > 1;
+}
+
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008
+#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008
+#define SUPERRES_ENERGY_BY_AC_THRESH 0.2
+
+static double get_energy_by_q2_thresh(const GF_GROUP *gf_group,
+ const RATE_CONTROL *rc,
+ int gf_frame_index) {
+ // TODO(now): Return keyframe thresh * factor based on frame type / pyramid
+ // level.
+ if (gf_group->update_type[gf_frame_index] == ARF_UPDATE) {
+ return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME;
+ } else if (gf_group->update_type[gf_frame_index] == KF_UPDATE) {
+ if (rc->frames_to_key <= 1)
+ return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO;
+ else
+ return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME;
+ } else {
+ assert(0);
+ }
+ return 0;
+}
+
+static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
+ double threshq,
+ double threshp) {
+ const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8);
+ const double tq = threshq * q * q;
+ const double tp = threshp * energy[1];
+ const double thresh = AOMMIN(tq, tp);
+ int k;
+ for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) {
+ if (energy[k - 1] > thresh) break;
+ }
+ return 3 * SCALE_NUMERATOR - k;
+}
+
+static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
+ int sr_kf, int sr_arf) {
+ // Use superres for Key-frames and Alt-ref frames only.
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ if (gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+ gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE) {
+ return SCALE_NUMERATOR;
+ }
+ if (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE && !sr_kf) {
+ return SCALE_NUMERATOR;
+ }
+ if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE && !sr_arf) {
+ return SCALE_NUMERATOR;
+ }
+
+ double energy[16];
+ analyze_hor_freq(cpi, energy);
+
+ const double energy_by_q2_thresh =
+ get_energy_by_q2_thresh(gf_group, &cpi->rc, cpi->gf_frame_index);
+ int denom = get_superres_denom_from_qindex_energy(
+ qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH);
+ /*
+ printf("\nenergy = [");
+ for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
+ printf("]\n");
+ printf("boost = %d\n",
+ (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE)
+ ? cpi->ppi->p_rc.kf_boost
+ : cpi->rc.gfu_boost);
+ printf("denom = %d\n", denom);
+ */
+ if (av1_superres_in_recode_allowed(cpi)) {
+ assert(cpi->superres_mode != AOM_SUPERRES_NONE);
+ // Force superres to be tried in the recode loop, as full-res is also going
+ // to be tried anyway.
+ denom = AOMMAX(denom, SCALE_NUMERATOR + 1);
+ }
+ return denom;
+}
+
+static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
+ // Choose an arbitrary random number
+ static unsigned int seed = 34567;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const SuperResCfg *const superres_cfg = &oxcf->superres_cfg;
+ const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+ const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+ if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+ uint8_t new_denom = SCALE_NUMERATOR;
+
+ // Make sure that superres mode of the frame is consistent with the
+ // sequence-level flag.
+ assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE,
+ cpi->common.seq_params->enable_superres));
+ assert(IMPLIES(!cpi->common.seq_params->enable_superres,
+ superres_cfg->superres_mode == AOM_SUPERRES_NONE));
+ // Make sure that superres mode for current encoding is consistent with user
+ // provided superres mode.
+ assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_AUTO,
+ cpi->superres_mode == superres_cfg->superres_mode));
+
+ // Note: we must look at the current superres_mode to be tried in 'cpi' here,
+ // not the user given mode in 'oxcf'.
+ switch (cpi->superres_mode) {
+ case AOM_SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
+ case AOM_SUPERRES_FIXED:
+ if (cpi->common.current_frame.frame_type == KEY_FRAME)
+ new_denom = superres_cfg->superres_kf_scale_denominator;
+ else
+ new_denom = superres_cfg->superres_scale_denominator;
+ break;
+ case AOM_SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+ case AOM_SUPERRES_QTHRESH: {
+ // Do not use superres when screen content tools are used.
+ if (cpi->common.features.allow_screen_content_tools) break;
+ if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ)
+ av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height);
+
+ // Now decide the use of superres based on 'q'.
+ int bottom_index, top_index;
+ const int q = av1_rc_pick_q_and_bounds(
+ cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+ &bottom_index, &top_index);
+
+ const int qthresh = (frame_is_intra_only(&cpi->common))
+ ? superres_cfg->superres_kf_qthresh
+ : superres_cfg->superres_qthresh;
+ if (q <= qthresh) {
+ new_denom = SCALE_NUMERATOR;
+ } else {
+ new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+ }
+ break;
+ }
+ case AOM_SUPERRES_AUTO: {
+ if (cpi->common.features.allow_screen_content_tools) break;
+ if (rc_cfg->mode == AOM_VBR || rc_cfg->mode == AOM_CQ)
+ av1_set_target_rate(cpi, frm_dim_cfg->width, frm_dim_cfg->height);
+
+ // Now decide the use of superres based on 'q'.
+ int bottom_index, top_index;
+ const int q = av1_rc_pick_q_and_bounds(
+ cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+ &bottom_index, &top_index);
+
+ const SUPERRES_AUTO_SEARCH_TYPE sr_search_type =
+ cpi->sf.hl_sf.superres_auto_search_type;
+ const int qthresh = (sr_search_type == SUPERRES_AUTO_SOLO) ? 128 : 0;
+ if (q <= qthresh) {
+ new_denom = SCALE_NUMERATOR; // Don't use superres.
+ } else {
+ if (sr_search_type == SUPERRES_AUTO_ALL) {
+ if (cpi->common.current_frame.frame_type == KEY_FRAME)
+ new_denom = superres_cfg->superres_kf_scale_denominator;
+ else
+ new_denom = superres_cfg->superres_scale_denominator;
+ } else {
+ new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+ }
+ }
+ break;
+ }
+ default: assert(0);
+ }
+ return new_denom;
+}
+
+static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
+ return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
+}
+
+static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
+ // Only need to check the width, as scaling is horizontal only.
+ (void)oheight;
+ return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
+}
+
+static int validate_size_scales(RESIZE_MODE resize_mode,
+ aom_superres_mode superres_mode, int owidth,
+ int oheight, size_params_type *rsz) {
+ if (dimensions_are_ok(owidth, oheight, rsz)) { // Nothing to do.
+ return 1;
+ }
+
+ // Calculate current resize scale.
+ int resize_denom =
+ AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
+ DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
+
+ if (resize_mode != RESIZE_RANDOM && superres_mode == AOM_SUPERRES_RANDOM) {
+ // Alter superres scale as needed to enforce conformity.
+ rsz->superres_denom =
+ (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
+ if (!dimensions_are_ok(owidth, oheight, rsz)) {
+ if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
+ }
+ } else if (resize_mode == RESIZE_RANDOM &&
+ superres_mode != AOM_SUPERRES_RANDOM) {
+ // Alter resize scale as needed to enforce conformity.
+ resize_denom =
+ (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
+ rsz->resize_width = owidth;
+ rsz->resize_height = oheight;
+ av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+ resize_denom);
+ if (!dimensions_are_ok(owidth, oheight, rsz)) {
+ if (resize_denom > SCALE_NUMERATOR) {
+ --resize_denom;
+ rsz->resize_width = owidth;
+ rsz->resize_height = oheight;
+ av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+ resize_denom);
+ }
+ }
+ } else if (resize_mode == RESIZE_RANDOM &&
+ superres_mode == AOM_SUPERRES_RANDOM) {
+ // Alter both resize and superres scales as needed to enforce conformity.
+ do {
+ if (resize_denom > rsz->superres_denom)
+ --resize_denom;
+ else
+ --rsz->superres_denom;
+ rsz->resize_width = owidth;
+ rsz->resize_height = oheight;
+ av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+ resize_denom);
+ } while (!dimensions_are_ok(owidth, oheight, rsz) &&
+ (resize_denom > SCALE_NUMERATOR ||
+ rsz->superres_denom > SCALE_NUMERATOR));
+ } else { // We are allowed to alter neither resize scale nor superres
+ // scale.
+ return 0;
+ }
+ return dimensions_are_ok(owidth, oheight, rsz);
+}
+
+// Calculates resize and superres params for next frame
+static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+ const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
+ size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height,
+ SCALE_NUMERATOR };
+ int resize_denom = SCALE_NUMERATOR;
+ if (has_no_stats_stage(cpi) && cpi->ppi->use_svc &&
+ (cpi->common.width != cpi->oxcf.frm_dim_cfg.width ||
+ cpi->common.height != cpi->oxcf.frm_dim_cfg.height)) {
+ rsz.resize_width = cpi->common.width;
+ rsz.resize_height = cpi->common.height;
+ return rsz;
+ }
+ if (is_stat_generation_stage(cpi)) return rsz;
+ if (resize_pending_params->width && resize_pending_params->height) {
+ rsz.resize_width = resize_pending_params->width;
+ rsz.resize_height = resize_pending_params->height;
+ resize_pending_params->width = resize_pending_params->height = 0;
+ if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE) return rsz;
+ } else {
+ resize_denom = calculate_next_resize_scale(cpi);
+ rsz.resize_width = frm_dim_cfg->width;
+ rsz.resize_height = frm_dim_cfg->height;
+ av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
+ resize_denom);
+ }
+ rsz.superres_denom = calculate_next_superres_scale(cpi);
+ if (!validate_size_scales(oxcf->resize_cfg.resize_mode, cpi->superres_mode,
+ frm_dim_cfg->width, frm_dim_cfg->height, &rsz))
+ assert(0 && "Invalid scale parameters");
+ return rsz;
+}
+
+static void setup_frame_size_from_params(AV1_COMP *cpi,
+ const size_params_type *rsz) {
+ int encode_width = rsz->resize_width;
+ int encode_height = rsz->resize_height;
+
+ AV1_COMMON *cm = &cpi->common;
+ cm->superres_upscaled_width = encode_width;
+ cm->superres_upscaled_height = encode_height;
+ cm->superres_scale_denominator = rsz->superres_denom;
+ av1_calculate_scaled_superres_size(&encode_width, &encode_height,
+ rsz->superres_denom);
+ av1_set_frame_size(cpi, encode_width, encode_height);
+}
+
+void av1_setup_frame_size(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ // Reset superres params from previous frame.
+ cm->superres_scale_denominator = SCALE_NUMERATOR;
+ const size_params_type rsz = calculate_next_size_params(cpi);
+ setup_frame_size_from_params(cpi, &rsz);
+
+ assert(av1_is_min_tile_width_satisfied(cm));
+}
+
+void av1_superres_post_encode(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+
+ assert(cpi->oxcf.superres_cfg.enable_superres);
+ assert(!is_lossless_requested(&cpi->oxcf.rc_cfg));
+ assert(!cm->features.all_lossless);
+
+ av1_superres_upscale(cm, NULL, cpi->image_pyramid_levels);
+
+ // If regular resizing is occurring the source will need to be downscaled to
+ // match the upscaled superres resolution. Otherwise the original source is
+ // used.
+ if (!av1_resize_scaled(cm)) {
+ cpi->source = cpi->unscaled_source;
+ if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
+ } else {
+ assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
+ assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
+ // Do downscale. cm->(width|height) has been updated by
+ // av1_superres_upscale
+ cpi->source = realloc_and_scale_source(cpi, cm->superres_upscaled_width,
+ cm->superres_upscaled_height);
+ }
+}
diff --git a/third_party/aom/av1/encoder/superres_scale.h b/third_party/aom/av1/encoder/superres_scale.h
new file mode 100644
index 0000000000..450a4ed902
--- /dev/null
+++ b/third_party/aom/av1/encoder/superres_scale.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SUPERRES_SCALE_H_
+#define AOM_AV1_ENCODER_SUPERRES_SCALE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int av1_superres_in_recode_allowed(const AV1_COMP *const cpi);
+void av1_superres_post_encode(AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_SUPERRES_SCALE_H_
diff --git a/third_party/aom/av1/encoder/svc_layercontext.c b/third_party/aom/av1/encoder/svc_layercontext.c
new file mode 100644
index 0000000000..2c99cb89b8
--- /dev/null
+++ b/third_party/aom/av1/encoder/svc_layercontext.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encoder_alloc.h"
+
+static void swap_ptr(void *a, void *b) {
+ void **a_p = (void **)a;
+ void **b_p = (void **)b;
+ void *c = *a_p;
+ *a_p = *b_p;
+ *b_p = c;
+}
+
+void av1_init_layer_context(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ SVC *const svc = &cpi->svc;
+ int mi_rows = cpi->common.mi_params.mi_rows;
+ int mi_cols = cpi->common.mi_params.mi_cols;
+ svc->base_framerate = 30.0;
+ svc->current_superframe = 0;
+ svc->force_zero_mode_spatial_ref = 1;
+ svc->num_encoded_top_layer = 0;
+ svc->use_flexible_mode = 0;
+ svc->has_lower_quality_layer = 0;
+
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+ PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+ lrc->ni_av_qi = oxcf->rc_cfg.worst_allowed_q;
+ lp_rc->total_actual_bits = 0;
+ lrc->ni_tot_qi = 0;
+ lp_rc->tot_q = 0.0;
+ lp_rc->avg_q = 0.0;
+ lp_rc->ni_frames = 0;
+ lrc->decimation_count = 0;
+ lrc->decimation_factor = 0;
+ lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
+ lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+ lrc->rtc_external_ratectrl = 0;
+ for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+ lp_rc->rate_correction_factors[i] = 1.0;
+ }
+ lc->target_bandwidth = lc->layer_target_bitrate;
+ lp_rc->last_q[INTER_FRAME] = lrc->worst_quality;
+ lp_rc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality;
+ lp_rc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality;
+ lp_rc->buffer_level =
+ oxcf->rc_cfg.starting_buffer_level_ms * lc->target_bandwidth / 1000;
+ lp_rc->bits_off_target = lp_rc->buffer_level;
+ // Initialize the cyclic refresh parameters. If spatial layers are used
+ // (i.e., ss_number_layers > 1), these need to be updated per spatial
+ // layer. Cyclic refresh is only applied on base temporal layer.
+ if (svc->number_spatial_layers > 1 && tl == 0) {
+ lc->sb_index = 0;
+ lc->actual_num_seg1_blocks = 0;
+ lc->actual_num_seg2_blocks = 0;
+ lc->counter_encode_maxq_scene_change = 0;
+ aom_free(lc->map);
+ CHECK_MEM_ERROR(cm, lc->map,
+ aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
+ }
+ }
+ svc->downsample_filter_type[sl] = BILINEAR;
+ svc->downsample_filter_phase[sl] = 8;
+ svc->last_layer_dropped[sl] = false;
+ svc->drop_spatial_layer[sl] = false;
+ }
+ if (svc->number_spatial_layers == 3) {
+ svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH;
+ }
+}
+
+bool av1_alloc_layer_context(AV1_COMP *cpi, int num_layers) {
+ SVC *const svc = &cpi->svc;
+ if (svc->layer_context == NULL || svc->num_allocated_layers < num_layers) {
+ assert(num_layers > 1);
+ aom_free(svc->layer_context);
+ svc->num_allocated_layers = 0;
+ svc->layer_context =
+ (LAYER_CONTEXT *)aom_calloc(num_layers, sizeof(*svc->layer_context));
+ if (svc->layer_context == NULL) return false;
+ svc->num_allocated_layers = num_layers;
+ }
+ return true;
+}
+
+// Update the layer context from a change_config() call.
+void av1_update_layer_context_change_config(AV1_COMP *const cpi,
+ const int64_t target_bandwidth) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+ AV1_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ int layer = 0;
+ int64_t spatial_layer_target = 0;
+ float bitrate_alloc = 1.0;
+ const int mi_rows = cm->mi_params.mi_rows;
+ const int mi_cols = cm->mi_params.mi_cols;
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ svc->layer_context[layer].target_bandwidth = lc->layer_target_bitrate;
+ }
+ spatial_layer_target = svc->layer_context[layer].target_bandwidth;
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ LAYER_CONTEXT *const lc =
+ &svc->layer_context[sl * svc->number_temporal_layers + tl];
+ RATE_CONTROL *const lrc = &lc->rc;
+ PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+ lc->spatial_layer_target_bandwidth = spatial_layer_target;
+ if (target_bandwidth != 0) {
+ bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+ }
+ lp_rc->starting_buffer_level =
+ (int64_t)(p_rc->starting_buffer_level * bitrate_alloc);
+ lp_rc->optimal_buffer_level =
+ (int64_t)(p_rc->optimal_buffer_level * bitrate_alloc);
+ lp_rc->maximum_buffer_size =
+ (int64_t)(p_rc->maximum_buffer_size * bitrate_alloc);
+ lp_rc->bits_off_target =
+ AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size);
+ lp_rc->buffer_level =
+ AOMMIN(lp_rc->buffer_level, lp_rc->maximum_buffer_size);
+ lc->framerate = cpi->framerate / lc->framerate_factor;
+ lrc->avg_frame_bandwidth =
+ (int)round(lc->target_bandwidth / lc->framerate);
+ lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+ lrc->rtc_external_ratectrl = rc->rtc_external_ratectrl;
+ lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
+ lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+ if (rc->use_external_qp_one_pass) {
+ lrc->worst_quality = rc->worst_quality;
+ lrc->best_quality = rc->best_quality;
+ }
+ // Reset the cyclic refresh parameters, if needed (map is NULL),
+ // or number of spatial layers has changed.
+ // Cyclic refresh is only applied on base temporal layer.
+ if (svc->number_spatial_layers > 1 && tl == 0 &&
+ (lc->map == NULL ||
+ svc->prev_number_spatial_layers != svc->number_spatial_layers)) {
+ lc->sb_index = 0;
+ lc->actual_num_seg1_blocks = 0;
+ lc->actual_num_seg2_blocks = 0;
+ lc->counter_encode_maxq_scene_change = 0;
+ aom_free(lc->map);
+ CHECK_MEM_ERROR(cm, lc->map,
+ aom_calloc(mi_rows * mi_cols, sizeof(*lc->map)));
+ }
+ }
+ }
+}
+
+/*!\brief Return layer context for current layer.
+ *
+ * \ingroup rate_control
+ * \param[in] cpi Top level encoder structure
+ *
+ * \return LAYER_CONTEXT for current layer.
+ */
+static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) {
+ return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+ cpi->svc.number_temporal_layers +
+ cpi->svc.temporal_layer_id];
+}
+
+void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
+ RATE_CONTROL *const lrc = &lc->rc;
+ const int tl = svc->temporal_layer_id;
+ lc->framerate = cpi->framerate / lc->framerate_factor;
+ lrc->avg_frame_bandwidth = (int)round(lc->target_bandwidth / lc->framerate);
+ lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
+ // Update the average layer frame size (non-cumulative per-frame-bw).
+ if (tl == 0) {
+ lc->avg_frame_size = lrc->avg_frame_bandwidth;
+ } else {
+ int prev_layer = svc->spatial_layer_id * svc->number_temporal_layers +
+ svc->temporal_layer_id - 1;
+ LAYER_CONTEXT *const lcprev = &svc->layer_context[prev_layer];
+ const double prev_layer_framerate =
+ cpi->framerate / lcprev->framerate_factor;
+ const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate;
+ lc->avg_frame_size =
+ (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
+ (lc->framerate - prev_layer_framerate));
+ }
+}
+
+static AOM_INLINE bool check_ref_is_low_spatial_res_super_frame(
+ int ref_frame, const SVC *svc, const RTC_REF *rtc_ref) {
+ int ref_frame_idx = rtc_ref->ref_idx[ref_frame - 1];
+ return rtc_ref->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
+ rtc_ref->buffer_spatial_layer[ref_frame_idx] <=
+ svc->spatial_layer_id - 1;
+}
+
+void av1_restore_layer_context(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ const AV1_COMMON *const cm = &cpi->common;
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
+ const int old_frame_since_key = cpi->rc.frames_since_key;
+ const int old_frame_to_key = cpi->rc.frames_to_key;
+ const int max_consec_drop = cpi->rc.max_consec_drop;
+ // Restore layer rate control.
+ cpi->rc = lc->rc;
+ cpi->ppi->p_rc = lc->p_rc;
+ cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth;
+ cpi->gf_frame_index = 0;
+ cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude;
+ if (cpi->mv_search_params.max_mv_magnitude == 0)
+ cpi->mv_search_params.max_mv_magnitude = AOMMAX(cm->width, cm->height);
+ // Reset the frames_since_key and frames_to_key counters to their values
+ // before the layer restore. Keep these defined for the stream (not layer).
+ cpi->rc.frames_since_key = old_frame_since_key;
+ cpi->rc.frames_to_key = old_frame_to_key;
+ // Reset to value before the layer restore.
+ cpi->rc.max_consec_drop = max_consec_drop;
+ // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+ // for the base temporal layer.
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ swap_ptr(&cr->map, &lc->map);
+ cr->sb_index = lc->sb_index;
+ cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
+ cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
+ cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change;
+ }
+ svc->skip_mvsearch_last = 0;
+ svc->skip_mvsearch_gf = 0;
+ svc->skip_mvsearch_altref = 0;
+ // For each reference (LAST/GOLDEN) set the skip_mvsearch_last/gf frame flags.
+ // This is to skip searching mv for that reference if it was last
+ // refreshed (i.e., buffer slot holding that reference was refreshed) on the
+ // previous spatial layer(s) at the same time (current_superframe).
+ if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref &&
+ cpi->sf.rt_sf.use_nonrd_pick_mode) {
+ if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc, rtc_ref)) {
+ svc->skip_mvsearch_last = 1;
+ }
+ if (check_ref_is_low_spatial_res_super_frame(GOLDEN_FRAME, svc, rtc_ref)) {
+ svc->skip_mvsearch_gf = 1;
+ }
+ if (check_ref_is_low_spatial_res_super_frame(ALTREF_FRAME, svc, rtc_ref)) {
+ svc->skip_mvsearch_altref = 1;
+ }
+ }
+}
+
+void av1_svc_update_buffer_slot_refreshed(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ const unsigned int current_frame =
+ cpi->ppi->use_svc ? svc->current_superframe
+ : cpi->common.current_frame.frame_number;
+ // For any buffer slot that is refreshed, update it with
+ // the spatial_layer_id and the current_superframe.
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+ // All slots are refreshed on KEY.
+ for (unsigned int i = 0; i < REF_FRAMES; i++) {
+ rtc_ref->buffer_time_index[i] = current_frame;
+ rtc_ref->buffer_spatial_layer[i] = svc->spatial_layer_id;
+ }
+ } else if (rtc_ref->set_ref_frame_config) {
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ const int ref_frame_map_idx = rtc_ref->ref_idx[i];
+ if (cpi->ppi->rtc_ref.refresh[ref_frame_map_idx]) {
+ rtc_ref->buffer_time_index[ref_frame_map_idx] = current_frame;
+ rtc_ref->buffer_spatial_layer[ref_frame_map_idx] =
+ svc->spatial_layer_id;
+ }
+ }
+ }
+}
+
+void av1_save_layer_context(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ const AV1_COMMON *const cm = &cpi->common;
+ LAYER_CONTEXT *lc = get_layer_context(cpi);
+ lc->rc = cpi->rc;
+ lc->p_rc = cpi->ppi->p_rc;
+ lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth;
+ lc->group_index = cpi->gf_frame_index;
+ lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude;
+ if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate;
+ // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+ // for the base temporal layer.
+ if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ signed char *temp = lc->map;
+ lc->map = cr->map;
+ cr->map = temp;
+ lc->sb_index = cr->sb_index;
+ lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
+ lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
+ lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
+ }
+ av1_svc_update_buffer_slot_refreshed(cpi);
+ for (unsigned int i = 0; i < REF_FRAMES; i++) {
+ if (frame_is_intra_only(cm) ||
+ cm->current_frame.refresh_frame_flags & (1 << i)) {
+ svc->spatial_layer_fb[i] = svc->spatial_layer_id;
+ svc->temporal_layer_fb[i] = svc->temporal_layer_id;
+ }
+ }
+ if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+ svc->current_superframe++;
+ // Reset drop flag to false for next superframe.
+ for (int sl = 0; sl < svc->number_spatial_layers; sl++)
+ svc->drop_spatial_layer[sl] = false;
+ }
+}
+
+int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) {
+ const SVC *const svc = &cpi->svc;
+ const AV1_COMMON *const cm = &cpi->common;
+ int fb_idx = -1;
+ int primary_ref_frame = PRIMARY_REF_NONE;
+ if (cpi->svc.number_spatial_layers > 1 ||
+ cpi->svc.number_temporal_layers > 1) {
+ // Set the primary_ref_frame to LAST_FRAME if that buffer slot for LAST
+ // was last updated on a lower temporal layer (or base TL0) and for the
+ // same spatial layer. For RTC patterns this allows for continued decoding
+ // when set of enhancement layers are dropped (continued decoding starting
+ // at next base TL0), so error_resilience can be off/0 for all layers.
+ fb_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+ if (svc->spatial_layer_fb[fb_idx] == svc->spatial_layer_id &&
+ (svc->temporal_layer_fb[fb_idx] < svc->temporal_layer_id ||
+ svc->temporal_layer_fb[fb_idx] == 0)) {
+ primary_ref_frame = 0; // LAST_FRAME: ref_frame - LAST_FRAME
+ }
+ } else if (cpi->ppi->rtc_ref.set_ref_frame_config) {
+ const ExternalFlags *const ext_flags = &cpi->ext_flags;
+ int flags = ext_flags->ref_frame_flags;
+ if (flags & AOM_LAST_FLAG) {
+ primary_ref_frame = 0; // LAST_FRAME: ref_frame - LAST_FRAME
+ } else if (flags & AOM_GOLD_FLAG) {
+ primary_ref_frame = GOLDEN_FRAME - LAST_FRAME;
+ } else if (flags & AOM_ALT_FLAG) {
+ primary_ref_frame = ALTREF_FRAME - LAST_FRAME;
+ }
+ }
+ return primary_ref_frame;
+}
+
+void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ aom_free(lc->map);
+ lc->map = NULL;
+ }
+ }
+}
+
+void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) {
+ SVC *const svc = &cpi->svc;
+ LAYER_CONTEXT *lc = NULL;
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
+ if (is_key) lc->frames_from_key_frame = 0;
+ }
+ }
+ av1_update_temporal_layer_framerate(cpi);
+ av1_restore_layer_context(cpi);
+}
+
+void av1_get_layer_resolution(const int width_org, const int height_org,
+ const int num, const int den, int *width_out,
+ int *height_out) {
+ int w, h;
+ if (width_out == NULL || height_out == NULL || den == 0) return;
+ if (den == 1 && num == 1) {
+ *width_out = width_org;
+ *height_out = height_org;
+ return;
+ }
+ w = width_org * num / den;
+ h = height_org * num / den;
+ // Make height and width even.
+ w += w % 2;
+ h += h % 2;
+ *width_out = w;
+ *height_out = h;
+}
+
+void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ AV1_COMMON *const cm = &cpi->common;
+ LAYER_CONTEXT *lc = NULL;
+ int width = 0, height = 0;
+ lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ // Set the lower quality layer flag.
+ svc->has_lower_quality_layer = 0;
+ if (cpi->svc.spatial_layer_id > 0) {
+ const LAYER_CONTEXT *lc_prev =
+ &svc->layer_context[(svc->spatial_layer_id - 1) *
+ svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ if (lc_prev->scaling_factor_den == 1 && lc_prev->scaling_factor_num == 1)
+ svc->has_lower_quality_layer = 1;
+ }
+ av1_get_layer_resolution(cpi->oxcf.frm_dim_cfg.width,
+ cpi->oxcf.frm_dim_cfg.height, lc->scaling_factor_num,
+ lc->scaling_factor_den, &width, &height);
+ // Use Eightap_smooth for low resolutions.
+ if (width * height <= 320 * 240)
+ svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH;
+
+ cm->width = width;
+ cm->height = height;
+ alloc_mb_mode_info_buffers(cpi);
+ av1_update_frame_size(cpi);
+ if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+ svc->mi_cols_full_resoln = cm->mi_params.mi_cols;
+ svc->mi_rows_full_resoln = cm->mi_params.mi_rows;
+ }
+}
+
+enum {
+ SVC_LAST_FRAME = 0,
+ SVC_LAST2_FRAME,
+ SVC_LAST3_FRAME,
+ SVC_GOLDEN_FRAME,
+ SVC_BWDREF_FRAME,
+ SVC_ALTREF2_FRAME,
+ SVC_ALTREF_FRAME
+};
+
+// For fixed svc mode: fixed pattern is set based on the number of
+// spatial and temporal layers, and the ksvc_fixed_mode.
+void av1_set_svc_fixed_mode(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ int i;
+ assert(svc->use_flexible_mode == 0);
+ // Fixed SVC mode only supports at most 3 spatial or temporal layers.
+ assert(svc->number_spatial_layers >= 1 && svc->number_spatial_layers <= 3 &&
+ svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3);
+ rtc_ref->set_ref_frame_config = 1;
+ int superframe_cnt = svc->current_superframe;
+ // Set the reference map buffer idx for the 7 references:
+ // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+ // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = i;
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->reference[i] = 0;
+ for (i = 0; i < REF_FRAMES; i++) rtc_ref->refresh[i] = 0;
+ // Always reference LAST, and reference GOLDEN on SL > 0.
+ // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later
+ // when frame_type is set.
+ rtc_ref->reference[SVC_LAST_FRAME] = 1;
+ if (svc->spatial_layer_id > 0) rtc_ref->reference[SVC_GOLDEN_FRAME] = 1;
+ if (svc->temporal_layer_id == 0) {
+ // Base temporal layer.
+ if (svc->spatial_layer_id == 0) {
+ // Set all buffer_idx to 0. Update slot 0 (LAST).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ rtc_ref->refresh[0] = 1;
+ } else if (svc->spatial_layer_id == 1) {
+ // Set buffer_idx for LAST to slot 1, GOLDEN (and all other refs) to
+ // slot 0. Update slot 1 (LAST).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
+ rtc_ref->refresh[1] = 1;
+ } else if (svc->spatial_layer_id == 2) {
+ // Set buffer_idx for LAST to slot 2, GOLDEN (and all other refs) to
+ // slot 1. Update slot 2 (LAST).
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 1;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
+ rtc_ref->refresh[2] = 1;
+ }
+ } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 1) % 4 == 0) {
+ // First top temporal enhancement layer.
+ if (svc->spatial_layer_id == 0) {
+ // Reference LAST (slot 0).
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to slot 0.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ rtc_ref->refresh[3] = 1;
+ }
+ } else if (svc->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 3.
+ // Set LAST2 to slot 4 and Update slot 4.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 3;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
+ if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4;
+ rtc_ref->refresh[4] = 1;
+ }
+ } else if (svc->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 4.
+ // No update.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 4;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
+ }
+ } else if (svc->temporal_layer_id == 1) {
+ // Middle temporal enhancement layer.
+ if (svc->spatial_layer_id == 0) {
+ // Reference LAST.
+ // Set all buffer_idx to 0.
+ // Set GOLDEN to slot 5 and update slot 5.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ if (svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+ svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 5;
+ rtc_ref->refresh[5] = 1;
+ }
+ } else if (svc->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+ // GOLDEN (and all other refs) to slot 5.
+ // Set LAST3 to slot 6 and update slot 6.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 5;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 1;
+ if (svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+ svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_LAST3_FRAME] = 6;
+ rtc_ref->refresh[6] = 1;
+ }
+ } else if (svc->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+ // GOLDEN (and all other refs) to slot 6.
+ // Set LAST3 to slot 7 and update slot 7.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 6;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 2;
+ if (svc->temporal_layer_id < svc->number_temporal_layers - 1) {
+ rtc_ref->ref_idx[SVC_LAST3_FRAME] = 7;
+ rtc_ref->refresh[7] = 1;
+ }
+ }
+ } else if (svc->temporal_layer_id == 2 && (superframe_cnt - 3) % 4 == 0) {
+ // Second top temporal enhancement layer.
+ if (svc->spatial_layer_id == 0) {
+ // Set LAST to slot 5 and reference LAST.
+ // Set GOLDEN to slot 3 and update slot 3.
+ // Set all other buffer_idx to 0.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 5;
+ if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ rtc_ref->refresh[3] = 1;
+ }
+ } else if (svc->spatial_layer_id == 1) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+ // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 6;
+ rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 3;
+ if (svc->spatial_layer_id < svc->number_spatial_layers - 1) {
+ rtc_ref->ref_idx[SVC_LAST2_FRAME] = 4;
+ rtc_ref->refresh[4] = 1;
+ }
+ } else if (svc->spatial_layer_id == 2) {
+ // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+ // GOLDEN to slot 4. No update.
+ for (i = 0; i < INTER_REFS_PER_FRAME; i++) rtc_ref->ref_idx[i] = 0;
+ rtc_ref->ref_idx[SVC_LAST_FRAME] = 7;
+ rtc_ref->ref_idx[SVC_GOLDEN_FRAME] = 4;
+ }
+ }
+}
+
+void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+ // If avg_frame_bandwidth for top temporal layer is not set
+ // (because enhancement layer was inactive), use the base TL0
+ int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ int avg_frame_bandwidth = lrc->avg_frame_bandwidth;
+ int prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth;
+ if (avg_frame_bandwidth == 0 || prev_avg_frame_bandwidth == 0) {
+ // Use base TL0.
+ layer = LAYER_IDS_TO_IDX(sl, 0, svc->number_temporal_layers);
+ lc = &svc->layer_context[layer];
+ lrc = &lc->rc;
+ avg_frame_bandwidth = lrc->avg_frame_bandwidth;
+ prev_avg_frame_bandwidth = lrc->prev_avg_frame_bandwidth;
+ }
+ if (avg_frame_bandwidth > (3 * prev_avg_frame_bandwidth >> 1) ||
+ avg_frame_bandwidth < (prev_avg_frame_bandwidth >> 1)) {
+ // Reset for all temporal layers with spatial layer sl.
+ for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int layer2 = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc2 = &svc->layer_context[layer2];
+ RATE_CONTROL *lrc2 = &lc2->rc;
+ PRIMARY_RATE_CONTROL *lp_rc2 = &lc2->p_rc;
+ PRIMARY_RATE_CONTROL *const lp_rc = &lc2->p_rc;
+ lrc2->rc_1_frame = 0;
+ lrc2->rc_2_frame = 0;
+ lp_rc2->bits_off_target = lp_rc->optimal_buffer_level;
+ lp_rc2->buffer_level = lp_rc->optimal_buffer_level;
+ }
+ }
+ }
+}
+
+void av1_svc_set_last_source(AV1_COMP *const cpi, EncodeFrameInput *frame_input,
+ YV12_BUFFER_CONFIG *prev_source) {
+ frame_input->last_source = prev_source != NULL ? prev_source : NULL;
+ if (!cpi->ppi->use_svc && cpi->rc.prev_frame_is_dropped &&
+ cpi->rc.frame_number_encoded > 0) {
+ frame_input->last_source = &cpi->svc.source_last_TL0;
+ } else {
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ if (cpi->svc.spatial_layer_id == 0) {
+ // For base spatial layer: if the LAST reference (index 0) is not
+ // the previous (super)frame set the last_source to the source
+ // corresponding to the last TL0, otherwise keep it at prev_source.
+ // Always use source_last_TL0 if previous base TL0 was dropped.
+ if (cpi->svc.current_superframe > 0) {
+ const int buffslot_last = rtc_ref->ref_idx[0];
+ // Check if previous frame was dropped on base TL0 layer.
+ const int layer =
+ LAYER_IDS_TO_IDX(0, 0, cpi->svc.number_temporal_layers);
+ LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ if (lrc->prev_frame_is_dropped ||
+ rtc_ref->buffer_time_index[buffslot_last] <
+ cpi->svc.current_superframe - 1) {
+ frame_input->last_source = &cpi->svc.source_last_TL0;
+ }
+ }
+ } else if (cpi->svc.spatial_layer_id > 0) {
+ // For spatial enhancement layers: the previous source (prev_source)
+ // corresponds to the lower spatial layer (which is the same source so
+ // we can't use that), so always set the last_source to the source of the
+ // last TL0.
+ if (cpi->svc.current_superframe > 0)
+ frame_input->last_source = &cpi->svc.source_last_TL0;
+ else
+ frame_input->last_source = NULL;
+ }
+ }
+}
+
+int av1_svc_get_min_ref_dist(const AV1_COMP *cpi) {
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ int min_dist = INT_MAX;
+ const unsigned int current_frame_num =
+ cpi->ppi->use_svc ? cpi->svc.current_superframe
+ : cpi->common.current_frame.frame_number;
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ if (cpi->ppi->rtc_ref.reference[i]) {
+ const int ref_frame_map_idx = rtc_ref->ref_idx[i];
+ const int dist =
+ current_frame_num - rtc_ref->buffer_time_index[ref_frame_map_idx];
+ if (dist < min_dist) min_dist = dist;
+ }
+ }
+ return min_dist;
+}
+
+void av1_svc_set_reference_was_previous(AV1_COMP *cpi) {
+ RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref;
+ // Check if the encoded frame had some reference that was the
+ // previous frame.
+ const unsigned int current_frame =
+ cpi->ppi->use_svc ? cpi->svc.current_superframe
+ : cpi->common.current_frame.frame_number;
+ rtc_ref->reference_was_previous_frame = true;
+ if (current_frame > 0) {
+ rtc_ref->reference_was_previous_frame = false;
+ for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ if (rtc_ref->reference[i]) {
+ const int ref_frame_map_idx = rtc_ref->ref_idx[i];
+ if (rtc_ref->buffer_time_index[ref_frame_map_idx] == current_frame - 1)
+ rtc_ref->reference_was_previous_frame = true;
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/svc_layercontext.h b/third_party/aom/av1/encoder/svc_layercontext.h
new file mode 100644
index 0000000000..93118be2d4
--- /dev/null
+++ b/third_party/aom/av1/encoder/svc_layercontext.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
+#define AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief The stucture of quantities related to each spatial and temporal layer.
+ * \ingroup SVC
+ */
+typedef struct {
+ /*!\cond */
+ RATE_CONTROL rc;
+ PRIMARY_RATE_CONTROL p_rc;
+ int framerate_factor;
+ int64_t layer_target_bitrate; // In bits per second.
+ int scaling_factor_num;
+ int scaling_factor_den;
+ int64_t target_bandwidth;
+ int64_t spatial_layer_target_bandwidth;
+ double framerate;
+ int avg_frame_size;
+ int max_q;
+ int min_q;
+ int frames_from_key_frame;
+ /*!\endcond */
+
+ /*!
+ * Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+ */
+ int sb_index;
+ /*!
+ * Segmentation map
+ */
+ int8_t *map;
+ /*!
+ * Number of blocks on segment 1
+ */
+ int actual_num_seg1_blocks;
+
+ /*!
+ * Number of blocks on segment 2
+ */
+ int actual_num_seg2_blocks;
+ /*!
+ * Counter used to detect scene change.
+ */
+ int counter_encode_maxq_scene_change;
+
+ /*!
+ * Speed settings for each layer.
+ */
+ uint8_t speed;
+ /*!
+ * GF group index.
+ */
+ unsigned char group_index;
+ /*!
+ * If current layer is key frame.
+ */
+ int is_key_frame;
+ /*!
+ * Maximum motion magnitude of previous encoded layer.
+ */
+ int max_mv_magnitude;
+} LAYER_CONTEXT;
+
+/*!
+ * \brief The stucture of SVC.
+ * \ingroup SVC
+ */
+typedef struct SVC {
+ /*!\cond */
+ int spatial_layer_id;
+ int temporal_layer_id;
+ int number_spatial_layers;
+ int number_temporal_layers;
+ int prev_number_spatial_layers;
+ int use_flexible_mode;
+ int ksvc_fixed_mode;
+ /*!\endcond */
+
+ /*!\cond */
+ double base_framerate;
+ unsigned int current_superframe;
+ int skip_mvsearch_last;
+ int skip_mvsearch_gf;
+ int skip_mvsearch_altref;
+ int spatial_layer_fb[REF_FRAMES];
+ int temporal_layer_fb[REF_FRAMES];
+ int num_encoded_top_layer;
+ int first_layer_denoise;
+ YV12_BUFFER_CONFIG source_last_TL0;
+ int mi_cols_full_resoln;
+ int mi_rows_full_resoln;
+ /*!\endcond */
+
+ /*!
+ * Layer context used for rate control in CBR mode.
+ * An array. The index for spatial layer `sl` and temporal layer `tl` is
+ * sl * number_temporal_layers + tl.
+ */
+ LAYER_CONTEXT *layer_context;
+
+ /*!
+ * Number of layers allocated for layer_context. If nonzero, must be greater
+ * than or equal to number_spatial_layers * number_temporal_layers.
+ */
+ int num_allocated_layers;
+
+ /*!
+ * EIGHTTAP_SMOOTH or BILINEAR
+ */
+ InterpFilter downsample_filter_type[AOM_MAX_SS_LAYERS];
+
+ /*!
+ * Downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+ * = 8 will center the target pixel and get a symmetric averaging filter.
+ */
+ int downsample_filter_phase[AOM_MAX_SS_LAYERS];
+
+ /*!
+ * Force zero-mv in mode search for the spatial/inter-layer reference.
+ */
+ int force_zero_mode_spatial_ref;
+
+ /*!
+ * Flag to indicate that current spatial layer has a lower quality layer
+ * (at the same timestamp) that can be used as a reference.
+ * Lower quality layer refers to the same resolution but encoded at
+ * different/lower bitrate.
+ */
+ int has_lower_quality_layer;
+
+ /*!
+ * Flag to indicate the frame drop mode for SVC: one of the two settings:
+ * AOM_LAYER_DROP (default) or AOM_FULL_SUPERFRAME_DROP.
+ */
+ AOM_SVC_FRAME_DROP_MODE framedrop_mode;
+
+ /*!
+ * Flag to indicate if frame was dropped for a given spatial_layer_id on
+ * previous superframe.
+ */
+ bool last_layer_dropped[AOM_MAX_SS_LAYERS];
+
+ /*!
+ * Flag to indicate if a previous spatial was dropped for the same superframe.
+ */
+ bool drop_spatial_layer[AOM_MAX_SS_LAYERS];
+} SVC;
+
+struct AV1_COMP;
+struct EncodeFrameInput;
+
+/*!\brief Initialize layer context data from init_config().
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Nothing returned. Set cpi->svc.
+ */
+void av1_init_layer_context(struct AV1_COMP *const cpi);
+
+/*!\brief Allocate layer context data.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] num_layers Number of layers to be allocated
+ *
+ * \remark Allocates memory for cpi->svc.layer_context.
+ * \return True on success, false on allocation failure.
+ */
+bool av1_alloc_layer_context(struct AV1_COMP *cpi, int num_layers);
+
+/*!\brief Update the layer context from a change_config() call.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] target_bandwidth Total target bandwidth
+ *
+ * \remark Nothing returned. Buffer level for each layer is set.
+ */
+void av1_update_layer_context_change_config(struct AV1_COMP *const cpi,
+ const int64_t target_bandwidth);
+
+/*!\brief Prior to encoding the frame, update framerate-related quantities
+ for the current temporal layer.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Nothing returned. Frame related quantities for current temporal
+ layer are updated.
+ */
+void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi);
+
+/*!\brief Prior to encoding the frame, set the layer context, for the current
+ layer to be encoded, to the cpi struct.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \remark Nothing returned. Layer context for current layer is set.
+ */
+void av1_restore_layer_context(struct AV1_COMP *const cpi);
+
+/*!\brief Save the layer context after encoding the frame.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ */
+void av1_save_layer_context(struct AV1_COMP *const cpi);
+
+/*!\brief Free the memory used for cyclic refresh in layer context.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ */
+void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi);
+
+/*!\brief Reset on key frame: reset counters, references and buffer updates.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] is_key Whether current layer is key frame
+ */
+void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key);
+
+/*!\brief Before encoding, set resolutions and allocate compressor data.
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ */
+void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi);
+
+/*!\brief Get primary reference frame for current layer
+ *
+ * \ingroup SVC
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ *
+ * \return The primary reference frame for current layer.
+ */
+int av1_svc_primary_ref_frame(const struct AV1_COMP *const cpi);
+
+/*!\brief Get resolution for current layer.
+ *
+ * \ingroup SVC
+ * \param[in] width_org Original width, unscaled
+ * \param[in] height_org Original height, unscaled
+ * \param[in] num Numerator for the scale ratio
+ * \param[in] den Denominator for the scale ratio
+ * \param[in] width_out Output width, scaled for current layer
+ * \param[in] height_out Output height, scaled for current layer
+ *
+ * \remark Nothing is returned. Instead the scaled width and height are set.
+ */
+void av1_get_layer_resolution(const int width_org, const int height_org,
+ const int num, const int den, int *width_out,
+ int *height_out);
+
+void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi);
+
+void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi);
+
+void av1_svc_set_last_source(struct AV1_COMP *const cpi,
+ struct EncodeFrameInput *frame_input,
+ YV12_BUFFER_CONFIG *prev_source);
+
+void av1_svc_update_buffer_slot_refreshed(struct AV1_COMP *const cpi);
+
+int av1_svc_get_min_ref_dist(const struct AV1_COMP *cpi);
+
+void av1_svc_set_reference_was_previous(struct AV1_COMP *cpi);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
new file mode 100644
index 0000000000..7d4d25de6a
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -0,0 +1,1520 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/odintrin.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/intra_mode_search_utils.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/temporal_filter.h"
+
+/*!\cond */
+
+// NOTE: All `tf` in this file means `temporal filtering`.
+
+// Forward Declaration.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+ MV *subblock_mvs, int *subblock_mses);
+
+// This function returns the minimum and maximum log variances for 4x4 sub
+// blocks in the current block.
+static INLINE void get_log_var_4x4sub_blk(
+ AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const frame_to_filter, int mb_row,
+ int mb_col, BLOCK_SIZE block_size, double *blk_4x4_var_min,
+ double *blk_4x4_var_max, int is_hbd) {
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ int var_min = INT_MAX;
+ int var_max = 0;
+
+ // Derive the source buffer.
+ const int src_stride = frame_to_filter->y_stride;
+ const int y_offset = mb_row * mb_height * src_stride + mb_col * mb_width;
+ const uint8_t *src_buf = frame_to_filter->y_buffer + y_offset;
+
+ for (int i = 0; i < mb_height; i += MI_SIZE) {
+ for (int j = 0; j < mb_width; j += MI_SIZE) {
+ // Calculate the 4x4 sub-block variance.
+ const int var = av1_calc_normalized_variance(
+ cpi->ppi->fn_ptr[BLOCK_4X4].vf, src_buf + (i * src_stride) + j,
+ src_stride, is_hbd);
+
+ // Record min and max for over-arching block
+ var_min = AOMMIN(var_min, var);
+ var_max = AOMMAX(var_max, var);
+ }
+ }
+
+ *blk_4x4_var_min = log1p(var_min / 16.0);
+ *blk_4x4_var_max = log1p(var_max / 16.0);
+}
+
+/*!\endcond */
+/*!\brief Does motion search for blocks in temporal filtering. This is
+ * the first step for temporal filtering. More specifically, given a frame to
+ * be filtered and another frame as reference, this function searches the
+ * reference frame to find out the most similar block as that from the frame
+ * to be filtered. This found block will be further used for weighted
+ * averaging.
+ *
+ * NOTE: Besides doing motion search for the entire block, this function will
+ * also do motion search for each 1/4 sub-block to get more precise
+ * predictions. Then, this function will determines whether to use 4
+ * sub-blocks to replace the entire block. If we do need to split the
+ * entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to
+ * the searched motion vector and search error (MSE) w.r.t. each sub-block
+ * respectively. Otherwise, the 4 elements will be the same, all of which
+ * are assigned as the searched motion vector and search error (MSE) for
+ * the entire block.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] mb Pointer to macroblock
+ * \param[in] frame_to_filter Pointer to the frame to be filtered
+ * \param[in] ref_frame Pointer to the reference frame
+ * \param[in] block_size Block size used for motion search
+ * \param[in] mb_row Row index of the block in the frame
+ * \param[in] mb_col Column index of the block in the frame
+ * \param[in] ref_mv Reference motion vector, which is commonly
+ * inherited from the motion search result of
+ * previous frame.
+ * \param[in] allow_me_for_sub_blks Flag to indicate whether motion search at
+ * 16x16 sub-block level is needed or not.
+ * \param[out] subblock_mvs Pointer to the motion vectors for
+ * 4 sub-blocks
+ * \param[out] subblock_mses Pointer to the search errors (MSE) for
+ * 4 sub-blocks
+ *
+ * \remark Nothing will be returned. Results are saved in subblock_mvs and
+ * subblock_mses
+ */
+static void tf_motion_search(AV1_COMP *cpi, MACROBLOCK *mb,
+ const YV12_BUFFER_CONFIG *frame_to_filter,
+ const YV12_BUFFER_CONFIG *ref_frame,
+ const BLOCK_SIZE block_size, const int mb_row,
+ const int mb_col, MV *ref_mv,
+ bool allow_me_for_sub_blks, MV *subblock_mvs,
+ int *subblock_mses) {
+ // Frame information
+ const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
+
+ // Block information (ONLY Y-plane is used for motion search).
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int mb_pels = mb_height * mb_width;
+ const int y_stride = frame_to_filter->y_stride;
+ const int src_width = frame_to_filter->y_width;
+ const int ref_width = ref_frame->y_width;
+ assert(y_stride == ref_frame->y_stride);
+ assert(src_width == ref_width);
+ const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+ // Save input state.
+ MACROBLOCKD *const mbd = &mb->e_mbd;
+ const struct buf_2d ori_src_buf = mb->plane[0].src;
+ const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
+
+ // Parameters used for motion search.
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ const int step_param = av1_init_search_range(
+ AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
+ const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
+ const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv;
+ const MV_COST_TYPE mv_cost_type =
+ min_frame_size >= 720
+ ? MV_COST_L1_HDRES
+ : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES);
+
+ // Starting position for motion search.
+ FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
+ // Baseline position for motion search (used for rate distortion comparison).
+ const MV baseline_mv = kZeroMv;
+
+ // Setup.
+ mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset;
+ mb->plane[0].src.stride = y_stride;
+ mb->plane[0].src.width = src_width;
+ mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
+ mbd->plane[0].pre[0].stride = y_stride;
+ mbd->plane[0].pre[0].width = ref_width;
+
+ const SEARCH_METHODS search_method = NSTEP;
+ const search_site_config *search_site_cfg =
+ av1_get_search_site_config(cpi, mb, search_method);
+
+ // Unused intermediate results for motion search.
+ unsigned int sse, error;
+ int distortion;
+ int cost_list[5];
+
+ // Do motion search.
+ int_mv best_mv; // Searched motion vector.
+ FULLPEL_MV_STATS best_mv_stats;
+ int block_mse = INT_MAX;
+ MV block_mv = kZeroMv;
+ const int q = av1_get_q(cpi);
+
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
+ &baseline_mv, start_mv, search_site_cfg,
+ search_method,
+ /*fine_search_interval=*/0);
+ full_ms_params.run_mesh_search = 1;
+ full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
+ if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+ // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+ full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
+ full_ms_params.mesh_search_mv_diff_threshold = 2;
+ }
+
+ av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
+ &best_mv_stats, NULL);
+
+ if (force_integer_mv == 1) { // Only do full search on the entire block.
+ const int mv_row = best_mv.as_mv.row;
+ const int mv_col = best_mv.as_mv.col;
+ best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
+ best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
+ const int mv_offset = mv_row * y_stride + mv_col;
+ error = cpi->ppi->fn_ptr[block_size].vf(
+ ref_frame->y_buffer + y_offset + mv_offset, y_stride,
+ frame_to_filter->y_buffer + y_offset, y_stride, &sse);
+ block_mse = DIVIDE_AND_ROUND(error, mb_pels);
+ block_mv = best_mv.as_mv;
+ } else { // Do fractional search on the entire block and all sub-blocks.
+ av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
+ &baseline_mv, cost_list);
+ ms_params.forced_stop = EIGHTH_PEL;
+ ms_params.var_params.subpel_search_type = subpel_search_type;
+ // Since we are merely refining the result from full pixel search, we don't
+ // need regularization for subpel search
+ ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+ best_mv_stats.err_cost = 0;
+
+ MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+ error = cpi->mv_search_params.find_fractional_mv_step(
+ &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv_stats,
+ &best_mv.as_mv, &distortion, &sse, NULL);
+ block_mse = DIVIDE_AND_ROUND(error, mb_pels);
+ block_mv = best_mv.as_mv;
+ *ref_mv = best_mv.as_mv;
+
+ if (allow_me_for_sub_blks) {
+ // On 4 sub-blocks.
+ const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
+ const int subblock_height = block_size_high[subblock_size];
+ const int subblock_width = block_size_wide[subblock_size];
+ const int subblock_pels = subblock_height * subblock_width;
+ start_mv = get_fullmv_from_mv(ref_mv);
+
+ int subblock_idx = 0;
+ for (int i = 0; i < mb_height; i += subblock_height) {
+ for (int j = 0; j < mb_width; j += subblock_width) {
+ const int offset = i * y_stride + j;
+ mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
+ mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
+ av1_make_default_fullpel_ms_params(
+ &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv,
+ search_site_cfg, search_method,
+ /*fine_search_interval=*/0);
+ full_ms_params.run_mesh_search = 1;
+ full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+
+ if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+ // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+ full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
+ full_ms_params.mesh_search_mv_diff_threshold = 2;
+ }
+ av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list),
+ &best_mv.as_fullmv, &best_mv_stats, NULL);
+
+ av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
+ &baseline_mv, cost_list);
+ ms_params.forced_stop = EIGHTH_PEL;
+ ms_params.var_params.subpel_search_type = subpel_search_type;
+ // Since we are merely refining the result from full pixel search, we
+ // don't need regularization for subpel search
+ ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+ best_mv_stats.err_cost = 0;
+
+ subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+ assert(
+ av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+ error = cpi->mv_search_params.find_fractional_mv_step(
+ &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
+ &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL);
+ subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
+ subblock_mvs[subblock_idx] = best_mv.as_mv;
+ ++subblock_idx;
+ }
+ }
+ }
+ }
+
+ // Restore input state.
+ mb->plane[0].src = ori_src_buf;
+ mbd->plane[0].pre[0] = ori_pre_buf;
+
+ // Make partition decision.
+ if (allow_me_for_sub_blks) {
+ tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
+ subblock_mses);
+ } else {
+ // Copy 32X32 block mv and mse values to sub blocks
+ for (int i = 0; i < 4; ++i) {
+ subblock_mvs[i] = block_mv;
+ subblock_mses[i] = block_mse;
+ }
+ }
+ // Do not pass down the reference motion vector if error is too large.
+ const int thresh = (min_frame_size >= 720) ? 12 : 3;
+ if (block_mse > (thresh << (mbd->bd - 8))) {
+ *ref_mv = kZeroMv;
+ }
+}
+/*!\cond */
+
+// Determines whether to split the entire block to 4 sub-blocks for filtering.
+// In particular, this decision is made based on the comparison between the
+// motion search error of the entire block and the errors of all sub-blocks.
+// Inputs:
+// block_mv: Motion vector for the entire block (ONLY as reference).
+// block_mse: Motion search error (MSE) for the entire block (ONLY as
+// reference).
+// subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be
+// modified based on the partition decision).
+// subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will
+// be modified based on the partition decision).
+// Returns:
+// Nothing will be returned. Results are saved in `subblock_mvs` and
+// `subblock_mses`.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+ MV *subblock_mvs, int *subblock_mses) {
+ int min_subblock_mse = INT_MAX;
+ int max_subblock_mse = INT_MIN;
+ int64_t sum_subblock_mse = 0;
+ for (int i = 0; i < 4; ++i) {
+ sum_subblock_mse += subblock_mses[i];
+ min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
+ max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
+ }
+
+ // TODO(any): The following magic numbers may be tuned to improve the
+ // performance OR find a way to get rid of these magic numbers.
+ if (((block_mse * 15 < sum_subblock_mse * 4) &&
+ max_subblock_mse - min_subblock_mse < 48) ||
+ ((block_mse * 14 < sum_subblock_mse * 4) &&
+ max_subblock_mse - min_subblock_mse < 24)) { // No split.
+ for (int i = 0; i < 4; ++i) {
+ subblock_mvs[i] = block_mv;
+ subblock_mses[i] = block_mse;
+ }
+ }
+}
+
+// Helper function to determine whether a frame is encoded with high bit-depth.
+static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+ return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+}
+
+/*!\endcond */
+/*!\brief Builds predictor for blocks in temporal filtering. This is the
+ * second step for temporal filtering, which is to construct predictions from
+ * all reference frames INCLUDING the frame to be filtered itself. These
+ * predictors are built based on the motion search results (motion vector is
+ * set as 0 for the frame to be filtered), and will be futher used for
+ * weighted averaging.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] ref_frame Pointer to the reference frame (or the frame
+ * to be filtered)
+ * \param[in] mbd Pointer to the block for filtering. Besides
+ * containing the subsampling information of all
+ * planes, this field also gives the searched
+ * motion vector for the entire block, i.e.,
+ * `mbd->mi[0]->mv[0]`. This vector should be 0
+ * if the `ref_frame` itself is the frame to be
+ * filtered.
+ * \param[in] block_size Size of the block
+ * \param[in] mb_row Row index of the block in the frame
+ * \param[in] mb_col Column index of the block in the frame
+ * \param[in] num_planes Number of planes in the frame
+ * \param[in] scale Scaling factor
+ * \param[in] subblock_mvs The motion vectors for each sub-block (row-major
+ * order)
+ * \param[out] pred Pointer to the predictor to be built
+ *
+ * \remark Nothing returned, But the contents of `pred` will be modified
+ */
+static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
+ const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row,
+ const int mb_col, const int num_planes,
+ const struct scale_factors *scale,
+ const MV *subblock_mvs, uint8_t *pred) {
+ // Information of the entire block.
+ const int mb_height = block_size_high[block_size]; // Height.
+ const int mb_width = block_size_wide[block_size]; // Width.
+ const int mb_y = mb_height * mb_row; // Y-coord (Top-left).
+ const int mb_x = mb_width * mb_col; // X-coord (Top-left).
+ const int bit_depth = mbd->bd; // Bit depth.
+ const int is_intrabc = 0; // Is intra-copied?
+ const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
+
+ // Default interpolation filters.
+ const int_interpfilters interp_filters =
+ av1_broadcast_interp_filter(MULTITAP_SHARP2);
+
+ // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int subsampling_y = mbd->plane[plane].subsampling_y;
+ const int subsampling_x = mbd->plane[plane].subsampling_x;
+ // Information of each sub-block in current plane.
+ const int plane_h = mb_height >> subsampling_y; // Plane height.
+ const int plane_w = mb_width >> subsampling_x; // Plane width.
+ const int plane_y = mb_y >> subsampling_y; // Y-coord (Top-left).
+ const int plane_x = mb_x >> subsampling_x; // X-coord (Top-left).
+ const int h = plane_h >> 1; // Sub-block height.
+ const int w = plane_w >> 1; // Sub-block width.
+ const int is_y_plane = (plane == 0); // Is Y-plane?
+
+ const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
+ ref_frame->widths[is_y_plane ? 0 : 1],
+ ref_frame->heights[is_y_plane ? 0 : 1],
+ ref_frame->strides[is_y_plane ? 0 : 1] };
+
+ // Handle each subblock.
+ int subblock_idx = 0;
+ for (int i = 0; i < plane_h; i += h) {
+ for (int j = 0; j < plane_w; j += w) {
+ // Choose proper motion vector.
+ const MV mv = subblock_mvs[subblock_idx++];
+ assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
+ mv.col >= INT16_MIN && mv.col <= INT16_MAX);
+
+ const int y = plane_y + i;
+ const int x = plane_x + j;
+
+ // Build predictior for each sub-block on current plane.
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+ subsampling_y, bit_depth, is_high_bitdepth,
+ is_intrabc, scale, &ref_buf, interp_filters);
+ inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+ av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
+ plane_w, &mv, &inter_pred_params);
+ }
+ }
+ plane_offset += plane_h * plane_w;
+ }
+}
+/*!\cond */
+
+// Computes temporal filter weights and accumulators for the frame to be
+// filtered. More concretely, the filter weights for all pixels are the same.
+// Inputs:
+// mbd: Pointer to the block for filtering, which is ONLY used to get
+// subsampling information of all planes as well as the bit-depth.
+// block_size: Size of the block.
+// num_planes: Number of planes in the frame.
+// pred: Pointer to the well-built predictors.
+// accum: Pointer to the pixel-wise accumulator for filtering.
+// count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+// Nothing will be returned. But the content to which `accum` and `pred`
+// point will be modified.
+void tf_apply_temporal_filter_self(const YV12_BUFFER_CONFIG *ref_frame,
+ const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size,
+ const int mb_row, const int mb_col,
+ const int num_planes, uint32_t *accum,
+ uint16_t *count) {
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int is_high_bitdepth = is_cur_buf_hbd(mbd);
+
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int subsampling_y = mbd->plane[plane].subsampling_y;
+ const int subsampling_x = mbd->plane[plane].subsampling_x;
+ const int h = mb_height >> subsampling_y; // Plane height.
+ const int w = mb_width >> subsampling_x; // Plane width.
+
+ const int frame_stride = ref_frame->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const uint8_t *buf8 = ref_frame->buffers[plane];
+ const uint16_t *buf16 = CONVERT_TO_SHORTPTR(buf8);
+ const int frame_offset = mb_row * h * frame_stride + mb_col * w;
+
+ int pred_idx = 0;
+ int pixel_idx = 0;
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ const int idx = plane_offset + pred_idx; // Index with plane shift.
+ const int pred_value = is_high_bitdepth
+ ? buf16[frame_offset + pixel_idx]
+ : buf8[frame_offset + pixel_idx];
+ accum[idx] += TF_WEIGHT_SCALE * pred_value;
+ count[idx] += TF_WEIGHT_SCALE;
+ ++pred_idx;
+ ++pixel_idx;
+ }
+ pixel_idx += (frame_stride - w);
+ }
+ plane_offset += h * w;
+ }
+}
+
+// Function to compute pixel-wise squared difference between two buffers.
+// Inputs:
+// ref: Pointer to reference buffer.
+// ref_offset: Start position of reference buffer for computation.
+// ref_stride: Stride for reference buffer.
+// tgt: Pointer to target buffer.
+// tgt_offset: Start position of target buffer for computation.
+// tgt_stride: Stride for target buffer.
+// height: Height of block for computation.
+// width: Width of block for computation.
+// is_high_bitdepth: Whether the two buffers point to high bit-depth frames.
+// square_diff: Pointer to save the squared differces.
+// Returns:
+// Nothing will be returned. But the content to which `square_diff` points
+// will be modified.
+static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
+ const int ref_stride, const uint8_t *tgt,
+ const int tgt_offset,
+ const int tgt_stride, const int height,
+ const int width,
+ const int is_high_bitdepth,
+ uint32_t *square_diff) {
+ const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+ const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt);
+
+ int ref_idx = 0;
+ int tgt_idx = 0;
+ int idx = 0;
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx]
+ : ref[ref_offset + ref_idx];
+ const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx]
+ : tgt[tgt_offset + tgt_idx];
+ const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value)
+ : (tgt_value - ref_value);
+ square_diff[idx] = diff * diff;
+
+ ++ref_idx;
+ ++tgt_idx;
+ ++idx;
+ }
+ ref_idx += (ref_stride - width);
+ tgt_idx += (tgt_stride - width);
+ }
+}
+
+// Function to accumulate pixel-wise squared difference between two luma buffers
+// to be consumed while filtering the chroma planes.
+// Inputs:
+// square_diff: Pointer to squared differences from luma plane.
+// luma_sse_sum: Pointer to save the sum of luma squared differences.
+// block_height: Height of block for computation.
+// block_width: Width of block for computation.
+// ss_x_shift: Chroma subsampling shift in 'X' direction
+// ss_y_shift: Chroma subsampling shift in 'Y' direction
+// Returns:
+// Nothing will be returned. But the content to which `luma_sse_sum` points
+// will be modified.
+void compute_luma_sq_error_sum(uint32_t *square_diff, uint32_t *luma_sse_sum,
+ int block_height, int block_width,
+ int ss_x_shift, int ss_y_shift) {
+ for (int i = 0; i < block_height; ++i) {
+ for (int j = 0; j < block_width; ++j) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ const int ww = block_width << ss_x_shift; // Width of Y-plane.
+ luma_sse_sum[i * block_width + j] += square_diff[yy * ww + xx];
+ }
+ }
+ }
+ }
+}
+
+/*!\endcond */
+/*!\brief Applies temporal filtering. NOTE that there are various optimised
+ * versions of this function called where the appropriate instruction set is
+ * supported.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] frame_to_filter Pointer to the frame to be filtered, which is
+ * used as reference to compute squared
+ * difference from the predictor.
+ * \param[in] mbd Pointer to the block for filtering, ONLY used
+ * to get subsampling information for the planes
+ * \param[in] block_size Size of the block
+ * \param[in] mb_row Row index of the block in the frame
+ * \param[in] mb_col Column index of the block in the frame
+ * \param[in] num_planes Number of planes in the frame
+ * \param[in] noise_levels Estimated noise levels for each plane
+ * in the frame (Y,U,V)
+ * \param[in] subblock_mvs Pointer to the motion vectors for 4 sub-blocks
+ * \param[in] subblock_mses Pointer to the search errors (MSE) for 4
+ * sub-blocks
+ * \param[in] q_factor Quantization factor. This is actually the `q`
+ * defined in libaom, converted from `qindex`
+ * \param[in] filter_strength Filtering strength. This value lies in range
+ * [0, 6] where 6 is the maximum strength.
+ * \param[in] tf_wgt_calc_lvl Controls the weight calculation method during
+ * temporal filtering
+ * \param[out] pred Pointer to the well-built predictors
+ * \param[out] accum Pointer to the pixel-wise accumulator for
+ * filtering
+ * \param[out] count Pointer to the pixel-wise counter for
+ * filtering
+ *
+ * \remark Nothing returned, But the contents of `accum`, `pred` and 'count'
+ * will be modified
+ */
+void av1_apply_temporal_filter_c(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int mb_pels = mb_height * mb_width;
+ const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
+ const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+ // Frame information.
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Decay factors for non-local mean approach.
+ double decay_factor[MAX_MB_PLANE] = { 0 };
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ for (int plane = 0; plane < num_planes; plane++) {
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ decay_factor[plane] = 1 / (n_decay * q_decay * s_decay);
+ }
+ double d_factor[4] = { 0 };
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Allocate memory for pixel-wise squared differences. They,
+ // regardless of the subsampling, are assigned with memory of size `mb_pels`.
+ uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t));
+ if (!square_diff) {
+ aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating temporal filter data");
+ }
+ memset(square_diff, 0, mb_pels * sizeof(square_diff[0]));
+
+ // Allocate memory for accumulated luma squared error. This value will be
+ // consumed while filtering the chroma planes.
+ uint32_t *luma_sse_sum = aom_memalign(32, mb_pels * sizeof(uint32_t));
+ if (!luma_sse_sum) {
+ aom_free(square_diff);
+ aom_internal_error(mbd->error_info, AOM_CODEC_MEM_ERROR,
+ "Error allocating temporal filter data");
+ }
+ memset(luma_sse_sum, 0, mb_pels * sizeof(luma_sse_sum[0]));
+
+ // Get window size for pixel-wise filtering.
+ assert(TF_WINDOW_LENGTH % 2 == 1);
+ const int half_window = TF_WINDOW_LENGTH >> 1;
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ // Locate pixel on reference frame.
+ const int subsampling_y = mbd->plane[plane].subsampling_y;
+ const int subsampling_x = mbd->plane[plane].subsampling_x;
+ const int h = mb_height >> subsampling_y; // Plane height.
+ const int w = mb_width >> subsampling_x; // Plane width.
+ const int frame_stride =
+ frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
+ const int frame_offset = mb_row * h * frame_stride + mb_col * w;
+ const uint8_t *ref = frame_to_filter->buffers[plane];
+ const int ss_y_shift =
+ subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int ss_x_shift =
+ subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane will
+ // be more accurate. The luma sse sum is reused in both chroma planes.
+ if (plane == AOM_PLANE_U)
+ compute_luma_sq_error_sum(square_diff, luma_sse_sum, h, w, ss_x_shift,
+ ss_y_shift);
+ compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset, w,
+ h, w, is_high_bitdepth, square_diff);
+
+ // Perform filtering.
+ int pred_idx = 0;
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ // non-local mean approach
+ uint64_t sum_square_diff = 0;
+
+ for (int wi = -half_window; wi <= half_window; ++wi) {
+ for (int wj = -half_window; wj <= half_window; ++wj) {
+ const int y = CLIP(i + wi, 0, h - 1); // Y-coord on current plane.
+ const int x = CLIP(j + wj, 0, w - 1); // X-coord on current plane.
+ sum_square_diff += square_diff[y * w + x];
+ }
+ }
+
+ sum_square_diff += luma_sse_sum[i * w + j];
+
+ // Scale down the difference for high bit depth input.
+ if (mbd->bd > 8) sum_square_diff >>= ((mbd->bd - 8) * 2);
+
+ // Combine window error and block error, and normalize it.
+ const double window_error = sum_square_diff * inv_num_ref_pixels;
+ const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+ const double block_error = (double)subblock_mses[subblock_idx];
+ const double combined_error =
+ weight_factor * window_error + block_error * inv_factor;
+
+ // Compute filter weight.
+ double scaled_error =
+ combined_error * d_factor[subblock_idx] * decay_factor[plane];
+ scaled_error = AOMMIN(scaled_error, 7);
+ int weight;
+ if (tf_wgt_calc_lvl == 0) {
+ weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+ } else {
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ weight = iroundpf(fweight);
+ }
+
+ const int idx = plane_offset + pred_idx; // Index with plane shift.
+ const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+ accum[idx] += weight * pred_value;
+ count[idx] += weight;
+
+ ++pred_idx;
+ }
+ }
+ plane_offset += h * w;
+ }
+
+ aom_free(square_diff);
+ aom_free(luma_sse_sum);
+}
+#if CONFIG_AV1_HIGHBITDEPTH
+// Calls High bit-depth temporal filter
+void av1_highbd_apply_temporal_filter_c(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ av1_apply_temporal_filter_c(frame_to_filter, mbd, block_size, mb_row, mb_col,
+ num_planes, noise_levels, subblock_mvs,
+ subblock_mses, q_factor, filter_strength,
+ tf_wgt_calc_lvl, pred, accum, count);
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+/*!\brief Normalizes the accumulated filtering result to produce the filtered
+ * frame
+ *
+ * \ingroup src_frame_proc
+ * \param[in] mbd Pointer to the block for filtering, which is
+ * ONLY used to get subsampling information for
+ * all the planes
+ * \param[in] block_size Size of the block
+ * \param[in] mb_row Row index of the block in the frame
+ * \param[in] mb_col Column index of the block in the frame
+ * \param[in] num_planes Number of planes in the frame
+ * \param[in] accum Pointer to the pre-computed accumulator
+ * \param[in] count Pointer to the pre-computed count
+ * \param[out] result_buffer Pointer to result buffer
+ *
+ * \remark Nothing returned, but the content to which `result_buffer` pointer
+ * will be modified
+ */
+static void tf_normalize_filtered_frame(
+ const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
+ const int mb_col, const int num_planes, const uint32_t *accum,
+ const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
+ // Block information.
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
+
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+ uint8_t *const buf = result_buffer->buffers[plane];
+ uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf);
+
+ int plane_idx = 0; // Pixel index on current plane (block-base).
+ int frame_idx = frame_offset; // Pixel index on the entire frame.
+ for (int i = 0; i < plane_h; ++i) {
+ for (int j = 0; j < plane_w; ++j) {
+ const int idx = plane_idx + plane_offset;
+ const uint16_t rounding = count[idx] >> 1;
+ if (is_high_bitdepth) {
+ buf16[frame_idx] =
+ (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]);
+ } else {
+ buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]);
+ }
+ ++plane_idx;
+ ++frame_idx;
+ }
+ frame_idx += (frame_stride - plane_w);
+ }
+ plane_offset += plane_h * plane_w;
+ }
+}
+
+int av1_get_q(const AV1_COMP *cpi) {
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
+ const int q =
+ (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type],
+ cpi->common.seq_params->bit_depth);
+ return q;
+}
+
+void av1_tf_do_filtering_row(AV1_COMP *cpi, ThreadData *td, int mb_row) {
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+ const int num_frames = tf_ctx->num_frames;
+ const int filter_frame_idx = tf_ctx->filter_frame_idx;
+ const int compute_frame_diff = tf_ctx->compute_frame_diff;
+ const struct scale_factors *scale = &tf_ctx->sf;
+ const double *noise_levels = tf_ctx->noise_levels;
+ const int num_pels = tf_ctx->num_pels;
+ const int q_factor = tf_ctx->q_factor;
+ const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+ const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
+ MACROBLOCK *const mb = &td->mb;
+ MACROBLOCKD *const mbd = &mb->e_mbd;
+ TemporalFilterData *const tf_data = &td->tf_data;
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int mi_h = mi_size_high_log2[block_size];
+ const int mi_w = mi_size_wide_log2[block_size];
+ const int num_planes = av1_num_planes(&cpi->common);
+ const int weight_calc_level_in_tf = cpi->sf.hl_sf.weight_calc_level_in_tf;
+ uint32_t *accum = tf_data->accum;
+ uint16_t *count = tf_data->count;
+ uint8_t *pred = tf_data->pred;
+
+ // Factor to control the filering strength.
+ const int filter_strength = cpi->oxcf.algo_cfg.arnr_strength;
+
+ // Do filtering.
+ FRAME_DIFF *diff = &td->tf_data.diff;
+ av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+ (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+ cpi->oxcf.border_in_pixels);
+ for (int mb_col = 0; mb_col < tf_ctx->mb_cols; mb_col++) {
+ av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+ (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
+ cpi->oxcf.border_in_pixels);
+ memset(accum, 0, num_pels * sizeof(accum[0]));
+ memset(count, 0, num_pels * sizeof(count[0]));
+ MV ref_mv = kZeroMv; // Reference motion vector passed down along frames.
+ // Perform temporal filtering frame by frame.
+
+ // Decide whether to perform motion search at 16x16 sub-block level or not
+ // based on 4x4 sub-blocks source variance. Allow motion search for split
+ // partition only if the difference between max and min source variance of
+ // 4x4 blocks is greater than a threshold (which is derived empirically).
+ bool allow_me_for_sub_blks = true;
+ if (cpi->sf.hl_sf.allow_sub_blk_me_in_tf) {
+ const int is_hbd = is_frame_high_bitdepth(frame_to_filter);
+ // Initialize minimum variance to a large value and maximum variance to 0.
+ double blk_4x4_var_min = DBL_MAX;
+ double blk_4x4_var_max = 0;
+ get_log_var_4x4sub_blk(cpi, frame_to_filter, mb_row, mb_col,
+ TF_BLOCK_SIZE, &blk_4x4_var_min, &blk_4x4_var_max,
+ is_hbd);
+ // TODO(sanampudi.venkatarao@ittiam.com): Experiment and adjust the
+ // threshold for high bit depth.
+ if ((blk_4x4_var_max - blk_4x4_var_min) <= 4.0)
+ allow_me_for_sub_blks = false;
+ }
+
+ for (int frame = 0; frame < num_frames; frame++) {
+ if (frames[frame] == NULL) continue;
+
+ // Motion search.
+ MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+ int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+ if (frame ==
+ filter_frame_idx) { // Frame to be filtered.
+ // Change ref_mv sign for following frames.
+ ref_mv.row *= -1;
+ ref_mv.col *= -1;
+ } else { // Other reference frames.
+ tf_motion_search(cpi, mb, frame_to_filter, frames[frame], block_size,
+ mb_row, mb_col, &ref_mv, allow_me_for_sub_blks,
+ subblock_mvs, subblock_mses);
+ }
+
+ // Perform weighted averaging.
+ if (frame == filter_frame_idx) { // Frame to be filtered.
+ tf_apply_temporal_filter_self(frames[frame], mbd, block_size, mb_row,
+ mb_col, num_planes, accum, count);
+ } else { // Other reference frames.
+ tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
+ num_planes, scale, subblock_mvs, pred);
+
+ // All variants of av1_apply_temporal_filter() contain floating point
+ // operations. Hence, clear the system state.
+
+ // TODO(any): avx2/sse2 version should be changed to align with C
+ // function before using. In particular, current avx2/sse2 function
+ // only supports 32x32 block size and 5x5 filtering window.
+ if (is_frame_high_bitdepth(frame_to_filter)) { // for high bit-depth
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+ av1_highbd_apply_temporal_filter(
+ frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+ noise_levels, subblock_mvs, subblock_mses, q_factor,
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
+ } else {
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ av1_apply_temporal_filter_c(
+ frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+ noise_levels, subblock_mvs, subblock_mses, q_factor,
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
+#if CONFIG_AV1_HIGHBITDEPTH
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ } else {
+ // for 8-bit
+ if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+ av1_apply_temporal_filter(
+ frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+ noise_levels, subblock_mvs, subblock_mses, q_factor,
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
+ } else {
+ av1_apply_temporal_filter_c(
+ frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+ noise_levels, subblock_mvs, subblock_mses, q_factor,
+ filter_strength, weight_calc_level_in_tf, pred, accum, count);
+ }
+ }
+ }
+ }
+ tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
+ accum, count, tf_ctx->output_frame);
+
+ if (compute_frame_diff) {
+ const int y_height = mb_height >> mbd->plane[0].subsampling_y;
+ const int y_width = mb_width >> mbd->plane[0].subsampling_x;
+ const int source_y_stride = frame_to_filter->y_stride;
+ const int filter_y_stride = tf_ctx->output_frame->y_stride;
+ const int source_offset =
+ mb_row * y_height * source_y_stride + mb_col * y_width;
+ const int filter_offset =
+ mb_row * y_height * filter_y_stride + mb_col * y_width;
+ unsigned int sse = 0;
+ cpi->ppi->fn_ptr[block_size].vf(
+ frame_to_filter->y_buffer + source_offset, source_y_stride,
+ tf_ctx->output_frame->y_buffer + filter_offset, filter_y_stride,
+ &sse);
+ diff->sum += sse;
+ diff->sse += sse * (int64_t)sse;
+ }
+ }
+}
+
+/*!\brief Does temporal filter for a given frame.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] cpi Top level encoder instance structure
+ *
+ * \remark Nothing will be returned, but the contents of td->diff will be
+ modified.
+ */
+static void tf_do_filtering(AV1_COMP *cpi) {
+ // Basic information.
+ ThreadData *td = &cpi->td;
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ const struct scale_factors *scale = &tf_ctx->sf;
+ const int num_planes = av1_num_planes(&cpi->common);
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+ MACROBLOCKD *mbd = &td->mb.e_mbd;
+ uint8_t *input_buffer[MAX_MB_PLANE];
+ MB_MODE_INFO **input_mb_mode_info;
+ tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
+ tf_setup_macroblockd(mbd, &td->tf_data, scale);
+
+ // Perform temporal filtering for each row.
+ for (int mb_row = 0; mb_row < tf_ctx->mb_rows; mb_row++)
+ av1_tf_do_filtering_row(cpi, td, mb_row);
+
+ tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);
+}
+
+/*!\brief Setups the frame buffer for temporal filtering. This fuction
+ * determines how many frames will be used for temporal filtering and then
+ * groups them into a buffer. This function will also estimate the noise level
+ * of the to-filter frame.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] cpi Top level encoder instance structure
+ * \param[in] filter_frame_lookahead_idx The index of the to-filter frame
+ * in the lookahead buffer cpi->lookahead
+ * \param[in] gf_frame_index GOP index
+ *
+ * \remark Nothing will be returned. But the fields `frames`, `num_frames`,
+ * `filter_frame_idx` and `noise_levels` will be updated in cpi->tf_ctx.
+ */
+static void tf_setup_filtering_buffer(AV1_COMP *cpi,
+ int filter_frame_lookahead_idx,
+ int gf_frame_index) {
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_frame_index];
+ const FRAME_TYPE frame_type = gf_group->frame_type[gf_frame_index];
+ const int is_forward_keyframe =
+ av1_gop_check_forward_keyframe(gf_group, gf_frame_index);
+
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+ // Number of frames used for filtering. Set `arnr_max_frames` as 1 to disable
+ // temporal filtering.
+ int num_frames = AOMMAX(cpi->oxcf.algo_cfg.arnr_max_frames, 1);
+ int num_before = 0; // Number of filtering frames before the to-filter frame.
+ int num_after = 0; // Number of filtering frames after the to-filer frame.
+ const int lookahead_depth =
+ av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
+
+ // Temporal filtering should not go beyond key frames
+ const int key_to_curframe =
+ AOMMAX(cpi->rc.frames_since_key + filter_frame_lookahead_idx, 0);
+ const int curframe_to_key =
+ AOMMAX(cpi->rc.frames_to_key - filter_frame_lookahead_idx - 1, 0);
+
+ // Number of buffered frames before the to-filter frame.
+ int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe);
+
+ // Number of buffered frames after the to-filter frame.
+ int max_after =
+ AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key);
+
+ // Estimate noises for each plane.
+ const struct lookahead_entry *to_filter_buf = av1_lookahead_peek(
+ cpi->ppi->lookahead, filter_frame_lookahead_idx, cpi->compressor_stage);
+ assert(to_filter_buf != NULL);
+ const YV12_BUFFER_CONFIG *to_filter_frame = &to_filter_buf->img;
+ const int num_planes = av1_num_planes(&cpi->common);
+ double *noise_levels = tf_ctx->noise_levels;
+ av1_estimate_noise_level(to_filter_frame, noise_levels, AOM_PLANE_Y,
+ num_planes - 1, cpi->common.seq_params->bit_depth,
+ NOISE_ESTIMATION_EDGE_THRESHOLD);
+ // Get quantization factor.
+ const int q = av1_get_q(cpi);
+ // Get correlation estimates from first-pass;
+ const FIRSTPASS_STATS *stats =
+ cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0);
+ double accu_coeff0 = 1.0, accu_coeff1 = 1.0;
+ for (int i = 1; i <= max_after; i++) {
+ if (stats + filter_frame_lookahead_idx + i >=
+ cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
+ max_after = i - 1;
+ break;
+ }
+ accu_coeff1 *=
+ AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001);
+ }
+ if (max_after >= 1) {
+ accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after);
+ }
+ for (int i = 1; i <= max_before; i++) {
+ if (stats + filter_frame_lookahead_idx - i + 1 <=
+ cpi->ppi->twopass.stats_buf_ctx->stats_in_start) {
+ max_before = i - 1;
+ break;
+ }
+ accu_coeff0 *=
+ AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001);
+ }
+ if (max_before >= 1) {
+ accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before);
+ }
+
+ // Adjust number of filtering frames based on quantization factor. When the
+ // quantization factor is small enough (lossless compression), we will not
+ // change the number of frames for key frame filtering, which is to avoid
+ // visual quality drop.
+ int adjust_num = 6;
+ const int adjust_num_frames_for_arf_filtering =
+ cpi->sf.hl_sf.adjust_num_frames_for_arf_filtering;
+ if (num_frames == 1) { // `arnr_max_frames = 1` is used to disable filtering.
+ adjust_num = 0;
+ } else if ((update_type == KF_UPDATE) && q <= 10) {
+ adjust_num = 0;
+ } else if (adjust_num_frames_for_arf_filtering > 0 &&
+ update_type != KF_UPDATE && (cpi->rc.frames_since_key > 0)) {
+ // Since screen content detection happens after temporal filtering,
+ // 'frames_since_key' check is added to ensure the sf is disabled for the
+ // first alt-ref frame.
+ // Adjust number of frames to be considered for filtering based on noise
+ // level of the current frame. For low-noise frame, use more frames to
+ // filter such that the filtered frame can provide better predictions for
+ // subsequent frames and vice versa.
+ const uint8_t av1_adjust_num_using_noise_lvl[2][3] = { { 6, 4, 2 },
+ { 4, 2, 0 } };
+ const uint8_t *adjust_num_frames =
+ av1_adjust_num_using_noise_lvl[adjust_num_frames_for_arf_filtering - 1];
+
+ if (noise_levels[AOM_PLANE_Y] < 0.5)
+ adjust_num = adjust_num_frames[0];
+ else if (noise_levels[AOM_PLANE_Y] < 1.0)
+ adjust_num = adjust_num_frames[1];
+ else
+ adjust_num = adjust_num_frames[2];
+ }
+ num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth);
+
+ if (frame_type == KEY_FRAME) {
+ num_before = AOMMIN(is_forward_keyframe ? num_frames / 2 : 0, max_before);
+ num_after = AOMMIN(num_frames - 1, max_after);
+ } else {
+ int gfu_boost = av1_calc_arf_boost(&cpi->ppi->twopass, &cpi->twopass_frame,
+ &cpi->ppi->p_rc, &cpi->frame_info,
+ filter_frame_lookahead_idx, max_before,
+ max_after, NULL, NULL, 0);
+
+ num_frames = AOMMIN(num_frames, gfu_boost / 150);
+ num_frames += !(num_frames & 1); // Make the number odd.
+
+ // Only use 2 neighbours for the second ARF.
+ if (update_type == INTNL_ARF_UPDATE) num_frames = AOMMIN(num_frames, 3);
+ if (AOMMIN(max_after, max_before) >= num_frames / 2) {
+ // just use half half
+ num_before = num_frames / 2;
+ num_after = num_frames / 2;
+ } else {
+ if (max_after < num_frames / 2) {
+ num_after = max_after;
+ num_before = AOMMIN(num_frames - 1 - num_after, max_before);
+ } else {
+ num_before = max_before;
+ num_after = AOMMIN(num_frames - 1 - num_before, max_after);
+ }
+ // Adjust insymmetry based on frame-level correlation
+ if (max_after > 0 && max_before > 0) {
+ if (num_after < num_before) {
+ const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff1, 0.01));
+ num_before = AOMMIN(num_before, num_after + insym);
+ } else {
+ const int insym = (int)(0.4 / AOMMAX(1 - accu_coeff0, 0.01));
+ num_after = AOMMIN(num_after, num_before + insym);
+ }
+ }
+ }
+ }
+ num_frames = num_before + 1 + num_after;
+
+ // Setup the frame buffer.
+ for (int frame = 0; frame < num_frames; ++frame) {
+ const int lookahead_idx = frame - num_before + filter_frame_lookahead_idx;
+ struct lookahead_entry *buf = av1_lookahead_peek(
+ cpi->ppi->lookahead, lookahead_idx, cpi->compressor_stage);
+ assert(buf != NULL);
+ frames[frame] = &buf->img;
+ }
+ tf_ctx->num_frames = num_frames;
+ tf_ctx->filter_frame_idx = num_before;
+ assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame);
+
+ av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes,
+ cpi->common.seq_params->sb_size);
+ av1_setup_block_planes(&cpi->td.mb.e_mbd,
+ cpi->common.seq_params->subsampling_x,
+ cpi->common.seq_params->subsampling_y, num_planes);
+}
+
+/*!\cond */
+
+double av1_estimate_noise_from_single_plane_c(const uint8_t *src, int height,
+ int width, int stride,
+ int edge_thresh) {
+ int64_t accum = 0;
+ int count = 0;
+
+ for (int i = 1; i < height - 1; ++i) {
+ for (int j = 1; j < width - 1; ++j) {
+ // Setup a small 3x3 matrix.
+ const int center_idx = i * stride + j;
+ int mat[3][3];
+ for (int ii = -1; ii <= 1; ++ii) {
+ for (int jj = -1; jj <= 1; ++jj) {
+ const int idx = center_idx + ii * stride + jj;
+ mat[ii + 1][jj + 1] = src[idx];
+ }
+ }
+ // Compute sobel gradients.
+ const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), 0);
+ // Accumulate Laplacian.
+ if (Ga < edge_thresh) { // Only count smooth pixels.
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ accum += ROUND_POWER_OF_TWO(abs(v), 0);
+ ++count;
+ }
+ }
+ }
+
+ // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
+ return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double av1_highbd_estimate_noise_from_single_plane_c(const uint16_t *src16,
+ int height, int width,
+ const int stride,
+ int bit_depth,
+ int edge_thresh) {
+ int64_t accum = 0;
+ int count = 0;
+ for (int i = 1; i < height - 1; ++i) {
+ for (int j = 1; j < width - 1; ++j) {
+ // Setup a small 3x3 matrix.
+ const int center_idx = i * stride + j;
+ int mat[3][3];
+ for (int ii = -1; ii <= 1; ++ii) {
+ for (int jj = -1; jj <= 1; ++jj) {
+ const int idx = center_idx + ii * stride + jj;
+ mat[ii + 1][jj + 1] = src16[idx];
+ }
+ }
+ // Compute sobel gradients.
+ const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+ 2 * (mat[1][0] - mat[1][2]);
+ const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+ 2 * (mat[0][1] - mat[2][1]);
+ const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8);
+ // Accumulate Laplacian.
+ if (Ga < edge_thresh) { // Only count smooth pixels.
+ const int v = 4 * mat[1][1] -
+ 2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+ (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+ accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8);
+ ++count;
+ }
+ }
+ }
+
+ // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
+ return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+#endif
+
+void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame,
+ double *noise_level, int plane_from, int plane_to,
+ int bit_depth, int edge_thresh) {
+ for (int plane = plane_from; plane <= plane_to; plane++) {
+ const bool is_uv_plane = (plane != AOM_PLANE_Y);
+ const int height = frame->crop_heights[is_uv_plane];
+ const int width = frame->crop_widths[is_uv_plane];
+ const int stride = frame->strides[is_uv_plane];
+ const uint8_t *src = frame->buffers[plane];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const int is_high_bitdepth = is_frame_high_bitdepth(frame);
+ if (is_high_bitdepth) {
+ noise_level[plane] = av1_highbd_estimate_noise_from_single_plane(
+ src16, height, width, stride, bit_depth, edge_thresh);
+ } else {
+ noise_level[plane] = av1_estimate_noise_from_single_plane(
+ src, height, width, stride, edge_thresh);
+ }
+#else
+ (void)bit_depth;
+ noise_level[plane] = av1_estimate_noise_from_single_plane(
+ src, height, width, stride, edge_thresh);
+#endif
+ }
+}
+
+// Initializes the members of TemporalFilterCtx
+// Inputs:
+// cpi: Top level encoder instance structure
+// check_show_existing: If 1, check whether the filtered frame is similar
+// to the original frame.
+// filter_frame_lookahead_idx: The index of the frame to be filtered in the
+// lookahead buffer cpi->lookahead.
+// Returns:
+// Nothing will be returned. But the contents of cpi->tf_ctx will be modified.
+static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx,
+ int gf_frame_index, int compute_frame_diff,
+ YV12_BUFFER_CONFIG *output_frame) {
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ // Setup frame buffer for filtering.
+ YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
+ tf_ctx->num_frames = 0;
+ tf_ctx->filter_frame_idx = -1;
+ tf_ctx->output_frame = output_frame;
+ tf_ctx->compute_frame_diff = compute_frame_diff;
+ tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, gf_frame_index);
+ assert(tf_ctx->num_frames > 0);
+ assert(tf_ctx->filter_frame_idx < tf_ctx->num_frames);
+
+ // Setup scaling factors. Scaling on each of the arnr frames is not
+ // supported.
+ // ARF is produced at the native frame size and resized when coded.
+ struct scale_factors *sf = &tf_ctx->sf;
+ av1_setup_scale_factors_for_frame(
+ sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+ frames[0]->y_crop_width, frames[0]->y_crop_height);
+
+ // Initialize temporal filter parameters.
+ MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+ const int filter_frame_idx = tf_ctx->filter_frame_idx;
+ const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
+ const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int mb_width = block_size_wide[block_size];
+ const int mb_height = block_size_high[block_size];
+ const int mb_rows = get_num_blocks(frame_height, mb_height);
+ const int mb_cols = get_num_blocks(frame_width, mb_width);
+ const int mb_pels = mb_width * mb_height;
+ const int is_highbitdepth = is_frame_high_bitdepth(frame_to_filter);
+ const int num_planes = av1_num_planes(&cpi->common);
+ int num_pels = 0;
+ for (int i = 0; i < num_planes; i++) {
+ const int subsampling_x = mbd->plane[i].subsampling_x;
+ const int subsampling_y = mbd->plane[i].subsampling_y;
+ num_pels += mb_pels >> (subsampling_x + subsampling_y);
+ }
+ tf_ctx->num_pels = num_pels;
+ tf_ctx->mb_rows = mb_rows;
+ tf_ctx->mb_cols = mb_cols;
+ tf_ctx->is_highbitdepth = is_highbitdepth;
+ tf_ctx->q_factor = av1_get_q(cpi);
+}
+
+int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
+ const FRAME_DIFF *frame_diff, int q_index,
+ aom_bit_depth_t bit_depth) {
+ const int frame_height = frame->y_crop_height;
+ const int frame_width = frame->y_crop_width;
+ const int block_height = block_size_high[TF_BLOCK_SIZE];
+ const int block_width = block_size_wide[TF_BLOCK_SIZE];
+ const int mb_rows = get_num_blocks(frame_height, block_height);
+ const int mb_cols = get_num_blocks(frame_width, block_width);
+ const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
+ const float mean = (float)frame_diff->sum / num_mbs;
+ const float std = (float)sqrt((float)frame_diff->sse / num_mbs - mean * mean);
+
+ const int ac_q_step = av1_ac_quant_QTX(q_index, 0, bit_depth);
+ const float threshold = 0.7f * ac_q_step * ac_q_step;
+
+ if (mean < threshold && std < mean * 1.2) {
+ return 1;
+ }
+ return 0;
+}
+
+void av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
+ int gf_frame_index, FRAME_DIFF *frame_diff,
+ YV12_BUFFER_CONFIG *output_frame) {
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ // Basic informaton of the current frame.
+ TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
+ TemporalFilterData *tf_data = &cpi->td.tf_data;
+ const int compute_frame_diff = frame_diff != NULL;
+ // TODO(anyone): Currently, we enforce the filtering strength on internal
+ // ARFs except the second ARF to be zero. We should investigate in which case
+ // it is more beneficial to use non-zero strength filtering.
+ // Only parallel level 0 frames go through temporal filtering.
+ assert(cpi->ppi->gf_group.frame_parallel_level[gf_frame_index] == 0);
+
+ // Initialize temporal filter context structure.
+ init_tf_ctx(cpi, filter_frame_lookahead_idx, gf_frame_index,
+ compute_frame_diff, output_frame);
+
+ // Allocate and reset temporal filter buffers.
+ const int is_highbitdepth = tf_ctx->is_highbitdepth;
+ if (!tf_alloc_and_reset_data(tf_data, tf_ctx->num_pels, is_highbitdepth)) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating temporal filter data");
+ }
+
+ // Perform temporal filtering process.
+ if (mt_info->num_workers > 1)
+ av1_tf_do_filtering_mt(cpi);
+ else
+ tf_do_filtering(cpi);
+
+ if (compute_frame_diff) {
+ *frame_diff = tf_data->diff;
+ }
+ // Deallocate temporal filter buffers.
+ tf_dealloc_data(tf_data, is_highbitdepth);
+}
+
+int av1_is_temporal_filter_on(const AV1EncoderConfig *oxcf) {
+ return oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1;
+}
+
+bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ tf_info->is_temporal_filter_on = av1_is_temporal_filter_on(oxcf);
+ if (tf_info->is_temporal_filter_on == 0) return true;
+
+ const AV1_COMMON *cm = &cpi->common;
+ const SequenceHeader *const seq_params = cm->seq_params;
+ for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+ if (aom_realloc_frame_buffer(
+ &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width,
+ oxcf->frm_dim_cfg.height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+ NULL, cpi->image_pyramid_levels, 0)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) {
+ if (tf_info->is_temporal_filter_on == 0) return;
+ for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+ aom_free_frame_buffer(&tf_info->tf_buf[i]);
+ }
+ aom_free_frame_buffer(&tf_info->tf_buf_second_arf);
+}
+
+void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info) {
+ av1_zero(tf_info->tf_buf_valid);
+ av1_zero(tf_info->tf_buf_gf_index);
+ av1_zero(tf_info->tf_buf_display_index_offset);
+}
+
+void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, AV1_COMP *cpi,
+ const GF_GROUP *gf_group) {
+ if (tf_info->is_temporal_filter_on == 0) return;
+ const AV1_COMMON *const cm = &cpi->common;
+ for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) {
+ int update_type = gf_group->update_type[gf_index];
+ if (update_type == KF_UPDATE || update_type == ARF_UPDATE) {
+ int buf_idx = gf_group->frame_type[gf_index] == INTER_FRAME;
+ int lookahead_idx = gf_group->arf_src_offset[gf_index] +
+ gf_group->cur_frame_idx[gf_index];
+ // This function is designed to be called multiple times after
+ // av1_tf_info_reset(). It will only generate the filtered frame that does
+ // not exist yet.
+ if (tf_info->tf_buf_valid[buf_idx] == 0 ||
+ tf_info->tf_buf_display_index_offset[buf_idx] != lookahead_idx) {
+ YV12_BUFFER_CONFIG *out_buf = &tf_info->tf_buf[buf_idx];
+ av1_temporal_filter(cpi, lookahead_idx, gf_index,
+ &tf_info->frame_diff[buf_idx], out_buf);
+ aom_extend_frame_borders(out_buf, av1_num_planes(cm));
+ tf_info->tf_buf_gf_index[buf_idx] = gf_index;
+ tf_info->tf_buf_display_index_offset[buf_idx] = lookahead_idx;
+ tf_info->tf_buf_valid[buf_idx] = 1;
+ }
+ }
+ }
+}
+
+YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info,
+ int gf_index,
+ FRAME_DIFF *frame_diff) {
+ if (tf_info->is_temporal_filter_on == 0) return NULL;
+ YV12_BUFFER_CONFIG *out_buf = NULL;
+ for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) {
+ if (tf_info->tf_buf_valid[i] && tf_info->tf_buf_gf_index[i] == gf_index) {
+ out_buf = &tf_info->tf_buf[i];
+ *frame_diff = tf_info->frame_diff[i];
+ }
+ }
+ return out_buf;
+}
+/*!\endcond */
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
new file mode 100644
index 0000000000..6504b91b66
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+struct AV1_COMP;
+struct AV1EncoderConfig;
+struct ThreadData;
+// TODO(wtc): These two variables are only used in avx2, sse2, neon
+// implementations, where the block size is still hard coded to TF_BLOCK_SIZE.
+// This should be fixed to align with the c implementation.
+#define BH 32
+#define BW 32
+
+// Block size used in temporal filtering.
+#define TF_BLOCK_SIZE BLOCK_32X32
+
+// Window size for temporal filtering.
+#define TF_WINDOW_LENGTH 5
+
+// A constant number, sqrt(pi / 2), used for noise estimation.
+static const double SQRT_PI_BY_2 = 1.25331413732;
+
+// Hyper-parameters used to compute filtering weight. These hyper-parameters can
+// be tuned for a better performance.
+// 0. A scale factor used in temporal filtering to raise the filter weight from
+// `double` with range [0, 1] to `int` with range [0, 1000].
+#define TF_WEIGHT_SCALE 1000
+// 1. Weight factor used to balance the weighted-average between window error
+// and block error. The weight is for window error while the weight for block
+// error is always set as 1.
+#define TF_WINDOW_BLOCK_BALANCE_WEIGHT 5
+// 2. Threshold for using q to adjust the filtering weight. Concretely, when
+// using a small q (high bitrate), we would like to reduce the filtering
+// strength such that more detailed information can be preserved. Hence, when
+// q is smaller than this threshold, we will adjust the filtering weight
+// based on the q-value.
+#define TF_Q_DECAY_THRESHOLD 20
+// 3. Normalization factor used to normalize the motion search error. Since the
+// motion search error can be large and uncontrollable, we will simply
+// normalize it before using it to compute the filtering weight.
+#define TF_SEARCH_ERROR_NORM_WEIGHT 20
+// 4. Threshold for using `arnr_strength` to adjust the filtering strength.
+// Concretely, users can use `arnr_strength` arguments to control the
+// strength of temporal filtering. When `arnr_strength` is small enough (
+// i.e., smaller than this threshold), we will adjust the filtering weight
+// based on the strength value.
+#define TF_STRENGTH_THRESHOLD 4
+// 5. Threshold for using motion search distance to adjust the filtering weight.
+// Concretely, larger motion search vector leads to a higher probability of
+// unreliable search. Hence, we would like to reduce the filtering strength
+// when the distance is large enough. Considering that the distance actually
+// relies on the frame size, this threshold is also a resolution-based
+// threshold. Taking 720p videos as an instance, if this field equals to 0.1,
+// then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold
+// for 360p videos will be 360 * 0.1 = 36.
+#define TF_SEARCH_DISTANCE_THRESHOLD 0.1
+// 6. Threshold to identify if the q is in a relative high range.
+// Above this cutoff q, a stronger filtering is applied.
+// For a high q, the quantization throws away more information, and thus a
+// stronger filtering is less likely to distort the encoded quality, while a
+// stronger filtering could reduce bit rates.
+// Ror a low q, more details are expected to be retained. Filtering is thus
+// more conservative.
+#define TF_QINDEX_CUTOFF 128
+
+#define NOISE_ESTIMATION_EDGE_THRESHOLD 50
+
+// Sum and SSE source vs filtered frame difference returned by
+// temporal filter.
+typedef struct {
+ int64_t sum;
+ int64_t sse;
+} FRAME_DIFF;
+
+/*!\endcond */
+
+/*!
+ * \brief Parameters related to temporal filtering.
+ */
+typedef struct {
+ /*!
+ * Frame buffers used for temporal filtering.
+ */
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+ /*!
+ * Number of frames in the frame buffer.
+ */
+ int num_frames;
+
+ /*!
+ * Output filtered frame
+ */
+ YV12_BUFFER_CONFIG *output_frame;
+
+ /*!
+ * Index of the frame to be filtered.
+ */
+ int filter_frame_idx;
+ /*!
+ * Whether to accumulate diff for show existing condition check.
+ */
+ int compute_frame_diff;
+ /*!
+ * Frame scaling factor.
+ */
+ struct scale_factors sf;
+ /*!
+ * Estimated noise levels for each plane in the frame.
+ */
+ double noise_levels[MAX_MB_PLANE];
+ /*!
+ * Number of pixels in the temporal filtering block across all planes.
+ */
+ int num_pels;
+ /*!
+ * Number of temporal filtering block rows.
+ */
+ int mb_rows;
+ /*!
+ * Number of temporal filtering block columns.
+ */
+ int mb_cols;
+ /*!
+ * Whether the frame is high-bitdepth or not.
+ */
+ int is_highbitdepth;
+ /*!
+ * Quantization factor used in temporal filtering.
+ */
+ int q_factor;
+} TemporalFilterCtx;
+
+/*!
+ * buffer count in TEMPORAL_FILTER_INFO
+ * Currently we only apply filtering on KEY and ARF after
+ * define_gf_group(). Hence, the count is two.
+ */
+#define TF_INFO_BUF_COUNT 2
+
+/*!
+ * \brief Temporal filter info for a gop
+ */
+typedef struct TEMPORAL_FILTER_INFO {
+ /*!
+ * A flag indicate whether temporal filter shoud be applied.
+ * This flag will stored the result of
+ * av1_is_temporal_filter_on()
+ */
+ int is_temporal_filter_on;
+ /*!
+ * buffers used for temporal filtering in a GOP
+ * index 0 for key frame and index 1 for ARF
+ */
+ YV12_BUFFER_CONFIG tf_buf[TF_INFO_BUF_COUNT];
+
+ /*!
+ * buffers used for temporal filtering for
+ * INTNL_ARF_UPDATE
+ * Check av1_gop_is_second_arf() for the
+ * definition of second_arf in detail
+ */
+ YV12_BUFFER_CONFIG tf_buf_second_arf;
+ /*!
+ * whether to show the buffer directly or not.
+ */
+ FRAME_DIFF frame_diff[TF_INFO_BUF_COUNT];
+ /*!
+ * the corresponding gf_index for the buffer.
+ */
+ int tf_buf_gf_index[TF_INFO_BUF_COUNT];
+ /*!
+ * the display_index offset between next show frame and the frames in the GOP
+ */
+ int tf_buf_display_index_offset[TF_INFO_BUF_COUNT];
+ /*!
+ * whether the buf is valid or not.
+ */
+ int tf_buf_valid[TF_INFO_BUF_COUNT];
+} TEMPORAL_FILTER_INFO;
+
+/*!\brief Check whether we should apply temporal filter at all.
+ * \param[in] oxcf AV1 encoder config
+ *
+ * \return 1: temporal filter is on 0: temporal is off
+ */
+int av1_is_temporal_filter_on(const struct AV1EncoderConfig *oxcf);
+
+/*!\brief Allocate buffers for TEMPORAL_FILTER_INFO
+ * \param[in,out] tf_info Temporal filter info for a gop
+ * \param[in,out] cpi Top level encoder instance structure
+ *
+ * \return True on success, false on memory allocation failure.
+ */
+bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info,
+ const struct AV1_COMP *cpi);
+
+/*!\brief Free buffers for TEMPORAL_FILTER_INFO
+ * \param[in,out] tf_info Temporal filter info for a gop
+ */
+void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info);
+
+/*!\brief Reset validity of tf_buf in TEMPORAL_FILTER_INFO
+ * \param[in,out] tf_info Temporal filter info for a gop
+ */
+void av1_tf_info_reset(TEMPORAL_FILTER_INFO *tf_info);
+
+/*!\brief Apply temporal filter for key frame and ARF in a gop
+ * \param[in,out] tf_info Temporal filter info for a gop
+ * \param[in,out] cpi Top level encoder instance structure
+ * \param[in] gf_group GF/ARF group data structure
+ */
+void av1_tf_info_filtering(TEMPORAL_FILTER_INFO *tf_info, struct AV1_COMP *cpi,
+ const GF_GROUP *gf_group);
+
+/*!\brief Get a filtered buffer from TEMPORAL_FILTER_INFO
+ * \param[in,out] tf_info Temporal filter info for a gop
+ * \param[in] gf_index gf_index for the target buffer
+ * \param[out] show_tf_buf whether the target buffer can be shown
+ * directly
+ */
+YV12_BUFFER_CONFIG *av1_tf_info_get_filtered_buf(TEMPORAL_FILTER_INFO *tf_info,
+ int gf_index,
+ FRAME_DIFF *frame_diff);
+
+/*!\cond */
+
+// Data related to temporal filtering.
+typedef struct {
+ // Source vs filtered frame error.
+ FRAME_DIFF diff;
+ // Pointer to temporary block info used to store state in temporal filtering
+ // process.
+ MB_MODE_INFO *tmp_mbmi;
+ // Pointer to accumulator buffer used in temporal filtering process.
+ uint32_t *accum;
+ // Pointer to count buffer used in temporal filtering process.
+ uint16_t *count;
+ // Pointer to predictor used in temporal filtering process.
+ uint8_t *pred;
+} TemporalFilterData;
+
+// Data related to temporal filter multi-thread synchronization.
+typedef struct {
+#if CONFIG_MULTITHREAD
+ // Mutex lock used for dispatching jobs.
+ pthread_mutex_t *mutex_;
+#endif // CONFIG_MULTITHREAD
+ // Next temporal filter block row to be filtered.
+ int next_tf_row;
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool tf_mt_exit;
+} AV1TemporalFilterSync;
+
+// Estimates noise level from a given frame using a single plane (Y, U, or V).
+// This is an adaptation of the mehtod in the following paper:
+// Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise
+// estimation using Laplacian operator and adaptive edge detection",
+// Proc. 3rd International Symposium on Communications, Control and
+// Signal Processing, 2008, St Julians, Malta.
+// Inputs:
+// frame: Pointer to the frame to estimate noise level from.
+// noise_level: Pointer to store the estimated noise.
+// plane_from: Index of the starting plane used for noise estimation.
+// Commonly, 0 for Y-plane, 1 for U-plane, and 2 for V-plane.
+// plane_to: Index of the end plane used for noise estimation.
+// bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame.
+// edge_thresh: Edge threshold.
+void av1_estimate_noise_level(const YV12_BUFFER_CONFIG *frame,
+ double *noise_level, int plane_from, int plane_to,
+ int bit_depth, int edge_thresh);
+/*!\endcond */
+
+/*!\brief Does temporal filter for a given macroblock row.
+*
+* \ingroup src_frame_proc
+* \param[in] cpi Top level encoder instance structure
+* \param[in] td Pointer to thread data
+* \param[in] mb_row Macroblock row to be filtered
+filtering
+*
+* \remark Nothing will be returned, but the contents of td->diff will be
+modified.
+*/
+void av1_tf_do_filtering_row(struct AV1_COMP *cpi, struct ThreadData *td,
+ int mb_row);
+
+/*!\brief Performs temporal filtering if needed on a source frame.
+ * For example to create a filtered alternate reference frame (ARF)
+ *
+ * In this function, the lookahead index is different from the 0-based
+ * real index. For example, if we want to filter the first frame in the
+ * pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead
+ * of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the
+ * second frame in the pre-fetched buffer. Another example: if we want to filter
+ * the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16.
+ * Futhermore, negative number is used for key frame in one-pass mode, where key
+ * frame is filtered with the frames before it instead of after it. For example,
+ * -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] cpi Top level encoder instance
+ * structure
+ * \param[in] filter_frame_lookahead_idx The index of the
+ * to-filter frame in the lookahead
+ * buffer cpi->lookahead.
+ * \param[in] gf_frame_index Index of GOP
+ * \param[in,out] frame_diff structure of sse and sum of the
+ * filtered frame.
+ * \param[out] output_frame Ouput filtered frame.
+ */
+void av1_temporal_filter(struct AV1_COMP *cpi,
+ const int filter_frame_lookahead_idx,
+ int gf_frame_index, FRAME_DIFF *frame_diff,
+ YV12_BUFFER_CONFIG *output_frame);
+
+/*!\brief Check whether a filtered frame can be show directly
+ *
+ * This function will use the filtered frame's sse and current q index
+ * to make decision.
+ *
+ * \ingroup src_frame_proc
+ * \param[in] frame filtered frame's buffer
+ * \param[in] frame_diff structure of sse and sum of the
+ * filtered frame.
+ * \param[in] q_index q_index used for this frame
+ * \param[in] bit_depth bit depth
+ * \return return 1 if this frame can be shown directly, otherwise
+ * return 0
+ */
+int av1_check_show_filtered_frame(const YV12_BUFFER_CONFIG *frame,
+ const FRAME_DIFF *frame_diff, int q_index,
+ aom_bit_depth_t bit_depth);
+
+/*!\cond */
+// Helper function to get `q` used for encoding.
+int av1_get_q(const struct AV1_COMP *cpi);
+
+// Allocates memory for members of TemporalFilterData.
+// Inputs:
+// tf_data: Pointer to the structure containing temporal filter related data.
+// num_pels: Number of pixels in the block across all planes.
+// is_high_bitdepth: Whether the frame is high-bitdepth or not.
+// Returns:
+// True if allocation is successful and false otherwise.
+static AOM_INLINE bool tf_alloc_and_reset_data(TemporalFilterData *tf_data,
+ int num_pels,
+ int is_high_bitdepth) {
+ tf_data->tmp_mbmi = (MB_MODE_INFO *)aom_calloc(1, sizeof(*tf_data->tmp_mbmi));
+ tf_data->accum =
+ (uint32_t *)aom_memalign(16, num_pels * sizeof(*tf_data->accum));
+ tf_data->count =
+ (uint16_t *)aom_memalign(16, num_pels * sizeof(*tf_data->count));
+ if (is_high_bitdepth)
+ tf_data->pred = CONVERT_TO_BYTEPTR(
+ aom_memalign(32, num_pels * 2 * sizeof(*tf_data->pred)));
+ else
+ tf_data->pred =
+ (uint8_t *)aom_memalign(32, num_pels * sizeof(*tf_data->pred));
+ // In case of an allocation failure, other successfully allocated buffers will
+ // be freed by the tf_dealloc_data() call in encoder_destroy().
+ if (!(tf_data->tmp_mbmi && tf_data->accum && tf_data->count && tf_data->pred))
+ return false;
+ memset(&tf_data->diff, 0, sizeof(tf_data->diff));
+ return true;
+}
+
+// Setup macroblockd params for temporal filtering process.
+// Inputs:
+// mbd: Pointer to the block for filtering.
+// tf_data: Pointer to the structure containing temporal filter related data.
+// scale: Scaling factor.
+// Returns:
+// Nothing will be returned. Contents of mbd will be modified.
+static AOM_INLINE void tf_setup_macroblockd(MACROBLOCKD *mbd,
+ TemporalFilterData *tf_data,
+ const struct scale_factors *scale) {
+ mbd->block_ref_scale_factors[0] = scale;
+ mbd->block_ref_scale_factors[1] = scale;
+ mbd->mi = &tf_data->tmp_mbmi;
+ mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+}
+
+// Deallocates the memory allocated for members of TemporalFilterData.
+// Inputs:
+// tf_data: Pointer to the structure containing temporal filter related data.
+// is_high_bitdepth: Whether the frame is high-bitdepth or not.
+// Returns:
+// Nothing will be returned.
+static AOM_INLINE void tf_dealloc_data(TemporalFilterData *tf_data,
+ int is_high_bitdepth) {
+ if (is_high_bitdepth)
+ tf_data->pred = (uint8_t *)CONVERT_TO_SHORTPTR(tf_data->pred);
+ aom_free(tf_data->tmp_mbmi);
+ tf_data->tmp_mbmi = NULL;
+ aom_free(tf_data->accum);
+ tf_data->accum = NULL;
+ aom_free(tf_data->count);
+ tf_data->count = NULL;
+ aom_free(tf_data->pred);
+ tf_data->pred = NULL;
+}
+
+// Saves the state prior to temporal filter process.
+// Inputs:
+// mbd: Pointer to the block for filtering.
+// input_mbmi: Backup block info to save input state.
+// input_buffer: Backup buffer pointer to save input state.
+// num_planes: Number of planes.
+// Returns:
+// Nothing will be returned. Contents of input_mbmi and input_buffer will be
+// modified.
+static INLINE void tf_save_state(MACROBLOCKD *mbd, MB_MODE_INFO ***input_mbmi,
+ uint8_t **input_buffer, int num_planes) {
+ for (int i = 0; i < num_planes; i++) {
+ input_buffer[i] = mbd->plane[i].pre[0].buf;
+ }
+ *input_mbmi = mbd->mi;
+}
+
+// Restores the initial state after temporal filter process.
+// Inputs:
+// mbd: Pointer to the block for filtering.
+// input_mbmi: Backup block info from where input state is restored.
+// input_buffer: Backup buffer pointer from where input state is restored.
+// num_planes: Number of planes.
+// Returns:
+// Nothing will be returned. Contents of mbd will be modified.
+static INLINE void tf_restore_state(MACROBLOCKD *mbd, MB_MODE_INFO **input_mbmi,
+ uint8_t **input_buffer, int num_planes) {
+ for (int i = 0; i < num_planes; i++) {
+ mbd->plane[i].pre[0].buf = input_buffer[i];
+ }
+ mbd->mi = input_mbmi;
+}
+
+/*!\endcond */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/third_party/aom/av1/encoder/thirdpass.c b/third_party/aom/av1/encoder/thirdpass.c
new file mode 100644
index 0000000000..a25522fbc5
--- /dev/null
+++ b/third_party/aom/av1/encoder/thirdpass.c
@@ -0,0 +1,877 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/encoder/thirdpass.h"
+
+#if CONFIG_THREE_PASS && CONFIG_AV1_DECODER
+#include "aom/aom_codec.h"
+#include "aom/aomdx.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/av1_iface_common.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/common/blockd.h"
+#include "common/ivfdec.h"
+
+static void setup_two_pass_stream_input(
+ struct AvxInputContext **input_ctx_ptr, const char *input_file_name,
+ struct aom_internal_error_info *err_info) {
+ FILE *infile;
+ infile = fopen(input_file_name, "rb");
+ if (!infile) {
+ aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM,
+ "Failed to open input file '%s'.", input_file_name);
+ }
+ struct AvxInputContext *aom_input_ctx = aom_malloc(sizeof(*aom_input_ctx));
+ if (!aom_input_ctx) {
+ fclose(infile);
+ aom_internal_error(err_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate memory for third-pass context.");
+ }
+ memset(aom_input_ctx, 0, sizeof(*aom_input_ctx));
+ aom_input_ctx->filename = input_file_name;
+ aom_input_ctx->file = infile;
+
+ if (file_is_ivf(aom_input_ctx)) {
+ aom_input_ctx->file_type = FILE_TYPE_IVF;
+ } else {
+ fclose(infile);
+ aom_free(aom_input_ctx);
+ aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM,
+ "Unrecognized input file type.");
+ }
+ *input_ctx_ptr = aom_input_ctx;
+}
+
+static void init_third_pass(THIRD_PASS_DEC_CTX *ctx) {
+ if (!ctx->input_ctx) {
+ if (ctx->input_file_name == NULL) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_INVALID_PARAM,
+ "No third pass input specified.");
+ }
+ setup_two_pass_stream_input(&ctx->input_ctx, ctx->input_file_name,
+ ctx->err_info);
+ }
+
+ if (!ctx->decoder.iface) {
+ aom_codec_iface_t *decoder_iface = &aom_codec_av1_inspect_algo;
+ if (aom_codec_dec_init(&ctx->decoder, decoder_iface, NULL, 0)) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to initialize decoder.");
+ }
+ }
+}
+
+// Return 0: success
+// 1: cannot read because this is end of file
+// -1: failure to read the frame
+static int read_frame(THIRD_PASS_DEC_CTX *ctx) {
+ if (!ctx->input_ctx || !ctx->decoder.iface) {
+ init_third_pass(ctx);
+ }
+ if (!ctx->have_frame) {
+ if (ivf_read_frame(ctx->input_ctx, &ctx->buf, &ctx->bytes_in_buffer,
+ &ctx->buffer_size, NULL) != 0) {
+ if (feof(ctx->input_ctx->file)) {
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+ ctx->frame = ctx->buf;
+ ctx->end_frame = ctx->frame + ctx->bytes_in_buffer;
+ ctx->have_frame = 1;
+ }
+
+ Av1DecodeReturn adr;
+ if (aom_codec_decode(&ctx->decoder, ctx->frame,
+ (unsigned int)ctx->bytes_in_buffer,
+ &adr) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to decode frame for third pass.");
+ }
+ ctx->this_frame_bits = (int)(adr.buf - ctx->frame) << 3;
+ ctx->frame = adr.buf;
+ ctx->bytes_in_buffer = ctx->end_frame - ctx->frame;
+ if (ctx->frame == ctx->end_frame) ctx->have_frame = 0;
+ return 0;
+}
+
+static void free_frame_info(THIRD_PASS_FRAME_INFO *frame_info) {
+ if (!frame_info) return;
+ aom_free(frame_info->mi_info);
+ frame_info->mi_info = NULL;
+}
+
+// This function gets the information needed from the recently decoded frame,
+// via various decoder APIs, and saves the info into ctx->frame_info.
+// Return 0: success
+// 1: cannot read because this is end of file
+// -1: failure to read the frame
+static int get_frame_info(THIRD_PASS_DEC_CTX *ctx) {
+ int ret = read_frame(ctx);
+ if (ret != 0) return ret;
+ int cur = ctx->frame_info_count;
+
+ ctx->frame_info[cur].actual_bits = ctx->this_frame_bits;
+
+ if (cur >= MAX_THIRD_PASS_BUF) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Third pass frame info ran out of available slots.");
+ }
+ aom_codec_frame_flags_t frame_type_flags = 0;
+ if (aom_codec_control(&ctx->decoder, AOMD_GET_FRAME_FLAGS,
+ &frame_type_flags) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read frame flags.");
+ }
+ if (frame_type_flags & AOM_FRAME_IS_KEY) {
+ ctx->frame_info[cur].frame_type = KEY_FRAME;
+ } else if (frame_type_flags & AOM_FRAME_IS_INTRAONLY) {
+ ctx->frame_info[cur].frame_type = INTRA_ONLY_FRAME;
+ } else if (frame_type_flags & AOM_FRAME_IS_SWITCH) {
+ ctx->frame_info[cur].frame_type = S_FRAME;
+ } else {
+ ctx->frame_info[cur].frame_type = INTER_FRAME;
+ }
+
+ // Get frame width and height
+ int frame_size[2];
+ if (aom_codec_control(&ctx->decoder, AV1D_GET_FRAME_SIZE, frame_size) !=
+ AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read frame size.");
+ }
+
+ // Check if we need to re-alloc the mi fields.
+ const int mi_cols = (frame_size[0] + 3) >> 2;
+ const int mi_rows = (frame_size[1] + 3) >> 2;
+ ctx->frame_info[cur].mi_stride = mi_cols;
+ ctx->frame_info[cur].mi_rows = mi_rows;
+ ctx->frame_info[cur].mi_cols = mi_cols;
+
+ if (ctx->frame_info[cur].width != frame_size[0] ||
+ ctx->frame_info[cur].height != frame_size[1] ||
+ !ctx->frame_info[cur].mi_info) {
+ free_frame_info(&ctx->frame_info[cur]);
+
+ ctx->frame_info[cur].mi_info =
+ aom_malloc(mi_cols * mi_rows * sizeof(*ctx->frame_info[cur].mi_info));
+
+ if (!ctx->frame_info[cur].mi_info) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate mi buffer for the third pass.");
+ }
+ }
+
+ ctx->frame_info[cur].width = frame_size[0];
+ ctx->frame_info[cur].height = frame_size[1];
+
+ // Get frame base q idx
+ if (aom_codec_control(&ctx->decoder, AOMD_GET_BASE_Q_IDX,
+ &ctx->frame_info[cur].base_q_idx) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read base q index.");
+ }
+
+ // Get show existing frame flag
+ if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+ &ctx->frame_info[cur].is_show_existing_frame) !=
+ AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read show existing frame flag.");
+ }
+
+ // Get show frame flag
+ if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_FRAME_FLAG,
+ &ctx->frame_info[cur].is_show_frame) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read show frame flag.");
+ }
+
+ // Get order hint
+ if (aom_codec_control(&ctx->decoder, AOMD_GET_ORDER_HINT,
+ &ctx->frame_info[cur].order_hint) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read order hint.");
+ }
+
+ // Clear MI info
+ for (int mi_row = 0; mi_row < mi_rows; mi_row++) {
+ for (int mi_col = 0; mi_col < mi_cols; mi_col++) {
+ ctx->frame_info[cur].mi_info[mi_row * mi_cols + mi_col].bsize =
+ BLOCK_INVALID;
+ }
+ }
+
+ // Get relevant information regarding each 4x4 MI
+ MB_MODE_INFO cur_mi_info;
+ THIRD_PASS_MI_INFO *const this_mi = ctx->frame_info[cur].mi_info;
+ for (int mi_row = 0; mi_row < mi_rows; mi_row++) {
+ for (int mi_col = 0; mi_col < mi_cols; mi_col++) {
+ const int offset = mi_row * mi_cols + mi_col;
+ if (this_mi[offset].bsize != BLOCK_INVALID) {
+ continue;
+ }
+ // Get info of this MI
+ if (aom_codec_control(&ctx->decoder, AV1D_GET_MI_INFO, mi_row, mi_col,
+ &cur_mi_info) != AOM_CODEC_OK) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read mi info.");
+ }
+ const int blk_mi_rows = mi_size_high[cur_mi_info.bsize];
+ const int blk_mi_cols = mi_size_wide[cur_mi_info.bsize];
+
+ for (int h = 0; h < blk_mi_rows; h++) {
+ for (int w = 0; w < blk_mi_cols; w++) {
+ if (h + mi_row >= mi_rows || w + mi_col >= mi_cols) {
+ continue;
+ }
+ const int this_offset = offset + h * mi_cols + w;
+ this_mi[this_offset].bsize = cur_mi_info.bsize;
+ this_mi[this_offset].partition = cur_mi_info.partition;
+ this_mi[this_offset].mi_row_start = mi_row;
+ this_mi[this_offset].mi_col_start = mi_col;
+ this_mi[this_offset].mv[0] = cur_mi_info.mv[0];
+ this_mi[this_offset].mv[1] = cur_mi_info.mv[1];
+ this_mi[this_offset].ref_frame[0] = cur_mi_info.ref_frame[0];
+ this_mi[this_offset].ref_frame[1] = cur_mi_info.ref_frame[1];
+ this_mi[this_offset].pred_mode = cur_mi_info.mode;
+ }
+ }
+ }
+ }
+
+ ctx->frame_info_count++;
+
+ return 0;
+}
+
+#define USE_SECOND_PASS_FILE 1
+
+#if !USE_SECOND_PASS_FILE
+// Parse the frames in the gop and determine the last frame of the current GOP.
+// Decode more frames if necessary. The variable max_num is the maximum static
+// GOP length if we detect an IPPP structure, and it is expected that max_mum >=
+// MAX_GF_INTERVAL.
+static void get_current_gop_end(THIRD_PASS_DEC_CTX *ctx, int max_num,
+ int *last_idx) {
+ assert(max_num >= MAX_GF_INTERVAL);
+ *last_idx = 0;
+ int cur_idx = 0;
+ int arf_order_hint = -1;
+ int num_show_frames = 0;
+ while (num_show_frames < max_num) {
+ assert(cur_idx < MAX_THIRD_PASS_BUF);
+ // Read in from bitstream if needed.
+ if (cur_idx >= ctx->frame_info_count) {
+ int ret = get_frame_info(ctx);
+ if (ret == 1) {
+ // At the end of the file, GOP ends in the prev frame.
+ if (arf_order_hint >= 0) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to derive GOP length.");
+ }
+ *last_idx = cur_idx - 1;
+ return;
+ }
+ if (ret < 0) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read frame for third pass.");
+ }
+ }
+
+ // TODO(bohanli): verify that fwd_kf works here.
+ if (ctx->frame_info[cur_idx].frame_type == KEY_FRAME &&
+ ctx->frame_info[cur_idx].is_show_frame) {
+ if (cur_idx != 0) {
+ // If this is a key frame and is not the first kf in this kf group, we
+ // have reached the next key frame. Stop here.
+ *last_idx = cur_idx - 1;
+ return;
+ }
+ } else if (!ctx->frame_info[cur_idx].is_show_frame &&
+ arf_order_hint == -1) {
+ // If this is an arf (the first no show)
+ if (num_show_frames <= 1) {
+ // This is an arf and we should end the GOP with its overlay.
+ arf_order_hint = ctx->frame_info[cur_idx].order_hint;
+ } else {
+ // There are multiple show frames before the this arf, so we treat the
+ // frames previous to this arf as a GOP.
+ *last_idx = cur_idx - 1;
+ return;
+ }
+ } else if (arf_order_hint >= 0 && ctx->frame_info[cur_idx].order_hint ==
+ (unsigned int)arf_order_hint) {
+ // If this is the overlay/show existing of the arf
+ assert(ctx->frame_info[cur_idx].is_show_frame);
+ *last_idx = cur_idx;
+ return;
+ } else {
+ // This frame is part of the GOP.
+ if (ctx->frame_info[cur_idx].is_show_frame) num_show_frames++;
+ }
+ cur_idx++;
+ }
+ // This is a long IPPP GOP and we will use a length of max_num here.
+ assert(arf_order_hint < 0);
+ *last_idx = max_num - 1;
+ return;
+}
+#endif
+
+static AOM_INLINE void read_gop_frames(THIRD_PASS_DEC_CTX *ctx) {
+ int cur_idx = 0;
+ while (cur_idx < ctx->gop_info.num_frames) {
+ assert(cur_idx < MAX_THIRD_PASS_BUF);
+ // Read in from bitstream if needed.
+ if (cur_idx >= ctx->frame_info_count) {
+ int ret = get_frame_info(ctx);
+ if (ret != 0) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Failed to read frame for third pass.");
+ }
+ }
+ cur_idx++;
+ }
+ return;
+}
+
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) {
+ // Read in future frames in the current GOP.
+ read_gop_frames(ctx);
+
+ int gf_len = 0;
+ // Check the GOP length against the value read from second_pass_file
+ for (int i = 0; i < ctx->gop_info.num_frames; i++) {
+ if (ctx->frame_info[i].is_show_frame) gf_len++;
+ }
+
+ if (gf_len != ctx->gop_info.gf_length) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Mismatch in third pass GOP length!");
+ }
+}
+
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) {
+ if (ctx->frame_info_count == 0) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "No available frame info for third pass.");
+ }
+ ctx->frame_info_count--;
+ free_frame_info(&ctx->frame_info[0]);
+ for (int i = 0; i < ctx->frame_info_count; i++) {
+ ctx->frame_info[i] = ctx->frame_info[i + 1];
+ }
+ ctx->frame_info[ctx->frame_info_count].mi_info = NULL;
+}
+
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+ const char *file) {
+ av1_free_thirdpass_ctx(*ctx);
+ CHECK_MEM_ERROR(cm, *ctx, aom_calloc(1, sizeof(**ctx)));
+ THIRD_PASS_DEC_CTX *ctx_ptr = *ctx;
+ ctx_ptr->input_file_name = file;
+ ctx_ptr->prev_gop_end = -1;
+ ctx_ptr->err_info = cm->error;
+}
+
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) {
+ if (ctx == NULL) return;
+ if (ctx->decoder.iface) {
+ aom_codec_destroy(&ctx->decoder);
+ }
+ if (ctx->input_ctx && ctx->input_ctx->file) fclose(ctx->input_ctx->file);
+ aom_free(ctx->input_ctx);
+ if (ctx->buf) free(ctx->buf);
+ for (int i = 0; i < MAX_THIRD_PASS_BUF; i++) {
+ free_frame_info(&ctx->frame_info[i]);
+ }
+ aom_free(ctx);
+}
+
+void av1_write_second_pass_gop_info(AV1_COMP *cpi) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+
+ if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) {
+ // Write the GOP length to a log file.
+ av1_open_second_pass_log(cpi, 0);
+
+ THIRD_PASS_GOP_INFO gop_info;
+
+ gop_info.num_frames = gf_group->size;
+ gop_info.use_arf = (gf_group->arf_index >= 0);
+ gop_info.gf_length = p_rc->baseline_gf_interval;
+
+ size_t count =
+ fwrite(&gop_info, sizeof(gop_info), 1, cpi->second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not write to second pass log file!");
+ }
+ }
+}
+
+void av1_write_second_pass_per_frame_info(AV1_COMP *cpi, int gf_index) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+ if (oxcf->pass == AOM_RC_SECOND_PASS && oxcf->second_pass_log) {
+ // write target bitrate
+ int bits = gf_group->bit_allocation[gf_index];
+ size_t count = fwrite(&bits, sizeof(bits), 1, cpi->second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not write to second pass log file!");
+ }
+
+ // write sse
+ uint64_t sse = 0;
+ int pkt_idx = cpi->ppi->output_pkt_list->cnt - 1;
+ if (pkt_idx >= 0 &&
+ cpi->ppi->output_pkt_list->pkts[pkt_idx].kind == AOM_CODEC_PSNR_PKT) {
+ sse = cpi->ppi->output_pkt_list->pkts[pkt_idx].data.psnr.sse[0];
+#if CONFIG_INTERNAL_STATS
+ } else if (cpi->ppi->b_calculate_psnr) {
+ sse = cpi->ppi->total_sq_error[0];
+#endif
+ } else {
+ const YV12_BUFFER_CONFIG *orig = cpi->source;
+ const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+ PSNR_STATS psnr;
+#if CONFIG_AV1_HIGHBITDEPTH
+ const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+ aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+ aom_calc_psnr(orig, recon, &psnr);
+#endif
+ sse = psnr.sse[0];
+ }
+
+ count = fwrite(&sse, sizeof(sse), 1, cpi->second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not write to second pass log file!");
+ }
+
+ // write bpm_factor
+ double factor = cpi->ppi->twopass.bpm_factor;
+ count = fwrite(&factor, sizeof(factor), 1, cpi->second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not write to second pass log file!");
+ }
+ }
+}
+void av1_open_second_pass_log(AV1_COMP *cpi, int is_read) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ if (oxcf->second_pass_log == NULL) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_INVALID_PARAM,
+ "No second pass log file specified for the third pass!");
+ }
+ // Read the GOP length from a file.
+ if (!cpi->second_pass_log_stream) {
+ if (is_read) {
+ cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "rb");
+ } else {
+ cpi->second_pass_log_stream = fopen(cpi->oxcf.second_pass_log, "wb");
+ }
+ if (!cpi->second_pass_log_stream) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not open second pass log file!");
+ }
+ }
+}
+
+void av1_close_second_pass_log(AV1_COMP *cpi) {
+ if (cpi->second_pass_log_stream) {
+ int ret = fclose(cpi->second_pass_log_stream);
+ if (ret != 0) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+ "Could not close second pass log file!");
+ }
+ cpi->second_pass_log_stream = 0;
+ }
+}
+
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+ THIRD_PASS_GOP_INFO *gop_info,
+ struct aom_internal_error_info *error) {
+ size_t count = fread(gop_info, sizeof(*gop_info), 1, second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(error, AOM_CODEC_ERROR,
+ "Could not read from second pass log file!");
+ }
+}
+
+void av1_read_second_pass_per_frame_info(
+ FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr,
+ int frame_info_count, struct aom_internal_error_info *error) {
+ for (int i = 0; i < frame_info_count; i++) {
+ // read target bits
+ int bits = 0;
+ size_t count = fread(&bits, sizeof(bits), 1, second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(error, AOM_CODEC_ERROR,
+ "Could not read from second pass log file!");
+ }
+ frame_info_arr[i].bits_allocated = bits;
+
+ // read distortion
+ uint64_t sse;
+ count = fread(&sse, sizeof(sse), 1, second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(error, AOM_CODEC_ERROR,
+ "Could not read from second pass log file!");
+ }
+ frame_info_arr[i].sse = sse;
+
+ // read bpm factor
+ double factor;
+ count = fread(&factor, sizeof(factor), 1, second_pass_log_stream);
+ if (count < 1) {
+ aom_internal_error(error, AOM_CODEC_ERROR,
+ "Could not read from second pass log file!");
+ }
+ frame_info_arr[i].bpm_factor = factor;
+ }
+}
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) {
+ if (ctx == NULL) return -1;
+ int use_arf = 0;
+ for (int i = 0; i < ctx->gop_info.gf_length; i++) {
+ if (ctx->frame_info[i].order_hint != 0 &&
+ ctx->frame_info[i].is_show_frame == 0) {
+ use_arf = 1;
+ }
+ }
+ if (use_arf != ctx->gop_info.use_arf) {
+ aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+ "Mismatch in third pass GOP length!");
+ }
+ return use_arf;
+}
+
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+ int fwidth, double *ratio_h, double *ratio_w) {
+ assert(ctx);
+ assert(fidx < ctx->frame_info_count);
+ const int fheight_second_pass = ctx->frame_info[fidx].height;
+ const int fwidth_second_pass = ctx->frame_info[fidx].width;
+ assert(fheight_second_pass <= fheight && fwidth_second_pass <= fwidth);
+
+ *ratio_h = (double)fheight / fheight_second_pass;
+ *ratio_w = (double)fwidth / fwidth_second_pass;
+}
+
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+ int mi_row, int mi_col,
+ double ratio_h, double ratio_w) {
+ assert(ctx);
+ assert(fidx < ctx->frame_info_count);
+
+ const int mi_rows_second_pass = ctx->frame_info[fidx].mi_rows;
+ const int mi_cols_second_pass = ctx->frame_info[fidx].mi_cols;
+
+ const int mi_row_second_pass =
+ clamp((int)round(mi_row / ratio_h), 0, mi_rows_second_pass - 1);
+ const int mi_col_second_pass =
+ clamp((int)round(mi_col / ratio_w), 0, mi_cols_second_pass - 1);
+
+ const int mi_stride_second_pass = ctx->frame_info[fidx].mi_stride;
+ THIRD_PASS_MI_INFO *this_mi = ctx->frame_info[fidx].mi_info +
+ mi_row_second_pass * mi_stride_second_pass +
+ mi_col_second_pass;
+ return this_mi;
+}
+
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+ double ratio_h, double ratio_w, int *mi_row,
+ int *mi_col) {
+ *mi_row = (int)round(third_pass_mi->mi_row_start * ratio_h);
+ *mi_col = (int)round(third_pass_mi->mi_col_start * ratio_w);
+}
+
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h, double ratio_w,
+ MV_REFERENCE_FRAME frame) {
+ assert(this_mi != NULL);
+ int_mv cur_mv;
+ cur_mv.as_int = INVALID_MV;
+
+ if (frame < LAST_FRAME || frame > ALTREF_FRAME) return cur_mv;
+
+ for (int r = 0; r < 2; r++) {
+ if (this_mi->ref_frame[r] == frame) {
+ cur_mv.as_mv.row = (int16_t)round(this_mi->mv[r].as_mv.row * ratio_h);
+ cur_mv.as_mv.col = (int16_t)round(this_mi->mv[r].as_mv.col * ratio_w);
+ }
+ }
+
+ return cur_mv;
+}
+
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h,
+ double ratio_w) {
+ assert(this_mi != NULL);
+ BLOCK_SIZE bsize = BLOCK_INVALID;
+
+ const BLOCK_SIZE bsize_second_pass = this_mi->bsize;
+ assert(bsize_second_pass != BLOCK_INVALID);
+
+ const int w_second_pass = block_size_wide[bsize_second_pass];
+ const int h_second_pass = block_size_high[bsize_second_pass];
+
+ int part_type;
+
+ if (w_second_pass == h_second_pass) {
+ part_type = PARTITION_NONE;
+ } else if (w_second_pass / h_second_pass == 2) {
+ part_type = PARTITION_HORZ;
+ } else if (w_second_pass / h_second_pass == 4) {
+ part_type = PARTITION_HORZ_4;
+ } else if (h_second_pass / w_second_pass == 2) {
+ part_type = PARTITION_VERT;
+ } else if (h_second_pass / w_second_pass == 4) {
+ part_type = PARTITION_VERT_4;
+ } else {
+ part_type = PARTITION_INVALID;
+ }
+ assert(part_type != PARTITION_INVALID);
+
+ const int w = (int)(round(w_second_pass * ratio_w));
+ const int h = (int)(round(h_second_pass * ratio_h));
+
+ for (int i = 0; i < SQR_BLOCK_SIZES; i++) {
+ const BLOCK_SIZE this_bsize = subsize_lookup[part_type][i];
+ if (this_bsize == BLOCK_INVALID) continue;
+
+ const int this_w = block_size_wide[this_bsize];
+ const int this_h = block_size_high[this_bsize];
+
+ if (this_w >= w && this_h >= h) {
+ // find the smallest block size that contains the mapped block
+ bsize = this_bsize;
+ break;
+ }
+ }
+ if (bsize == BLOCK_INVALID) {
+ // could not find a proper one, just use the largest then.
+ bsize = BLOCK_128X128;
+ }
+
+ return bsize;
+}
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+ THIRD_PASS_MI_INFO *this_mi) {
+ int mi_stride = ctx->frame_info[0].mi_stride;
+
+ int mi_row = this_mi->mi_row_start;
+ int mi_col = this_mi->mi_col_start;
+
+ THIRD_PASS_MI_INFO *corner_mi =
+ &ctx->frame_info[0].mi_info[mi_row * mi_stride + mi_col];
+
+ return corner_mi->partition;
+}
+
+#else // !(CONFIG_THREE_PASS && CONFIG_AV1_DECODER)
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+ const char *file) {
+ (void)ctx;
+ (void)file;
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "To utilize three-pass encoding, libaom must be built "
+ "with CONFIG_THREE_PASS=1 & CONFIG_AV1_DECODER=1.");
+}
+
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) { (void)ctx; }
+
+void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read) {
+ (void)cpi;
+ (void)is_read;
+}
+
+void av1_close_second_pass_log(struct AV1_COMP *cpi) { (void)cpi; }
+
+void av1_write_second_pass_gop_info(struct AV1_COMP *cpi) { (void)cpi; }
+
+void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index) {
+ (void)cpi;
+ (void)gf_index;
+}
+
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+ THIRD_PASS_GOP_INFO *gop_info,
+ struct aom_internal_error_info *error) {
+ (void)second_pass_log_stream;
+ (void)gop_info;
+ (void)error;
+}
+
+void av1_read_second_pass_per_frame_info(
+ FILE *second_pass_log_stream, THIRD_PASS_FRAME_INFO *frame_info_arr,
+ int frame_info_count, struct aom_internal_error_info *error) {
+ (void)second_pass_log_stream;
+ (void)frame_info_arr;
+ (void)frame_info_count;
+ (void)error;
+}
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx) {
+ (void)ctx;
+ return 1;
+}
+
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+ int fwidth, double *ratio_h, double *ratio_w) {
+ (void)ctx;
+ (void)fidx;
+ (void)fheight;
+ (void)fwidth;
+ (void)ratio_h;
+ (void)ratio_w;
+}
+
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+ int mi_row, int mi_col,
+ double ratio_h, double ratio_w) {
+ (void)ctx;
+ (void)fidx;
+ (void)mi_row;
+ (void)mi_col;
+ (void)ratio_h;
+ (void)ratio_w;
+ return NULL;
+}
+
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h, double ratio_w,
+ MV_REFERENCE_FRAME frame) {
+ (void)this_mi;
+ (void)ratio_h;
+ (void)ratio_w;
+ (void)frame;
+ int_mv mv;
+ mv.as_int = INVALID_MV;
+ return mv;
+}
+
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h,
+ double ratio_w) {
+ (void)this_mi;
+ (void)ratio_h;
+ (void)ratio_w;
+ return BLOCK_INVALID;
+}
+
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+ double ratio_h, double ratio_w, int *mi_row,
+ int *mi_col) {
+ (void)third_pass_mi;
+ (void)ratio_h;
+ (void)ratio_w;
+ (void)mi_row;
+ (void)mi_col;
+}
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+ THIRD_PASS_MI_INFO *this_mi) {
+ (void)ctx;
+ (void)this_mi;
+ return PARTITION_INVALID;
+}
+#endif // CONFIG_THREE_PASS && CONFIG_AV1_DECODER
+
+#if CONFIG_BITRATE_ACCURACY
+static void fwrite_and_check(const void *ptr, size_t size, size_t nmemb,
+ FILE *stream,
+ struct aom_internal_error_info *error) {
+ size_t count = fwrite(ptr, size, nmemb, stream);
+ if (count < nmemb) {
+ aom_internal_error(error, AOM_CODEC_ERROR, "fwrite_and_check failed\n");
+ }
+}
+
+static void fread_and_check(void *ptr, size_t size, size_t nmemb, FILE *stream,
+ struct aom_internal_error_info *error) {
+ size_t count = fread(ptr, size, nmemb, stream);
+ if (count < nmemb) {
+ aom_internal_error(error, AOM_CODEC_ERROR, "fread_and_check failed\n");
+ }
+}
+
+void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group,
+ const TplParams *tpl_data) {
+ tpl_info->tpl_ready = tpl_data->ready;
+ if (tpl_info->tpl_ready) {
+ tpl_info->gf_length = gf_group->size;
+ for (int i = 0; i < tpl_info->gf_length; ++i) {
+ tpl_info->txfm_stats_list[i] = tpl_data->txfm_stats_list[i];
+ tpl_info->qstep_ratio_ls[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+ tpl_info->update_type_list[i] = gf_group->update_type[i];
+ }
+ }
+}
+
+void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream,
+ struct aom_internal_error_info *error) {
+ fwrite_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1,
+ log_stream, error);
+ if (tpl_info->tpl_ready) {
+ fwrite_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1,
+ log_stream, error);
+ assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS);
+ fwrite_and_check(&tpl_info->txfm_stats_list,
+ sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length,
+ log_stream, error);
+ fwrite_and_check(&tpl_info->qstep_ratio_ls,
+ sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length,
+ log_stream, error);
+ fwrite_and_check(&tpl_info->update_type_list,
+ sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length,
+ log_stream, error);
+ }
+}
+
+void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream,
+ struct aom_internal_error_info *error) {
+ av1_zero(*tpl_info);
+ fread_and_check(&tpl_info->tpl_ready, sizeof(tpl_info->tpl_ready), 1,
+ log_stream, error);
+ if (tpl_info->tpl_ready) {
+ fread_and_check(&tpl_info->gf_length, sizeof(tpl_info->gf_length), 1,
+ log_stream, error);
+ assert(tpl_info->gf_length <= MAX_LENGTH_TPL_FRAME_STATS);
+ fread_and_check(&tpl_info->txfm_stats_list,
+ sizeof(tpl_info->txfm_stats_list[0]), tpl_info->gf_length,
+ log_stream, error);
+ fread_and_check(&tpl_info->qstep_ratio_ls,
+ sizeof(tpl_info->qstep_ratio_ls[0]), tpl_info->gf_length,
+ log_stream, error);
+ fread_and_check(&tpl_info->update_type_list,
+ sizeof(tpl_info->update_type_list[0]), tpl_info->gf_length,
+ log_stream, error);
+ }
+}
+#endif // CONFIG_BITRATE_ACCURACY
diff --git a/third_party/aom/av1/encoder/thirdpass.h b/third_party/aom/av1/encoder/thirdpass.h
new file mode 100644
index 0000000000..8080c06cb6
--- /dev/null
+++ b/third_party/aom/av1/encoder/thirdpass.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_THIRDPASS_H_
+#define AOM_AV1_ENCODER_THIRDPASS_H_
+
+#include "av1/common/enums.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/tpl_model.h"
+
+struct AV1_COMP;
+
+// TODO(bohanli): optimize this number
+#define MAX_THIRD_PASS_BUF \
+ (AOMMAX((2 * MAX_GF_INTERVAL + 1), MAX_STATIC_GF_GROUP_LENGTH))
+
+// Struct to store useful information related to a GOP, in addition to what is
+// available in the bitstream
+typedef struct {
+ int gf_length;
+ int num_frames;
+ int use_arf;
+} THIRD_PASS_GOP_INFO;
+
+#if CONFIG_BITRATE_ACCURACY
+typedef struct TPL_INFO {
+ int gf_length;
+ int tpl_ready;
+ TplTxfmStats txfm_stats_list[MAX_LENGTH_TPL_FRAME_STATS];
+ double qstep_ratio_ls[MAX_LENGTH_TPL_FRAME_STATS];
+ FRAME_UPDATE_TYPE update_type_list[MAX_LENGTH_TPL_FRAME_STATS];
+} TPL_INFO;
+#endif // CONFIG_BITRATE_ACCURACY
+
+typedef struct {
+ BLOCK_SIZE bsize;
+ PARTITION_TYPE partition;
+ int mi_row_start;
+ int mi_col_start;
+ int_mv mv[2];
+ MV_REFERENCE_FRAME ref_frame[2];
+ PREDICTION_MODE pred_mode;
+} THIRD_PASS_MI_INFO;
+
+// Struct to store useful information about a frame for the third pass.
+// The members are extracted from the decoder by function get_frame_info.
+typedef struct {
+ int width;
+ int height;
+ int mi_stride;
+ int mi_rows;
+ int mi_cols;
+ int base_q_idx;
+ int is_show_existing_frame;
+ int is_show_frame;
+ int bits_allocated;
+ int actual_bits;
+ uint64_t sse;
+ double bpm_factor;
+ FRAME_TYPE frame_type;
+ unsigned int order_hint;
+ THIRD_PASS_MI_INFO *mi_info;
+} THIRD_PASS_FRAME_INFO;
+
+typedef struct {
+ /* --- Input and decoding related members --- */
+ // the input file
+ const char *input_file_name;
+#if CONFIG_THREE_PASS
+ // input context
+ struct AvxInputContext *input_ctx;
+#endif
+ // decoder codec context
+ aom_codec_ctx_t decoder;
+ // start of the frame in buf
+ const unsigned char *frame;
+ // end of the frame(s) in buf
+ const unsigned char *end_frame;
+ // whether we still have following frames in buf
+ int have_frame;
+ // pointer to buffer for the read frames
+ uint8_t *buf;
+ // size of data in buffer
+ size_t bytes_in_buffer;
+ // current buffer size
+ size_t buffer_size;
+ // error info pointer
+ struct aom_internal_error_info *err_info;
+
+ int this_frame_bits;
+
+ /* --- Members for third pass encoding --- */
+ // Array to store info about each frame.
+ // frame_info[0] should point to the current frame.
+ THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF];
+ // number of frames available in frame_info
+ int frame_info_count;
+ // the end of the previous GOP (order hint)
+ int prev_gop_end;
+ THIRD_PASS_GOP_INFO gop_info;
+} THIRD_PASS_DEC_CTX;
+
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+ const char *file);
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx);
+
+// Set the GOP structure from the twopass bitstream.
+// TODO(bohanli): this is currently a skeleton and we only return the gop
+// length. This function also saves all frame information in the array
+// ctx->frame_info for this GOP.
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx);
+
+// Pop one frame out of the array ctx->frame_info. This function is used to make
+// sure that frame_info[0] always corresponds to the current frame.
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx);
+
+void av1_open_second_pass_log(struct AV1_COMP *cpi, int is_read);
+void av1_close_second_pass_log(struct AV1_COMP *cpi);
+
+// Write the current GOP information into the second pass log file.
+void av1_write_second_pass_gop_info(struct AV1_COMP *cpi);
+// Write the information of the frames in this GOP into the second pass log
+// file.
+void av1_write_second_pass_per_frame_info(struct AV1_COMP *cpi, int gf_index);
+
+// Read the next GOP information from the second pass log file.
+void av1_read_second_pass_gop_info(FILE *second_pass_log_stream,
+ THIRD_PASS_GOP_INFO *gop_info,
+ struct aom_internal_error_info *error);
+// read the information of the frames in next GOP from the second pass log file.
+void av1_read_second_pass_per_frame_info(FILE *second_pass_log_stream,
+ THIRD_PASS_FRAME_INFO *frame_info_arr,
+ int frame_info_count,
+ struct aom_internal_error_info *error);
+
+int av1_check_use_arf(THIRD_PASS_DEC_CTX *ctx);
+
+// Calculate the ratio of third pass frame dimensions over second pass frame
+// dimensions. Return them in ratio_h and ratio_w.
+void av1_get_third_pass_ratio(THIRD_PASS_DEC_CTX *ctx, int fidx, int fheight,
+ int fwidth, double *ratio_h, double *ratio_w);
+
+// Get the pointer to a second pass mi info, where mi_row and mi_col are the mi
+// location in the thirdpass frame.
+THIRD_PASS_MI_INFO *av1_get_third_pass_mi(THIRD_PASS_DEC_CTX *ctx, int fidx,
+ int mi_row, int mi_col,
+ double ratio_h, double ratio_w);
+
+// Get the adjusted MVs of this_mi, associated with the reference frame. If no
+// MV is found with the reference frame, INVALID_MV is returned.
+int_mv av1_get_third_pass_adjusted_mv(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h, double ratio_w,
+ MV_REFERENCE_FRAME frame);
+
+// Get the adjusted block size of this_mi.
+BLOCK_SIZE av1_get_third_pass_adjusted_blk_size(THIRD_PASS_MI_INFO *this_mi,
+ double ratio_h, double ratio_w);
+
+// Get the adjusted mi position in the third pass frame, of a given
+// third_pass_mi. Location is returned in mi_row and mi_col.
+void av1_third_pass_get_adjusted_mi(THIRD_PASS_MI_INFO *third_pass_mi,
+ double ratio_h, double ratio_w, int *mi_row,
+ int *mi_col);
+
+PARTITION_TYPE av1_third_pass_get_sb_part_type(THIRD_PASS_DEC_CTX *ctx,
+ THIRD_PASS_MI_INFO *this_mi);
+
+#if CONFIG_BITRATE_ACCURACY
+
+void av1_pack_tpl_info(TPL_INFO *tpl_info, const GF_GROUP *gf_group,
+ const TplParams *tpl_data);
+
+void av1_write_tpl_info(const TPL_INFO *tpl_info, FILE *log_stream,
+ struct aom_internal_error_info *error);
+
+void av1_read_tpl_info(TPL_INFO *tpl_info, FILE *log_stream,
+ struct aom_internal_error_info *error);
+
+#endif // CONFIG_BITRATE_ACCURACY
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_THIRDPASS_H_
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
new file mode 100644
index 0000000000..ffac886e32
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/entropy.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+static AOM_INLINE int av1_fast_palette_color_index_context_on_edge(
+ const uint8_t *color_map, int stride, int r, int c, int *color_idx) {
+ const bool has_left = (c - 1 >= 0);
+ const bool has_above = (r - 1 >= 0);
+ assert(r > 0 || c > 0);
+ assert(has_above ^ has_left);
+ assert(color_idx);
+ (void)has_left;
+
+ const uint8_t color_neighbor = has_above
+ ? color_map[(r - 1) * stride + (c - 0)]
+ : color_map[(r - 0) * stride + (c - 1)];
+ // If the neighbor color has higher index than current color index, then we
+ // move up by 1.
+ const uint8_t current_color = *color_idx = color_map[r * stride + c];
+ if (color_neighbor > current_color) {
+ (*color_idx)++;
+ } else if (color_neighbor == current_color) {
+ *color_idx = 0;
+ }
+
+ // Get hash value of context.
+ // The non-diagonal neighbors get a weight of 2.
+ const uint8_t color_score = 2;
+ const uint8_t hash_multiplier = 1;
+ const uint8_t color_index_ctx_hash = color_score * hash_multiplier;
+
+ // Lookup context from hash.
+ const int color_index_ctx =
+ av1_palette_color_index_context_lookup[color_index_ctx_hash];
+ assert(color_index_ctx == 0);
+ (void)color_index_ctx;
+ return 0;
+}
+
+#define SWAP(i, j) \
+ do { \
+ const uint8_t tmp_score = score_rank[i]; \
+ const uint8_t tmp_color = color_rank[i]; \
+ score_rank[i] = score_rank[j]; \
+ color_rank[i] = color_rank[j]; \
+ score_rank[j] = tmp_score; \
+ color_rank[j] = tmp_color; \
+ } while (0)
+#define INVALID_COLOR_IDX (UINT8_MAX)
+
+// A faster version of av1_get_palette_color_index_context used by the encoder
+// exploiting the fact that the encoder does not need to maintain a color order.
+static AOM_INLINE int av1_fast_palette_color_index_context(
+ const uint8_t *color_map, int stride, int r, int c, int *color_idx) {
+ assert(r > 0 || c > 0);
+
+ const bool has_above = (r - 1 >= 0);
+ const bool has_left = (c - 1 >= 0);
+ assert(has_above || has_left);
+ if (has_above ^ has_left) {
+ return av1_fast_palette_color_index_context_on_edge(color_map, stride, r, c,
+ color_idx);
+ }
+
+ // This goes in the order of left, top, and top-left. This has the advantage
+ // that unless anything here are not distinct or invalid, this will already
+ // be in sorted order. Furthermore, if either of the first two is
+ // invalid, we know the last one is also invalid.
+ uint8_t color_neighbors[NUM_PALETTE_NEIGHBORS];
+ color_neighbors[0] = color_map[(r - 0) * stride + (c - 1)];
+ color_neighbors[1] = color_map[(r - 1) * stride + (c - 0)];
+ color_neighbors[2] = color_map[(r - 1) * stride + (c - 1)];
+
+ // Aggregate duplicated values.
+ // Since our array is so small, using a couple if statements is faster
+ uint8_t scores[NUM_PALETTE_NEIGHBORS] = { 2, 2, 1 };
+ uint8_t num_invalid_colors = 0;
+ if (color_neighbors[0] == color_neighbors[1]) {
+ scores[0] += scores[1];
+ color_neighbors[1] = INVALID_COLOR_IDX;
+ num_invalid_colors += 1;
+
+ if (color_neighbors[0] == color_neighbors[2]) {
+ scores[0] += scores[2];
+ num_invalid_colors += 1;
+ }
+ } else if (color_neighbors[0] == color_neighbors[2]) {
+ scores[0] += scores[2];
+ num_invalid_colors += 1;
+ } else if (color_neighbors[1] == color_neighbors[2]) {
+ scores[1] += scores[2];
+ num_invalid_colors += 1;
+ }
+
+ const uint8_t num_valid_colors = NUM_PALETTE_NEIGHBORS - num_invalid_colors;
+
+ uint8_t *color_rank = color_neighbors;
+ uint8_t *score_rank = scores;
+
+ // Sort everything
+ if (num_valid_colors > 1) {
+ if (color_neighbors[1] == INVALID_COLOR_IDX) {
+ scores[1] = scores[2];
+ color_neighbors[1] = color_neighbors[2];
+ }
+
+ // We need to swap the first two elements if they have the same score but
+ // the color indices are not in the right order
+ if (score_rank[0] < score_rank[1] ||
+ (score_rank[0] == score_rank[1] && color_rank[0] > color_rank[1])) {
+ SWAP(0, 1);
+ }
+ if (num_valid_colors > 2) {
+ if (score_rank[0] < score_rank[2]) {
+ SWAP(0, 2);
+ }
+ if (score_rank[1] < score_rank[2]) {
+ SWAP(1, 2);
+ }
+ }
+ }
+
+ // If any of the neighbor colors has higher index than current color index,
+ // then we move up by 1 unless the current color is the same as one of the
+ // neighbors.
+ const uint8_t current_color = *color_idx = color_map[r * stride + c];
+ for (int idx = 0; idx < num_valid_colors; idx++) {
+ if (color_rank[idx] > current_color) {
+ (*color_idx)++;
+ } else if (color_rank[idx] == current_color) {
+ *color_idx = idx;
+ break;
+ }
+ }
+
+ // Get hash value of context.
+ uint8_t color_index_ctx_hash = 0;
+ static const uint8_t hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+ for (int idx = 0; idx < num_valid_colors; ++idx) {
+ color_index_ctx_hash += score_rank[idx] * hash_multipliers[idx];
+ }
+ assert(color_index_ctx_hash > 0);
+ assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+ // Lookup context from hash.
+ const int color_index_ctx = 9 - color_index_ctx_hash;
+ assert(color_index_ctx ==
+ av1_palette_color_index_context_lookup[color_index_ctx_hash]);
+ assert(color_index_ctx >= 0);
+ assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+ return color_index_ctx;
+}
+#undef INVALID_COLOR_IDX
+#undef SWAP
+
+static int cost_and_tokenize_map(Av1ColorMapParam *param, TokenExtra **t,
+ int plane, int calc_rate, int allow_update_cdf,
+ FRAME_COUNTS *counts) {
+ const uint8_t *const color_map = param->color_map;
+ MapCdf map_cdf = param->map_cdf;
+ ColorCost color_cost = param->color_cost;
+ const int plane_block_width = param->plane_width;
+ const int rows = param->rows;
+ const int cols = param->cols;
+ const int n = param->n_colors;
+ const int palette_size_idx = n - PALETTE_MIN_SIZE;
+ int this_rate = 0;
+
+ (void)plane;
+ (void)counts;
+
+ for (int k = 1; k < rows + cols - 1; ++k) {
+ for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
+ int i = k - j;
+ int color_new_idx;
+ const int color_ctx = av1_fast_palette_color_index_context(
+ color_map, plane_block_width, i, j, &color_new_idx);
+ assert(color_new_idx >= 0 && color_new_idx < n);
+ if (calc_rate) {
+ this_rate += color_cost[palette_size_idx][color_ctx][color_new_idx];
+ } else {
+ (*t)->token = color_new_idx;
+ (*t)->color_ctx = color_ctx;
+ ++(*t);
+ if (allow_update_cdf)
+ update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
+#if CONFIG_ENTROPY_STATS
+ if (plane) {
+ ++counts->palette_uv_color_index[palette_size_idx][color_ctx]
+ [color_new_idx];
+ } else {
+ ++counts->palette_y_color_index[palette_size_idx][color_ctx]
+ [color_new_idx];
+ }
+#endif
+ }
+ }
+ }
+ if (calc_rate) return this_rate;
+ return 0;
+}
+
+static void get_palette_params(const MACROBLOCK *const x, int plane,
+ BLOCK_SIZE bsize, Av1ColorMapParam *params) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ params->color_map = xd->plane[plane].color_index_map;
+ params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+ : xd->tile_ctx->palette_y_color_index_cdf;
+ params->color_cost = plane ? x->mode_costs.palette_uv_color_cost
+ : x->mode_costs.palette_y_color_cost;
+ params->n_colors = pmi->palette_size[plane];
+ av1_get_block_dimensions(bsize, plane, xd, &params->plane_width, NULL,
+ &params->rows, &params->cols);
+}
+
+// TODO(any): Remove this function
+static void get_color_map_params(const MACROBLOCK *const x, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ COLOR_MAP_TYPE type,
+ Av1ColorMapParam *params) {
+ (void)tx_size;
+ memset(params, 0, sizeof(*params));
+ switch (type) {
+ case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break;
+ default: assert(0 && "Invalid color map type"); return;
+ }
+}
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+ TX_SIZE tx_size, COLOR_MAP_TYPE type) {
+ assert(plane == 0 || plane == 1);
+ Av1ColorMapParam color_map_params;
+ get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+ return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL);
+}
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+ TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ COLOR_MAP_TYPE type, int allow_update_cdf,
+ FRAME_COUNTS *counts) {
+ assert(plane == 0 || plane == 1);
+ Av1ColorMapParam color_map_params;
+ get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+ // The first color index does not use context or entropy.
+ (*t)->token = color_map_params.color_map[0];
+ (*t)->color_ctx = -1;
+ ++(*t);
+ cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf,
+ counts);
+}
+
+static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size,
+ BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
+ int block, int plane, void *arg) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->bsize, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+
+ if (tx_size == plane_tx_size || plane) {
+ plane_bsize =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+
+ struct tokenize_b_args *args = arg;
+ if (args->allow_update_cdf)
+ av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
+ plane_bsize, tx_size, arg);
+ else
+ av1_record_txb_context(plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, arg);
+
+ } else {
+ // Half the block size in transform block unit.
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int step = bsw * bsh;
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+
+ assert(bsw > 0 && bsh > 0);
+
+ for (int row = 0; row < row_end; row += bsh) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < col_end; col += bsw) {
+ const int offsetc = blk_col + col;
+
+ tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane,
+ arg);
+ block += step;
+ }
+ }
+ }
+}
+
+void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+ uint8_t allow_update_cdf) {
+ assert(bsize < BLOCK_SIZES_ALL);
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+ return;
+
+ const int num_planes = av1_num_planes(cm);
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
+
+ if (mbmi->skip_txfm) {
+ av1_reset_entropy_context(xd, bsize, num_planes);
+ return;
+ }
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (plane && !xd->is_chroma_ref) break;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+ const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+ const int bw = mi_size_wide[txb_size];
+ const int bh = mi_size_high[txb_size];
+ int block = 0;
+ const int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, ss_x, ss_y);
+ int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+ int mu_blocks_high = mi_size_high[max_unit_bsize];
+
+ mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+ for (int idy = 0; idy < mi_height; idy += mu_blocks_high) {
+ for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+ const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+ const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+ for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
+ for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
+ tokenize_vartx(td, max_tx_size, plane_bsize, blk_row, blk_col,
+ block, plane, &arg);
+ block += step;
+ }
+ }
+ }
+ }
+ }
+ if (rate) *rate += arg.this_rate;
+}
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
new file mode 100644
index 0000000000..f675c489ae
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TOKENIZE_H_
+#define AOM_AV1_ENCODER_TOKENIZE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/encoder/block.h"
+#include "aom_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The token and color_ctx members of the TokenExtra structure are used
+// to store the indices of color and color context of each pixel in
+// case of palette mode.
+// 1) token can take values in the range of [0, 7] as maximum number of possible
+// colors is 8 (PALETTE_COLORS). Hence token requires 3 bits (unsigned).
+// 2) The reserved field (1-bit) is positioned such that color_ctx occupies the
+// most significant bits and token occupies the least significant bits of the
+// byte. Thus accesses to token and color_ctx are optimal. If TokenExtra is
+// defined as:
+// typedef struct {
+// int8_t color_ctx : 4;
+// uint8_t token : 3;
+// } TokenExtra;
+// then read of color_ctx requires an extra left shift to facilitate sign
+// extension and write of token requires an extra masking.
+// 3) color_ctx can take 5 (PALETTE_COLOR_INDEX_CONTEXTS) valid values, i.e.,
+// from 0 to 4. As per the current implementation it can take values in the
+// range of [-1, 4]. Here -1 corresponds to invalid color index context and is
+// used for default initialization. Hence color_ctx requires 4 bits (signed).
+typedef struct {
+ uint8_t token : 3;
+ uint8_t reserved : 1;
+ int8_t color_ctx : 4;
+} TokenExtra;
+
+typedef struct {
+ TokenExtra *start;
+ unsigned int count;
+} TokenList;
+
+typedef struct {
+ // Number of tile tokens for which memory is allocated.
+ unsigned int tokens_allocated;
+ // tile_tok[i][j] is a pointer to the buffer storing palette tokens of the ith
+ // tile row, jth tile column.
+ TokenExtra *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+ // tplist[i][j][k] holds the start pointer of tile_tok[i][j] and the count of
+ // palette tokens for the kth superblock row of the ith tile row, jth tile
+ // column.
+ TokenList *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
+} TokenInfo;
+
+struct AV1_COMP;
+struct ThreadData;
+struct FRAME_COUNTS;
+
+enum {
+ OUTPUT_ENABLED = 0,
+ DRY_RUN_NORMAL,
+ DRY_RUN_COSTCOEFFS,
+} UENUM1BYTE(RUN_TYPE);
+
+struct tokenize_b_args {
+ const struct AV1_COMP *cpi;
+ struct ThreadData *td;
+ int this_rate;
+ uint8_t allow_update_cdf;
+ RUN_TYPE dry_run;
+};
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
+void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+ uint8_t allow_update_cdf);
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+ TX_SIZE tx_size, COLOR_MAP_TYPE type);
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+ TokenExtra **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ COLOR_MAP_TYPE type, int allow_update_cdf,
+ struct FRAME_COUNTS *counts);
+
+static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
+ TX_SIZE tx_size) {
+ const int eob_max = av1_get_max_eob(tx_size);
+ return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
+// Token buffer is only used for palette tokens.
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+ int sb_size_log2,
+ const int num_planes) {
+ // Calculate the maximum number of max superblocks in the image.
+ const int shift = sb_size_log2 - 4;
+ const int sb_size = 1 << sb_size_log2;
+ const int sb_size_square = sb_size * sb_size;
+ const int sb_rows = CEIL_POWER_OF_TWO(mb_rows, shift);
+ const int sb_cols = CEIL_POWER_OF_TWO(mb_cols, shift);
+
+ // One palette token for each pixel. There can be palettes on two planes.
+ const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
+
+ return sb_rows * sb_cols * sb_palette_toks;
+}
+
+// Allocate memory for token related info.
+static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info,
+ unsigned int tokens_required) {
+ int sb_rows =
+ CEIL_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+ token_info->tokens_allocated = tokens_required;
+
+ CHECK_MEM_ERROR(cm, token_info->tile_tok[0][0],
+ (TokenExtra *)aom_calloc(
+ tokens_required, sizeof(*token_info->tile_tok[0][0])));
+
+ CHECK_MEM_ERROR(
+ cm, token_info->tplist[0][0],
+ (TokenList *)aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+ sizeof(*token_info->tplist[0][0])));
+}
+
+// Check if memory allocation has been done for token related info.
+static AOM_INLINE bool is_token_info_allocated(const TokenInfo *token_info) {
+ return ((token_info->tile_tok[0][0] != NULL) &&
+ (token_info->tplist[0][0] != NULL));
+}
+
+// Free memory from token related variables.
+static AOM_INLINE void free_token_info(TokenInfo *token_info) {
+ aom_free(token_info->tile_tok[0][0]);
+ token_info->tile_tok[0][0] = NULL;
+
+ aom_free(token_info->tplist[0][0]);
+ token_info->tplist[0][0] = NULL;
+
+ token_info->tokens_allocated = 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TOKENIZE_H_
diff --git a/third_party/aom/av1/encoder/tpl_model.c b/third_party/aom/av1/encoder/tpl_model.c
new file mode 100644
index 0000000000..ca60e4981e
--- /dev/null
+++ b/third_party/aom/av1/encoder/tpl_model.c
@@ -0,0 +1,2511 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <stdint.h>
+
+#include "av1/encoder/thirdpass.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
+
+static INLINE double exp_bounded(double v) {
+ // When v > 700 or <-700, the exp function will be close to overflow
+ // For details, see the "Notes" in the following link.
+ // https://en.cppreference.com/w/c/numeric/math/exp
+ if (v > 700) {
+ return DBL_MAX;
+ } else if (v < -700) {
+ return 0;
+ }
+ return exp(v);
+}
+
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats) {
+ tpl_txfm_stats->ready = 0;
+ tpl_txfm_stats->coeff_num = 256;
+ tpl_txfm_stats->txfm_block_count = 0;
+ memset(tpl_txfm_stats->abs_coeff_sum, 0,
+ sizeof(tpl_txfm_stats->abs_coeff_sum[0]) * tpl_txfm_stats->coeff_num);
+ memset(tpl_txfm_stats->abs_coeff_mean, 0,
+ sizeof(tpl_txfm_stats->abs_coeff_mean[0]) * tpl_txfm_stats->coeff_num);
+}
+
+#if CONFIG_BITRATE_ACCURACY
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+ TplTxfmStats *accumulated_stats) {
+ accumulated_stats->txfm_block_count += sub_stats->txfm_block_count;
+ for (int i = 0; i < accumulated_stats->coeff_num; ++i) {
+ accumulated_stats->abs_coeff_sum[i] += sub_stats->abs_coeff_sum[i];
+ }
+}
+
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+ const tran_low_t *coeff) {
+ // For transform larger than 16x16, the scale of coeff need to be adjusted.
+ // It's not LOSSLESS_Q_STEP.
+ assert(tpl_txfm_stats->coeff_num <= 256);
+ for (int i = 0; i < tpl_txfm_stats->coeff_num; ++i) {
+ tpl_txfm_stats->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP;
+ }
+ ++tpl_txfm_stats->txfm_block_count;
+}
+
+void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats) {
+ if (txfm_stats->txfm_block_count > 0) {
+ for (int j = 0; j < txfm_stats->coeff_num; j++) {
+ txfm_stats->abs_coeff_mean[j] =
+ txfm_stats->abs_coeff_sum[j] / txfm_stats->txfm_block_count;
+ }
+ txfm_stats->ready = 1;
+ } else {
+ txfm_stats->ready = 0;
+ }
+}
+
+static AOM_INLINE void av1_tpl_store_txfm_stats(
+ TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats,
+ const int frame_index) {
+ tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats;
+}
+#endif // CONFIG_BITRATE_ACCURACY
+
+static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
+ const tran_low_t *coeff,
+ tran_low_t *qcoeff,
+ tran_low_t *dqcoeff, TX_SIZE tx_size,
+ uint16_t *eob, int64_t *recon_error,
+ int64_t *sse) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+ int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+ const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+ QUANT_PARAM quant_param;
+ av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob,
+ scan_order, &quant_param);
+ *recon_error =
+ av1_highbd_block_error(coeff, dqcoeff, pix_num, sse, xd->bd) >> shift;
+ } else {
+ av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
+ &quant_param);
+ *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+ }
+#else
+ (void)xd;
+ av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, eob, scan_order,
+ &quant_param);
+ *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+ *recon_error = AOMMAX(*recon_error, 1);
+
+ *sse = (*sse) >> shift;
+ *sse = AOMMAX(*sse, 1);
+}
+
+static AOM_INLINE void set_tpl_stats_block_size(uint8_t *block_mis_log2,
+ uint8_t *tpl_bsize_1d) {
+ // tpl stats bsize: 2 means 16x16
+ *block_mis_log2 = 2;
+ // Block size used in tpl motion estimation
+ *tpl_bsize_1d = 16;
+ // MIN_TPL_BSIZE_1D = 16;
+ assert(*tpl_bsize_1d >= 16);
+}
+
+void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi,
+ CommonModeInfoParams *const mi_params, int width,
+ int height, int byte_alignment, int lag_in_frames) {
+ SequenceHeader *const seq_params = &ppi->seq_params;
+ TplParams *const tpl_data = &ppi->tpl_data;
+ set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
+ &tpl_data->tpl_bsize_1d);
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+ tpl_data->border_in_pixels =
+ ALIGN_POWER_OF_TWO(tpl_data->tpl_bsize_1d + 2 * AOM_INTERP_EXTEND, 5);
+
+ const int alloc_y_plane_only =
+ ppi->cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : 0;
+ for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
+ const int mi_cols =
+ ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2);
+ const int mi_rows =
+ ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2);
+ TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame];
+ tpl_frame->is_valid = 0;
+ tpl_frame->width = mi_cols >> block_mis_log2;
+ tpl_frame->height = mi_rows >> block_mis_log2;
+ tpl_frame->stride = tpl_data->tpl_stats_buffer[frame].width;
+ tpl_frame->mi_rows = mi_params->mi_rows;
+ tpl_frame->mi_cols = mi_params->mi_cols;
+ }
+ tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
+
+ // If lag_in_frames <= 1, TPL module is not invoked. Hence dynamic memory
+ // allocations are avoided for buffers in tpl_data.
+ if (lag_in_frames <= 1) return;
+
+ AOM_CHECK_MEM_ERROR(&ppi->error, tpl_data->txfm_stats_list,
+ aom_calloc(MAX_LENGTH_TPL_FRAME_STATS,
+ sizeof(*tpl_data->txfm_stats_list)));
+
+ for (int frame = 0; frame < lag_in_frames; ++frame) {
+ AOM_CHECK_MEM_ERROR(
+ &ppi->error, tpl_data->tpl_stats_pool[frame],
+ aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
+ tpl_data->tpl_stats_buffer[frame].height,
+ sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
+
+ if (aom_alloc_frame_buffer(
+ &tpl_data->tpl_rec_pool[frame], width, height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, tpl_data->border_in_pixels,
+ byte_alignment, 0, alloc_y_plane_only))
+ aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+}
+
+static AOM_INLINE int32_t tpl_get_satd_cost(BitDepthInfo bd_info,
+ int16_t *src_diff, int diff_stride,
+ const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ tran_low_t *coeff, int bw, int bh,
+ TX_SIZE tx_size) {
+ const int pix_num = bw * bh;
+
+ av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+ dst, dst_stride);
+ av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
+ return aom_satd(coeff, pix_num);
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+ const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+
+ assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+ int rate_cost = 1;
+
+ for (int idx = 0; idx < eob; ++idx) {
+ unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+ rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
+ }
+
+ return (rate_cost << AV1_PROB_COST_SHIFT);
+}
+
+static AOM_INLINE void txfm_quant_rdcost(
+ const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size,
+ int do_recon, int *rate_cost, int64_t *recon_error, int64_t *sse) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ uint16_t eob;
+ av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+ dst, dst_stride);
+ av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
+
+ get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error,
+ sse);
+
+ *rate_cost = rate_estimator(qcoeff, eob, tx_size);
+
+ if (do_recon)
+ av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst,
+ dst_stride, eob, 0);
+}
+
+static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
+ uint8_t *cur_frame_buf,
+ uint8_t *ref_frame_buf, int stride,
+ int ref_stride, int width, int ref_width,
+ BLOCK_SIZE bsize, MV center_mv,
+ int_mv *best_mv) {
+ AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
+ int step_param;
+ uint32_t bestsme = UINT_MAX;
+ FULLPEL_MV_STATS best_mv_stats;
+ int distortion;
+ uint32_t sse;
+ int cost_list[5];
+ FULLPEL_MV start_mv = get_fullmv_from_mv(&center_mv);
+
+ // Setup frame pointers
+ x->plane[0].src.buf = cur_frame_buf;
+ x->plane[0].src.stride = stride;
+ x->plane[0].src.width = width;
+ xd->plane[0].pre[0].buf = ref_frame_buf;
+ xd->plane[0].pre[0].stride = ref_stride;
+ xd->plane[0].pre[0].width = ref_width;
+
+ step_param = tpl_sf->reduce_first_step_size;
+ step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ const search_site_config *search_site_cfg =
+ cpi->mv_search_params.search_site_cfg[SS_CFG_SRC];
+ if (search_site_cfg->stride != ref_stride)
+ search_site_cfg = cpi->mv_search_params.search_site_cfg[SS_CFG_LOOKAHEAD];
+ assert(search_site_cfg->stride == ref_stride);
+
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
+ start_mv, search_site_cfg,
+ tpl_sf->search_method,
+ /*fine_search_interval=*/0);
+
+ bestsme = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list),
+ &best_mv->as_fullmv, &best_mv_stats, NULL);
+
+ // When sub-pel motion search is skipped, populate sub-pel precision MV and
+ // return.
+ if (tpl_sf->subpel_force_stop == FULL_PEL) {
+ best_mv->as_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+ return bestsme;
+ }
+
+ SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+ av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &center_mv,
+ cost_list);
+ ms_params.forced_stop = tpl_sf->subpel_force_stop;
+ ms_params.var_params.subpel_search_type = USE_2_TAPS;
+ ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+ best_mv_stats.err_cost = 0;
+ MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+ assert(av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
+ bestsme = cpi->mv_search_params.find_fractional_mv_step(
+ xd, cm, &ms_params, subpel_start_mv, &best_mv_stats, &best_mv->as_mv,
+ &distortion, &sse, NULL);
+
+ return bestsme;
+}
+
+typedef struct {
+ int_mv mv;
+ int sad;
+} center_mv_t;
+
+static int compare_sad(const void *a, const void *b) {
+ const int diff = ((center_mv_t *)a)->sad - ((center_mv_t *)b)->sad;
+ if (diff < 0)
+ return -1;
+ else if (diff > 0)
+ return 1;
+ return 0;
+}
+
+static int is_alike_mv(int_mv candidate_mv, center_mv_t *center_mvs,
+ int center_mvs_count, int skip_alike_starting_mv) {
+ // MV difference threshold is in 1/8 precision.
+ const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) };
+ int thr = mv_diff_thr[skip_alike_starting_mv];
+ int i;
+
+ for (i = 0; i < center_mvs_count; i++) {
+ if (abs(center_mvs[i].mv.as_mv.col - candidate_mv.as_mv.col) < thr &&
+ abs(center_mvs[i].mv.as_mv.row - candidate_mv.as_mv.row) < thr)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void get_rate_distortion(
+ int *rate_cost, int64_t *recon_error, int64_t *pred_error,
+ int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff,
+ tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x,
+ const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3],
+ const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode,
+ int mi_row, int mi_col, int use_y_only_rate_distortion, int do_recon,
+ TplTxfmStats *tpl_txfm_stats) {
+ const SequenceHeader *seq_params = cm->seq_params;
+ *rate_cost = 0;
+ *recon_error = 1;
+ *pred_error = 1;
+
+ (void)tpl_txfm_stats;
+
+ MACROBLOCKD *xd = &x->e_mbd;
+ int is_compound = (best_mode == NEW_NEWMV);
+ int num_planes = use_y_only_rate_distortion ? 1 : MAX_MB_PLANE;
+
+ uint8_t *src_buffer_pool[MAX_MB_PLANE] = {
+ xd->cur_buf->y_buffer,
+ xd->cur_buf->u_buffer,
+ xd->cur_buf->v_buffer,
+ };
+ const int src_stride_pool[MAX_MB_PLANE] = {
+ xd->cur_buf->y_stride,
+ xd->cur_buf->uv_stride,
+ xd->cur_buf->uv_stride,
+ };
+
+ const int_interpfilters kernel =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ BLOCK_SIZE bsize_plane =
+ av1_ss_size_lookup[txsize_to_bsize[tx_size]][pd->subsampling_x]
+ [pd->subsampling_y];
+
+ int dst_buffer_stride = rec_stride_pool[plane];
+ int dst_mb_offset =
+ ((mi_row * MI_SIZE * dst_buffer_stride) >> pd->subsampling_y) +
+ ((mi_col * MI_SIZE) >> pd->subsampling_x);
+ uint8_t *dst_buffer = rec_buffer_pool[plane] + dst_mb_offset;
+ for (int ref = 0; ref < 1 + is_compound; ++ref) {
+ if (!is_inter_mode(best_mode)) {
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize_plane], block_size_high[bsize_plane],
+ max_txsize_rect_lookup[bsize_plane], best_mode, 0, 0,
+ FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, dst_buffer,
+ dst_buffer_stride, 0, 0, plane);
+ } else {
+ int_mv best_mv = xd->mi[0]->mv[ref];
+ uint8_t *ref_buffer_pool[MAX_MB_PLANE] = {
+ ref_frame_ptr[ref]->y_buffer,
+ ref_frame_ptr[ref]->u_buffer,
+ ref_frame_ptr[ref]->v_buffer,
+ };
+ InterPredParams inter_pred_params;
+ struct buf_2d ref_buf = {
+ NULL, ref_buffer_pool[plane],
+ plane ? ref_frame_ptr[ref]->uv_width : ref_frame_ptr[ref]->y_width,
+ plane ? ref_frame_ptr[ref]->uv_height : ref_frame_ptr[ref]->y_height,
+ plane ? ref_frame_ptr[ref]->uv_stride : ref_frame_ptr[ref]->y_stride
+ };
+ av1_init_inter_params(&inter_pred_params, block_size_wide[bsize_plane],
+ block_size_high[bsize_plane],
+ (mi_row * MI_SIZE) >> pd->subsampling_y,
+ (mi_col * MI_SIZE) >> pd->subsampling_x,
+ pd->subsampling_x, pd->subsampling_y, xd->bd,
+ is_cur_buf_hbd(xd), 0,
+ xd->block_ref_scale_factors[0], &ref_buf, kernel);
+ if (is_compound) av1_init_comp_mode(&inter_pred_params);
+ inter_pred_params.conv_params = get_conv_params_no_round(
+ ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+ av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride,
+ &best_mv.as_mv, &inter_pred_params);
+ }
+ }
+
+ int src_stride = src_stride_pool[plane];
+ int src_mb_offset = ((mi_row * MI_SIZE * src_stride) >> pd->subsampling_y) +
+ ((mi_col * MI_SIZE) >> pd->subsampling_x);
+
+ int this_rate = 1;
+ int64_t this_recon_error = 1;
+ int64_t sse;
+ txfm_quant_rdcost(
+ x, src_diff, block_size_wide[bsize_plane],
+ src_buffer_pool[plane] + src_mb_offset, src_stride, dst_buffer,
+ dst_buffer_stride, coeff, qcoeff, dqcoeff, block_size_wide[bsize_plane],
+ block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane],
+ do_recon, &this_rate, &this_recon_error, &sse);
+
+#if CONFIG_BITRATE_ACCURACY
+ if (plane == 0 && tpl_txfm_stats) {
+ // We only collect Y plane's transform coefficient
+ av1_record_tpl_txfm_block(tpl_txfm_stats, coeff);
+ }
+#endif // CONFIG_BITRATE_ACCURACY
+
+ *recon_error += this_recon_error;
+ *pred_error += sse;
+ *rate_cost += this_rate;
+ }
+}
+
+static AOM_INLINE int32_t get_inter_cost(const AV1_COMP *cpi, MACROBLOCKD *xd,
+ const uint8_t *src_mb_buffer,
+ int src_stride,
+ TplBuffers *tpl_tmp_buffers,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int mi_row, int mi_col, int rf_idx,
+ MV *rfidx_mv, int use_pred_sad) {
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ TplParams *tpl_data = &cpi->ppi->tpl_data;
+ const YV12_BUFFER_CONFIG *const ref_frame_ptr =
+ tpl_data->src_ref_frame[rf_idx];
+ int16_t *src_diff = tpl_tmp_buffers->src_diff;
+ tran_low_t *coeff = tpl_tmp_buffers->coeff;
+ const int bw = 4 << mi_size_wide_log2[bsize];
+ const int bh = 4 << mi_size_high_log2[bsize];
+ int32_t inter_cost;
+
+ if (cpi->sf.tpl_sf.subpel_force_stop != FULL_PEL) {
+ const int_interpfilters kernel =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ uint8_t *predictor8 = tpl_tmp_buffers->predictor8;
+ uint8_t *predictor =
+ is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+ struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
+ ref_frame_ptr->y_width, ref_frame_ptr->y_height,
+ ref_frame_ptr->y_stride };
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+ mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+ &tpl_data->sf, &ref_buf, kernel);
+ inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+ av1_enc_build_one_inter_predictor(predictor, bw, rfidx_mv,
+ &inter_pred_params);
+
+ if (use_pred_sad) {
+ inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(src_mb_buffer, src_stride,
+ predictor, bw);
+ } else {
+ inter_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ }
+ } else {
+ int ref_mb_offset =
+ mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
+ uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
+ int ref_stride = ref_frame_ptr->y_stride;
+ const FULLPEL_MV fullmv = get_fullmv_from_mv(rfidx_mv);
+ // Since sub-pel motion search is not performed, use the prediction pixels
+ // directly from the reference block ref_mb
+ if (use_pred_sad) {
+ inter_cost = (int)cpi->ppi->fn_ptr[bsize].sdf(
+ src_mb_buffer, src_stride,
+ &ref_mb[fullmv.row * ref_stride + fullmv.col], ref_stride);
+ } else {
+ inter_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ &ref_mb[fullmv.row * ref_stride + fullmv.col],
+ ref_stride, coeff, bw, bh, tx_size);
+ }
+ }
+ return inter_cost;
+}
+
+static AOM_INLINE void mode_estimation(AV1_COMP *cpi,
+ TplTxfmStats *tpl_txfm_stats,
+ TplBuffers *tpl_tmp_buffers,
+ MACROBLOCK *x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ TplDepStats *tpl_stats) {
+ AV1_COMMON *cm = &cpi->common;
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
+
+ (void)gf_group;
+
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ TplParams *tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+ const int bw = 4 << mi_size_wide_log2[bsize];
+ const int bh = 4 << mi_size_high_log2[bsize];
+
+ int frame_offset = tpl_data->frame_idx - cpi->gf_frame_index;
+
+ int32_t best_intra_cost = INT32_MAX;
+ int32_t intra_cost;
+ PREDICTION_MODE best_mode = DC_PRED;
+
+ const int mb_y_offset =
+ mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset;
+ const int src_stride = xd->cur_buf->y_stride;
+ const int src_width = xd->cur_buf->y_width;
+
+ int dst_mb_offset =
+ mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE;
+ uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset;
+ int dst_buffer_stride = tpl_frame->rec_picture->y_stride;
+ int use_y_only_rate_distortion = tpl_sf->use_y_only_rate_distortion;
+
+ uint8_t *rec_buffer_pool[3] = {
+ tpl_frame->rec_picture->y_buffer,
+ tpl_frame->rec_picture->u_buffer,
+ tpl_frame->rec_picture->v_buffer,
+ };
+
+ const int rec_stride_pool[3] = {
+ tpl_frame->rec_picture->y_stride,
+ tpl_frame->rec_picture->uv_stride,
+ tpl_frame->rec_picture->uv_stride,
+ };
+
+ for (int plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ pd->subsampling_x = xd->cur_buf->subsampling_x;
+ pd->subsampling_y = xd->cur_buf->subsampling_y;
+ }
+
+ uint8_t *predictor8 = tpl_tmp_buffers->predictor8;
+ int16_t *src_diff = tpl_tmp_buffers->src_diff;
+ tran_low_t *coeff = tpl_tmp_buffers->coeff;
+ tran_low_t *qcoeff = tpl_tmp_buffers->qcoeff;
+ tran_low_t *dqcoeff = tpl_tmp_buffers->dqcoeff;
+ uint8_t *predictor =
+ is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+ int64_t recon_error = 1;
+ int64_t pred_error = 1;
+
+ memset(tpl_stats, 0, sizeof(*tpl_stats));
+ tpl_stats->ref_frame_index[0] = -1;
+ tpl_stats->ref_frame_index[1] = -1;
+
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+ mi_row, mi_col);
+ set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+ cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+ set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+ av1_num_planes(cm));
+ xd->mi[0]->bsize = bsize;
+ xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+
+ // Intra prediction search
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+ // Pre-load the bottom left line.
+ if (xd->left_available &&
+ mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) {
+ if (is_cur_buf_hbd(xd)) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
+ for (int i = 0; i < bw; ++i)
+ dst[(bw + i) * dst_buffer_stride - 1] =
+ dst[(bw - 1) * dst_buffer_stride - 1];
+ } else {
+ for (int i = 0; i < bw; ++i)
+ dst_buffer[(bw + i) * dst_buffer_stride - 1] =
+ dst_buffer[(bw - 1) * dst_buffer_stride - 1];
+ }
+ }
+
+ // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED,
+ // H_PRED, and V_PRED
+ const PREDICTION_MODE last_intra_mode =
+ tpl_sf->prune_intra_modes ? D45_PRED : INTRA_MODE_END;
+ const SequenceHeader *seq_params = cm->seq_params;
+ for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode;
+ ++mode) {
+ av1_predict_intra_block(xd, seq_params->sb_size,
+ seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize], block_size_high[bsize],
+ tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+ dst_buffer_stride, predictor, bw, 0, 0, 0);
+
+ if (tpl_frame->use_pred_sad) {
+ intra_cost = (int32_t)cpi->ppi->fn_ptr[bsize].sdf(
+ src_mb_buffer, src_stride, predictor, bw);
+ } else {
+ intra_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ }
+
+ if (intra_cost < best_intra_cost) {
+ best_intra_cost = intra_cost;
+ best_mode = mode;
+ }
+ }
+ // Calculate SATD of the best intra mode if SAD was used for mode decision
+ // as best_intra_cost is used in ML model to skip intra mode evaluation.
+ if (tpl_frame->use_pred_sad) {
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize], block_size_high[bsize], tx_size, best_mode, 0,
+ 0, FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, predictor, bw, 0,
+ 0, 0);
+ best_intra_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ }
+
+ int rate_cost = 1;
+
+ if (cpi->use_ducky_encode) {
+ get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+ qcoeff, dqcoeff, cm, x, NULL, rec_buffer_pool,
+ rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+ use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
+
+ tpl_stats->intra_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->intra_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->intra_rate = rate_cost;
+ }
+
+ if (cpi->third_pass_ctx &&
+ frame_offset < cpi->third_pass_ctx->frame_info_count &&
+ tpl_data->frame_idx < gf_group->size) {
+ double ratio_h, ratio_w;
+ av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+ cm->width, &ratio_h, &ratio_w);
+ THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+ cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+ PREDICTION_MODE third_pass_mode = this_mi->pred_mode;
+
+ if (third_pass_mode >= last_intra_mode &&
+ third_pass_mode < INTRA_MODE_END) {
+ av1_predict_intra_block(
+ xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+ block_size_wide[bsize], block_size_high[bsize], tx_size,
+ third_pass_mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+ dst_buffer_stride, predictor, bw, 0, 0, 0);
+
+ intra_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+
+ if (intra_cost < best_intra_cost) {
+ best_intra_cost = intra_cost;
+ best_mode = third_pass_mode;
+ }
+ }
+ }
+
+ // Motion compensated prediction
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+ xd->mi[0]->ref_frame[1] = NONE_FRAME;
+ xd->mi[0]->compound_idx = 1;
+
+ int best_rf_idx = -1;
+ int_mv best_mv[2];
+ int32_t inter_cost;
+ int32_t best_inter_cost = INT32_MAX;
+ int rf_idx;
+ int_mv single_mv[INTER_REFS_PER_FRAME];
+
+ best_mv[0].as_int = INVALID_MV;
+ best_mv[1].as_int = INVALID_MV;
+
+ for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) {
+ single_mv[rf_idx].as_int = INVALID_MV;
+ if (tpl_data->ref_frame[rf_idx] == NULL ||
+ tpl_data->src_ref_frame[rf_idx] == NULL) {
+ tpl_stats->mv[rf_idx].as_int = INVALID_MV;
+ continue;
+ }
+
+ const YV12_BUFFER_CONFIG *ref_frame_ptr = tpl_data->src_ref_frame[rf_idx];
+ const int ref_mb_offset =
+ mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
+ uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
+ const int ref_stride = ref_frame_ptr->y_stride;
+ const int ref_width = ref_frame_ptr->y_width;
+
+ int_mv best_rfidx_mv = { 0 };
+ uint32_t bestsme = UINT32_MAX;
+
+ center_mv_t center_mvs[4] = { { { 0 }, INT_MAX },
+ { { 0 }, INT_MAX },
+ { { 0 }, INT_MAX },
+ { { 0 }, INT_MAX } };
+ int refmv_count = 1;
+ int idx;
+
+ if (xd->up_available) {
+ TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+ mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)];
+ if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+ tpl_sf->skip_alike_starting_mv)) {
+ center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
+ ++refmv_count;
+ }
+ }
+
+ if (xd->left_available) {
+ TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+ mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)];
+ if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+ tpl_sf->skip_alike_starting_mv)) {
+ center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
+ ++refmv_count;
+ }
+ }
+
+ if (xd->up_available && mi_col + mi_width < xd->tile.mi_col_end) {
+ TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+ mi_row - mi_height, mi_col + mi_width, tpl_frame->stride,
+ block_mis_log2)];
+ if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+ tpl_sf->skip_alike_starting_mv)) {
+ center_mvs[refmv_count].mv.as_int = ref_tpl_stats->mv[rf_idx].as_int;
+ ++refmv_count;
+ }
+ }
+
+ if (cpi->third_pass_ctx &&
+ frame_offset < cpi->third_pass_ctx->frame_info_count &&
+ tpl_data->frame_idx < gf_group->size) {
+ double ratio_h, ratio_w;
+ av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+ cm->width, &ratio_h, &ratio_w);
+ THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+ cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+ int_mv tp_mv = av1_get_third_pass_adjusted_mv(this_mi, ratio_h, ratio_w,
+ rf_idx + LAST_FRAME);
+ if (tp_mv.as_int != INVALID_MV &&
+ !is_alike_mv(tp_mv, center_mvs + 1, refmv_count - 1,
+ tpl_sf->skip_alike_starting_mv)) {
+ center_mvs[0].mv = tp_mv;
+ }
+ }
+
+ // Prune starting mvs
+ if (tpl_sf->prune_starting_mv && refmv_count > 1) {
+ // Get each center mv's sad.
+ for (idx = 0; idx < refmv_count; ++idx) {
+ FULLPEL_MV mv = get_fullmv_from_mv(&center_mvs[idx].mv.as_mv);
+ clamp_fullmv(&mv, &x->mv_limits);
+ center_mvs[idx].sad = (int)cpi->ppi->fn_ptr[bsize].sdf(
+ src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col],
+ ref_stride);
+ }
+
+ // Rank center_mv using sad.
+ qsort(center_mvs, refmv_count, sizeof(center_mvs[0]), compare_sad);
+
+ refmv_count = AOMMIN(4 - tpl_sf->prune_starting_mv, refmv_count);
+ // Further reduce number of refmv based on sad difference.
+ if (refmv_count > 1) {
+ int last_sad = center_mvs[refmv_count - 1].sad;
+ int second_to_last_sad = center_mvs[refmv_count - 2].sad;
+ if ((last_sad - second_to_last_sad) * 5 > second_to_last_sad)
+ refmv_count--;
+ }
+ }
+
+ for (idx = 0; idx < refmv_count; ++idx) {
+ int_mv this_mv;
+ uint32_t thissme = motion_estimation(
+ cpi, x, src_mb_buffer, ref_mb, src_stride, ref_stride, src_width,
+ ref_width, bsize, center_mvs[idx].mv.as_mv, &this_mv);
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ best_rfidx_mv = this_mv;
+ }
+ }
+
+ tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int;
+ single_mv[rf_idx] = best_rfidx_mv;
+
+ inter_cost = get_inter_cost(
+ cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size,
+ mi_row, mi_col, rf_idx, &best_rfidx_mv.as_mv, tpl_frame->use_pred_sad);
+ // Store inter cost for each ref frame. This is used to prune inter modes.
+ tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
+
+ if (inter_cost < best_inter_cost) {
+ best_rf_idx = rf_idx;
+
+ best_inter_cost = inter_cost;
+ best_mv[0].as_int = best_rfidx_mv.as_int;
+ }
+ }
+ // Calculate SATD of the best inter mode if SAD was used for mode decision
+ // as best_inter_cost is used in ML model to skip intra mode evaluation.
+ if (best_inter_cost < INT32_MAX && tpl_frame->use_pred_sad) {
+ assert(best_rf_idx != -1);
+ best_inter_cost = get_inter_cost(
+ cpi, xd, src_mb_buffer, src_stride, tpl_tmp_buffers, bsize, tx_size,
+ mi_row, mi_col, best_rf_idx, &best_mv[0].as_mv, 0 /* use_pred_sad */);
+ }
+
+ if (best_rf_idx != -1 && best_inter_cost < best_intra_cost) {
+ best_mode = NEWMV;
+ xd->mi[0]->ref_frame[0] = best_rf_idx + LAST_FRAME;
+ xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
+ }
+
+ // Start compound predition search.
+ int comp_ref_frames[3][2] = {
+ { 0, 4 },
+ { 0, 6 },
+ { 3, 6 },
+ };
+
+ int start_rf = 0;
+ int end_rf = 3;
+ if (!tpl_sf->allow_compound_pred) end_rf = 0;
+ if (cpi->third_pass_ctx &&
+ frame_offset < cpi->third_pass_ctx->frame_info_count &&
+ tpl_data->frame_idx < gf_group->size) {
+ double ratio_h, ratio_w;
+ av1_get_third_pass_ratio(cpi->third_pass_ctx, frame_offset, cm->height,
+ cm->width, &ratio_h, &ratio_w);
+ THIRD_PASS_MI_INFO *this_mi = av1_get_third_pass_mi(
+ cpi->third_pass_ctx, frame_offset, mi_row, mi_col, ratio_h, ratio_w);
+
+ if (this_mi->ref_frame[0] >= LAST_FRAME &&
+ this_mi->ref_frame[1] >= LAST_FRAME) {
+ int found = 0;
+ for (int i = 0; i < 3; i++) {
+ if (comp_ref_frames[i][0] + LAST_FRAME == this_mi->ref_frame[0] &&
+ comp_ref_frames[i][1] + LAST_FRAME == this_mi->ref_frame[1]) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found || !tpl_sf->allow_compound_pred) {
+ comp_ref_frames[2][0] = this_mi->ref_frame[0] - LAST_FRAME;
+ comp_ref_frames[2][1] = this_mi->ref_frame[1] - LAST_FRAME;
+ if (!tpl_sf->allow_compound_pred) {
+ start_rf = 2;
+ end_rf = 3;
+ }
+ }
+ }
+ }
+
+ xd->mi_row = mi_row;
+ xd->mi_col = mi_col;
+ int best_cmp_rf_idx = -1;
+ const int_interpfilters kernel =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ for (int cmp_rf_idx = start_rf; cmp_rf_idx < end_rf; ++cmp_rf_idx) {
+ int rf_idx0 = comp_ref_frames[cmp_rf_idx][0];
+ int rf_idx1 = comp_ref_frames[cmp_rf_idx][1];
+
+ if (tpl_data->ref_frame[rf_idx0] == NULL ||
+ tpl_data->src_ref_frame[rf_idx0] == NULL ||
+ tpl_data->ref_frame[rf_idx1] == NULL ||
+ tpl_data->src_ref_frame[rf_idx1] == NULL) {
+ continue;
+ }
+
+ const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
+ tpl_data->src_ref_frame[rf_idx0],
+ tpl_data->src_ref_frame[rf_idx1],
+ };
+
+ xd->mi[0]->ref_frame[0] = rf_idx0 + LAST_FRAME;
+ xd->mi[0]->ref_frame[1] = rf_idx1 + LAST_FRAME;
+ xd->mi[0]->mode = NEW_NEWMV;
+ const int8_t ref_frame_type = av1_ref_frame_type(xd->mi[0]->ref_frame);
+ // Set up ref_mv for av1_joint_motion_search().
+ CANDIDATE_MV *this_ref_mv_stack = x->mbmi_ext.ref_mv_stack[ref_frame_type];
+ this_ref_mv_stack[xd->mi[0]->ref_mv_idx].this_mv = single_mv[rf_idx0];
+ this_ref_mv_stack[xd->mi[0]->ref_mv_idx].comp_mv = single_mv[rf_idx1];
+
+ struct buf_2d yv12_mb[2][MAX_MB_PLANE];
+ for (int i = 0; i < 2; ++i) {
+ av1_setup_pred_block(xd, yv12_mb[i], ref_frame_ptr[i],
+ xd->block_ref_scale_factors[i],
+ xd->block_ref_scale_factors[i], MAX_MB_PLANE);
+ for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ xd->plane[plane].pre[i] = yv12_mb[i][plane];
+ }
+ }
+
+ int_mv tmp_mv[2] = { single_mv[rf_idx0], single_mv[rf_idx1] };
+ int rate_mv;
+ av1_joint_motion_search(cpi, x, bsize, tmp_mv, NULL, 0, &rate_mv,
+ !cpi->sf.mv_sf.disable_second_mv,
+ NUM_JOINT_ME_REFINE_ITER);
+
+ for (int ref = 0; ref < 2; ++ref) {
+ struct buf_2d ref_buf = { NULL, ref_frame_ptr[ref]->y_buffer,
+ ref_frame_ptr[ref]->y_width,
+ ref_frame_ptr[ref]->y_height,
+ ref_frame_ptr[ref]->y_stride };
+ InterPredParams inter_pred_params;
+ av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+ mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd),
+ 0, &tpl_data->sf, &ref_buf, kernel);
+ av1_init_comp_mode(&inter_pred_params);
+
+ inter_pred_params.conv_params = get_conv_params_no_round(
+ ref, 0, xd->tmp_conv_dst, MAX_SB_SIZE, 1, xd->bd);
+
+ av1_enc_build_one_inter_predictor(predictor, bw, &tmp_mv[ref].as_mv,
+ &inter_pred_params);
+ }
+ inter_cost =
+ tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+ predictor, bw, coeff, bw, bh, tx_size);
+ if (inter_cost < best_inter_cost) {
+ best_cmp_rf_idx = cmp_rf_idx;
+ best_inter_cost = inter_cost;
+ best_mv[0] = tmp_mv[0];
+ best_mv[1] = tmp_mv[1];
+ }
+ }
+
+ if (best_cmp_rf_idx != -1 && best_inter_cost < best_intra_cost) {
+ best_mode = NEW_NEWMV;
+ const int best_rf_idx0 = comp_ref_frames[best_cmp_rf_idx][0];
+ const int best_rf_idx1 = comp_ref_frames[best_cmp_rf_idx][1];
+ xd->mi[0]->ref_frame[0] = best_rf_idx0 + LAST_FRAME;
+ xd->mi[0]->ref_frame[1] = best_rf_idx1 + LAST_FRAME;
+ }
+
+ if (best_inter_cost < INT32_MAX && is_inter_mode(best_mode)) {
+ xd->mi[0]->mv[0].as_int = best_mv[0].as_int;
+ xd->mi[0]->mv[1].as_int = best_mv[1].as_int;
+ const YV12_BUFFER_CONFIG *ref_frame_ptr[2] = {
+ best_cmp_rf_idx >= 0
+ ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]
+ : tpl_data->src_ref_frame[best_rf_idx],
+ best_cmp_rf_idx >= 0
+ ? tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
+ : NULL,
+ };
+ rate_cost = 1;
+ get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+ qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+ rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+ use_y_only_rate_distortion, 0 /*do_recon*/, NULL);
+ tpl_stats->srcrf_rate = rate_cost;
+ }
+
+ best_intra_cost = AOMMAX(best_intra_cost, 1);
+ best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
+ tpl_stats->inter_cost = best_inter_cost;
+ tpl_stats->intra_cost = best_intra_cost;
+
+ tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+
+ // Final encode
+ rate_cost = 0;
+ const YV12_BUFFER_CONFIG *ref_frame_ptr[2];
+
+ ref_frame_ptr[0] =
+ best_mode == NEW_NEWMV
+ ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]]
+ : best_rf_idx >= 0 ? tpl_data->ref_frame[best_rf_idx]
+ : NULL;
+ ref_frame_ptr[1] =
+ best_mode == NEW_NEWMV
+ ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
+ : NULL;
+ get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+ qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+ rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+ use_y_only_rate_distortion, 1 /*do_recon*/,
+ tpl_txfm_stats);
+
+ tpl_stats->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->recrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->recrf_rate = rate_cost;
+
+ if (!is_inter_mode(best_mode)) {
+ tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->srcrf_rate = rate_cost;
+ tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
+ }
+
+ tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist);
+ tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate);
+
+ if (best_mode == NEW_NEWMV) {
+ ref_frame_ptr[0] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
+ ref_frame_ptr[1] =
+ tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
+ get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+ qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+ rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+ use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
+ tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->cmp_recrf_rate[0] = rate_cost;
+
+ tpl_stats->cmp_recrf_dist[0] =
+ AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[0]);
+ tpl_stats->cmp_recrf_rate[0] =
+ AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[0]);
+
+ tpl_stats->cmp_recrf_dist[0] =
+ AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[0]);
+ tpl_stats->cmp_recrf_rate[0] =
+ AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[0]);
+
+ rate_cost = 0;
+ ref_frame_ptr[0] =
+ tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
+ ref_frame_ptr[1] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
+ get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+ qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+ rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
+ use_y_only_rate_distortion, 1 /*do_recon*/, NULL);
+ tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->cmp_recrf_rate[1] = rate_cost;
+
+ tpl_stats->cmp_recrf_dist[1] =
+ AOMMAX(tpl_stats->srcrf_dist, tpl_stats->cmp_recrf_dist[1]);
+ tpl_stats->cmp_recrf_rate[1] =
+ AOMMAX(tpl_stats->srcrf_rate, tpl_stats->cmp_recrf_rate[1]);
+
+ tpl_stats->cmp_recrf_dist[1] =
+ AOMMIN(tpl_stats->recrf_dist, tpl_stats->cmp_recrf_dist[1]);
+ tpl_stats->cmp_recrf_rate[1] =
+ AOMMIN(tpl_stats->recrf_rate, tpl_stats->cmp_recrf_rate[1]);
+ }
+
+ if (best_mode == NEWMV) {
+ tpl_stats->mv[best_rf_idx] = best_mv[0];
+ tpl_stats->ref_frame_index[0] = best_rf_idx;
+ tpl_stats->ref_frame_index[1] = NONE_FRAME;
+ } else if (best_mode == NEW_NEWMV) {
+ tpl_stats->ref_frame_index[0] = comp_ref_frames[best_cmp_rf_idx][0];
+ tpl_stats->ref_frame_index[1] = comp_ref_frames[best_cmp_rf_idx][1];
+ tpl_stats->mv[tpl_stats->ref_frame_index[0]] = best_mv[0];
+ tpl_stats->mv[tpl_stats->ref_frame_index[1]] = best_mv[1];
+ }
+
+ for (int idy = 0; idy < mi_height; ++idy) {
+ for (int idx = 0; idx < mi_width; ++idx) {
+ if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > idx &&
+ (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > idy) {
+ xd->mi[idx + idy * cm->mi_params.mi_stride] = xd->mi[0];
+ }
+ }
+ }
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+ int round;
+ if (ref_pos < 0)
+ round = -(1 + (-ref_pos - 1) / bsize_pix);
+ else
+ round = ref_pos / bsize_pix;
+
+ return round;
+}
+
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+ int height) {
+ int min_row = AOMMAX(row_a, row_b);
+ int max_row = AOMMIN(row_a + height, row_b + height);
+ int min_col = AOMMAX(col_a, col_b);
+ int max_col = AOMMIN(col_a + width, col_b + width);
+ if (min_row < max_row && min_col < max_col) {
+ return (max_row - min_row) * (max_col - min_col);
+ }
+ return 0;
+}
+
+int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) {
+ return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
+}
+
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+ int64_t srcrf_dist, int pix_num) {
+ double beta = (double)srcrf_dist / recrf_dist;
+ int64_t rate_cost = delta_rate;
+
+ if (srcrf_dist <= 128) return rate_cost;
+
+ double dr =
+ (double)(delta_rate >> (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT)) /
+ pix_num;
+
+ double log_den = log(beta) / log(2.0) + 2.0 * dr;
+
+ if (log_den > log(10.0) / log(2.0)) {
+ rate_cost = (int64_t)((log(1.0 / beta) * pix_num) / log(2.0) / 2.0);
+ rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+ return rate_cost;
+ }
+
+ double num = pow(2.0, log_den);
+ double den = num * beta + (1 - beta) * beta;
+
+ rate_cost = (int64_t)((pix_num * log(num / den)) / log(2.0) / 2.0);
+
+ rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+
+ return rate_cost;
+}
+
+static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
+ int mi_col, const BLOCK_SIZE bsize,
+ int frame_idx, int ref) {
+ TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx];
+ TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr;
+ TplDepFrame *tpl_frame = tpl_data->tpl_frame;
+ const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+ TplDepStats *tpl_stats_ptr = &tpl_ptr[av1_tpl_ptr_pos(
+ mi_row, mi_col, tpl_frame->stride, block_mis_log2)];
+
+ int is_compound = tpl_stats_ptr->ref_frame_index[1] >= 0;
+
+ if (tpl_stats_ptr->ref_frame_index[ref] < 0) return;
+ const int ref_frame_index = tpl_stats_ptr->ref_frame_index[ref];
+ TplDepFrame *ref_tpl_frame =
+ &tpl_frame[tpl_frame[frame_idx].ref_map_index[ref_frame_index]];
+ TplDepStats *ref_stats_ptr = ref_tpl_frame->tpl_stats_ptr;
+
+ if (tpl_frame[frame_idx].ref_map_index[ref_frame_index] < 0) return;
+
+ const FULLPEL_MV full_mv =
+ get_fullmv_from_mv(&tpl_stats_ptr->mv[ref_frame_index].as_mv);
+ const int ref_pos_row = mi_row * MI_SIZE + full_mv.row;
+ const int ref_pos_col = mi_col * MI_SIZE + full_mv.col;
+
+ const int bw = 4 << mi_size_wide_log2[bsize];
+ const int bh = 4 << mi_size_high_log2[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const int mi_width = mi_size_wide[bsize];
+ const int pix_num = bw * bh;
+
+ // top-left on grid block location in pixel
+ int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+ int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+ int block;
+
+ int64_t srcrf_dist = is_compound ? tpl_stats_ptr->cmp_recrf_dist[!ref]
+ : tpl_stats_ptr->srcrf_dist;
+ int64_t srcrf_rate =
+ is_compound
+ ? (tpl_stats_ptr->cmp_recrf_rate[!ref] << TPL_DEP_COST_SCALE_LOG2)
+ : (tpl_stats_ptr->srcrf_rate << TPL_DEP_COST_SCALE_LOG2);
+
+ int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - srcrf_dist;
+ int64_t mc_dep_dist =
+ (int64_t)(tpl_stats_ptr->mc_dep_dist *
+ ((double)(tpl_stats_ptr->recrf_dist - srcrf_dist) /
+ tpl_stats_ptr->recrf_dist));
+ int64_t delta_rate =
+ (tpl_stats_ptr->recrf_rate << TPL_DEP_COST_SCALE_LOG2) - srcrf_rate;
+ int64_t mc_dep_rate =
+ av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
+ srcrf_dist, pix_num);
+
+ for (block = 0; block < 4; ++block) {
+ int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+ int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+ if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+ grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+ int overlap_area = av1_get_overlap_area(grid_pos_row, grid_pos_col,
+ ref_pos_row, ref_pos_col, bw, bh);
+ int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+ int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+ assert((1 << block_mis_log2) == mi_height);
+ assert((1 << block_mis_log2) == mi_width);
+ TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos(
+ ref_mi_row, ref_mi_col, ref_tpl_frame->stride, block_mis_log2)];
+ des_stats->mc_dep_dist +=
+ ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num;
+ des_stats->mc_dep_rate +=
+ ((delta_rate + mc_dep_rate) * overlap_area) / pix_num;
+ }
+ }
+}
+
+static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row,
+ int mi_col, int frame_idx) {
+ const BLOCK_SIZE tpl_stats_block_size =
+ convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2);
+ tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx,
+ 0);
+ tpl_model_update_b(tpl_data, mi_row, mi_col, tpl_stats_block_size, frame_idx,
+ 1);
+}
+
+static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row,
+ int mi_col, int stride,
+ const TplDepStats *src_stats,
+ uint8_t block_mis_log2) {
+ int index = av1_tpl_ptr_pos(mi_row, mi_col, stride, block_mis_log2);
+ TplDepStats *tpl_ptr = &tpl_stats_ptr[index];
+ *tpl_ptr = *src_stats;
+ tpl_ptr->intra_cost = AOMMAX(1, tpl_ptr->intra_cost);
+ tpl_ptr->inter_cost = AOMMAX(1, tpl_ptr->inter_cost);
+ tpl_ptr->srcrf_dist = AOMMAX(1, tpl_ptr->srcrf_dist);
+ tpl_ptr->srcrf_sse = AOMMAX(1, tpl_ptr->srcrf_sse);
+ tpl_ptr->recrf_dist = AOMMAX(1, tpl_ptr->recrf_dist);
+ tpl_ptr->srcrf_rate = AOMMAX(1, tpl_ptr->srcrf_rate);
+ tpl_ptr->recrf_rate = AOMMAX(1, tpl_ptr->recrf_rate);
+ tpl_ptr->cmp_recrf_dist[0] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[0]);
+ tpl_ptr->cmp_recrf_dist[1] = AOMMAX(1, tpl_ptr->cmp_recrf_dist[1]);
+ tpl_ptr->cmp_recrf_rate[0] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[0]);
+ tpl_ptr->cmp_recrf_rate[1] = AOMMAX(1, tpl_ptr->cmp_recrf_rate[1]);
+}
+
+// Reset the ref and source frame pointers of tpl_data.
+static AOM_INLINE void tpl_reset_src_ref_frames(TplParams *tpl_data) {
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ tpl_data->ref_frame[i] = NULL;
+ tpl_data->src_ref_frame[i] = NULL;
+ }
+}
+
+static AOM_INLINE int get_gop_length(const GF_GROUP *gf_group) {
+ int gop_length = AOMMIN(gf_group->size, MAX_TPL_FRAME_IDX - 1);
+ return gop_length;
+}
+
+// Initialize the mc_flow parameters used in computing tpl data.
+static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
+ int pframe_qindex) {
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+ const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture;
+ const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
+ uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME];
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
+ int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
+ gf_group, cpi->sf.inter_sf.selective_ref_frame,
+ tpl_sf->prune_ref_frames_in_tpl, frame_idx);
+ int gop_length = get_gop_length(gf_group);
+ int ref_frame_flags;
+ AV1_COMMON *cm = &cpi->common;
+ int rdmult, idx;
+ ThreadData *td = &cpi->td;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+ tpl_data->frame_idx = frame_idx;
+ tpl_reset_src_ref_frames(tpl_data);
+ av1_tile_init(&xd->tile, cm, 0, 0);
+
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+ // Setup scaling factor
+ av1_setup_scale_factors_for_frame(
+ &tpl_data->sf, this_frame->y_crop_width, this_frame->y_crop_height,
+ this_frame->y_crop_width, this_frame->y_crop_height);
+
+ xd->cur_buf = this_frame;
+
+ for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+ TplDepFrame *tpl_ref_frame =
+ &tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]];
+ tpl_data->ref_frame[idx] = tpl_ref_frame->rec_picture;
+ tpl_data->src_ref_frame[idx] = tpl_ref_frame->gf_picture;
+ ref_frame_display_indices[idx] = tpl_ref_frame->frame_display_index;
+ }
+
+ // Store the reference frames based on priority order
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ ref_frames_ordered[i] =
+ tpl_data->ref_frame[ref_frame_priority_order[i] - 1];
+ }
+
+ // Work out which reference frame slots may be used.
+ ref_frame_flags =
+ get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi),
+ ref_frames_ordered, cpi->ext_flags.ref_frame_flags);
+
+ enforce_max_ref_frames(cpi, &ref_frame_flags, ref_frame_display_indices,
+ tpl_frame->frame_display_index);
+
+ // Prune reference frames
+ for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+ if ((ref_frame_flags & (1 << idx)) == 0) {
+ tpl_data->ref_frame[idx] = NULL;
+ }
+ }
+
+ // Skip motion estimation w.r.t. reference frames which are not
+ // considered in RD search, using "selective_ref_frame" speed feature.
+ // The reference frame pruning is not enabled for frames beyond the gop
+ // length, as there are fewer reference frames and the reference frames
+ // differ from the frames considered during RD search.
+ if (ref_pruning_enabled && (frame_idx < gop_length)) {
+ for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+ const MV_REFERENCE_FRAME refs[2] = { idx + 1, NONE_FRAME };
+ if (prune_ref_by_selective_ref_frame(cpi, NULL, refs,
+ ref_frame_display_indices)) {
+ tpl_data->ref_frame[idx] = NULL;
+ }
+ }
+ }
+
+ // Make a temporary mbmi for tpl model
+ MB_MODE_INFO mbmi;
+ memset(&mbmi, 0, sizeof(mbmi));
+ MB_MODE_INFO *mbmi_ptr = &mbmi;
+ xd->mi = &mbmi_ptr;
+
+ xd->block_ref_scale_factors[0] = &tpl_data->sf;
+ xd->block_ref_scale_factors[1] = &tpl_data->sf;
+
+ const int base_qindex =
+ cpi->use_ducky_encode ? gf_group->q_val[frame_idx] : pframe_qindex;
+ // Get rd multiplier set up.
+ rdmult = (int)av1_compute_rd_mult(
+ base_qindex, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+
+ if (rdmult < 1) rdmult = 1;
+ av1_set_error_per_bit(&x->errorperbit, rdmult);
+ av1_set_sad_per_bit(cpi, &x->sadperbit, base_qindex);
+
+ tpl_frame->is_valid = 1;
+
+ cm->quant_params.base_qindex = base_qindex;
+ av1_frame_init_quantizer(cpi);
+
+ const BitDepthInfo bd_info = get_bit_depth_info(xd);
+ const FRAME_UPDATE_TYPE update_type =
+ gf_group->update_type[cpi->gf_frame_index];
+ tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex(
+ bd_info.bit_depth, update_type, base_qindex) /
+ 6;
+
+ if (cpi->use_ducky_encode)
+ tpl_frame->base_rdmult = gf_group->rdmult_val[frame_idx];
+
+ av1_init_tpl_txfm_stats(tpl_txfm_stats);
+
+ // Initialize x->mbmi_ext when compound predictions are enabled.
+ if (tpl_sf->allow_compound_pred) av1_zero(x->mbmi_ext);
+
+ // Set the pointer to null since mbmi is only allocated inside this function.
+ assert(xd->mi == &mbmi_ptr);
+ xd->mi = NULL;
+
+ // Tpl module is called before the setting of speed features at frame level.
+ // Thus, turning off this speed feature for key frame is done here and not
+ // integrated into the speed feature setting itself.
+ const int layer_depth_th = (tpl_sf->use_sad_for_mode_decision == 1) ? 5 : 0;
+ tpl_frame->use_pred_sad =
+ tpl_sf->use_sad_for_mode_decision &&
+ gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+ gf_group->layer_depth[frame_idx] >= layer_depth_th;
+}
+
+// This function stores the motion estimation dependencies of all the blocks in
+// a row
+void av1_mc_flow_dispenser_row(AV1_COMP *cpi, TplTxfmStats *tpl_txfm_stats,
+ TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
+ int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ AV1_COMMON *const cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const int mi_width = mi_size_wide[bsize];
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ const int tplb_cols_in_tile =
+ ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
+ const int tplb_row = ROUND_POWER_OF_TWO(mi_row, mi_size_high_log2[bsize]);
+ assert(mi_size_high[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2));
+ assert(mi_size_wide[bsize] == (1 << tpl_data->tpl_stats_block_mis_log2));
+
+ for (int mi_col = 0, tplb_col_in_tile = 0; mi_col < mi_params->mi_cols;
+ mi_col += mi_width, tplb_col_in_tile++) {
+ (*tpl_row_mt->sync_read_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+ tplb_col_in_tile);
+
+#if CONFIG_MULTITHREAD
+ if (mt_info->num_workers > 1) {
+ pthread_mutex_lock(tpl_row_mt->mutex_);
+ const bool tpl_mt_exit = tpl_row_mt->tpl_mt_exit;
+ pthread_mutex_unlock(tpl_row_mt->mutex_);
+ // Exit in case any worker has encountered an error.
+ if (tpl_mt_exit) return;
+ }
+#endif
+
+ TplDepStats tpl_stats;
+
+ // Motion estimation column boundary
+ av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width,
+ tpl_data->border_in_pixels);
+ xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+ xd->mb_to_right_edge =
+ GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
+ mode_estimation(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row, mi_col,
+ bsize, tx_size, &tpl_stats);
+
+ // Motion flow dependency dispenser.
+ tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, tpl_frame->stride,
+ &tpl_stats, tpl_data->tpl_stats_block_mis_log2);
+ (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
+ tplb_col_in_tile, tplb_cols_in_tile);
+ }
+}
+
+static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ ThreadData *td = &cpi->td;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BLOCK_SIZE bsize =
+ convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
+ const TX_SIZE tx_size = max_txsize_lookup[bsize];
+ const int mi_height = mi_size_high[bsize];
+ for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
+ // Motion estimation row boundary
+ av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
+ cpi->ppi->tpl_data.border_in_pixels);
+ xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+ xd->mb_to_bottom_edge =
+ GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+ av1_mc_flow_dispenser_row(cpi, &td->tpl_txfm_stats, &td->tpl_tmp_buffers, x,
+ mi_row, bsize, tx_size);
+ }
+}
+
+static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows,
+ int mi_cols) {
+ if (!frame_idx) {
+ return;
+ }
+ const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
+ const int mi_height = mi_size_high[bsize];
+ const int mi_width = mi_size_wide[bsize];
+ assert(mi_height == (1 << tpl_data->tpl_stats_block_mis_log2));
+ assert(mi_width == (1 << tpl_data->tpl_stats_block_mis_log2));
+
+ for (int mi_row = 0; mi_row < mi_rows; mi_row += mi_height) {
+ for (int mi_col = 0; mi_col < mi_cols; mi_col += mi_width) {
+ tpl_model_update(tpl_data, mi_row, mi_col, frame_idx);
+ }
+ }
+}
+
+static AOM_INLINE void init_gop_frames_for_tpl(
+ AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params,
+ GF_GROUP *gf_group, int *tpl_group_frames, int *pframe_qindex) {
+ AV1_COMMON *cm = &cpi->common;
+ assert(cpi->gf_frame_index == 0);
+ *pframe_qindex = 0;
+
+ RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+ init_ref_map_pair(cpi, ref_frame_map_pairs);
+
+ int remapped_ref_idx[REF_FRAMES];
+
+ EncodeFrameParams frame_params = *init_frame_params;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+
+ int ref_picture_map[REF_FRAMES];
+
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ if (frame_params.frame_type == KEY_FRAME) {
+ tpl_data->tpl_frame[-i - 1].gf_picture = NULL;
+ tpl_data->tpl_frame[-i - 1].rec_picture = NULL;
+ tpl_data->tpl_frame[-i - 1].frame_display_index = 0;
+ } else {
+ tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf;
+ tpl_data->tpl_frame[-i - 1].rec_picture = &cm->ref_frame_map[i]->buf;
+ tpl_data->tpl_frame[-i - 1].frame_display_index =
+ cm->ref_frame_map[i]->display_order_hint;
+ }
+
+ ref_picture_map[i] = -i - 1;
+ }
+
+ *tpl_group_frames = 0;
+
+ int gf_index;
+ int process_frame_count = 0;
+ const int gop_length = get_gop_length(gf_group);
+
+ for (gf_index = 0; gf_index < gop_length; ++gf_index) {
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
+ FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index];
+ int lookahead_index =
+ gf_group->cur_frame_idx[gf_index] + gf_group->arf_src_offset[gf_index];
+ frame_params.show_frame = frame_update_type != ARF_UPDATE &&
+ frame_update_type != INTNL_ARF_UPDATE;
+ frame_params.show_existing_frame =
+ frame_update_type == INTNL_OVERLAY_UPDATE ||
+ frame_update_type == OVERLAY_UPDATE;
+ frame_params.frame_type = gf_group->frame_type[gf_index];
+
+ if (frame_update_type == LF_UPDATE)
+ *pframe_qindex = gf_group->q_val[gf_index];
+
+ const struct lookahead_entry *buf = av1_lookahead_peek(
+ cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage);
+ if (buf == NULL) break;
+ tpl_frame->gf_picture = &buf->img;
+
+ // Use filtered frame buffer if available. This will make tpl stats more
+ // precise.
+ FRAME_DIFF frame_diff;
+ const YV12_BUFFER_CONFIG *tf_buf =
+ av1_tf_info_get_filtered_buf(&cpi->ppi->tf_info, gf_index, &frame_diff);
+ if (tf_buf != NULL) {
+ tpl_frame->gf_picture = tf_buf;
+ }
+
+ // 'cm->current_frame.frame_number' is the display number
+ // of the current frame.
+ // 'lookahead_index' is frame offset within the gf group.
+ // 'lookahead_index + cm->current_frame.frame_number'
+ // is the display index of the frame.
+ tpl_frame->frame_display_index =
+ lookahead_index + cm->current_frame.frame_number;
+ assert(buf->display_idx ==
+ cpi->frame_index_set.show_frame_count + lookahead_index);
+
+ if (frame_update_type != OVERLAY_UPDATE &&
+ frame_update_type != INTNL_OVERLAY_UPDATE) {
+ tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
+ tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
+ ++process_frame_count;
+ }
+ const int true_disp = (int)(tpl_frame->frame_display_index);
+
+ av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0,
+ remapped_ref_idx);
+
+ int refresh_mask =
+ av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+ gf_index, true_disp, ref_frame_map_pairs);
+
+ // Make the frames marked as is_frame_non_ref to non-reference frames.
+ if (cpi->ppi->gf_group.is_frame_non_ref[gf_index]) refresh_mask = 0;
+
+ int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+
+ if (refresh_frame_map_index < REF_FRAMES &&
+ refresh_frame_map_index != INVALID_IDX) {
+ ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+ AOMMAX(0, true_disp);
+ ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+ get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+ cpi->ppi->gf_group.max_layer_depth);
+ }
+
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+ tpl_frame->ref_map_index[i - LAST_FRAME] =
+ ref_picture_map[remapped_ref_idx[i - LAST_FRAME]];
+
+ if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
+
+ ++*tpl_group_frames;
+ }
+
+ const int tpl_extend = cpi->oxcf.gf_cfg.lag_in_frames - MAX_GF_INTERVAL;
+ int extend_frame_count = 0;
+ int extend_frame_length = AOMMIN(
+ tpl_extend, cpi->rc.frames_to_key - cpi->ppi->p_rc.baseline_gf_interval);
+
+ int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] +
+ gf_group->arf_src_offset[gop_length - 1] + 1;
+
+ for (;
+ gf_index < MAX_TPL_FRAME_IDX && extend_frame_count < extend_frame_length;
+ ++gf_index) {
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
+ FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE;
+ frame_params.show_frame = frame_update_type != ARF_UPDATE &&
+ frame_update_type != INTNL_ARF_UPDATE;
+ frame_params.show_existing_frame =
+ frame_update_type == INTNL_OVERLAY_UPDATE;
+ frame_params.frame_type = INTER_FRAME;
+
+ int lookahead_index = frame_display_index;
+ struct lookahead_entry *buf = av1_lookahead_peek(
+ cpi->ppi->lookahead, lookahead_index, cpi->compressor_stage);
+
+ if (buf == NULL) break;
+
+ tpl_frame->gf_picture = &buf->img;
+ tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
+ tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
+ // 'cm->current_frame.frame_number' is the display number
+ // of the current frame.
+ // 'frame_display_index' is frame offset within the gf group.
+ // 'frame_display_index + cm->current_frame.frame_number'
+ // is the display index of the frame.
+ tpl_frame->frame_display_index =
+ frame_display_index + cm->current_frame.frame_number;
+
+ ++process_frame_count;
+
+ gf_group->update_type[gf_index] = LF_UPDATE;
+
+#if CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+ if (cpi->oxcf.pass == AOM_RC_SECOND_PASS) {
+ if (cpi->oxcf.rc_cfg.mode == AOM_Q) {
+ *pframe_qindex = cpi->oxcf.rc_cfg.cq_level;
+ } else if (cpi->oxcf.rc_cfg.mode == AOM_VBR) {
+ // TODO(angiebird): Find a more adaptive method to decide pframe_qindex
+ // override the pframe_qindex in the second pass when bitrate accuracy
+ // is on. We found that setting this pframe_qindex make the tpl stats
+ // more stable.
+ *pframe_qindex = 128;
+ }
+ }
+#endif // CONFIG_BITRATE_ACCURACY && CONFIG_THREE_PASS
+ gf_group->q_val[gf_index] = *pframe_qindex;
+ const int true_disp = (int)(tpl_frame->frame_display_index);
+ av1_get_ref_frames(ref_frame_map_pairs, true_disp, cpi, gf_index, 0,
+ remapped_ref_idx);
+ int refresh_mask =
+ av1_get_refresh_frame_flags(cpi, &frame_params, frame_update_type,
+ gf_index, true_disp, ref_frame_map_pairs);
+ int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+
+ if (refresh_frame_map_index < REF_FRAMES &&
+ refresh_frame_map_index != INVALID_IDX) {
+ ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+ AOMMAX(0, true_disp);
+ ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+ get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+ cpi->ppi->gf_group.max_layer_depth);
+ }
+
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+ tpl_frame->ref_map_index[i - LAST_FRAME] =
+ ref_picture_map[remapped_ref_idx[i - LAST_FRAME]];
+
+ tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1;
+ tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1;
+ tpl_frame->ref_map_index[BWDREF_FRAME - LAST_FRAME] = -1;
+ tpl_frame->ref_map_index[ALTREF2_FRAME - LAST_FRAME] = -1;
+
+ if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
+
+ ++*tpl_group_frames;
+ ++extend_frame_count;
+ ++frame_display_index;
+ }
+}
+
+void av1_init_tpl_stats(TplParams *const tpl_data) {
+ tpl_data->ready = 0;
+ set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
+ &tpl_data->tpl_bsize_1d);
+ for (int frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) {
+ TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+ tpl_frame->is_valid = 0;
+ }
+ for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+ TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+ if (tpl_data->tpl_stats_pool[frame_idx] == NULL) continue;
+ memset(tpl_data->tpl_stats_pool[frame_idx], 0,
+ tpl_frame->height * tpl_frame->width *
+ sizeof(*tpl_frame->tpl_stats_ptr));
+ }
+}
+
+int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index) {
+ if (tpl_data->ready == 0) {
+ return 0;
+ }
+ if (gf_frame_index >= MAX_TPL_FRAME_IDX) {
+ // The sub-GOP length exceeds the TPL buffer capacity.
+ // Hence the TPL related functions are disabled hereafter.
+ return 0;
+ }
+ return tpl_data->tpl_frame[gf_frame_index].is_valid;
+}
+
+static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) {
+ switch (gop_eval) {
+ case 1:
+ // Allow larger GOP size if the base layer ARF has higher dependency
+ // factor than the intermediate ARF and both ARFs have reasonably high
+ // dependency factors.
+ return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0;
+ case 2:
+ if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6)
+ return 1; // Don't shorten the gf interval
+ else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4)
+ return 0; // Shorten the gf interval
+ else
+ return 2; // Cannot decide the gf interval, so redo the
+ // tpl stats calculation.
+ case 3: return beta[0] > 1.1;
+ default: return 2;
+ }
+}
+
+// TODO(jingning): Restructure av1_rc_pick_q_and_bounds() to narrow down
+// the scope of input arguments.
+void av1_tpl_preload_rc_estimate(AV1_COMP *cpi,
+ const EncodeFrameParams *const frame_params) {
+ AV1_COMMON *cm = &cpi->common;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ int bottom_index, top_index;
+ if (cpi->use_ducky_encode) return;
+
+ cm->current_frame.frame_type = frame_params->frame_type;
+ for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+ ++gf_index) {
+ cm->current_frame.frame_type = gf_group->frame_type[gf_index];
+ cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
+ gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
+ gf_group->q_val[gf_index] = av1_rc_pick_q_and_bounds(
+ cpi, cm->width, cm->height, gf_index, &bottom_index, &top_index);
+ }
+}
+
+static AOM_INLINE int skip_tpl_for_frame(const GF_GROUP *gf_group,
+ int frame_idx, int gop_eval,
+ int approx_gop_eval,
+ int reduce_num_frames) {
+ // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
+ // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
+ // tpl stats calculation is limited to ARFs from base layer and (base+1)
+ // layer.
+ const int num_arf_layers = (gop_eval == 2) ? 3 : 2;
+ const int gop_length = get_gop_length(gf_group);
+
+ if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
+ gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
+ return 1;
+
+ // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
+ // frames and for frames beyond gop length.
+ if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+ frame_idx >= gop_length))
+ return 1;
+
+ if (reduce_num_frames && gf_group->update_type[frame_idx] == LF_UPDATE &&
+ frame_idx < gop_length)
+ return 1;
+
+ return 0;
+}
+
+int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
+ const EncodeFrameParams *const frame_params) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+ assert(cpi->gf_frame_index == 0);
+ AV1_COMMON *cm = &cpi->common;
+ MultiThreadInfo *const mt_info = &cpi->mt_info;
+ AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ EncodeFrameParams this_frame_params = *frame_params;
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ int approx_gop_eval = (gop_eval > 1);
+
+ if (cpi->superres_mode != AOM_SUPERRES_NONE) {
+ assert(cpi->superres_mode != AOM_SUPERRES_AUTO);
+ av1_init_tpl_stats(tpl_data);
+ return 0;
+ }
+
+ cm->current_frame.frame_type = frame_params->frame_type;
+ for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+ ++gf_index) {
+ cm->current_frame.frame_type = gf_group->frame_type[gf_index];
+ av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
+ gf_group->update_type[gf_index],
+ gf_group->refbuf_state[gf_index], 0);
+
+ memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame,
+ sizeof(cpi->refresh_frame));
+ }
+
+ int pframe_qindex;
+ int tpl_gf_group_frames;
+ init_gop_frames_for_tpl(cpi, frame_params, gf_group, &tpl_gf_group_frames,
+ &pframe_qindex);
+
+ cpi->ppi->p_rc.base_layer_qp = pframe_qindex;
+
+ av1_init_tpl_stats(tpl_data);
+
+ TplBuffers *tpl_tmp_buffers = &cpi->td.tpl_tmp_buffers;
+ if (!tpl_alloc_temp_buffers(tpl_tmp_buffers, tpl_data->tpl_bsize_1d)) {
+ aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
+ "Error allocating tpl data");
+ }
+
+ tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read_dummy;
+ tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write_dummy;
+
+ av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
+ cm->width, cm->height);
+
+ if (frame_params->frame_type == KEY_FRAME) {
+ av1_init_mv_probs(cm);
+ }
+ av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
+ cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs);
+
+ const int num_planes =
+ cpi->sf.tpl_sf.use_y_only_rate_distortion ? 1 : av1_num_planes(cm);
+ // As tpl module is called before the setting of speed features at frame
+ // level, turning off this speed feature for the first GF group of the
+ // key-frame interval is done here.
+ int reduce_num_frames =
+ cpi->sf.tpl_sf.reduce_num_frames &&
+ gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
+ gf_group->max_layer_depth > 2;
+ // TPL processing is skipped for frames of type LF_UPDATE when
+ // 'reduce_num_frames' is 1, which affects the r0 calcuation. Thus, a factor
+ // to adjust r0 is used. The value of 1.6 corresponds to using ~60% of the
+ // frames in the gf group on an average.
+ tpl_data->r0_adjust_factor = reduce_num_frames ? 1.6 : 1.0;
+
+ // Backward propagation from tpl_group_frames to 1.
+ for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames;
+ ++frame_idx) {
+ if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval,
+ reduce_num_frames))
+ continue;
+
+ init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
+ if (mt_info->num_workers > 1) {
+ tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read;
+ tpl_row_mt->sync_write_ptr = av1_tpl_row_mt_sync_write;
+ av1_mc_flow_dispenser_mt(cpi);
+ } else {
+ mc_flow_dispenser(cpi);
+ }
+#if CONFIG_BITRATE_ACCURACY
+ av1_tpl_txfm_stats_update_abs_coeff_mean(&cpi->td.tpl_txfm_stats);
+ av1_tpl_store_txfm_stats(tpl_data, &cpi->td.tpl_txfm_stats, frame_idx);
+#endif // CONFIG_BITRATE_ACCURACY
+#if CONFIG_RATECTRL_LOG && CONFIG_THREE_PASS && CONFIG_BITRATE_ACCURACY
+ if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) {
+ int frame_coding_idx =
+ av1_vbr_rc_frame_coding_idx(&cpi->vbr_rc_info, frame_idx);
+ rc_log_frame_stats(&cpi->rc_log, frame_coding_idx,
+ &cpi->td.tpl_txfm_stats);
+ }
+#endif // CONFIG_RATECTRL_LOG
+
+ aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture,
+ num_planes);
+ }
+
+ for (int frame_idx = tpl_gf_group_frames - 1;
+ frame_idx >= cpi->gf_frame_index; --frame_idx) {
+ if (skip_tpl_for_frame(gf_group, frame_idx, gop_eval, approx_gop_eval,
+ reduce_num_frames))
+ continue;
+
+ mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows,
+ cm->mi_params.mi_cols);
+ }
+
+ av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
+ gf_group->update_type[cpi->gf_frame_index],
+ gf_group->update_type[cpi->gf_frame_index], 0);
+ cm->current_frame.frame_type = frame_params->frame_type;
+ cm->show_frame = frame_params->show_frame;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ // Record the time if the function returns.
+ if (cpi->common.tiles.large_scale || gf_group->max_layer_depth_allowed == 0 ||
+ !gop_eval)
+ end_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+
+ tpl_dealloc_temp_buffers(tpl_tmp_buffers);
+
+ if (!approx_gop_eval) {
+ tpl_data->ready = 1;
+ }
+ if (cpi->common.tiles.large_scale) return 0;
+ if (gf_group->max_layer_depth_allowed == 0) return 1;
+ if (!gop_eval) return 0;
+ assert(gf_group->arf_index >= 0);
+
+ double beta[2] = { 0.0 };
+ const int frame_idx_0 = gf_group->arf_index;
+ const int frame_idx_1 =
+ AOMMIN(tpl_gf_group_frames - 1, gf_group->arf_index + 1);
+ beta[0] = av1_tpl_get_frame_importance(tpl_data, frame_idx_0);
+ beta[1] = av1_tpl_get_frame_importance(tpl_data, frame_idx_1);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_tpl_setup_stats_time);
+#endif
+ return eval_gop_length(beta, gop_eval);
+}
+
+void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int tpl_idx = cpi->gf_frame_index;
+
+ assert(
+ IMPLIES(cpi->ppi->gf_group.size > 0, tpl_idx < cpi->ppi->gf_group.size));
+
+ TplParams *const tpl_data = &cpi->ppi->tpl_data;
+ const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+
+ if (!tpl_frame->is_valid) return;
+
+ const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr;
+ const int tpl_stride = tpl_frame->stride;
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+ const int block_size = BLOCK_16X16;
+ const int num_mi_w = mi_size_wide[block_size];
+ const int num_mi_h = mi_size_high[block_size];
+ const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const double c = 1.2;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+ // Loop through each 'block_size' X 'block_size' block.
+ for (int row = 0; row < num_rows; row++) {
+ for (int col = 0; col < num_cols; col++) {
+ double intra_cost = 0.0, mc_dep_cost = 0.0;
+ // Loop through each mi block.
+ for (int mi_row = row * num_mi_h; mi_row < (row + 1) * num_mi_h;
+ mi_row += step) {
+ for (int mi_col = col * num_mi_w; mi_col < (col + 1) * num_mi_w;
+ mi_col += step) {
+ if (mi_row >= cm->mi_params.mi_rows || mi_col >= mi_cols_sr) continue;
+ const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ mi_row, mi_col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+ int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ intra_cost += (double)(this_stats->recrf_dist << RDDIV_BITS);
+ mc_dep_cost +=
+ (double)(this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+ }
+ }
+ const double rk = intra_cost / mc_dep_cost;
+ const int index = row * num_cols + col;
+ cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c;
+ }
+ }
+}
+
+void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
+ BLOCK_SIZE sb_size, int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+ cpi->gf_frame_index < cpi->ppi->gf_group.size));
+ const int tpl_idx = cpi->gf_frame_index;
+
+ const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
+ const int layer_depth = AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
+ const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+
+ if (tpl_idx >= MAX_TPL_FRAME_IDX) return;
+ TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx];
+ if (!tpl_frame->is_valid) return;
+ if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
+ if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
+
+ const int mi_col_sr =
+ coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+ const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+ const int sb_mi_width_sr = coded_to_superres_mi(
+ mi_size_wide[sb_size], cm->superres_scale_denominator);
+
+ const int bsize_base = BLOCK_16X16;
+ const int num_mi_w = mi_size_wide[bsize_base];
+ const int num_mi_h = mi_size_high[bsize_base];
+ const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const int num_bcols = (sb_mi_width_sr + num_mi_w - 1) / num_mi_w;
+ const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h;
+ int row, col;
+
+ double base_block_count = 0.0;
+ double log_sum = 0.0;
+
+ for (row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (col = mi_col_sr / num_mi_h;
+ col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ log_sum += log(cpi->tpl_rdmult_scaling_factors[index]);
+ base_block_count += 1.0;
+ }
+ }
+
+ const CommonQuantParams *quant_params = &cm->quant_params;
+
+ const int orig_qindex_rdmult =
+ quant_params->base_qindex + quant_params->y_dc_delta_q;
+ const int orig_rdmult = av1_compute_rd_mult(
+ orig_qindex_rdmult, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+
+ const int new_qindex_rdmult = quant_params->base_qindex +
+ x->rdmult_delta_qindex +
+ quant_params->y_dc_delta_q;
+ const int new_rdmult = av1_compute_rd_mult(
+ new_qindex_rdmult, cm->seq_params->bit_depth,
+ cpi->ppi->gf_group.update_type[cpi->gf_frame_index], layer_depth,
+ boost_index, frame_type, cpi->oxcf.q_cfg.use_fixed_qp_offsets,
+ is_stat_consumption_stage(cpi));
+
+ const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
+
+ double scale_adj = log(scaling_factor) - log_sum / base_block_count;
+ scale_adj = exp_bounded(scale_adj);
+
+ for (row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (col = mi_col_sr / num_mi_h;
+ col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ cpi->ppi->tpl_sb_rdmult_scaling_factors[index] =
+ scale_adj * cpi->tpl_rdmult_scaling_factors[index];
+ }
+ }
+}
+
+double av1_exponential_entropy(double q_step, double b) {
+ b = AOMMAX(b, TPL_EPSILON);
+ double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
+ return -log2(1 - z) - z * log2(z) / (1 - z);
+}
+
+double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) {
+ // zero bin's size is zero_bin_ratio * q_step
+ // non-zero bin's size is q_step
+ b = AOMMAX(b, TPL_EPSILON);
+ double z = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+ double h = av1_exponential_entropy(q_step, b);
+ double r = -(1 - z) * log2(1 - z) - z * log2(z) + z * (h + 1);
+ return r;
+}
+
+double av1_laplace_estimate_frame_rate(int q_index, int block_count,
+ const double *abs_coeff_mean,
+ int coeff_num) {
+ double zero_bin_ratio = 2;
+ double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+ double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+ double est_rate = 0;
+ // dc coeff
+ est_rate += av1_laplace_entropy(dc_q_step, abs_coeff_mean[0], zero_bin_ratio);
+ // ac coeff
+ for (int i = 1; i < coeff_num; ++i) {
+ est_rate +=
+ av1_laplace_entropy(ac_q_step, abs_coeff_mean[i], zero_bin_ratio);
+ }
+ est_rate *= block_count;
+ return est_rate;
+}
+
+double av1_estimate_coeff_entropy(double q_step, double b,
+ double zero_bin_ratio, int qcoeff) {
+ b = AOMMAX(b, TPL_EPSILON);
+ int abs_qcoeff = abs(qcoeff);
+ double z0 = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+ if (abs_qcoeff == 0) {
+ double r = -log2(1 - z0);
+ return r;
+ } else {
+ double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
+ double r = 1 - log2(z0) - log2(1 - z) - (abs_qcoeff - 1) * log2(z);
+ return r;
+ }
+}
+
+double av1_estimate_txfm_block_entropy(int q_index,
+ const double *abs_coeff_mean,
+ int *qcoeff_arr, int coeff_num) {
+ double zero_bin_ratio = 2;
+ double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+ double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+ double est_rate = 0;
+ // dc coeff
+ est_rate += av1_estimate_coeff_entropy(dc_q_step, abs_coeff_mean[0],
+ zero_bin_ratio, qcoeff_arr[0]);
+ // ac coeff
+ for (int i = 1; i < coeff_num; ++i) {
+ est_rate += av1_estimate_coeff_entropy(ac_q_step, abs_coeff_mean[i],
+ zero_bin_ratio, qcoeff_arr[i]);
+ }
+ return est_rate;
+}
+
+#if CONFIG_RD_COMMAND
+void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command) {
+ FILE *fptr = fopen(filepath, "r");
+ fscanf(fptr, "%d", &rd_command->frame_count);
+ rd_command->frame_index = 0;
+ for (int i = 0; i < rd_command->frame_count; ++i) {
+ int option;
+ fscanf(fptr, "%d", &option);
+ rd_command->option_ls[i] = (RD_OPTION)option;
+ if (option == RD_OPTION_SET_Q) {
+ fscanf(fptr, "%d", &rd_command->q_index_ls[i]);
+ } else if (option == RD_OPTION_SET_Q_RDMULT) {
+ fscanf(fptr, "%d", &rd_command->q_index_ls[i]);
+ fscanf(fptr, "%d", &rd_command->rdmult_ls[i]);
+ }
+ }
+ fclose(fptr);
+}
+#endif // CONFIG_RD_COMMAND
+
+double av1_tpl_get_frame_importance(const TplParams *tpl_data,
+ int gf_frame_index) {
+ const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_frame_index];
+ const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+ const int tpl_stride = tpl_frame->stride;
+ double intra_cost_base = 0;
+ double mc_dep_cost_base = 0;
+ double cbcmp_base = 1;
+ const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+ for (int row = 0; row < tpl_frame->mi_rows; row += step) {
+ for (int col = 0; col < tpl_frame->mi_cols; col += step) {
+ const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+ row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+ double cbcmp = (double)this_stats->srcrf_dist;
+ const int64_t mc_dep_delta =
+ RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+ this_stats->mc_dep_dist);
+ double dist_scaled = (double)(this_stats->recrf_dist << RDDIV_BITS);
+ dist_scaled = AOMMAX(dist_scaled, 1);
+ intra_cost_base += log(dist_scaled) * cbcmp;
+ mc_dep_cost_base += log(dist_scaled + mc_dep_delta) * cbcmp;
+ cbcmp_base += cbcmp;
+ }
+ }
+ return exp((mc_dep_cost_base - intra_cost_base) / cbcmp_base);
+}
+
+double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index) {
+ if (!av1_tpl_stats_ready(tpl_data, gf_frame_index)) {
+ return 1;
+ }
+ const double frame_importance =
+ av1_tpl_get_frame_importance(tpl_data, gf_frame_index);
+ return sqrt(1 / frame_importance);
+}
+
+int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
+ aom_bit_depth_t bit_depth) {
+ const double leaf_qstep = av1_dc_quant_QTX(leaf_qindex, 0, bit_depth);
+ const double target_qstep = leaf_qstep * qstep_ratio;
+ int qindex = leaf_qindex;
+ if (qstep_ratio < 1.0) {
+ for (qindex = leaf_qindex; qindex > 0; --qindex) {
+ const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ if (qstep <= target_qstep) break;
+ }
+ } else {
+ for (qindex = leaf_qindex; qindex <= MAXQ; ++qindex) {
+ const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
+ if (qstep >= target_qstep) break;
+ }
+ }
+ return qindex;
+}
+
+int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index,
+ int leaf_qindex, aom_bit_depth_t bit_depth) {
+ const double qstep_ratio = av1_tpl_get_qstep_ratio(tpl_data, gf_frame_index);
+ return av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth);
+}
+
+#if CONFIG_BITRATE_ACCURACY
+void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget,
+ int show_frame_count) {
+ av1_zero(*vbr_rc_info);
+ vbr_rc_info->ready = 0;
+ vbr_rc_info->total_bit_budget = total_bit_budget;
+ vbr_rc_info->show_frame_count = show_frame_count;
+ const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.94559, 1,
+ 0.94559, 1, 1,
+ 0.94559 };
+
+ // TODO(angiebird): Based on the previous code, only the scale factor 0.94559
+ // will be used in most of the cases with --limi=17. Figure out if the
+ // following scale factors works better.
+ // const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.12040, 1,
+ // 1.10199, 1, 1,
+ // 0.16393 };
+
+ const double mv_scale_factors[FRAME_UPDATE_TYPES] = { 3, 3, 3, 3, 3, 3, 3 };
+ memcpy(vbr_rc_info->scale_factors, scale_factors,
+ sizeof(scale_factors[0]) * FRAME_UPDATE_TYPES);
+ memcpy(vbr_rc_info->mv_scale_factors, mv_scale_factors,
+ sizeof(mv_scale_factors[0]) * FRAME_UPDATE_TYPES);
+
+ vbr_rc_reset_gop_data(vbr_rc_info);
+#if CONFIG_THREE_PASS
+ // TODO(angiebird): Explain why we use -1 here
+ vbr_rc_info->cur_gop_idx = -1;
+ vbr_rc_info->gop_count = 0;
+ vbr_rc_info->total_frame_count = 0;
+#endif // CONFIG_THREE_PASS
+}
+
+#if CONFIG_THREE_PASS
+int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info,
+ int gf_frame_index) {
+ int gop_idx = vbr_rc_info->cur_gop_idx;
+ int gop_start_idx = vbr_rc_info->gop_start_idx_list[gop_idx];
+ return gop_start_idx + gf_frame_index;
+}
+
+void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info,
+ const TPL_INFO *tpl_info) {
+ int gop_start_idx = vbr_rc_info->total_frame_count;
+ vbr_rc_info->gop_start_idx_list[vbr_rc_info->gop_count] = gop_start_idx;
+ vbr_rc_info->gop_length_list[vbr_rc_info->gop_count] = tpl_info->gf_length;
+ assert(gop_start_idx + tpl_info->gf_length <= VBR_RC_INFO_MAX_FRAMES);
+ for (int i = 0; i < tpl_info->gf_length; ++i) {
+ vbr_rc_info->txfm_stats_list[gop_start_idx + i] =
+ tpl_info->txfm_stats_list[i];
+ vbr_rc_info->qstep_ratio_list[gop_start_idx + i] =
+ tpl_info->qstep_ratio_ls[i];
+ vbr_rc_info->update_type_list[gop_start_idx + i] =
+ tpl_info->update_type_list[i];
+ }
+ vbr_rc_info->total_frame_count += tpl_info->gf_length;
+ vbr_rc_info->gop_count++;
+}
+#endif // CONFIG_THREE_PASS
+
+void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info,
+ int gop_showframe_count) {
+ vbr_rc_info->gop_showframe_count = gop_showframe_count;
+ vbr_rc_info->gop_bit_budget = vbr_rc_info->total_bit_budget *
+ gop_showframe_count /
+ vbr_rc_info->show_frame_count;
+}
+
+void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count,
+ const double *qstep_ratio_list,
+ aom_bit_depth_t bit_depth,
+ int *q_index_list) {
+ for (int i = 0; i < frame_count; ++i) {
+ q_index_list[i] = av1_get_q_index_from_qstep_ratio(
+ base_q_index, qstep_ratio_list[i], bit_depth);
+ }
+}
+
+double av1_vbr_rc_info_estimate_gop_bitrate(
+ int base_q_index, aom_bit_depth_t bit_depth,
+ const double *update_type_scale_factors, int frame_count,
+ const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+ const TplTxfmStats *stats_list, int *q_index_list,
+ double *estimated_bitrate_byframe) {
+ av1_vbr_rc_compute_q_indices(base_q_index, frame_count, qstep_ratio_list,
+ bit_depth, q_index_list);
+ double estimated_gop_bitrate = 0;
+ for (int frame_index = 0; frame_index < frame_count; frame_index++) {
+ const TplTxfmStats *frame_stats = &stats_list[frame_index];
+ double frame_bitrate = 0;
+ if (frame_stats->ready) {
+ int q_index = q_index_list[frame_index];
+
+ frame_bitrate = av1_laplace_estimate_frame_rate(
+ q_index, frame_stats->txfm_block_count, frame_stats->abs_coeff_mean,
+ frame_stats->coeff_num);
+ }
+ FRAME_UPDATE_TYPE update_type = update_type_list[frame_index];
+ estimated_gop_bitrate +=
+ frame_bitrate * update_type_scale_factors[update_type];
+ if (estimated_bitrate_byframe != NULL) {
+ estimated_bitrate_byframe[frame_index] = frame_bitrate;
+ }
+ }
+ return estimated_gop_bitrate;
+}
+
+int av1_vbr_rc_info_estimate_base_q(
+ double bit_budget, aom_bit_depth_t bit_depth,
+ const double *update_type_scale_factors, int frame_count,
+ const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+ const TplTxfmStats *stats_list, int *q_index_list,
+ double *estimated_bitrate_byframe) {
+ int q_max = 255; // Maximum q value.
+ int q_min = 0; // Minimum q value.
+ int q = (q_max + q_min) / 2;
+
+ double q_max_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ q_max, bit_depth, update_type_scale_factors, frame_count,
+ update_type_list, qstep_ratio_list, stats_list, q_index_list,
+ estimated_bitrate_byframe);
+
+ double q_min_estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ q_min, bit_depth, update_type_scale_factors, frame_count,
+ update_type_list, qstep_ratio_list, stats_list, q_index_list,
+ estimated_bitrate_byframe);
+ while (q_min + 1 < q_max) {
+ double estimate = av1_vbr_rc_info_estimate_gop_bitrate(
+ q, bit_depth, update_type_scale_factors, frame_count, update_type_list,
+ qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe);
+ if (estimate > bit_budget) {
+ q_min = q;
+ q_min_estimate = estimate;
+ } else {
+ q_max = q;
+ q_max_estimate = estimate;
+ }
+ q = (q_max + q_min) / 2;
+ }
+ // Pick the estimate that lands closest to the budget.
+ if (fabs(q_max_estimate - bit_budget) < fabs(q_min_estimate - bit_budget)) {
+ q = q_max;
+ } else {
+ q = q_min;
+ }
+ // Update q_index_list and vbr_rc_info.
+ av1_vbr_rc_info_estimate_gop_bitrate(
+ q, bit_depth, update_type_scale_factors, frame_count, update_type_list,
+ qstep_ratio_list, stats_list, q_index_list, estimated_bitrate_byframe);
+ return q;
+}
+void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info,
+ const TplParams *tpl_data,
+ const GF_GROUP *gf_group,
+ aom_bit_depth_t bit_depth) {
+ vbr_rc_info->q_index_list_ready = 1;
+ double gop_bit_budget = vbr_rc_info->gop_bit_budget;
+
+ for (int i = 0; i < gf_group->size; i++) {
+ vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+ }
+
+ double mv_bits = 0;
+ for (int i = 0; i < gf_group->size; i++) {
+ double frame_mv_bits = 0;
+ if (av1_tpl_stats_ready(tpl_data, i)) {
+ TplDepFrame *tpl_frame = &tpl_data->tpl_frame[i];
+ frame_mv_bits = av1_tpl_compute_frame_mv_entropy(
+ tpl_frame, tpl_data->tpl_stats_block_mis_log2);
+ FRAME_UPDATE_TYPE updae_type = gf_group->update_type[i];
+ mv_bits += frame_mv_bits * vbr_rc_info->mv_scale_factors[updae_type];
+ }
+ }
+
+ mv_bits = AOMMIN(mv_bits, 0.6 * gop_bit_budget);
+ gop_bit_budget -= mv_bits;
+
+ vbr_rc_info->base_q_index = av1_vbr_rc_info_estimate_base_q(
+ gop_bit_budget, bit_depth, vbr_rc_info->scale_factors, gf_group->size,
+ gf_group->update_type, vbr_rc_info->qstep_ratio_list,
+ tpl_data->txfm_stats_list, vbr_rc_info->q_index_list, NULL);
+}
+
+#endif // CONFIG_BITRATE_ACCURACY
+
+// Use upper and left neighbor block as the reference MVs.
+// Compute the minimum difference between current MV and reference MV.
+int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col,
+ int step, int tpl_stride, int right_shift) {
+ const TplDepStats *tpl_stats =
+ &tpl_frame
+ ->tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_stride, right_shift)];
+ int_mv current_mv = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+ int current_mv_magnitude =
+ abs(current_mv.as_mv.row) + abs(current_mv.as_mv.col);
+
+ // Retrieve the up and left neighbors.
+ int up_error = INT_MAX;
+ int_mv up_mv_diff;
+ if (row - step >= 0) {
+ tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+ row - step, col, tpl_stride, right_shift)];
+ up_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+ up_mv_diff.as_mv.row = current_mv.as_mv.row - up_mv_diff.as_mv.row;
+ up_mv_diff.as_mv.col = current_mv.as_mv.col - up_mv_diff.as_mv.col;
+ up_error = abs(up_mv_diff.as_mv.row) + abs(up_mv_diff.as_mv.col);
+ }
+
+ int left_error = INT_MAX;
+ int_mv left_mv_diff;
+ if (col - step >= 0) {
+ tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+ row, col - step, tpl_stride, right_shift)];
+ left_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+ left_mv_diff.as_mv.row = current_mv.as_mv.row - left_mv_diff.as_mv.row;
+ left_mv_diff.as_mv.col = current_mv.as_mv.col - left_mv_diff.as_mv.col;
+ left_error = abs(left_mv_diff.as_mv.row) + abs(left_mv_diff.as_mv.col);
+ }
+
+ // Return the MV with the minimum distance from current.
+ if (up_error < left_error && up_error < current_mv_magnitude) {
+ return up_mv_diff;
+ } else if (left_error < up_error && left_error < current_mv_magnitude) {
+ return left_mv_diff;
+ }
+ return current_mv;
+}
+
+/* Compute the entropy of motion vectors for a single frame. */
+double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame,
+ uint8_t right_shift) {
+ if (!tpl_frame->is_valid) {
+ return 0;
+ }
+
+ int count_row[500] = { 0 };
+ int count_col[500] = { 0 };
+ int n = 0; // number of MVs to process
+
+ const int tpl_stride = tpl_frame->stride;
+ const int step = 1 << right_shift;
+
+ for (int row = 0; row < tpl_frame->mi_rows; row += step) {
+ for (int col = 0; col < tpl_frame->mi_cols; col += step) {
+ int_mv mv = av1_compute_mv_difference(tpl_frame, row, col, step,
+ tpl_stride, right_shift);
+ count_row[clamp(mv.as_mv.row, 0, 499)] += 1;
+ count_col[clamp(mv.as_mv.row, 0, 499)] += 1;
+ n += 1;
+ }
+ }
+
+ // Estimate the bits used using the entropy formula.
+ double rate_row = 0;
+ double rate_col = 0;
+ for (int i = 0; i < 500; i++) {
+ if (count_row[i] != 0) {
+ double p = count_row[i] / (double)n;
+ rate_row += count_row[i] * -log2(p);
+ }
+ if (count_col[i] != 0) {
+ double p = count_col[i] / (double)n;
+ rate_col += count_col[i] * -log2(p);
+ }
+ }
+
+ return rate_row + rate_col;
+}
diff --git a/third_party/aom/av1/encoder/tpl_model.h b/third_party/aom/av1/encoder/tpl_model.h
new file mode 100644
index 0000000000..bcd58216c5
--- /dev/null
+++ b/third_party/aom/av1/encoder/tpl_model.h
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_
+#define AOM_AV1_ENCODER_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\cond */
+
+struct AV1_PRIMARY;
+struct AV1_COMP;
+struct AV1_SEQ_CODING_TOOLS;
+struct EncodeFrameParams;
+struct EncodeFrameInput;
+struct GF_GROUP;
+struct ThreadData;
+struct TPL_INFO;
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
+ switch (length) {
+ case 64: return BLOCK_64X64;
+ case 32: return BLOCK_32X32;
+ case 16: return BLOCK_16X16;
+ case 8: return BLOCK_8X8;
+ case 4: return BLOCK_4X4;
+ default:
+ assert(0 && "Invalid block size for tpl model");
+ return BLOCK_16X16;
+ }
+}
+
+typedef struct AV1TplRowMultiThreadSync {
+#if CONFIG_MULTITHREAD
+ // Synchronization objects for top-right dependency.
+ pthread_mutex_t *mutex_;
+ pthread_cond_t *cond_;
+#endif
+ // Buffer to store the macroblock whose encoding is complete.
+ // num_finished_cols[i] stores the number of macroblocks which finished
+ // encoding in the ith macroblock row.
+ int *num_finished_cols;
+ // Number of extra macroblocks of the top row to be complete for encoding
+ // of the current macroblock to start. A value of 1 indicates top-right
+ // dependency.
+ int sync_range;
+ // Number of macroblock rows.
+ int rows;
+ // Number of threads processing the current tile.
+ int num_threads_working;
+} AV1TplRowMultiThreadSync;
+
+typedef struct AV1TplRowMultiThreadInfo {
+ // Initialized to false, set to true by the worker thread that encounters an
+ // error in order to abort the processing of other worker threads.
+ bool tpl_mt_exit;
+#if CONFIG_MULTITHREAD
+ // Mutex lock object used for error handling.
+ pthread_mutex_t *mutex_;
+#endif
+ // Row synchronization related function pointers.
+ void (*sync_read_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c);
+ void (*sync_write_ptr)(AV1TplRowMultiThreadSync *tpl_mt_sync, int r, int c,
+ int cols);
+} AV1TplRowMultiThreadInfo;
+
+// TODO(jingning): This needs to be cleaned up next.
+
+// TPL stats buffers are prepared for every frame in the GOP,
+// including (internal) overlays and (internal) arfs.
+// In addition, frames in the lookahead that are outside of the GOP
+// are also used.
+// Thus it should use
+// (gop_length) + (# overlays) + (MAX_LAG_BUFFERS - gop_len) =
+// MAX_LAG_BUFFERS + (# overlays)
+// 2 * MAX_LAG_BUFFERS is therefore a safe estimate.
+// TODO(bohanli): test setting it to 1.5 * MAX_LAG_BUFFER
+#define MAX_TPL_FRAME_IDX (2 * MAX_LAG_BUFFERS)
+// The first REF_FRAMES + 1 buffers are reserved.
+// tpl_data->tpl_frame starts after REF_FRAMES + 1
+#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TPL_FRAME_IDX + REF_FRAMES + 1)
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+#define TPL_EPSILON 0.0000001
+
+typedef struct TplTxfmStats {
+ int ready; // Whether abs_coeff_mean is ready
+ double abs_coeff_sum[256]; // Assume we are using 16x16 transform block
+ double abs_coeff_mean[256];
+ int txfm_block_count;
+ int coeff_num;
+} TplTxfmStats;
+
+typedef struct {
+ uint8_t *predictor8;
+ int16_t *src_diff;
+ tran_low_t *coeff;
+ tran_low_t *qcoeff;
+ tran_low_t *dqcoeff;
+} TplBuffers;
+
+typedef struct TplDepStats {
+ int64_t srcrf_sse;
+ int64_t srcrf_dist;
+ int64_t recrf_sse;
+ int64_t recrf_dist;
+ int64_t intra_sse;
+ int64_t intra_dist;
+ int64_t cmp_recrf_dist[2];
+ int64_t mc_dep_rate;
+ int64_t mc_dep_dist;
+ int64_t pred_error[INTER_REFS_PER_FRAME];
+ int32_t intra_cost;
+ int32_t inter_cost;
+ int32_t srcrf_rate;
+ int32_t recrf_rate;
+ int32_t intra_rate;
+ int32_t cmp_recrf_rate[2];
+ int_mv mv[INTER_REFS_PER_FRAME];
+ int8_t ref_frame_index[2];
+} TplDepStats;
+
+typedef struct TplDepFrame {
+ uint8_t is_valid;
+ TplDepStats *tpl_stats_ptr;
+ const YV12_BUFFER_CONFIG *gf_picture;
+ YV12_BUFFER_CONFIG *rec_picture;
+ int ref_map_index[REF_FRAMES];
+ int stride;
+ int width;
+ int height;
+ int mi_rows;
+ int mi_cols;
+ int base_rdmult;
+ uint32_t frame_display_index;
+ // When set, SAD metric is used for intra and inter mode decision.
+ int use_pred_sad;
+} TplDepFrame;
+
+/*!\endcond */
+/*!
+ * \brief Params related to temporal dependency model.
+ */
+typedef struct TplParams {
+ /*!
+ * Whether the tpl stats is ready.
+ */
+ int ready;
+
+ /*!
+ * Block granularity of tpl score storage.
+ */
+ uint8_t tpl_stats_block_mis_log2;
+
+ /*!
+ * Tpl motion estimation block 1d size. tpl_bsize_1d >= 16.
+ */
+ uint8_t tpl_bsize_1d;
+
+ /*!
+ * Buffer to store the frame level tpl information for each frame in a gf
+ * group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf
+ * group
+ */
+ TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
+
+ /*!
+ * Buffer to store tpl stats at block granularity.
+ * tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf
+ * group.
+ */
+ TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
+
+ /*!
+ * Pointer to the buffer which stores tpl transform stats per frame.
+ * txfm_stats_list[i] stores the TplTxfmStats of the ith frame in a gf group.
+ * Memory is allocated dynamically for MAX_LENGTH_TPL_FRAME_STATS frames when
+ * tpl is enabled.
+ */
+ TplTxfmStats *txfm_stats_list;
+
+ /*!
+ * Buffer to store tpl reconstructed frame.
+ * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
+ */
+ YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS];
+
+ /*!
+ * Pointer to tpl_stats_buffer.
+ */
+ TplDepFrame *tpl_frame;
+
+ /*!
+ * Scale factors for the current frame.
+ */
+ struct scale_factors sf;
+
+ /*!
+ * GF group index of the current frame.
+ */
+ int frame_idx;
+
+ /*!
+ * Array of pointers to the frame buffers holding the source frame.
+ * src_ref_frame[i] stores the pointer to the source frame of the ith
+ * reference frame type.
+ */
+ const YV12_BUFFER_CONFIG *src_ref_frame[INTER_REFS_PER_FRAME];
+
+ /*!
+ * Array of pointers to the frame buffers holding the tpl reconstructed frame.
+ * ref_frame[i] stores the pointer to the tpl reconstructed frame of the ith
+ * reference frame type.
+ */
+ const YV12_BUFFER_CONFIG *ref_frame[INTER_REFS_PER_FRAME];
+
+ /*!
+ * Parameters related to synchronization for top-right dependency in row based
+ * multi-threading of tpl
+ */
+ AV1TplRowMultiThreadSync tpl_mt_sync;
+
+ /*!
+ * Frame border for tpl frame.
+ */
+ int border_in_pixels;
+
+ /*!
+ * Factor to adjust r0 if TPL uses a subset of frames in the gf group.
+ */
+ double r0_adjust_factor;
+} TplParams;
+
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG
+#define VBR_RC_INFO_MAX_FRAMES 500
+#endif // CONFIG_BITRATE_ACCURACY || CONFIG_RATECTRL_LOG
+
+#if CONFIG_BITRATE_ACCURACY
+
+/*!
+ * \brief This structure stores information needed for bitrate accuracy
+ * experiment.
+ */
+typedef struct {
+ int ready;
+ double total_bit_budget; // The total bit budget of the entire video
+ int show_frame_count; // Number of show frames in the entire video
+
+ int gop_showframe_count; // The number of show frames in the current gop
+ double gop_bit_budget; // The bitbudget for the current gop
+ double scale_factors[FRAME_UPDATE_TYPES]; // Scale factors to improve the
+ // budget estimation
+ double mv_scale_factors[FRAME_UPDATE_TYPES]; // Scale factors to improve
+ // MV entropy estimation
+
+ // === Below this line are GOP related data that will be updated per GOP ===
+ int base_q_index; // Stores the base q index.
+ int q_index_list_ready;
+ int q_index_list[VBR_RC_INFO_MAX_FRAMES]; // q indices for the current
+ // GOP
+
+ // Array to store qstep_ratio for each frame in a GOP
+ double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES];
+
+#if CONFIG_THREE_PASS
+ TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES];
+ FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES];
+ int gop_start_idx_list[VBR_RC_INFO_MAX_FRAMES];
+ int gop_length_list[VBR_RC_INFO_MAX_FRAMES];
+ int cur_gop_idx;
+ int total_frame_count;
+ int gop_count;
+#endif // CONFIG_THREE_PASS
+} VBR_RATECTRL_INFO;
+
+static INLINE void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) {
+ vbr_rc_info->q_index_list_ready = 0;
+ av1_zero(vbr_rc_info->q_index_list);
+}
+
+void av1_vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info, double total_bit_budget,
+ int show_frame_count);
+
+int av1_vbr_rc_frame_coding_idx(const VBR_RATECTRL_INFO *vbr_rc_info,
+ int gf_frame_index);
+
+void av1_vbr_rc_append_tpl_info(VBR_RATECTRL_INFO *vbr_rc_info,
+ const struct TPL_INFO *tpl_info);
+
+void av1_vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info,
+ int gop_showframe_count);
+
+void av1_vbr_rc_compute_q_indices(int base_q_index, int frame_count,
+ const double *qstep_ratio_list,
+ aom_bit_depth_t bit_depth, int *q_index_list);
+
+/*!\brief Update q_index_list in vbr_rc_info based on tpl stats
+ *
+ * \param[out] vbr_rc_info Rate control info for BITRATE_ACCURACY
+ * experiment
+ * \param[in] tpl_data TPL struct
+ * \param[in] gf_group GOP struct
+ * \param[in] bit_depth bit depth
+ */
+void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info,
+ const TplParams *tpl_data,
+ const struct GF_GROUP *gf_group,
+ aom_bit_depth_t bit_depth);
+/*
+ *!\brief Compute the number of bits needed to encode a GOP
+ *
+ * \param[in] base_q_index base layer q_index
+ * \param[in] bit_depth bit depth
+ * \param[in] update_type_scale_factors array of scale factors for each
+ * update_type
+ * \param[in] frame_count size of update_type_list,
+ * qstep_ratio_list stats_list,
+ * q_index_list and
+ * estimated_bitrate_byframe
+ * \param[in] update_type_list array of update_type, one per frame
+ * \param[in] qstep_ratio_list array of qstep_ratio, one per frame
+ * \param[in] stats_list array of transform stats, one per
+ * frame
+ * \param[out] q_index_list array of q_index, one per frame
+ * \param[out] estimated_bitrate_byframe array to keep track of frame
+ * bitrate
+ *
+ * \return The estimated GOP bitrate.
+ *
+ */
+double av1_vbr_rc_info_estimate_gop_bitrate(
+ int base_q_index, aom_bit_depth_t bit_depth,
+ const double *update_type_scale_factors, int frame_count,
+ const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+ const TplTxfmStats *stats_list, int *q_index_list,
+ double *estimated_bitrate_byframe);
+
+/*!\brief Estimate the optimal base q index for a GOP.
+ *
+ * This function uses a binary search to find base layer q index to
+ * achieve the specified bit budget.
+ *
+ * \param[in] bit_budget target bit budget
+ * \param[in] bit_depth bit depth
+ * \param[in] update_type_scale_factors array of scale factors for each
+ * update_type
+ * \param[in] frame_count size of update_type_list, qstep_ratio_list
+ * stats_list, q_index_list and
+ * estimated_bitrate_byframe
+ * \param[in] update_type_list array of update_type, one per frame
+ * \param[in] qstep_ratio_list array of qstep_ratio, one per frame
+ * \param[in] stats_list array of transform stats, one per frame
+ * \param[out] q_index_list array of q_index, one per frame
+ * \param[out] estimated_bitrate_byframe Array to keep track of frame
+ * bitrate
+ *
+ * \return Returns the optimal base q index to use.
+ */
+int av1_vbr_rc_info_estimate_base_q(
+ double bit_budget, aom_bit_depth_t bit_depth,
+ const double *update_type_scale_factors, int frame_count,
+ const FRAME_UPDATE_TYPE *update_type_list, const double *qstep_ratio_list,
+ const TplTxfmStats *stats_list, int *q_index_list,
+ double *estimated_bitrate_byframe);
+
+#endif // CONFIG_BITRATE_ACCURACY
+
+#if CONFIG_RD_COMMAND
+typedef enum {
+ RD_OPTION_NONE,
+ RD_OPTION_SET_Q,
+ RD_OPTION_SET_Q_RDMULT
+} RD_OPTION;
+
+typedef struct RD_COMMAND {
+ RD_OPTION option_ls[MAX_LENGTH_TPL_FRAME_STATS];
+ int q_index_ls[MAX_LENGTH_TPL_FRAME_STATS];
+ int rdmult_ls[MAX_LENGTH_TPL_FRAME_STATS];
+ int frame_count;
+ int frame_index;
+} RD_COMMAND;
+
+void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command);
+#endif // CONFIG_RD_COMMAND
+
+/*!\brief Allocate buffers used by tpl model
+ *
+ * \param[in] Top-level encode/decode structure
+ * \param[in] lag_in_frames number of lookahead frames
+ *
+ * \param[out] tpl_data tpl data structure
+ */
+
+void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi,
+ CommonModeInfoParams *const mi_params, int width,
+ int height, int byte_alignment, int lag_in_frames);
+
+static AOM_INLINE void tpl_dealloc_temp_buffers(TplBuffers *tpl_tmp_buffers) {
+ aom_free(tpl_tmp_buffers->predictor8);
+ tpl_tmp_buffers->predictor8 = NULL;
+ aom_free(tpl_tmp_buffers->src_diff);
+ tpl_tmp_buffers->src_diff = NULL;
+ aom_free(tpl_tmp_buffers->coeff);
+ tpl_tmp_buffers->coeff = NULL;
+ aom_free(tpl_tmp_buffers->qcoeff);
+ tpl_tmp_buffers->qcoeff = NULL;
+ aom_free(tpl_tmp_buffers->dqcoeff);
+ tpl_tmp_buffers->dqcoeff = NULL;
+}
+
+static AOM_INLINE bool tpl_alloc_temp_buffers(TplBuffers *tpl_tmp_buffers,
+ uint8_t tpl_bsize_1d) {
+ // Number of pixels in a tpl block
+ const int tpl_block_pels = tpl_bsize_1d * tpl_bsize_1d;
+
+ // Allocate temporary buffers used in mode estimation.
+ tpl_tmp_buffers->predictor8 = (uint8_t *)aom_memalign(
+ 32, tpl_block_pels * 2 * sizeof(*tpl_tmp_buffers->predictor8));
+ tpl_tmp_buffers->src_diff = (int16_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->src_diff));
+ tpl_tmp_buffers->coeff = (tran_low_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->coeff));
+ tpl_tmp_buffers->qcoeff = (tran_low_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->qcoeff));
+ tpl_tmp_buffers->dqcoeff = (tran_low_t *)aom_memalign(
+ 32, tpl_block_pels * sizeof(*tpl_tmp_buffers->dqcoeff));
+
+ if (!(tpl_tmp_buffers->predictor8 && tpl_tmp_buffers->src_diff &&
+ tpl_tmp_buffers->coeff && tpl_tmp_buffers->qcoeff &&
+ tpl_tmp_buffers->dqcoeff)) {
+ tpl_dealloc_temp_buffers(tpl_tmp_buffers);
+ return false;
+ }
+ return true;
+}
+
+/*!\brief Implements temporal dependency modelling for a GOP (GF/ARF
+ * group) and selects between 16 and 32 frame GOP structure.
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in] cpi Top - level encoder instance structure
+ * \param[in] gop_eval Flag if it is in the GOP length decision stage
+ * \param[in] frame_params Per frame encoding parameters
+ *
+ * \return Indicates whether or not we should use a longer GOP length.
+ */
+int av1_tpl_setup_stats(struct AV1_COMP *cpi, int gop_eval,
+ const struct EncodeFrameParams *const frame_params);
+
+/*!\cond */
+
+void av1_tpl_preload_rc_estimate(
+ struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
+
+int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift);
+
+void av1_init_tpl_stats(TplParams *const tpl_data);
+
+int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index);
+
+void av1_tpl_rdmult_setup(struct AV1_COMP *cpi);
+
+void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x,
+ BLOCK_SIZE sb_size, int mi_row, int mi_col);
+
+void av1_mc_flow_dispenser_row(struct AV1_COMP *cpi,
+ TplTxfmStats *tpl_txfm_stats,
+ TplBuffers *tpl_tmp_buffers, MACROBLOCK *x,
+ int mi_row, BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+/*!\brief Compute the entropy of an exponential probability distribution
+ * function (pdf) subjected to uniform quantization.
+ *
+ * pdf(x) = b*exp(-b*x)
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in] q_step quantizer step size
+ * \param[in] b parameter of exponential distribution
+ *
+ * \return entropy cost
+ */
+double av1_exponential_entropy(double q_step, double b);
+
+/*!\brief Compute the entropy of a Laplace probability distribution
+ * function (pdf) subjected to non-uniform quantization.
+ *
+ * pdf(x) = 0.5*b*exp(-0.5*b*|x|)
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in] q_step quantizer step size for non-zero bins
+ * \param[in] b parameter of Laplace distribution
+ * \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio * q_step
+ *
+ * \return entropy cost
+ */
+double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio);
+
+/*!\brief Compute the frame rate using transform block stats
+ *
+ * Assume each position i in the transform block is of Laplace distribution
+ * with mean absolute deviation abs_coeff_mean[i]
+ *
+ * Then we can use av1_laplace_entropy() to compute the expected frame
+ * rate.
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in] q_index quantizer index
+ * \param[in] block_count number of transform blocks
+ * \param[in] abs_coeff_mean array of mean absolute deviation
+ * \param[in] coeff_num number of coefficients per transform block
+ *
+ * \return expected frame rate
+ */
+double av1_laplace_estimate_frame_rate(int q_index, int block_count,
+ const double *abs_coeff_mean,
+ int coeff_num);
+
+/*
+ *!\brief Init TplTxfmStats
+ *
+ * \param[in] tpl_txfm_stats a structure for storing transform stats
+ *
+ */
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats);
+
+#if CONFIG_BITRATE_ACCURACY
+/*
+ *!\brief Accumulate TplTxfmStats
+ *
+ * \param[in] sub_stats a structure for storing sub transform stats
+ * \param[out] accumulated_stats a structure for storing accumulated
+ *transform stats
+ *
+ */
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+ TplTxfmStats *accumulated_stats);
+
+/*
+ *!\brief Record a transform block into TplTxfmStats
+ *
+ * \param[in] tpl_txfm_stats A structure for storing transform stats
+ * \param[out] coeff An array of transform coefficients. Its size
+ * should equal to tpl_txfm_stats.coeff_num.
+ *
+ */
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+ const tran_low_t *coeff);
+
+/*
+ *!\brief Update abs_coeff_mean and ready of txfm_stats
+ * If txfm_block_count > 0, this function will use abs_coeff_sum and
+ * txfm_block_count to compute abs_coeff_mean. Moreover, reday flag
+ * will be set to one.
+ *
+ * \param[in] txfm_stats A structure for storing transform stats
+ */
+void av1_tpl_txfm_stats_update_abs_coeff_mean(TplTxfmStats *txfm_stats);
+#endif // CONFIG_BITRATE_ACCURACY
+
+/*!\brief Estimate coefficient entropy using Laplace dsitribution
+ *
+ *\ingroup tpl_modelling
+ *
+ * This function is equivalent to -log2(laplace_prob()), where laplace_prob()
+ *is defined in tpl_model_test.cc
+ *
+ * \param[in] q_step quantizer step size without any scaling
+ * \param[in] b mean absolute deviation of Laplace
+ *distribution \param[in] zero_bin_ratio zero bin's size is zero_bin_ratio
+ ** q_step \param[in] qcoeff quantized coefficient
+ *
+ * \return estimated coefficient entropy
+ *
+ */
+double av1_estimate_coeff_entropy(double q_step, double b,
+ double zero_bin_ratio, int qcoeff);
+
+/*!\brief Estimate entropy of a transform block using Laplace dsitribution
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in] q_index quantizer index
+ * \param[in] abs_coeff_mean array of mean absolute deviations
+ * \param[in] qcoeff_arr array of quantized coefficients
+ * \param[in] coeff_num number of coefficients per transform block
+ *
+ * \return estimated transform block entropy
+ *
+ */
+double av1_estimate_txfm_block_entropy(int q_index,
+ const double *abs_coeff_mean,
+ int *qcoeff_arr, int coeff_num);
+
+// TODO(angiebird): Add doxygen description here.
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+ int64_t srcrf_dist, int pix_num);
+
+/*!\brief Compute the overlap area between two blocks with the same size
+ *
+ *\ingroup tpl_modelling
+ *
+ * If there is no overlap, this function should return zero.
+ *
+ * \param[in] row_a row position of the first block
+ * \param[in] col_a column position of the first block
+ * \param[in] row_b row position of the second block
+ * \param[in] col_b column position of the second block
+ * \param[in] width width shared by the two blocks
+ * \param[in] height height shared by the two blocks
+ *
+ * \return overlap area of the two blocks
+ */
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+ int height);
+
+/*!\brief Get current frame's q_index from tpl stats and leaf_qindex
+ *
+ * \param[in] tpl_data TPL struct
+ * \param[in] gf_frame_index current frame index in the GOP
+ * \param[in] leaf_qindex q index of leaf frame
+ * \param[in] bit_depth bit depth
+ *
+ * \return q_index
+ */
+int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index,
+ int leaf_qindex, aom_bit_depth_t bit_depth);
+
+/*!\brief Compute the frame importance from TPL stats
+ *
+ * \param[in] tpl_data TPL struct
+ * \param[in] gf_frame_index current frame index in the GOP
+ *
+ * \return frame_importance
+ */
+double av1_tpl_get_frame_importance(const TplParams *tpl_data,
+ int gf_frame_index);
+
+/*!\brief Compute the ratio between arf q step and the leaf q step based on
+ * TPL stats
+ *
+ * \param[in] tpl_data TPL struct
+ * \param[in] gf_frame_index current frame index in the GOP
+ * \param[in] leaf_qindex q index of leaf frame
+ * \param[in] bit_depth bit depth
+ *
+ * \return qstep_ratio
+ */
+double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index);
+
+/*!\brief Find a q index whose step size is near qstep_ratio * leaf_qstep
+ *
+ * \param[in] leaf_qindex q index of leaf frame
+ * \param[in] qstep_ratio step ratio between target q index and
+ * leaf q index \param[in] bit_depth bit depth
+ *
+ * \return q_index
+ */
+int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
+ aom_bit_depth_t bit_depth);
+
+/*!\brief Improve the motion vector estimation by taking neighbors into
+ * account.
+ *
+ * Use the upper and left neighbor block as the reference MVs.
+ * Compute the minimum difference between current MV and reference MV.
+ *
+ * \param[in] tpl_frame Tpl frame struct
+ * \param[in] row Current row
+ * \param[in] col Current column
+ * \param[in] step Step parameter for av1_tpl_ptr_pos
+ * \param[in] tpl_stride Stride parameter for av1_tpl_ptr_pos
+ * \param[in] right_shift Right shift parameter for
+ * av1_tpl_ptr_pos
+ */
+int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col,
+ int step, int tpl_stride, int right_shift);
+
+/*!\brief Compute the entropy of motion vectors for a single frame.
+ *
+ * \param[in] tpl_frame TPL frame struct
+ * \param[in] right_shift right shift value for step
+ *
+ * \return Bits used by the motion vectors for one frame.
+ */
+double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame,
+ uint8_t right_shift);
+
+#if CONFIG_RATECTRL_LOG
+typedef struct {
+ int coding_frame_count;
+ int base_q_index;
+
+ // Encode decision
+ int q_index_list[VBR_RC_INFO_MAX_FRAMES];
+ double qstep_ratio_list[VBR_RC_INFO_MAX_FRAMES];
+ FRAME_UPDATE_TYPE update_type_list[VBR_RC_INFO_MAX_FRAMES];
+
+ // Frame stats
+ TplTxfmStats txfm_stats_list[VBR_RC_INFO_MAX_FRAMES];
+
+ // Estimated encode results
+ double est_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES];
+
+ // Actual encode results
+ double act_rate_list[VBR_RC_INFO_MAX_FRAMES];
+ double act_coeff_rate_list[VBR_RC_INFO_MAX_FRAMES];
+} RATECTRL_LOG;
+
+static INLINE void rc_log_init(RATECTRL_LOG *rc_log) { av1_zero(*rc_log); }
+
+static INLINE void rc_log_frame_stats(RATECTRL_LOG *rc_log, int coding_index,
+ const TplTxfmStats *txfm_stats) {
+ rc_log->txfm_stats_list[coding_index] = *txfm_stats;
+}
+
+static INLINE void rc_log_frame_encode_param(RATECTRL_LOG *rc_log,
+ int coding_index,
+ double qstep_ratio, int q_index,
+ FRAME_UPDATE_TYPE update_type) {
+ rc_log->qstep_ratio_list[coding_index] = qstep_ratio;
+ rc_log->q_index_list[coding_index] = q_index;
+ rc_log->update_type_list[coding_index] = update_type;
+ const TplTxfmStats *txfm_stats = &rc_log->txfm_stats_list[coding_index];
+ rc_log->est_coeff_rate_list[coding_index] = 0;
+ if (txfm_stats->ready) {
+ rc_log->est_coeff_rate_list[coding_index] = av1_laplace_estimate_frame_rate(
+ q_index, txfm_stats->txfm_block_count, txfm_stats->abs_coeff_mean,
+ txfm_stats->coeff_num);
+ }
+}
+
+static INLINE void rc_log_frame_entropy(RATECTRL_LOG *rc_log, int coding_index,
+ double act_rate,
+ double act_coeff_rate) {
+ rc_log->act_rate_list[coding_index] = act_rate;
+ rc_log->act_coeff_rate_list[coding_index] = act_coeff_rate;
+}
+
+static INLINE void rc_log_record_chunk_info(RATECTRL_LOG *rc_log,
+ int base_q_index,
+ int coding_frame_count) {
+ rc_log->base_q_index = base_q_index;
+ rc_log->coding_frame_count = coding_frame_count;
+}
+
+static INLINE void rc_log_show(const RATECTRL_LOG *rc_log) {
+ printf("= chunk 1\n");
+ printf("coding_frame_count %d base_q_index %d\n", rc_log->coding_frame_count,
+ rc_log->base_q_index);
+ printf("= frame %d\n", rc_log->coding_frame_count);
+ for (int coding_idx = 0; coding_idx < rc_log->coding_frame_count;
+ coding_idx++) {
+ printf(
+ "coding_idx %d update_type %d q %d qstep_ratio %f est_coeff_rate %f "
+ "act_coeff_rate %f act_rate %f\n",
+ coding_idx, rc_log->update_type_list[coding_idx],
+ rc_log->q_index_list[coding_idx], rc_log->qstep_ratio_list[coding_idx],
+ rc_log->est_coeff_rate_list[coding_idx],
+ rc_log->act_coeff_rate_list[coding_idx],
+ rc_log->act_rate_list[coding_idx]);
+ }
+}
+#endif // CONFIG_RATECTRL_LOG
+
+/*!\endcond */
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TPL_MODEL_H_
diff --git a/third_party/aom/av1/encoder/tune_butteraugli.c b/third_party/aom/av1/encoder/tune_butteraugli.c
new file mode 100644
index 0000000000..92fc4b2a92
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_butteraugli.c
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/encoder/tune_butteraugli.h"
+
+#include "aom_dsp/butteraugli.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder_utils.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/var_based_part.h"
+
+static const int resize_factor = 2;
+
+static void set_mb_butteraugli_rdmult_scaling(AV1_COMP *cpi,
+ const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *recon,
+ const double K) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = cm->seq_params;
+ const CommonModeInfoParams *const mi_params = &cm->mi_params;
+ const aom_color_range_t color_range =
+ seq_params->color_range != 0 ? AOM_CR_FULL_RANGE : AOM_CR_STUDIO_RANGE;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int width = source->y_crop_width;
+ const int height = source->y_crop_height;
+ const int ss_x = source->subsampling_x;
+ const int ss_y = source->subsampling_y;
+
+ float *diffmap;
+ CHECK_MEM_ERROR(cm, diffmap, aom_malloc(width * height * sizeof(*diffmap)));
+ if (!aom_calc_butteraugli(source, recon, bit_depth,
+ seq_params->matrix_coefficients, color_range,
+ diffmap)) {
+ aom_internal_error(cm->error, AOM_CODEC_ERROR,
+ "Failed to calculate Butteraugli distances.");
+ }
+
+ const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize] / resize_factor;
+ const int num_mi_h = mi_size_high[butteraugli_rdo_bsize] / resize_factor;
+ const int num_cols =
+ (mi_params->mi_cols / resize_factor + num_mi_w - 1) / num_mi_w;
+ const int num_rows =
+ (mi_params->mi_rows / resize_factor + num_mi_h - 1) / num_mi_h;
+ const int block_w = num_mi_w << 2;
+ const int block_h = num_mi_h << 2;
+ double log_sum = 0.0;
+ double blk_count = 0.0;
+
+ // Loop through each block.
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ const int y_start = row * block_h;
+ const int x_start = col * block_w;
+ float dbutteraugli = 0.0f;
+ float dmse = 0.0f;
+ float px_count = 0.0f;
+
+ // Loop through each pixel.
+ for (int y = y_start; y < y_start + block_h && y < height; y++) {
+ for (int x = x_start; x < x_start + block_w && x < width; x++) {
+ dbutteraugli += powf(diffmap[y * width + x], 12.0f);
+ float px_diff = source->y_buffer[y * source->y_stride + x] -
+ recon->y_buffer[y * recon->y_stride + x];
+ dmse += px_diff * px_diff;
+ px_count += 1.0f;
+ }
+ }
+ const int y_end = AOMMIN((y_start >> ss_y) + (block_h >> ss_y),
+ (height + ss_y) >> ss_y);
+ for (int y = y_start >> ss_y; y < y_end; y++) {
+ const int x_end = AOMMIN((x_start >> ss_x) + (block_w >> ss_x),
+ (width + ss_x) >> ss_x);
+ for (int x = x_start >> ss_x; x < x_end; x++) {
+ const int src_px_index = y * source->uv_stride + x;
+ const int recon_px_index = y * recon->uv_stride + x;
+ const float px_diff_u = (float)(source->u_buffer[src_px_index] -
+ recon->u_buffer[recon_px_index]);
+ const float px_diff_v = (float)(source->v_buffer[src_px_index] -
+ recon->v_buffer[recon_px_index]);
+ dmse += px_diff_u * px_diff_u + px_diff_v * px_diff_v;
+ px_count += 2.0f;
+ }
+ }
+
+ dbutteraugli = powf(dbutteraugli, 1.0f / 12.0f);
+ dmse = dmse / px_count;
+ const float eps = 0.01f;
+ double weight;
+ if (dbutteraugli < eps || dmse < eps) {
+ weight = -1.0;
+ } else {
+ blk_count += 1.0;
+ weight = dmse / dbutteraugli;
+ weight = AOMMIN(weight, 5.0);
+ weight += K;
+ log_sum += log(weight);
+ }
+ cpi->butteraugli_info.rdmult_scaling_factors[index] = weight;
+ }
+ }
+ // Geometric average of the weights.
+ log_sum = exp(log_sum / blk_count);
+
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ double *weight = &cpi->butteraugli_info.rdmult_scaling_factors[index];
+ if (*weight <= 0.0) {
+ *weight = 1.0;
+ } else {
+ *weight /= log_sum;
+ }
+ *weight = AOMMIN(*weight, 2.5);
+ *weight = AOMMAX(*weight, 0.4);
+ }
+ }
+
+ aom_free(diffmap);
+}
+
+void av1_set_butteraugli_rdmult(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int *rdmult) {
+ assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI);
+ if (!cpi->butteraugli_info.recon_set) {
+ return;
+ }
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const int num_mi_w = mi_size_wide[butteraugli_rdo_bsize];
+ const int num_mi_h = mi_size_high[butteraugli_rdo_bsize];
+ const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+ const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+ double num_of_mi = 0.0;
+ double geom_mean_of_scale = 0.0;
+
+ for (int row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (int col = mi_col / num_mi_h;
+ col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ geom_mean_of_scale +=
+ log(cpi->butteraugli_info.rdmult_scaling_factors[index]);
+ num_of_mi += 1.0;
+ }
+ }
+ geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+ *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+ *rdmult = AOMMAX(*rdmult, 0);
+ av1_set_error_per_bit(&x->errorperbit, *rdmult);
+}
+
+static void copy_plane(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h) {
+ for (int row = 0; row < h; row++) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void copy_img(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+ int width, int height) {
+ copy_plane(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, width,
+ height);
+ const int width_uv = (width + src->subsampling_x) >> src->subsampling_x;
+ const int height_uv = (height + src->subsampling_y) >> src->subsampling_y;
+ copy_plane(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ width_uv, height_uv);
+ copy_plane(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ width_uv, height_uv);
+}
+
+static void zero_plane(uint8_t *dst, int dst_stride, int h) {
+ for (int row = 0; row < h; row++) {
+ memset(dst, 0, dst_stride);
+ dst += dst_stride;
+ }
+}
+
+static void zero_img(YV12_BUFFER_CONFIG *dst) {
+ zero_plane(dst->y_buffer, dst->y_stride, dst->y_height);
+ zero_plane(dst->u_buffer, dst->uv_stride, dst->uv_height);
+ zero_plane(dst->v_buffer, dst->uv_stride, dst->uv_height);
+}
+
+void av1_setup_butteraugli_source(AV1_COMP *cpi) {
+ YV12_BUFFER_CONFIG *const dst = &cpi->butteraugli_info.source;
+ AV1_COMMON *const cm = &cpi->common;
+ const int width = cpi->source->y_crop_width;
+ const int height = cpi->source->y_crop_height;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int ss_x = cpi->source->subsampling_x;
+ const int ss_y = cpi->source->subsampling_y;
+ if (dst->buffer_alloc_sz == 0) {
+ aom_alloc_frame_buffer(
+ dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+ }
+ av1_copy_and_extend_frame(cpi->source, dst);
+
+ YV12_BUFFER_CONFIG *const resized_dst = &cpi->butteraugli_info.resized_source;
+ if (resized_dst->buffer_alloc_sz == 0) {
+ aom_alloc_frame_buffer(
+ resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ }
+ if (!av1_resize_and_extend_frame_nonnormative(
+ cpi->source, resized_dst, bit_depth, av1_num_planes(cm))) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers during resize");
+ }
+
+ zero_img(cpi->source);
+ copy_img(resized_dst, cpi->source, width / resize_factor,
+ height / resize_factor);
+}
+
+void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) {
+ av1_copy_and_extend_frame(&cpi->butteraugli_info.source, cpi->source);
+ AV1_COMMON *const cm = &cpi->common;
+ const int width = cpi->source->y_crop_width;
+ const int height = cpi->source->y_crop_height;
+ const int ss_x = cpi->source->subsampling_x;
+ const int ss_y = cpi->source->subsampling_y;
+
+ YV12_BUFFER_CONFIG resized_recon;
+ memset(&resized_recon, 0, sizeof(resized_recon));
+ aom_alloc_frame_buffer(
+ &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor,
+ height / resize_factor);
+
+ set_mb_butteraugli_rdmult_scaling(cpi, &cpi->butteraugli_info.resized_source,
+ &resized_recon, K);
+ cpi->butteraugli_info.recon_set = true;
+ aom_free_frame_buffer(&resized_recon);
+}
+
+void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
+ const int q_index = 96;
+
+ // Setup necessary params for encoding, including frame source, etc.
+ if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
+ av1_set_frame_size(cpi, cm->superres_upscaled_width,
+ cm->superres_upscaled_height);
+
+ cpi->source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter,
+ 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels);
+ if (cpi->unscaled_last_source != NULL) {
+ cpi->last_source = av1_realloc_and_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+ cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels,
+ cpi->image_pyramid_levels);
+ }
+
+ av1_setup_butteraugli_source(cpi);
+ av1_setup_frame(cpi);
+
+ if (cm->seg.enabled) {
+ if (!cm->seg.update_data && cm->prev_frame) {
+ segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ cm->seg.enabled = cm->prev_frame->seg.enabled;
+ } else {
+ av1_calculate_segdata(&cm->seg);
+ }
+ } else {
+ memset(&cm->seg, 0, sizeof(cm->seg));
+ }
+ segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+ cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+ const PARTITION_SEARCH_TYPE partition_search_type =
+ cpi->sf.part_sf.partition_search_type;
+ const BLOCK_SIZE fixed_partition_size = cpi->sf.part_sf.fixed_partition_size;
+ // Enable a quicker pass by uncommenting the following lines:
+ // cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+ // cpi->sf.part_sf.fixed_partition_size = BLOCK_32X32;
+
+ av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q_index,
+ q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
+ av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
+ av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+ cm->seq_params->bit_depth);
+
+ av1_set_variance_partition_thresholds(cpi, q_index, 0);
+ av1_encode_frame(cpi);
+
+ av1_setup_butteraugli_rdmult_and_restore_source(cpi, 0.3);
+ cpi->sf.part_sf.partition_search_type = partition_search_type;
+ cpi->sf.part_sf.fixed_partition_size = fixed_partition_size;
+}
diff --git a/third_party/aom/av1/encoder/tune_butteraugli.h b/third_party/aom/av1/encoder/tune_butteraugli.h
new file mode 100644
index 0000000000..bae5d2a882
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_butteraugli.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
+#define AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/block.h"
+
+typedef struct {
+ // Stores the scaling factors for rdmult when tuning for Butteraugli.
+ // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for
+ // 4x4 block at (row, col).
+ double *rdmult_scaling_factors;
+ YV12_BUFFER_CONFIG source, resized_source;
+ bool recon_set;
+} TuneButteraugliInfo;
+
+struct AV1_COMP;
+static const BLOCK_SIZE butteraugli_rdo_bsize = BLOCK_16X16;
+
+void av1_set_butteraugli_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int *rdmult);
+
+void av1_setup_butteraugli_source(struct AV1_COMP *cpi);
+
+// 'K' is used to balance the rate-distortion distribution between PSNR
+// and Butteraugli.
+void av1_setup_butteraugli_rdmult_and_restore_source(struct AV1_COMP *cpi,
+ double K);
+
+void av1_setup_butteraugli_rdmult(struct AV1_COMP *cpi);
+
+#endif // AOM_AV1_ENCODER_TUNE_BUTTERAUGLI_H_
diff --git a/third_party/aom/av1/encoder/tune_vmaf.c b/third_party/aom/av1/encoder/tune_vmaf.c
new file mode 100644
index 0000000000..4e5ffa387c
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_vmaf.c
@@ -0,0 +1,1112 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/tune_vmaf.h"
+
+#include "aom_dsp/psnr.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/rdopt.h"
+#include "config/aom_scale_rtcd.h"
+
+static const double kBaselineVmaf = 97.42773;
+
+static double get_layer_value(const double *array, int layer) {
+ while (array[layer] < 0.0 && layer > 0) layer--;
+ return AOMMAX(array[layer], 0.0);
+}
+
+static void motion_search(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *ref,
+ const BLOCK_SIZE block_size, const int mb_row,
+ const int mb_col, FULLPEL_MV *ref_mv) {
+ // Block information (ONLY Y-plane is used for motion search).
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int y_stride = src->y_stride;
+ assert(y_stride == ref->y_stride);
+ const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+ // Save input state.
+ MACROBLOCK *const mb = &cpi->td.mb;
+ MACROBLOCKD *const mbd = &mb->e_mbd;
+ const struct buf_2d ori_src_buf = mb->plane[0].src;
+ const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
+
+ // Parameters used for motion search.
+ FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+ FULLPEL_MV_STATS best_mv_stats;
+ const SEARCH_METHODS search_method = NSTEP;
+ const search_site_config *search_site_cfg =
+ cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
+ const int step_param =
+ av1_init_search_range(AOMMAX(src->y_crop_width, src->y_crop_height));
+
+ // Baseline position for motion search (used for rate distortion comparison).
+ const MV baseline_mv = kZeroMv;
+
+ // Setup.
+ mb->plane[0].src.buf = src->y_buffer + y_offset;
+ mb->plane[0].src.stride = y_stride;
+ mbd->plane[0].pre[0].buf = ref->y_buffer + y_offset;
+ mbd->plane[0].pre[0].stride = y_stride;
+
+ // Unused intermediate results for motion search.
+ int cost_list[5];
+
+ // Do motion search.
+ // Only do full search on the entire block.
+ av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
+ &baseline_mv, *ref_mv, search_site_cfg,
+ search_method,
+ /*fine_search_interval=*/0);
+ av1_full_pixel_search(*ref_mv, &full_ms_params, step_param,
+ cond_cost_list(cpi, cost_list), ref_mv, &best_mv_stats,
+ NULL);
+
+ // Restore input state.
+ mb->plane[0].src = ori_src_buf;
+ mbd->plane[0].pre[0] = ori_pre_buf;
+}
+
+static unsigned int residual_variance(const AV1_COMP *cpi,
+ const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *ref,
+ const BLOCK_SIZE block_size,
+ const int mb_row, const int mb_col,
+ FULLPEL_MV ref_mv, unsigned int *sse) {
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int y_stride = src->y_stride;
+ assert(y_stride == ref->y_stride);
+ const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+ const int mv_offset = ref_mv.row * y_stride + ref_mv.col;
+ const unsigned int var = cpi->ppi->fn_ptr[block_size].vf(
+ ref->y_buffer + y_offset + mv_offset, y_stride, src->y_buffer + y_offset,
+ y_stride, sse);
+ return var;
+}
+
+static double frame_average_variance(const AV1_COMP *const cpi,
+ const YV12_BUFFER_CONFIG *const frame) {
+ const MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ const uint8_t *const y_buffer = frame->y_buffer;
+ const int y_stride = frame->y_stride;
+ const BLOCK_SIZE block_size = BLOCK_64X64;
+
+ const int block_w = mi_size_wide[block_size] * 4;
+ const int block_h = mi_size_high[block_size] * 4;
+ int row, col;
+ double var = 0.0, var_count = 0.0;
+ const int use_hbd = frame->flags & YV12_FLAG_HIGHBITDEPTH;
+
+ // Loop through each block.
+ for (row = 0; row < frame->y_height / block_h; ++row) {
+ for (col = 0; col < frame->y_width / block_w; ++col) {
+ struct buf_2d buf;
+ const int row_offset_y = row * block_h;
+ const int col_offset_y = col * block_w;
+
+ buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
+ buf.stride = y_stride;
+
+ var += av1_get_perpixel_variance(cpi, xd, &buf, block_size, AOM_PLANE_Y,
+ use_hbd);
+ var_count += 1.0;
+ }
+ }
+ var /= var_count;
+ return var;
+}
+
+static double residual_frame_average_variance(AV1_COMP *cpi,
+ const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *ref,
+ FULLPEL_MV *mvs) {
+ if (ref == NULL) return frame_average_variance(cpi, src);
+ const BLOCK_SIZE block_size = BLOCK_16X16;
+ const int frame_height = src->y_height;
+ const int frame_width = src->y_width;
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int mb_rows = (frame_height + mb_height - 1) / mb_height;
+ const int mb_cols = (frame_width + mb_width - 1) / mb_width;
+ const int num_planes = av1_num_planes(&cpi->common);
+ const int mi_h = mi_size_high_log2[block_size];
+ const int mi_w = mi_size_wide_log2[block_size];
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+ // Save input state.
+ MACROBLOCK *const mb = &cpi->td.mb;
+ MACROBLOCKD *const mbd = &mb->e_mbd;
+ uint8_t *input_buffer[MAX_MB_PLANE];
+ for (int i = 0; i < num_planes; i++) {
+ input_buffer[i] = mbd->plane[i].pre[0].buf;
+ }
+ MB_MODE_INFO **input_mb_mode_info = mbd->mi;
+
+ bool do_motion_search = false;
+ if (mvs == NULL) {
+ do_motion_search = true;
+ CHECK_MEM_ERROR(&cpi->common, mvs,
+ (FULLPEL_MV *)aom_calloc(mb_rows * mb_cols, sizeof(*mvs)));
+ }
+
+ unsigned int variance = 0;
+ // Perform temporal filtering block by block.
+ for (int mb_row = 0; mb_row < mb_rows; mb_row++) {
+ av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+ (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+ cpi->oxcf.border_in_pixels);
+ for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+ av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+ (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
+ cpi->oxcf.border_in_pixels);
+ FULLPEL_MV *ref_mv = &mvs[mb_col + mb_row * mb_cols];
+ if (do_motion_search) {
+ motion_search(cpi, src, ref, block_size, mb_row, mb_col, ref_mv);
+ }
+ unsigned int mv_sse;
+ const unsigned int blk_var = residual_variance(
+ cpi, src, ref, block_size, mb_row, mb_col, *ref_mv, &mv_sse);
+ variance += blk_var;
+ }
+ }
+
+ // Restore input state
+ for (int i = 0; i < num_planes; i++) {
+ mbd->plane[i].pre[0].buf = input_buffer[i];
+ }
+ mbd->mi = input_mb_mode_info;
+ return (double)variance / (double)(mb_rows * mb_cols);
+}
+
+// TODO(sdeng): Add the SIMD implementation.
+static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source,
+ int source_stride,
+ const uint16_t *blurred,
+ int blurred_stride, uint16_t *dst,
+ int dst_stride, int w, int h,
+ double amount, int bit_depth) {
+ const int max_value = (1 << bit_depth) - 1;
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ const double val =
+ (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+ dst[j] = (uint16_t)clamp((int)(val + 0.5), 0, max_value);
+ }
+ source += source_stride;
+ blurred += blurred_stride;
+ dst += dst_stride;
+ }
+}
+
+static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride,
+ const uint8_t *blurred, int blurred_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ double amount) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ const double val =
+ (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+ dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255);
+ }
+ source += source_stride;
+ blurred += blurred_stride;
+ dst += dst_stride;
+ }
+}
+
+static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
+ const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *blurred,
+ const YV12_BUFFER_CONFIG *dst, double amount) {
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ if (cpi->common.seq_params->use_highbitdepth) {
+ assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(dst->flags & YV12_FLAG_HIGHBITDEPTH);
+ highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
+ CONVERT_TO_SHORTPTR(blurred->y_buffer),
+ blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer),
+ dst->y_stride, source->y_width, source->y_height,
+ amount, bit_depth);
+ } else {
+ unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer,
+ blurred->y_stride, dst->y_buffer, dst->y_stride,
+ source->y_width, source->y_height, amount);
+ }
+}
+
+// 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128,
+// all co-efficients must be even.
+DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0, 8, 30, 52,
+ 30, 8, 0, 0 };
+static AOM_INLINE void gaussian_blur(const int bit_depth,
+ const YV12_BUFFER_CONFIG *source,
+ const YV12_BUFFER_CONFIG *dst) {
+ const int block_size = BLOCK_128X128;
+ const int block_w = mi_size_wide[block_size] * 4;
+ const int block_h = mi_size_high[block_size] * 4;
+ const int num_cols = (source->y_width + block_w - 1) / block_w;
+ const int num_rows = (source->y_height + block_h - 1) / block_h;
+ int row, col;
+
+ ConvolveParams conv_params = get_conv_params(0, 0, bit_depth);
+ InterpFilterParams filter = { .filter_ptr = gauss_filter,
+ .taps = 8,
+ .interp_filter = EIGHTTAP_REGULAR };
+
+ for (row = 0; row < num_rows; ++row) {
+ for (col = 0; col < num_cols; ++col) {
+ const int row_offset_y = row * block_h;
+ const int col_offset_y = col * block_w;
+
+ uint8_t *src_buf =
+ source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+ uint8_t *dst_buf =
+ dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y;
+
+ if (source->flags & YV12_FLAG_HIGHBITDEPTH) {
+ av1_highbd_convolve_2d_sr(
+ CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
+ CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h,
+ &filter, &filter, 0, 0, &conv_params, bit_depth);
+ } else {
+ av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride,
+ block_w, block_h, &filter, &filter, 0, 0,
+ &conv_params);
+ }
+ }
+ }
+}
+
+static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi,
+ double source_variance,
+ YV12_BUFFER_CONFIG *const source,
+ YV12_BUFFER_CONFIG *const sharpened) {
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const bool cal_vmaf_neg =
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+ double new_vmaf;
+
+ aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, sharpened, bit_depth,
+ cal_vmaf_neg, &new_vmaf);
+
+ const double sharpened_var = frame_average_variance(cpi, sharpened);
+ return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf);
+}
+
+static double find_best_frame_unsharp_amount_loop(
+ const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source,
+ YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened,
+ double best_vmaf, const double baseline_variance,
+ const double unsharp_amount_start, const double step_size,
+ const int max_loop_count, const double max_amount) {
+ const double min_amount = 0.0;
+ int loop_count = 0;
+ double approx_vmaf = best_vmaf;
+ double unsharp_amount = unsharp_amount_start;
+ do {
+ best_vmaf = approx_vmaf;
+ unsharp_amount += step_size;
+ if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+ unsharp(cpi, source, blurred, sharpened, unsharp_amount);
+ approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened);
+
+ loop_count++;
+ } while (approx_vmaf > best_vmaf && loop_count < max_loop_count);
+ unsharp_amount =
+ approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size;
+ return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
+static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi,
+ YV12_BUFFER_CONFIG *const source,
+ YV12_BUFFER_CONFIG *const blurred,
+ const double unsharp_amount_start,
+ const double step_size,
+ const int max_loop_count,
+ const double max_filter_amount) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int width = source->y_width;
+ const int height = source->y_height;
+ YV12_BUFFER_CONFIG sharpened;
+ memset(&sharpened, 0, sizeof(sharpened));
+ aom_alloc_frame_buffer(
+ &sharpened, width, height, source->subsampling_x, source->subsampling_y,
+ cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ const double baseline_variance = frame_average_variance(cpi, source);
+ double unsharp_amount;
+ if (unsharp_amount_start <= step_size) {
+ unsharp_amount = find_best_frame_unsharp_amount_loop(
+ cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0,
+ step_size, max_loop_count, max_filter_amount);
+ } else {
+ double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start;
+ double v0, v1;
+ unsharp(cpi, source, blurred, &sharpened, a0);
+ v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+ unsharp(cpi, source, blurred, &sharpened, a1);
+ v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+ if (fabs(v0 - v1) < 0.01) {
+ unsharp_amount = a0;
+ } else if (v0 > v1) {
+ unsharp_amount = find_best_frame_unsharp_amount_loop(
+ cpi, source, blurred, &sharpened, v0, baseline_variance, a0,
+ -step_size, max_loop_count, max_filter_amount);
+ } else {
+ unsharp_amount = find_best_frame_unsharp_amount_loop(
+ cpi, source, blurred, &sharpened, v1, baseline_variance, a1,
+ step_size, max_loop_count, max_filter_amount);
+ }
+ }
+
+ aom_free_frame_buffer(&sharpened);
+ return unsharp_amount;
+}
+
+void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi,
+ YV12_BUFFER_CONFIG *const source) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int width = source->y_width;
+ const int height = source->y_height;
+
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int layer_depth =
+ AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+ const double best_frame_unsharp_amount =
+ get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+ if (best_frame_unsharp_amount <= 0.0) return;
+
+ YV12_BUFFER_CONFIG blurred;
+ memset(&blurred, 0, sizeof(blurred));
+ aom_alloc_frame_buffer(
+ &blurred, width, height, source->subsampling_x, source->subsampling_y,
+ cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ gaussian_blur(bit_depth, source, &blurred);
+ unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
+ aom_free_frame_buffer(&blurred);
+}
+
+void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
+ YV12_BUFFER_CONFIG *const source) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int width = source->y_width;
+ const int height = source->y_height;
+
+ YV12_BUFFER_CONFIG source_extended, blurred;
+ memset(&source_extended, 0, sizeof(source_extended));
+ memset(&blurred, 0, sizeof(blurred));
+ aom_alloc_frame_buffer(
+ &source_extended, width, height, source->subsampling_x,
+ source->subsampling_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(
+ &blurred, width, height, source->subsampling_x, source->subsampling_y,
+ cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ av1_copy_and_extend_frame(source, &source_extended);
+ gaussian_blur(bit_depth, &source_extended, &blurred);
+ aom_free_frame_buffer(&source_extended);
+
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int layer_depth =
+ AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+ const double last_frame_unsharp_amount =
+ get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+ const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+ cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01);
+
+ cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+ best_frame_unsharp_amount;
+
+ unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
+ aom_free_frame_buffer(&blurred);
+}
+
+void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
+ YV12_BUFFER_CONFIG *const source) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int width = source->y_width;
+ const int height = source->y_height;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int ss_x = source->subsampling_x;
+ const int ss_y = source->subsampling_y;
+
+ YV12_BUFFER_CONFIG source_extended, blurred;
+ memset(&blurred, 0, sizeof(blurred));
+ memset(&source_extended, 0, sizeof(source_extended));
+ aom_alloc_frame_buffer(
+ &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ av1_copy_and_extend_frame(source, &source_extended);
+ gaussian_blur(bit_depth, &source_extended, &blurred);
+ aom_free_frame_buffer(&source_extended);
+
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int layer_depth =
+ AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+ const double last_frame_unsharp_amount =
+ get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+
+ const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+ cpi, source, &blurred, last_frame_unsharp_amount, 0.05, 20, 1.01);
+
+ cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+ best_frame_unsharp_amount;
+
+ const int block_size = BLOCK_64X64;
+ const int block_w = mi_size_wide[block_size] * 4;
+ const int block_h = mi_size_high[block_size] * 4;
+ const int num_cols = (source->y_width + block_w - 1) / block_w;
+ const int num_rows = (source->y_height + block_h - 1) / block_h;
+ double *best_unsharp_amounts =
+ aom_calloc(num_cols * num_rows, sizeof(*best_unsharp_amounts));
+ if (!best_unsharp_amounts) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating vmaf data");
+ }
+
+ YV12_BUFFER_CONFIG source_block, blurred_block;
+ memset(&source_block, 0, sizeof(source_block));
+ memset(&blurred_block, 0, sizeof(blurred_block));
+ aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int row_offset_y = row * block_h;
+ const int col_offset_y = col * block_w;
+ const int block_width = AOMMIN(width - col_offset_y, block_w);
+ const int block_height = AOMMIN(height - row_offset_y, block_h);
+ const int index = col + row * num_cols;
+
+ if (cm->seq_params->use_highbitdepth) {
+ assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
+ uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+ row_offset_y * source->y_stride +
+ col_offset_y;
+ uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+ row_offset_y * blurred.y_stride +
+ col_offset_y;
+ uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer);
+ uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer);
+
+ // Copy block from source frame.
+ for (int i = 0; i < block_h; ++i) {
+ for (int j = 0; j < block_w; ++j) {
+ if (i >= block_height || j >= block_width) {
+ src_dst[j] = 0;
+ blurred_dst[j] = 0;
+ } else {
+ src_dst[j] = frame_src_buf[j];
+ blurred_dst[j] = frame_blurred_buf[j];
+ }
+ }
+ frame_src_buf += source->y_stride;
+ frame_blurred_buf += blurred.y_stride;
+ src_dst += source_block.y_stride;
+ blurred_dst += blurred_block.y_stride;
+ }
+ } else {
+ uint8_t *frame_src_buf =
+ source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+ uint8_t *frame_blurred_buf =
+ blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+ uint8_t *blurred_dst = blurred_block.y_buffer;
+ uint8_t *src_dst = source_block.y_buffer;
+
+ // Copy block from source frame.
+ for (int i = 0; i < block_h; ++i) {
+ for (int j = 0; j < block_w; ++j) {
+ if (i >= block_height || j >= block_width) {
+ src_dst[j] = 0;
+ blurred_dst[j] = 0;
+ } else {
+ src_dst[j] = frame_src_buf[j];
+ blurred_dst[j] = frame_blurred_buf[j];
+ }
+ }
+ frame_src_buf += source->y_stride;
+ frame_blurred_buf += blurred.y_stride;
+ src_dst += source_block.y_stride;
+ blurred_dst += blurred_block.y_stride;
+ }
+ }
+
+ best_unsharp_amounts[index] = find_best_frame_unsharp_amount(
+ cpi, &source_block, &blurred_block, best_frame_unsharp_amount, 0.1, 3,
+ 1.5);
+ }
+ }
+
+ // Apply best blur amounts
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int row_offset_y = row * block_h;
+ const int col_offset_y = col * block_w;
+ const int block_width = AOMMIN(source->y_width - col_offset_y, block_w);
+ const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
+ const int index = col + row * num_cols;
+
+ if (cm->seq_params->use_highbitdepth) {
+ assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
+ uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+ row_offset_y * source->y_stride + col_offset_y;
+ uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+ row_offset_y * blurred.y_stride + col_offset_y;
+ highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf,
+ blurred.y_stride, src_buf, source->y_stride,
+ block_width, block_height,
+ best_unsharp_amounts[index], bit_depth);
+ } else {
+ uint8_t *src_buf =
+ source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+ uint8_t *blurred_buf =
+ blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+ unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride,
+ src_buf, source->y_stride, block_width, block_height,
+ best_unsharp_amounts[index]);
+ }
+ }
+ }
+
+ aom_free_frame_buffer(&source_block);
+ aom_free_frame_buffer(&blurred_block);
+ aom_free_frame_buffer(&blurred);
+ aom_free(best_unsharp_amounts);
+}
+
+void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const int y_width = cpi->source->y_width;
+ const int y_height = cpi->source->y_height;
+ const int resized_block_size = BLOCK_32X32;
+ const int resize_factor = 2;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int ss_x = cpi->source->subsampling_x;
+ const int ss_y = cpi->source->subsampling_y;
+
+ YV12_BUFFER_CONFIG resized_source;
+ memset(&resized_source, 0, sizeof(resized_source));
+ aom_alloc_frame_buffer(
+ &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x,
+ ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ if (!av1_resize_and_extend_frame_nonnormative(
+ cpi->source, &resized_source, bit_depth, av1_num_planes(cm))) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating buffers during resize");
+ }
+
+ const int resized_y_width = resized_source.y_width;
+ const int resized_y_height = resized_source.y_height;
+ const int resized_block_w = mi_size_wide[resized_block_size] * 4;
+ const int resized_block_h = mi_size_high[resized_block_size] * 4;
+ const int num_cols =
+ (resized_y_width + resized_block_w - 1) / resized_block_w;
+ const int num_rows =
+ (resized_y_height + resized_block_h - 1) / resized_block_h;
+
+ YV12_BUFFER_CONFIG blurred;
+ memset(&blurred, 0, sizeof(blurred));
+ aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x,
+ ss_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ gaussian_blur(bit_depth, &resized_source, &blurred);
+
+ YV12_BUFFER_CONFIG recon;
+ memset(&recon, 0, sizeof(recon));
+ aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_yv12_copy_frame(&resized_source, &recon, 1);
+
+ VmafContext *vmaf_context;
+ const bool cal_vmaf_neg =
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+ aom_init_vmaf_context(&vmaf_context, cpi->vmaf_info.vmaf_model, cal_vmaf_neg);
+ unsigned int *sses = aom_calloc(num_rows * num_cols, sizeof(*sses));
+ if (!sses) {
+ aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating vmaf data");
+ }
+
+ // Loop through each 'block_size' block.
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ const int row_offset_y = row * resized_block_h;
+ const int col_offset_y = col * resized_block_w;
+
+ uint8_t *const orig_buf = resized_source.y_buffer +
+ row_offset_y * resized_source.y_stride +
+ col_offset_y;
+ uint8_t *const blurred_buf =
+ blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+
+ cpi->ppi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
+ blurred_buf, blurred.y_stride,
+ &sses[index]);
+
+ uint8_t *const recon_buf =
+ recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y;
+ // Set recon buf
+ if (cpi->common.seq_params->use_highbitdepth) {
+ highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+ CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
+ CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride,
+ resized_block_w, resized_block_h, 0.0, bit_depth);
+ } else {
+ unsharp_rect(blurred_buf, blurred.y_stride, blurred_buf,
+ blurred.y_stride, recon_buf, recon.y_stride,
+ resized_block_w, resized_block_h, 0.0);
+ }
+
+ aom_read_vmaf_image(vmaf_context, &resized_source, &recon, bit_depth,
+ index);
+
+ // Restore recon buf
+ if (cpi->common.seq_params->use_highbitdepth) {
+ highbd_unsharp_rect(
+ CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+ CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
+ CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride, resized_block_w,
+ resized_block_h, 0.0, bit_depth);
+ } else {
+ unsharp_rect(orig_buf, resized_source.y_stride, orig_buf,
+ resized_source.y_stride, recon_buf, recon.y_stride,
+ resized_block_w, resized_block_h, 0.0);
+ }
+ }
+ }
+ aom_flush_vmaf_context(vmaf_context);
+ for (int row = 0; row < num_rows; ++row) {
+ for (int col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ const double vmaf = aom_calc_vmaf_at_index(
+ vmaf_context, cpi->vmaf_info.vmaf_model, index);
+ const double dvmaf = kBaselineVmaf - vmaf;
+
+ const double mse =
+ (double)sses[index] / (double)(resized_y_width * resized_y_height);
+ double weight;
+ const double eps = 0.01 / (num_rows * num_cols);
+ if (dvmaf < eps || mse < eps) {
+ weight = 1.0;
+ } else {
+ weight = mse / dvmaf;
+ }
+
+ // Normalize it with a data fitted model.
+ weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8;
+ cpi->vmaf_info.rdmult_scaling_factors[index] = weight;
+ }
+ }
+
+ aom_free_frame_buffer(&resized_source);
+ aom_free_frame_buffer(&blurred);
+ aom_close_vmaf_context(vmaf_context);
+ aom_free(sses);
+}
+
+void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int *const rdmult) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const int bsize_base = BLOCK_64X64;
+ const int num_mi_w = mi_size_wide[bsize_base];
+ const int num_mi_h = mi_size_high[bsize_base];
+ const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+ const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+ const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+ const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+ int row, col;
+ double num_of_mi = 0.0;
+ double geom_mean_of_scale = 0.0;
+
+ for (row = mi_row / num_mi_w;
+ row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+ for (col = mi_col / num_mi_h;
+ col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ geom_mean_of_scale += log(cpi->vmaf_info.rdmult_scaling_factors[index]);
+ num_of_mi += 1.0;
+ }
+ }
+ geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+ *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+ *rdmult = AOMMAX(*rdmult, 0);
+ av1_set_error_per_bit(&x->errorperbit, *rdmult);
+}
+
+// TODO(sdeng): replace them with the SIMD versions.
+static AOM_INLINE double highbd_image_sad_c(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ int w, int h) {
+ double accum = 0.0;
+ int i, j;
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ double img1px = src[i * src_stride + j];
+ double img2px = ref[i * ref_stride + j];
+
+ accum += fabs(img1px - img2px);
+ }
+ }
+
+ return accum / (double)(h * w);
+}
+
+static AOM_INLINE double image_sad_c(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int w,
+ int h) {
+ double accum = 0.0;
+ int i, j;
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ double img1px = src[i * src_stride + j];
+ double img2px = ref[i * ref_stride + j];
+
+ accum += fabs(img1px - img2px);
+ }
+ }
+
+ return accum / (double)(h * w);
+}
+
+static double calc_vmaf_motion_score(const AV1_COMP *const cpi,
+ const AV1_COMMON *const cm,
+ const YV12_BUFFER_CONFIG *const cur,
+ const YV12_BUFFER_CONFIG *const last,
+ const YV12_BUFFER_CONFIG *const next) {
+ const int y_width = cur->y_width;
+ const int y_height = cur->y_height;
+ YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int ss_x = cur->subsampling_x;
+ const int ss_y = cur->subsampling_y;
+
+ memset(&blurred_cur, 0, sizeof(blurred_cur));
+ memset(&blurred_last, 0, sizeof(blurred_last));
+ memset(&blurred_next, 0, sizeof(blurred_next));
+
+ aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+
+ gaussian_blur(bit_depth, cur, &blurred_cur);
+ gaussian_blur(bit_depth, last, &blurred_last);
+ if (next) gaussian_blur(bit_depth, next, &blurred_next);
+
+ double motion1, motion2 = 65536.0;
+ if (cm->seq_params->use_highbitdepth) {
+ assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH);
+ const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
+ motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+ blurred_cur.y_stride,
+ CONVERT_TO_SHORTPTR(blurred_last.y_buffer),
+ blurred_last.y_stride, y_width, y_height) *
+ scale_factor;
+ if (next) {
+ assert(blurred_next.flags & YV12_FLAG_HIGHBITDEPTH);
+ motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+ blurred_cur.y_stride,
+ CONVERT_TO_SHORTPTR(blurred_next.y_buffer),
+ blurred_next.y_stride, y_width, y_height) *
+ scale_factor;
+ }
+ } else {
+ motion1 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
+ blurred_last.y_buffer, blurred_last.y_stride, y_width,
+ y_height);
+ if (next) {
+ motion2 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
+ blurred_next.y_buffer, blurred_next.y_stride,
+ y_width, y_height);
+ }
+ }
+
+ aom_free_frame_buffer(&blurred_cur);
+ aom_free_frame_buffer(&blurred_last);
+ aom_free_frame_buffer(&blurred_next);
+
+ return AOMMIN(motion1, motion2);
+}
+
+static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi,
+ YV12_BUFFER_CONFIG **last,
+ YV12_BUFFER_CONFIG **next) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const GF_GROUP *gf_group = &cpi->ppi->gf_group;
+ const int src_index =
+ cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[cpi->gf_frame_index];
+ struct lookahead_entry *last_entry = av1_lookahead_peek(
+ cpi->ppi->lookahead, src_index - 1, cpi->compressor_stage);
+ struct lookahead_entry *next_entry = av1_lookahead_peek(
+ cpi->ppi->lookahead, src_index + 1, cpi->compressor_stage);
+ *next = &next_entry->img;
+ *last = cm->show_frame ? cpi->last_source : &last_entry->img;
+}
+
+// Calculates the new qindex from the VMAF motion score. This is based on the
+// observation: when the motion score becomes higher, the VMAF score of the
+// same source and distorted frames would become higher.
+int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) {
+ return current_qindex;
+ }
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int layer_depth =
+ AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+ const double last_frame_ysse =
+ get_layer_value(cpi->vmaf_info.last_frame_ysse, layer_depth);
+ const double last_frame_vmaf =
+ get_layer_value(cpi->vmaf_info.last_frame_vmaf, layer_depth);
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const double approx_sse = last_frame_ysse / (double)((1 << (bit_depth - 8)) *
+ (1 << (bit_depth - 8)));
+ const double approx_dvmaf = kBaselineVmaf - last_frame_vmaf;
+ const double sse_threshold =
+ 0.01 * cpi->source->y_width * cpi->source->y_height;
+ const double vmaf_threshold = 0.01;
+ if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) {
+ return current_qindex;
+ }
+ YV12_BUFFER_CONFIG *cur_buf = cpi->source;
+ if (cm->show_frame == 0) {
+ const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
+ struct lookahead_entry *cur_entry = av1_lookahead_peek(
+ cpi->ppi->lookahead, src_index, cpi->compressor_stage);
+ cur_buf = &cur_entry->img;
+ }
+ assert(cur_buf);
+
+ YV12_BUFFER_CONFIG *next_buf, *last_buf;
+ get_neighbor_frames(cpi, &last_buf, &next_buf);
+ assert(last_buf);
+
+ const double motion =
+ calc_vmaf_motion_score(cpi, cm, cur_buf, last_buf, next_buf);
+
+ // Get dVMAF through a data fitted model.
+ const double dvmaf = 26.11 * (1.0 - exp(-0.06 * motion));
+ const double dsse = dvmaf * approx_sse / approx_dvmaf;
+
+ // Clamping beta to address VQ issue (aomedia:3170).
+ const double beta = AOMMAX(approx_sse / (dsse + approx_sse), 0.5);
+ const int offset =
+ av1_get_deltaq_offset(cm->seq_params->bit_depth, current_qindex, beta);
+ int qindex = current_qindex + offset;
+
+ qindex = AOMMIN(qindex, MAXQ);
+ qindex = AOMMAX(qindex, MINQ);
+
+ return qindex;
+}
+
+static AOM_INLINE double cal_approx_score(
+ AV1_COMP *const cpi, double src_variance, double new_variance,
+ double src_score, YV12_BUFFER_CONFIG *const src,
+ YV12_BUFFER_CONFIG *const recon_sharpened) {
+ double score;
+ const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+ const bool cal_vmaf_neg =
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+ aom_calc_vmaf(cpi->vmaf_info.vmaf_model, src, recon_sharpened, bit_depth,
+ cal_vmaf_neg, &score);
+ return src_variance / new_variance * (score - src_score);
+}
+
+static double find_best_frame_unsharp_amount_loop_neg(
+ AV1_COMP *const cpi, double src_variance, double base_score,
+ YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon,
+ YV12_BUFFER_CONFIG *const ref, YV12_BUFFER_CONFIG *const src_blurred,
+ YV12_BUFFER_CONFIG *const recon_blurred,
+ YV12_BUFFER_CONFIG *const src_sharpened,
+ YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs,
+ double best_score, const double unsharp_amount_start,
+ const double step_size, const int max_loop_count, const double max_amount) {
+ const double min_amount = 0.0;
+ int loop_count = 0;
+ double approx_score = best_score;
+ double unsharp_amount = unsharp_amount_start;
+
+ do {
+ best_score = approx_score;
+ unsharp_amount += step_size;
+ if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+ unsharp(cpi, recon, recon_blurred, recon_sharpened, unsharp_amount);
+ unsharp(cpi, src, src_blurred, src_sharpened, unsharp_amount);
+ const double new_variance =
+ residual_frame_average_variance(cpi, src_sharpened, ref, mvs);
+ approx_score = cal_approx_score(cpi, src_variance, new_variance, base_score,
+ src, recon_sharpened);
+
+ loop_count++;
+ } while (approx_score > best_score && loop_count < max_loop_count);
+ unsharp_amount =
+ approx_score > best_score ? unsharp_amount : unsharp_amount - step_size;
+
+ return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
+static double find_best_frame_unsharp_amount_neg(
+ AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const src,
+ YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref,
+ double base_score, const double unsharp_amount_start,
+ const double step_size, const int max_loop_count,
+ const double max_filter_amount) {
+ FULLPEL_MV *mvs = NULL;
+ const double src_variance =
+ residual_frame_average_variance(cpi, src, ref, mvs);
+
+ const AV1_COMMON *const cm = &cpi->common;
+ const int width = recon->y_width;
+ const int height = recon->y_height;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const int ss_x = recon->subsampling_x;
+ const int ss_y = recon->subsampling_y;
+
+ YV12_BUFFER_CONFIG src_blurred, recon_blurred, src_sharpened, recon_sharpened;
+ memset(&recon_sharpened, 0, sizeof(recon_sharpened));
+ memset(&src_sharpened, 0, sizeof(src_sharpened));
+ memset(&recon_blurred, 0, sizeof(recon_blurred));
+ memset(&src_blurred, 0, sizeof(src_blurred));
+ aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y,
+ cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels,
+ cm->features.byte_alignment, 0, 0);
+ aom_alloc_frame_buffer(
+ &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
+ cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0);
+
+ gaussian_blur(bit_depth, recon, &recon_blurred);
+ gaussian_blur(bit_depth, src, &src_blurred);
+
+ unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_start);
+ unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_start);
+ const double variance_start =
+ residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
+ const double score_start = cal_approx_score(
+ cpi, src_variance, variance_start, base_score, src, &recon_sharpened);
+
+ const double unsharp_amount_next = unsharp_amount_start + step_size;
+ unsharp(cpi, recon, &recon_blurred, &recon_sharpened, unsharp_amount_next);
+ unsharp(cpi, src, &src_blurred, &src_sharpened, unsharp_amount_next);
+ const double variance_next =
+ residual_frame_average_variance(cpi, &src_sharpened, ref, mvs);
+ const double score_next = cal_approx_score(cpi, src_variance, variance_next,
+ base_score, src, &recon_sharpened);
+
+ double unsharp_amount;
+ if (score_next > score_start) {
+ unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
+ cpi, src_variance, base_score, src, recon, ref, &src_blurred,
+ &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_next,
+ unsharp_amount_next, step_size, max_loop_count, max_filter_amount);
+ } else {
+ unsharp_amount = find_best_frame_unsharp_amount_loop_neg(
+ cpi, src_variance, base_score, src, recon, ref, &src_blurred,
+ &recon_blurred, &src_sharpened, &recon_sharpened, mvs, score_start,
+ unsharp_amount_start, -step_size, max_loop_count, max_filter_amount);
+ }
+
+ aom_free_frame_buffer(&recon_sharpened);
+ aom_free_frame_buffer(&src_sharpened);
+ aom_free_frame_buffer(&recon_blurred);
+ aom_free_frame_buffer(&src_blurred);
+ aom_free(mvs);
+ return unsharp_amount;
+}
+
+void av1_update_vmaf_curve(AV1_COMP *cpi) {
+ YV12_BUFFER_CONFIG *source = cpi->source;
+ YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+ const int bit_depth = cpi->td.mb.e_mbd.bd;
+ const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+ const int layer_depth =
+ AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
+ double base_score;
+ const bool cal_vmaf_neg =
+ cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN;
+ aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, recon, bit_depth,
+ cal_vmaf_neg, &base_score);
+ cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score;
+ if (cpi->common.seq_params->use_highbitdepth) {
+ assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
+ assert(recon->flags & YV12_FLAG_HIGHBITDEPTH);
+ cpi->vmaf_info.last_frame_ysse[layer_depth] =
+ (double)aom_highbd_get_y_sse(source, recon);
+ } else {
+ cpi->vmaf_info.last_frame_ysse[layer_depth] =
+ (double)aom_get_y_sse(source, recon);
+ }
+
+ if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
+ YV12_BUFFER_CONFIG *last, *next;
+ get_neighbor_frames(cpi, &last, &next);
+ double best_unsharp_amount_start =
+ get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth);
+ const int max_loop_count = 5;
+ cpi->vmaf_info.last_frame_unsharp_amount[layer_depth] =
+ find_best_frame_unsharp_amount_neg(cpi, source, recon, last, base_score,
+ best_unsharp_amount_start, 0.025,
+ max_loop_count, 1.01);
+ }
+}
diff --git a/third_party/aom/av1/encoder/tune_vmaf.h b/third_party/aom/av1/encoder/tune_vmaf.h
new file mode 100644
index 0000000000..a04a29e6fe
--- /dev/null
+++ b/third_party/aom/av1/encoder/tune_vmaf.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_
+#define AOM_AV1_ENCODER_TUNE_VMAF_H_
+
+#include "aom_dsp/vmaf.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/block.h"
+
+typedef struct {
+ // Stores the scaling factors for rdmult when tuning for VMAF.
+ // rdmult_scaling_factors[row * num_cols + col] stores the scaling factors for
+ // 64x64 block at (row, col).
+ double *rdmult_scaling_factors;
+
+ // Stores the luma sse of the last frame.
+ double last_frame_ysse[MAX_ARF_LAYERS];
+
+ // Stores the VMAF of the last frame.
+ double last_frame_vmaf[MAX_ARF_LAYERS];
+
+ // Stores the filter strength of the last frame.
+ double last_frame_unsharp_amount[MAX_ARF_LAYERS];
+
+ // Stores the origial qindex before scaling.
+ int original_qindex;
+
+ // VMAF model used in VMAF caculations.
+ VmafModel *vmaf_model;
+} TuneVMAFInfo;
+
+struct AV1_COMP;
+
+void av1_vmaf_blk_preprocessing(struct AV1_COMP *cpi,
+ YV12_BUFFER_CONFIG *source);
+
+void av1_vmaf_frame_preprocessing(struct AV1_COMP *cpi,
+ YV12_BUFFER_CONFIG *source);
+
+void av1_vmaf_neg_preprocessing(struct AV1_COMP *cpi,
+ YV12_BUFFER_CONFIG *source);
+
+void av1_set_mb_vmaf_rdmult_scaling(struct AV1_COMP *cpi);
+
+void av1_set_vmaf_rdmult(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col, int *rdmult);
+
+int av1_get_vmaf_base_qindex(const struct AV1_COMP *cpi, int current_qindex);
+
+void av1_update_vmaf_curve(struct AV1_COMP *cpi);
+
+#endif // AOM_AV1_ENCODER_TUNE_VMAF_H_
diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h
new file mode 100644
index 0000000000..aab5e1398d
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h
@@ -0,0 +1,3422 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * Contains the details of the ML models used for pruning transform size. This
+ * file is only included by av1/encoder/tx_search.c.
+ */
+#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+/***************************CONFIG_NN_V2 (New)********************************/
+#if CONFIG_NN_V2
+// Tx type model for 4x4 block.
+static float av1_tx_type_nn_4x4_hor_layer0_weights[32] = {
+ -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+ 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f,
+ -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+ 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f,
+ 1.35792f, 0.27733f, 0.88660f, -0.68304f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer0_bias[8] = {
+ 1.38742f, 0.59540f, -1.37622f, 1.92114f,
+ 0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer1_weights[32] = {
+ 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f,
+ -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f,
+ -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f,
+ 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+ -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer1_bias[4] = {
+ 4.07177f,
+ 3.26961f,
+ 0.58083f,
+ 1.21199f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_4x4_hor_layer0_weights, // weights
+ av1_tx_type_nn_4x4_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x4_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x4_hor_layer1_weights,
+ av1_tx_type_nn_4x4_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x4_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x4_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_weights[32] = {
+ -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f,
+ 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f,
+ 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f,
+ 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+ -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_bias[8] = {
+ -0.33685f, 0.22025f, 0.28140f, 0.56138f,
+ 0.93489f, -1.77048f, 1.34989f, -0.93747f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer1_weights[32] = {
+ -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f,
+ 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f,
+ -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+ -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f,
+ -0.86315f, -0.53336f, 0.30320f, -1.32331f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer1_bias[4] = {
+ -1.31519f,
+ -3.26321f,
+ 1.71794f,
+ -1.90778f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_4x4_ver_layer0_weights, // weights
+ av1_tx_type_nn_4x4_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x4_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x4_ver_layer1_weights,
+ av1_tx_type_nn_4x4_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x4_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x4_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static float av1_tx_type_nn_4x8_hor_layer0_weights[32] = {
+ 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f,
+ 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f,
+ -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f,
+ -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+ -1.35896f, -1.17121f, 1.68866f, 0.10357f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer0_bias[8] = {
+ 2.93391f, 0.66831f, -0.21419f, 0.00000f,
+ -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer1_weights[32] = {
+ -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f,
+ -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f,
+ 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f,
+ 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+ -0.50191f, 0.18219f, 1.83664f, -0.75276f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer1_bias[4] = {
+ -1.17455f,
+ -2.26089f,
+ -1.79863f,
+ -2.26333f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_4x8_hor_layer0_weights, // weights
+ av1_tx_type_nn_4x8_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x8_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x8_hor_layer1_weights,
+ av1_tx_type_nn_4x8_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x8_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x8_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_weights[128] = {
+ -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f,
+ -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+ -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f,
+ 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f,
+ 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f,
+ 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f,
+ -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f,
+ -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f,
+ 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f,
+ -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+ -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f,
+ -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f,
+ 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+ 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+ -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f,
+ -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f,
+ 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f,
+ -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f,
+ -0.21958f, 0.05970f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_bias[16] = {
+ 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f,
+ 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f,
+ 0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer1_weights[64] = {
+ -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f,
+ -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f,
+ -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f,
+ -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f,
+ 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f,
+ 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f,
+ -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+ -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+ -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f,
+ -1.01848f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer1_bias[4] = {
+ -1.45955f,
+ -2.08949f,
+ -1.24813f,
+ -1.55368f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_4x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_4x8_ver_layer0_weights, // weights
+ av1_tx_type_nn_4x8_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x8_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x8_ver_layer1_weights,
+ av1_tx_type_nn_4x8_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x8_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x8_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static float av1_tx_type_nn_8x4_hor_layer0_weights[128] = {
+ -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f,
+ 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f,
+ -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f,
+ -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f,
+ -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f,
+ 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f,
+ 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f,
+ -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+ -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+ 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f,
+ 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f,
+ -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f,
+ -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f,
+ 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f,
+ 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+ 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f,
+ -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+ -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+ -1.85523f, 0.92532f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer0_bias[16] = {
+ 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f,
+ -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f,
+ -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer1_weights[64] = {
+ -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f,
+ -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f,
+ 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f,
+ -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+ 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f,
+ -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f,
+ -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f,
+ 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f,
+ 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f,
+ -1.10654f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer1_bias[4] = {
+ -0.92861f,
+ -1.45151f,
+ -1.33588f,
+ -4.33853f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_8x4_hor_layer0_weights, // weights
+ av1_tx_type_nn_8x4_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x4_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x4_hor_layer1_weights,
+ av1_tx_type_nn_8x4_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x4_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x4_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_weights[32] = {
+ -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+ -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+ -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f,
+ -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f,
+ 1.66212f, 1.70826f, 1.55182f, 0.12230f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_bias[8] = {
+ 0.10943f, 2.09789f, 2.16578f, 0.15766f,
+ -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer1_weights[32] = {
+ 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f,
+ 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f,
+ 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f,
+ -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f,
+ -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer1_bias[4] = {
+ 1.81013f,
+ 1.10517f,
+ 2.90059f,
+ 0.95391f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_8x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_8x4_ver_layer0_weights, // weights
+ av1_tx_type_nn_8x4_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x4_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x4_ver_layer1_weights,
+ av1_tx_type_nn_8x4_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x4_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x4_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static float av1_tx_type_nn_8x8_hor_layer0_weights[128] = {
+ -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f,
+ -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f,
+ 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f,
+ 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+ -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f,
+ -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f,
+ -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f,
+ 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+ 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f,
+ -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+ 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f,
+ -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f,
+ 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f,
+ 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f,
+ 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f,
+ 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+ 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f,
+ 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+ -0.99892f, 1.09823f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer0_bias[16] = {
+ -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f,
+ -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f,
+ -0.26319f, 2.65579f, -1.30137f, -0.01487f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer1_weights[64] = {
+ -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f,
+ -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f,
+ 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+ 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f,
+ 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+ -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f,
+ 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f,
+ 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f,
+ 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f,
+ 0.06161f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer1_bias[4] = {
+ 1.70385f,
+ 1.82373f,
+ 1.78496f,
+ 1.80826f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_8x8_hor_layer0_weights, // weights
+ av1_tx_type_nn_8x8_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x8_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x8_hor_layer1_weights,
+ av1_tx_type_nn_8x8_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x8_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x8_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_weights[128] = {
+ -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+ 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f,
+ -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f,
+ -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f,
+ 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f,
+ 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f,
+ 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f,
+ -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+ -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f,
+ 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f,
+ 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f,
+ -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f,
+ 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f,
+ 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f,
+ -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f,
+ 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f,
+ -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f,
+ -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+ -1.29848f, 0.39308f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_bias[16] = {
+ -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f,
+ 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f,
+ 0.83015f, 0.06024f, 1.17180f, 0.65122f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer1_weights[64] = {
+ -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f,
+ 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+ 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f,
+ 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+ 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+ 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f,
+ 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f,
+ 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f,
+ -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f,
+ -0.41305f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer1_bias[4] = {
+ 2.14067f,
+ 2.76699f,
+ 2.04233f,
+ 1.34803f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_8x8_ver_layer0_weights, // weights
+ av1_tx_type_nn_8x8_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x8_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x8_ver_layer1_weights,
+ av1_tx_type_nn_8x8_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x8_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x8_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static float av1_tx_type_nn_8x16_hor_layer0_weights[128] = {
+ -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+ 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f,
+ -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f,
+ 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+ -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f,
+ 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f,
+ -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f,
+ 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f,
+ -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f,
+ -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f,
+ 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f,
+ 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f,
+ -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f,
+ 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f,
+ -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f,
+ 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f,
+ 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f,
+ -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+ -0.28136f, 0.42556f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer0_bias[16] = {
+ 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f,
+ -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+ 1.81560f, -1.02643f, -0.81690f, 0.08302f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer1_weights[64] = {
+ 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f,
+ -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f,
+ 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f,
+ -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f,
+ 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f,
+ 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f,
+ 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f,
+ 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f,
+ 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f,
+ -1.31243f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer1_bias[4] = {
+ 0.83359f,
+ 1.06875f,
+ 1.77645f,
+ 1.49570f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x16_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_8x16_hor_layer0_weights, // weights
+ av1_tx_type_nn_8x16_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x16_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x16_hor_layer1_weights,
+ av1_tx_type_nn_8x16_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x16_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x16_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_weights[128] = {
+ 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f,
+ -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f,
+ -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f,
+ 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+ -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f,
+ 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f,
+ 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f,
+ 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f,
+ -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f,
+ -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f,
+ 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f,
+ 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+ -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+ -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f,
+ -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f,
+ -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+ -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+ 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+ -0.12236f, 0.16075f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_bias[16] = {
+ -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f,
+ -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+ 0.57598f, 0.99819f, 0.75175f, 0.17044f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer1_weights[64] = {
+ -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f,
+ 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f,
+ -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+ 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f,
+ -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+ -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f,
+ -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+ 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f,
+ 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f,
+ 2.20547f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer1_bias[4] = {
+ -0.44080f,
+ -1.67455f,
+ -1.46332f,
+ -6.13206f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x16_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_8x16_ver_layer0_weights, // weights
+ av1_tx_type_nn_8x16_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_8x16_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_8x16_ver_layer1_weights,
+ av1_tx_type_nn_8x16_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_8x16_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_8x16_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static float av1_tx_type_nn_16x8_hor_layer0_weights[128] = {
+ 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f,
+ -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f,
+ -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f,
+ 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f,
+ 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f,
+ 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f,
+ 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f,
+ -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f,
+ -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f,
+ -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f,
+ 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f,
+ -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f,
+ -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f,
+ -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f,
+ 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f,
+ -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+ -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+ 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f,
+ -0.36570f, -0.50757f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer0_bias[16] = {
+ -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+ 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f,
+ -0.12329f, 0.08986f, 1.08117f, -0.00220f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer1_weights[64] = {
+ 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+ 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f,
+ -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f,
+ -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f,
+ -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f,
+ -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f,
+ 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f,
+ 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f,
+ 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f,
+ -0.23347f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer1_bias[4] = {
+ 3.57175f,
+ 2.42612f,
+ 3.31259f,
+ 2.08287f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_16x8_hor_layer0_weights, // weights
+ av1_tx_type_nn_16x8_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_16x8_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_16x8_hor_layer1_weights,
+ av1_tx_type_nn_16x8_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_16x8_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_16x8_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_weights[128] = {
+ 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f,
+ 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+ -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f,
+ 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f,
+ 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f,
+ -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f,
+ 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f,
+ -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f,
+ 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f,
+ 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f,
+ 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+ -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+ -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f,
+ -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f,
+ 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f,
+ 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+ -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f,
+ -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f,
+ -0.81945f, -0.41647f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_bias[16] = {
+ 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f,
+ 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f,
+ -0.04510f, 0.48000f, -0.09354f, -0.42422f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer1_weights[64] = {
+ 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f,
+ -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f,
+ 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f,
+ -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f,
+ -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f,
+ 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f,
+ 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f,
+ -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f,
+ 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f,
+ -0.00873f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer1_bias[4] = {
+ 3.34981f,
+ 3.74710f,
+ 1.38339f,
+ 0.45176f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_16x8_ver_layer0_weights, // weights
+ av1_tx_type_nn_16x8_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_16x8_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_16x8_ver_layer1_weights,
+ av1_tx_type_nn_16x8_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_16x8_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_16x8_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static float av1_tx_type_nn_16x16_layer0_weights[128] = {
+ 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f,
+ 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f,
+ -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+ -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f,
+ 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f,
+ 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f,
+ 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f,
+ 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f,
+ -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f,
+ 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f,
+ 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f,
+ 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f,
+ -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f,
+ 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f,
+ 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f,
+ -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+ -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f,
+ 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f,
+ 0.50355f, 0.08592f,
+};
+
+static float av1_tx_type_nn_16x16_layer0_bias[16] = {
+ -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f,
+ -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f,
+ -0.14062f, -0.42120f, 0.94573f, -0.09287f,
+};
+
+static float av1_tx_type_nn_16x16_layer1_weights[64] = {
+ -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f,
+ 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f,
+ 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+ 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f,
+ 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+ 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f,
+ -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f,
+ 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f,
+ -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f,
+ 1.08829f,
+};
+
+static float av1_tx_type_nn_16x16_layer1_bias[4] = {
+ 0.81986f,
+ 1.26865f,
+ 0.11118f,
+ 2.48404f,
+};
+
+static float av1_tx_type_nn_16x16_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x16_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x16 = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_16x16_layer0_weights, // weights
+ av1_tx_type_nn_16x16_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_16x16_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_16x16_layer1_weights,
+ av1_tx_type_nn_16x16_layer1_bias,
+ NONE,
+ av1_tx_type_nn_16x16_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_16x16_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static float av1_tx_type_nn_4x16_hor_layer0_weights[32] = {
+ 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f,
+ 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+ 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f,
+ 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f,
+ -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer0_bias[8] = {
+ -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+ 0.62806f, -0.20675f, 4.91940f, -0.56079f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer1_weights[32] = {
+ -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f,
+ -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f,
+ 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+ 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+ 1.28413f, -0.30326f, 2.45329f, -0.83335f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer1_bias[4] = {
+ 2.33198f,
+ 3.36245f,
+ 1.62603f,
+ 2.91056f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x16_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_4x16_hor_layer0_weights, // weights
+ av1_tx_type_nn_4x16_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x16_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x16_hor_layer1_weights,
+ av1_tx_type_nn_4x16_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x16_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x16_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_weights[128] = {
+ 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f,
+ 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f,
+ -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f,
+ -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+ -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f,
+ -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f,
+ 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f,
+ 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f,
+ 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+ -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f,
+ -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f,
+ 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+ 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+ 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f,
+ 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f,
+ -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+ 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f,
+ 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f,
+ -0.27975f, -0.01149f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_bias[16] = {
+ -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f,
+ -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+ -0.32530f, 0.73483f, 0.08322f, -0.23890f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer1_weights[64] = {
+ 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f,
+ -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f,
+ 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f,
+ -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f,
+ 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f,
+ -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f,
+ 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f,
+ 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f,
+ -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f,
+ -0.56513f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer1_bias[4] = {
+ 4.60896f,
+ 4.53551f,
+ 4.53124f,
+ 4.27435f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_4x16_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_4x16_ver_layer0_weights, // weights
+ av1_tx_type_nn_4x16_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_4x16_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_4x16_ver_layer1_weights,
+ av1_tx_type_nn_4x16_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_4x16_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_4x16_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static float av1_tx_type_nn_16x4_hor_layer0_weights[128] = {
+ 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f,
+ 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+ -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f,
+ -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f,
+ -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f,
+ -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f,
+ 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f,
+ 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f,
+ 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f,
+ -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f,
+ 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f,
+ -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f,
+ 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f,
+ -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f,
+ -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+ -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f,
+ 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f,
+ 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f,
+ 0.19055f, -1.56413f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer0_bias[16] = {
+ -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f,
+ 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f,
+ 1.14048f, 0.33308f, -1.10886f, 0.41184f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer1_weights[64] = {
+ -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+ 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f,
+ -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f,
+ -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f,
+ 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f,
+ -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f,
+ -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f,
+ 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f,
+ 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f,
+ -0.43819f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer1_bias[4] = {
+ 2.32575f,
+ 2.75703f,
+ 1.12304f,
+ 2.15567f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_hor = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 8, // num_inputs
+ 16, // num_outputs
+ av1_tx_type_nn_16x4_hor_layer0_weights, // weights
+ av1_tx_type_nn_16x4_hor_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_16x4_hor_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 16, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_16x4_hor_layer1_weights,
+ av1_tx_type_nn_16x4_hor_layer1_bias,
+ NONE,
+ av1_tx_type_nn_16x4_hor_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_16x4_hor_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_weights[32] = {
+ 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+ 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f,
+ -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f,
+ -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+ -0.17967f, -0.96622f, 0.42635f, -1.04784f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_bias[8] = {
+ -0.52088f, 0.52844f, -1.03655f, -0.30974f,
+ 2.59952f, -1.93604f, 0.00000f, 2.51787f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer1_weights[32] = {
+ 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f,
+ 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f,
+ 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f,
+ -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f,
+ 1.26814f, -1.93873f, -0.00768f, 1.58309f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer1_bias[4] = {
+ 2.34713f,
+ 1.68667f,
+ 1.25488f,
+ 1.69812f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_16x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_ver = {
+ 1, // num_hidden_layers
+ {
+ // fc layer setting
+ {
+ // layer 0
+ 4, // num_inputs
+ 8, // num_outputs
+ av1_tx_type_nn_16x4_ver_layer0_weights, // weights
+ av1_tx_type_nn_16x4_ver_layer0_bias, // bias
+ RELU, // activation
+ av1_tx_type_nn_16x4_ver_layer0_out, // output
+ NULL,
+ NULL,
+ NULL,
+ },
+ {
+ 8, // num_inputs (!!same as num_outputs of last layer)
+ 4,
+ av1_tx_type_nn_16x4_ver_layer1_weights,
+ av1_tx_type_nn_16x4_ver_layer1_bias,
+ NONE,
+ av1_tx_type_nn_16x4_ver_layer1_out,
+ NULL,
+ NULL,
+ NULL,
+ },
+ },
+ 4, // num_outputs
+ av1_tx_type_nn_16x4_ver_layer1_out, // logits (!!same as last layer output)
+ SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_hor[] = {
+ &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
+};
+
+static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_ver[] = {
+ &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
+};
+#else
+/******************************CONFIG_NN***************************************/
+// Tx type model for 4x4 block.
+static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = {
+ -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+ 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f,
+ -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+ 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f,
+ 1.35792f, 0.27733f, 0.88660f, -0.68304f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = {
+ 1.38742f, 0.59540f, -1.37622f, 1.92114f,
+ 0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = {
+ 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f,
+ -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f,
+ -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f,
+ 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+ -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = {
+ 4.07177f,
+ 3.26961f,
+ 0.58083f,
+ 1.21199f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x4_hor_layer0,
+ av1_tx_type_nn_weights_4x4_hor_layer1 },
+ { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = {
+ -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f,
+ 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f,
+ 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f,
+ 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+ -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = {
+ -0.33685f, 0.22025f, 0.28140f, 0.56138f,
+ 0.93489f, -1.77048f, 1.34989f, -0.93747f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = {
+ -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f,
+ 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f,
+ -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+ -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f,
+ -0.86315f, -0.53336f, 0.30320f, -1.32331f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = {
+ -1.31519f,
+ -3.26321f,
+ 1.71794f,
+ -1.90778f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x4_ver_layer0,
+ av1_tx_type_nn_weights_4x4_ver_layer1 },
+ { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
+ 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f,
+ 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f,
+ -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f,
+ -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+ -1.35896f, -1.17121f, 1.68866f, 0.10357f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
+ 2.93391f, 0.66831f, -0.21419f, 0.00000f,
+ -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
+ -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f,
+ -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f,
+ 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f,
+ 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+ -0.50191f, 0.18219f, 1.83664f, -0.75276f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
+ -1.17455f,
+ -2.26089f,
+ -1.79863f,
+ -2.26333f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x8_hor_layer0,
+ av1_tx_type_nn_weights_4x8_hor_layer1 },
+ { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
+ -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f,
+ -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+ -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f,
+ 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f,
+ 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f,
+ 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f,
+ -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f,
+ -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f,
+ 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f,
+ -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+ -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f,
+ -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f,
+ 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+ 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+ -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f,
+ -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f,
+ 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f,
+ -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f,
+ -0.21958f, 0.05970f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
+ 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f,
+ 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f,
+ 0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
+ -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f,
+ -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f,
+ -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f,
+ -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f,
+ 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f,
+ 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f,
+ -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+ -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+ -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f,
+ -1.01848f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
+ -1.45955f,
+ -2.08949f,
+ -1.24813f,
+ -1.55368f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x8_ver_layer0,
+ av1_tx_type_nn_weights_4x8_ver_layer1 },
+ { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
+ -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f,
+ 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f,
+ -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f,
+ -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f,
+ -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f,
+ 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f,
+ 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f,
+ -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+ -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+ 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f,
+ 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f,
+ -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f,
+ -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f,
+ 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f,
+ 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+ 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f,
+ -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+ -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+ -1.85523f, 0.92532f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
+ 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f,
+ -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f,
+ -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
+ -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f,
+ -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f,
+ 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f,
+ -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+ 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f,
+ -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f,
+ -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f,
+ 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f,
+ 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f,
+ -1.10654f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
+ -0.92861f,
+ -1.45151f,
+ -1.33588f,
+ -4.33853f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x4_hor_layer0,
+ av1_tx_type_nn_weights_8x4_hor_layer1 },
+ { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
+ -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+ -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+ -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f,
+ -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f,
+ 1.66212f, 1.70826f, 1.55182f, 0.12230f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
+ 0.10943f, 2.09789f, 2.16578f, 0.15766f,
+ -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
+ 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f,
+ 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f,
+ 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f,
+ -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f,
+ -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
+ 1.81013f,
+ 1.10517f,
+ 2.90059f,
+ 0.95391f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x4_ver_layer0,
+ av1_tx_type_nn_weights_8x4_ver_layer1 },
+ { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = {
+ -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f,
+ -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f,
+ 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f,
+ 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+ -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f,
+ -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f,
+ -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f,
+ 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+ 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f,
+ -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+ 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f,
+ -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f,
+ 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f,
+ 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f,
+ 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f,
+ 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+ 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f,
+ 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+ -0.99892f, 1.09823f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = {
+ -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f,
+ -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f,
+ -0.26319f, 2.65579f, -1.30137f, -0.01487f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = {
+ -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f,
+ -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f,
+ 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+ 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f,
+ 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+ -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f,
+ 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f,
+ 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f,
+ 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f,
+ 0.06161f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = {
+ 1.70385f,
+ 1.82373f,
+ 1.78496f,
+ 1.80826f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x8_hor_layer0,
+ av1_tx_type_nn_weights_8x8_hor_layer1 },
+ { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = {
+ -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+ 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f,
+ -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f,
+ -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f,
+ 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f,
+ 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f,
+ 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f,
+ -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+ -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f,
+ 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f,
+ 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f,
+ -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f,
+ 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f,
+ 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f,
+ -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f,
+ 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f,
+ -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f,
+ -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+ -1.29848f, 0.39308f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = {
+ -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f,
+ 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f,
+ 0.83015f, 0.06024f, 1.17180f, 0.65122f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = {
+ -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f,
+ 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+ 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f,
+ 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+ 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+ 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f,
+ 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f,
+ 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f,
+ -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f,
+ -0.41305f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = {
+ 2.14067f,
+ 2.76699f,
+ 2.04233f,
+ 1.34803f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x8_ver_layer0,
+ av1_tx_type_nn_weights_8x8_ver_layer1 },
+ { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
+ -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+ 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f,
+ -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f,
+ 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+ -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f,
+ 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f,
+ -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f,
+ 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f,
+ -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f,
+ -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f,
+ 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f,
+ 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f,
+ -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f,
+ 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f,
+ -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f,
+ 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f,
+ 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f,
+ -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+ -0.28136f, 0.42556f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
+ 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f,
+ -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+ 1.81560f, -1.02643f, -0.81690f, 0.08302f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
+ 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f,
+ -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f,
+ 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f,
+ -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f,
+ 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f,
+ 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f,
+ 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f,
+ 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f,
+ 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f,
+ -1.31243f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
+ 0.83359f,
+ 1.06875f,
+ 1.77645f,
+ 1.49570f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x16_hor_layer0,
+ av1_tx_type_nn_weights_8x16_hor_layer1 },
+ { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
+ 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f,
+ -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f,
+ -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f,
+ 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+ -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f,
+ 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f,
+ 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f,
+ 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f,
+ -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f,
+ -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f,
+ 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f,
+ 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+ -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+ -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f,
+ -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f,
+ -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+ -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+ 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+ -0.12236f, 0.16075f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
+ -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f,
+ -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+ 0.57598f, 0.99819f, 0.75175f, 0.17044f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
+ -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f,
+ 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f,
+ -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+ 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f,
+ -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+ -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f,
+ -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+ 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f,
+ 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f,
+ 2.20547f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
+ -0.44080f,
+ -1.67455f,
+ -1.46332f,
+ -6.13206f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x16_ver_layer0,
+ av1_tx_type_nn_weights_8x16_ver_layer1 },
+ { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
+ 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f,
+ -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f,
+ -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f,
+ 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f,
+ 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f,
+ 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f,
+ 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f,
+ -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f,
+ -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f,
+ -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f,
+ 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f,
+ -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f,
+ -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f,
+ -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f,
+ 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f,
+ -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+ -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+ 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f,
+ -0.36570f, -0.50757f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
+ -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+ 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f,
+ -0.12329f, 0.08986f, 1.08117f, -0.00220f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
+ 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+ 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f,
+ -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f,
+ -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f,
+ -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f,
+ -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f,
+ 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f,
+ 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f,
+ 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f,
+ -0.23347f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
+ 3.57175f,
+ 2.42612f,
+ 3.31259f,
+ 2.08287f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x8_hor_layer0,
+ av1_tx_type_nn_weights_16x8_hor_layer1 },
+ { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
+ 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f,
+ 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+ -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f,
+ 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f,
+ 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f,
+ -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f,
+ 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f,
+ -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f,
+ 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f,
+ 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f,
+ 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+ -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+ -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f,
+ -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f,
+ 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f,
+ 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+ -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f,
+ -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f,
+ -0.81945f, -0.41647f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
+ 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f,
+ 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f,
+ -0.04510f, 0.48000f, -0.09354f, -0.42422f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
+ 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f,
+ -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f,
+ 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f,
+ -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f,
+ -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f,
+ 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f,
+ 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f,
+ -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f,
+ 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f,
+ -0.00873f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
+ 3.34981f,
+ 3.74710f,
+ 1.38339f,
+ 0.45176f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x8_ver_layer0,
+ av1_tx_type_nn_weights_16x8_ver_layer1 },
+ { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static const float av1_tx_type_nn_weights_16x16_layer0[128] = {
+ 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f,
+ 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f,
+ -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+ -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f,
+ 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f,
+ 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f,
+ 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f,
+ 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f,
+ -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f,
+ 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f,
+ 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f,
+ 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f,
+ -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f,
+ 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f,
+ 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f,
+ -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+ -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f,
+ 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f,
+ 0.50355f, 0.08592f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer0[16] = {
+ -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f,
+ -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f,
+ -0.14062f, -0.42120f, 0.94573f, -0.09287f,
+};
+
+static const float av1_tx_type_nn_weights_16x16_layer1[64] = {
+ -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f,
+ 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f,
+ 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+ 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f,
+ 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+ 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f,
+ -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f,
+ 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f,
+ -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f,
+ 1.08829f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer1[4] = {
+ 0.81986f,
+ 1.26865f,
+ 0.11118f,
+ 2.48404f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_type_nn_weights_16x16_layer0,
+ av1_tx_type_nn_weights_16x16_layer1,
+ },
+ {
+ av1_tx_type_nn_bias_16x16_layer0,
+ av1_tx_type_nn_bias_16x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = {
+ 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f,
+ 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+ 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f,
+ 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f,
+ -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = {
+ -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+ 0.62806f, -0.20675f, 4.91940f, -0.56079f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = {
+ -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f,
+ -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f,
+ 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+ 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+ 1.28413f, -0.30326f, 2.45329f, -0.83335f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = {
+ 2.33198f,
+ 3.36245f,
+ 1.62603f,
+ 2.91056f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x16_hor_layer0,
+ av1_tx_type_nn_weights_4x16_hor_layer1 },
+ { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = {
+ 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f,
+ 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f,
+ -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f,
+ -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+ -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f,
+ -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f,
+ 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f,
+ 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f,
+ 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+ -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f,
+ -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f,
+ 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+ 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+ 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f,
+ 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f,
+ -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+ 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f,
+ 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f,
+ -0.27975f, -0.01149f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = {
+ -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f,
+ -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+ -0.32530f, 0.73483f, 0.08322f, -0.23890f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = {
+ 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f,
+ -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f,
+ 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f,
+ -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f,
+ 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f,
+ -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f,
+ 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f,
+ 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f,
+ -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f,
+ -0.56513f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = {
+ 4.60896f,
+ 4.53551f,
+ 4.53124f,
+ 4.27435f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x16_ver_layer0,
+ av1_tx_type_nn_weights_4x16_ver_layer1 },
+ { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = {
+ 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f,
+ 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+ -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f,
+ -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f,
+ -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f,
+ -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f,
+ 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f,
+ 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f,
+ 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f,
+ -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f,
+ 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f,
+ -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f,
+ 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f,
+ -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f,
+ -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+ -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f,
+ 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f,
+ 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f,
+ 0.19055f, -1.56413f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = {
+ -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f,
+ 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f,
+ 1.14048f, 0.33308f, -1.10886f, 0.41184f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = {
+ -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+ 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f,
+ -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f,
+ -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f,
+ 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f,
+ -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f,
+ -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f,
+ 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f,
+ 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f,
+ -0.43819f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = {
+ 2.32575f,
+ 2.75703f,
+ 1.12304f,
+ 2.15567f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x4_hor_layer0,
+ av1_tx_type_nn_weights_16x4_hor_layer1 },
+ { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = {
+ 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+ 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f,
+ -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f,
+ -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+ -0.17967f, -0.96622f, 0.42635f, -1.04784f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = {
+ -0.52088f, 0.52844f, -1.03655f, -0.30974f,
+ 2.59952f, -1.93604f, 0.00000f, 2.51787f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = {
+ 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f,
+ 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f,
+ 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f,
+ -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f,
+ 1.26814f, -1.93873f, -0.00768f, 1.58309f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = {
+ 2.34713f,
+ 1.68667f,
+ 1.25488f,
+ 1.69812f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x4_ver_layer0,
+ av1_tx_type_nn_weights_16x4_ver_layer1 },
+ { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static const NN_CONFIG *const av1_tx_type_nnconfig_map_hor[] = {
+ &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
+};
+
+static const NN_CONFIG *const av1_tx_type_nnconfig_map_ver[] = {
+ &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
+};
+#endif // CONFIG_NN_V2
+
+// Tx split model for 4x8 block.
+static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = {
+ 0.068650f, -0.732073f, -0.040361f, 0.322550f, -0.021123f, 0.212518f,
+ -0.350546f, 0.435987f, -0.111756f, -0.401568f, 0.069548f, -0.313000f,
+ 0.073918f, -0.373805f, -0.775810f, -0.124753f, 0.181094f, -0.602641f,
+ -0.026219f, -0.350112f, 0.020599f, -0.311752f, -0.476482f, -0.669465f,
+ -0.310921f, 0.348869f, -0.115984f, 0.154250f, 0.200485f, -0.016689f,
+ 0.020392f, 0.413810f, 0.634064f, -0.627530f, 0.399178f, -0.012284f,
+ 0.472030f, 0.091087f, -0.706100f, -0.447944f, -0.274226f, 0.445656f,
+ 0.309339f, 0.505522f, 0.038496f, -0.152809f, 0.408684f, -0.068151f,
+ 0.271612f, 0.353233f, -0.150365f, 0.075212f, -0.035096f, 0.346615f,
+ 0.124382f, 0.477072f, 0.216288f, 0.070548f, -0.106362f, 0.681613f,
+ -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f,
+ 0.063009f, -0.123053f, 0.104875f, -0.137581f, -0.282933f, -0.003624f,
+ -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f,
+ -0.670248f, -0.353762f, 0.181109f, 0.289715f, -0.071206f, 0.261141f,
+ 0.052796f, -0.114554f, -0.139214f, -0.261380f, 0.075984f, -0.647925f,
+ -0.099528f, -0.677814f, 0.015712f, -0.389385f, -0.095622f, -0.165117f,
+ -0.109454f, -0.175240f, -0.393914f, 0.212330f, 0.037822f, 0.248280f,
+ 0.180197f, 0.110493f, -0.525727f, -0.092329f, -0.524029f, -0.407364f,
+ -0.542373f, -0.435626f, -0.912194f, 0.062794f, 0.160433f, 0.741485f,
+ -0.103659f, -0.119327f, -0.055275f, 0.334358f, 0.014713f, 0.046327f,
+ 0.831114f, -0.576682f, 0.354369f, -0.082088f, 0.452331f, 0.039730f,
+ -0.792429f, -0.385862f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer0[16] = {
+ 0.238621f, 2.186830f, 1.383035f, -0.867139f, 1.257119f, -0.351571f,
+ -0.240650f, -0.971692f, 2.744843f, 1.116991f, 0.139062f, -0.165332f,
+ 0.262171f, -1.598153f, -1.427340f, -1.602306f,
+};
+
+static const float av1_tx_split_nn_weights_4x8_layer1[16] = {
+ -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f,
+ -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f, 0.278987f,
+ 0.085082f, 0.614986f, 0.847904f, 0.637578f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer1[1] = {
+ 0.20586078f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x8 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_4x8_layer0,
+ av1_tx_split_nn_weights_4x8_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_4x8_layer0,
+ av1_tx_split_nn_bias_4x8_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 8x8 block.
+static const float av1_tx_split_nn_weights_8x8_layer0[144] = {
+ 0.177983f, -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f,
+ -0.098202f, -0.279510f, 0.001054f, -0.119319f, -1.835282f, -0.581507f,
+ -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f,
+ -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f,
+ 0.015331f, -0.341818f, 0.145549f, -0.348362f, 0.147647f, -0.323400f,
+ 0.047558f, -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f,
+ 0.447740f, 0.782381f, -0.179164f, -0.584675f, -0.052645f, 0.038656f,
+ -0.096783f, 0.038342f, -0.170762f, -0.405844f, -0.552665f, -0.509866f,
+ 0.757204f, -1.296465f, 0.631015f, 0.009265f, 0.646192f, 0.044523f,
+ 0.653161f, 0.033820f, 0.849639f, -0.068555f, -1.036085f, -0.511652f,
+ 0.104693f, -1.458690f, 0.286051f, -0.089800f, 0.381564f, -0.302640f,
+ 0.304465f, -0.268706f, 0.432603f, -0.117914f, -2.070031f, -0.565696f,
+ -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f,
+ -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f,
+ 0.101894f, -0.221847f, 0.018412f, -0.423887f, -0.266684f, -0.444930f,
+ -0.196237f, 0.106638f, -0.065834f, -0.538401f, -0.280772f, -0.620348f,
+ 1.089957f, -0.799928f, 0.504112f, -0.165763f, 0.578741f, -0.172653f,
+ 0.547316f, -0.143484f, 0.717220f, -0.297190f, -1.237854f, -0.074819f,
+ -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f,
+ -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f,
+ 0.231249f, -1.693073f, -0.035899f, 0.380845f, -0.058476f, 0.409405f,
+ -0.066679f, 0.406731f, -0.068501f, 0.396748f, 0.639462f, 0.150834f,
+ -0.418659f, -1.421931f, 0.101889f, 0.083573f, 0.129746f, 0.134460f,
+ 0.081185f, 0.127420f, 0.083664f, 0.051096f, 1.361688f, 0.386093f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer0[12] = {
+ 4.280443f, 2.218902f, -0.256953f, 3.161431f, 2.082548f, 2.506052f,
+ 2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f,
+};
+
+static const float av1_tx_split_nn_weights_8x8_layer1[12] = {
+ 1.178833f, -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f,
+ -0.766968f, -0.356663f, 0.450146f, 0.509370f, -0.356604f, -0.443506f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer1[1] = {
+ -0.156294f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x8 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 12,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_8x8_layer0,
+ av1_tx_split_nn_weights_8x8_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_8x8_layer0,
+ av1_tx_split_nn_bias_8x8_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 8x16 block.
+static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = {
+ 0.374660f, 0.218905f, -0.139779f, 0.212141f, 0.056517f, 0.051114f,
+ 0.042860f, -0.273258f, -0.340809f, 0.138983f, -0.216996f, -0.241519f,
+ -0.123244f, 0.078577f, -0.472273f, -0.194201f, 0.125056f, 0.239761f,
+ -0.332782f, 0.174782f, -0.211400f, -0.129795f, 0.062195f, 0.113176f,
+ -0.008869f, 0.140764f, 0.059833f, 0.163826f, 0.359293f, -0.109797f,
+ -0.022091f, -0.059536f, -0.188226f, 0.179709f, 0.031386f, 0.164790f,
+ 0.214364f, 0.198555f, 0.152262f, -0.242980f, 0.319367f, -0.136902f,
+ 0.046524f, -0.043591f, 0.342178f, -0.011757f, -0.014286f, 0.072871f,
+ -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f,
+ -0.120865f, -0.160042f, 0.240028f, 0.112902f, -0.141587f, -0.703012f,
+ -0.136591f, 0.318993f, -0.154417f, -0.054668f, 0.192870f, 0.176166f,
+ -0.029965f, 0.266942f, -0.178384f, 0.038680f, 0.134403f, -0.002426f,
+ 0.534825f, -0.070923f, 0.413281f, 0.418148f, 0.093729f, 0.016454f,
+ 0.305358f, -0.040512f, 0.069904f, -0.227588f, -0.362220f, -0.031604f,
+ -0.394901f, 0.071506f, -0.342833f, -0.142550f, -0.164005f, 0.182600f,
+ 0.213062f, 0.076805f, 0.278758f, 0.125613f, -0.035552f, 0.040971f,
+ 0.182785f, -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f,
+ 0.114657f, 0.047121f, 0.195902f, 0.264759f, 0.017799f, 0.210230f,
+ 0.150749f, -0.142142f, 0.182494f, -0.142415f, -0.259782f, -0.114830f,
+ -0.198826f, 0.000061f, -0.375668f, -0.276656f, -0.373202f, 0.210298f,
+ 0.422680f, 0.066960f, 0.351106f, -0.209034f, 0.367195f, -0.110274f,
+ 0.115573f, -0.066642f, -0.389673f, -0.260447f, 0.056949f, -0.180425f,
+ 0.069922f, -0.153506f, -0.097053f, -0.111757f, 0.094069f, 0.144837f,
+ -0.052984f, -0.506681f, -0.034474f, 0.279057f, -0.105025f, 0.006656f,
+ -0.125017f, -0.114096f, 0.103153f, -0.117402f, -0.359472f, 0.072534f,
+ 0.110291f, 0.003088f, -0.456897f, 0.038331f, -0.322298f, 0.113942f,
+ -0.119916f, -0.194392f, 0.093167f, 0.193459f, 0.074671f, 0.033602f,
+ 0.004440f, -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f,
+ 0.319160f, -0.066218f, 0.291246f, 0.181292f, 0.089914f, 0.025273f,
+ 0.303128f, 0.019063f, 0.078545f, -0.396919f, 0.014065f, -0.122121f,
+ 0.037107f, -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f,
+ 0.102970f, -0.225040f, 0.061059f, -0.258188f, -0.469871f, -0.099607f,
+ -0.061524f, -0.213700f, 0.070237f, -0.289134f, -0.238225f, 0.256403f,
+ -0.119344f, 0.067782f, -0.398983f, -0.123975f, -0.200205f, -0.047038f,
+ 0.026569f, 0.031037f, 0.094302f, -0.101239f, 0.433307f, -0.303612f,
+ 0.088537f, -0.164436f, 0.202471f, -0.048592f, -0.251904f, 0.122577f,
+ -0.309874f, -0.263405f, -0.292503f, 0.216589f, 0.035378f, 0.136599f,
+ -0.145844f, -0.018211f, 0.174084f, -0.449941f, -0.001428f, 0.064134f,
+ 0.039652f, 0.111083f, -0.246076f, -0.204733f, 0.056559f, -0.000123f,
+ 0.104049f, 0.138512f, -0.128309f, 0.087855f, 0.232784f, 0.247138f,
+ 0.162766f, 0.154829f, 0.313605f, -0.164115f, -0.050844f, 0.156549f,
+ 0.185279f, -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f,
+ -0.203399f, -0.096831f, -0.127867f, 0.310674f, -0.008181f, 0.004078f,
+ -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f,
+ 0.114268f, -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f,
+ -0.352853f, -0.224001f, -0.156330f, 0.215436f, 0.171846f, 0.291849f,
+ 0.108832f, 0.046991f, -0.127801f, 0.032485f, 0.141493f, 0.123319f,
+ -0.057250f, 0.315346f, -0.061317f, -0.465086f, -0.130179f, -0.217841f,
+ -0.239089f, -0.073251f, -0.327718f, 0.054905f, -0.283169f, -0.028900f,
+ 0.071450f, 0.270072f, 0.248891f, 0.088052f, 0.253319f, 0.122808f,
+ 0.175490f, -0.147805f, 0.089169f, -0.045457f, -0.330788f, 0.099791f,
+ -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f,
+ 0.162554f, -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f,
+ -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f,
+ -0.182036f, 0.176772f, -0.070823f, 0.216054f, -0.211533f, -0.232992f,
+ 0.279346f, 0.117984f, 0.236674f, 0.126625f, -0.046220f, 0.044919f,
+ 0.278492f, 0.083944f, 0.180512f, 0.217994f, 0.401170f, -0.064417f,
+ 0.011636f, -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f,
+ -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f, -0.312849f,
+ -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f,
+ -0.354389f, 0.169464f, 0.094151f, -0.217122f, -0.456397f, 0.211478f,
+ 0.219232f, -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f,
+ -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f, -0.144181f,
+ 0.335028f, 0.176439f, 0.105980f, 0.169390f, 0.155615f, -0.040618f,
+ -0.176029f, 0.155569f, -0.184833f, -0.171099f, -0.178663f, -0.032051f,
+ -0.434334f, 0.092238f, -0.263103f, 0.061804f, -0.172957f, 0.005962f,
+ -0.100176f, 0.125898f, 0.048092f, -0.088141f, 0.247196f, -0.221601f,
+ -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f,
+ 0.403401f, -0.046200f, 0.322259f, 0.219678f, 0.109850f, 0.051837f,
+ 0.196861f, -0.019118f, 0.248818f, -0.137567f, 0.127862f, 0.052293f,
+ 0.298726f, 0.275788f, 0.015344f, 0.058714f, 0.283691f, -0.053794f,
+ -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f,
+ -0.252396f, -0.069017f, 0.034803f, -0.003388f, -0.262577f, 0.062115f,
+ -0.298393f, 0.215415f, -0.153615f, 0.289902f, 0.085886f, -0.504290f,
+ 0.077178f, 0.150861f, -0.228848f, -0.261020f, 0.198204f, 0.162113f,
+ 0.346418f, -0.286950f, 0.354756f, -0.226419f, 0.024720f, 0.208037f,
+ 0.107286f, -0.110849f, 0.104415f, -0.207725f, 0.063932f, -0.037748f,
+ -0.167037f, -0.068282f, 0.320815f, -0.051884f, 0.099989f, -0.078388f,
+ 0.127071f, 0.046675f, -0.336571f, -0.273080f, 0.264694f, -0.007352f,
+ -0.093828f, 0.094773f, -0.144434f, 0.091795f, -0.031615f, 0.056914f,
+ 0.064673f, -0.136669f, 0.344734f, 0.225926f, 0.283451f, -0.068354f,
+ 0.030572f, 0.180784f, -0.378047f, -0.092962f, -0.083291f, 0.038970f,
+ 0.052094f, -0.017932f, 0.216302f, -0.184396f, 0.079888f, 0.210406f,
+ -0.020627f, 0.244744f, 0.336972f, -0.182914f, -0.220976f, -0.304225f,
+ -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f,
+ -0.408768f, 0.184693f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer0[64] = {
+ -0.274107f, 0.445751f, 0.234359f, 0.291593f, 0.163298f, 0.183707f,
+ -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f, -0.354974f,
+ 0.000000f, -0.254630f, 0.220149f, 0.371104f, 0.789759f, 0.270300f,
+ 0.195126f, -0.206958f, 0.917708f, -0.256232f, 1.131933f, 1.178944f,
+ 0.461270f, 0.246169f, -0.818614f, -0.111986f, 0.759355f, 0.154889f,
+ 0.470299f, -1.025250f, 0.678678f, 0.959346f, -0.164105f, 0.544079f,
+ -0.448733f, 0.649221f, -0.536672f, 0.962758f, -0.256427f, 0.808664f,
+ -0.118694f, 0.684873f, -0.015635f, -0.046469f, 0.075481f, 0.412647f,
+ 0.454456f, -0.107169f, 0.775235f, -0.261629f, -1.194849f, 0.010093f,
+ -0.231289f, 0.658286f, -0.769320f, 0.564545f, 0.482962f, -0.131378f,
+ -0.255844f, -0.078400f, 0.476752f, 0.643001f,
+};
+
+static const float av1_tx_split_nn_weights_8x16_layer1[64] = {
+ -0.145065f, -0.145101f, 0.174786f, 0.196692f, 0.102025f, -0.087735f,
+ 0.386353f, -0.660539f, -0.183940f, 0.490045f, -0.276404f, -0.145669f,
+ 0.209846f, -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f,
+ -0.108545f, -0.261181f, 1.435606f, -0.176621f, -1.158548f, 2.035680f,
+ 0.218069f, -0.138629f, 0.305958f, -0.277194f, -0.602468f, 0.203873f,
+ 0.120720f, 0.216095f, -0.434502f, -0.579746f, -0.239450f, 0.755529f,
+ 0.545643f, 0.232091f, 0.330169f, 0.988136f, -0.070465f, -0.345584f,
+ -0.162455f, -0.617064f, 0.123881f, -0.201098f, 0.222756f, 0.112932f,
+ 0.048647f, -0.147890f, 0.394584f, -0.262148f, 0.280564f, -0.195432f,
+ -0.047515f, 1.133410f, 0.255415f, -0.299032f, -0.397807f, -0.153246f,
+ -0.256734f, 0.177370f, 0.213522f, -0.530158f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer1[1] = {
+ 0.14910713f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x16 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_8x16_layer0,
+ av1_tx_split_nn_weights_8x16_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_8x16_layer0,
+ av1_tx_split_nn_bias_8x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 16x16 block.
+static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = {
+ -0.177215f, -0.297166f, 0.299924f, 0.207878f, 0.216871f, 0.173264f,
+ 0.295464f, 0.048395f, 0.154731f, 0.305880f, 0.056787f, -0.166617f,
+ 0.115653f, -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f,
+ -0.024940f, -0.007055f, 0.001392f, 0.021678f, -1.594600f, -0.099593f,
+ 0.332930f, 0.103574f, 0.158249f, 0.182601f, 0.332665f, 0.226207f,
+ -0.139566f, 0.185531f, 0.099074f, -0.185654f, -0.203121f, -0.285678f,
+ -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f,
+ -0.066150f, -0.099058f, -0.458879f, 0.127544f, 0.338314f, -0.161350f,
+ 0.030091f, -0.075528f, 0.004320f, 0.353690f, -0.013480f, -0.420402f,
+ -0.004659f, -0.329401f, -0.001745f, 0.227384f, -0.055183f, 0.121405f,
+ 0.160340f, 0.143603f, -0.221813f, 0.079107f, -0.657639f, -0.084348f,
+ -0.303414f, 0.046774f, -0.367679f, 0.060005f, 0.168645f, 0.084421f,
+ -0.133625f, 0.301375f, 0.079412f, -0.419303f, 0.017235f, 0.068637f,
+ 0.018384f, -0.428325f, -0.019753f, 0.149444f, -0.474836f, -0.287162f,
+ 0.198083f, 0.028292f, -0.299092f, -0.005849f, -0.256245f, 0.233277f,
+ -0.217561f, -0.264003f, 0.269411f, 0.207032f, -0.339411f, -0.198431f,
+ -0.028521f, 0.158076f, 0.177116f, 0.345702f, -0.145132f, 0.064623f,
+ -0.090867f, 0.288816f, -0.263198f, -0.071028f, -0.044546f, 0.380017f,
+ -0.014100f, -0.271192f, -0.318559f, 0.129015f, -0.050314f, -0.093355f,
+ -0.578498f, 0.099090f, -0.133080f, -0.029975f, -0.059828f, -0.157765f,
+ -0.321153f, -0.343671f, -0.242959f, 0.128304f, 0.017170f, 0.072787f,
+ -0.475838f, -0.003806f, -0.068615f, 0.150556f, -0.159903f, -0.416513f,
+ 0.218794f, -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f,
+ -0.077329f, -0.089747f, -0.096526f, 0.537952f, 0.134725f, -0.006469f,
+ -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f, -0.021712f,
+ -0.513992f, 0.259135f, -0.319808f, 0.077811f, 0.104613f, 0.370571f,
+ 0.185244f, 0.065530f, -0.091098f, -0.573741f, 0.111934f, 0.437417f,
+ -0.123691f, 0.220641f, -0.024783f, -0.149460f, -0.354185f, -0.134127f,
+ 0.038015f, -0.380596f, 0.250980f, 0.142208f, 0.135170f, -0.131129f,
+ -0.357556f, -0.530945f, 0.159672f, -0.147025f, -0.377829f, -0.504508f,
+ -0.492870f, 0.020753f, 0.142818f, 0.025172f, 0.086140f, 0.091283f,
+ 0.087491f, -0.186415f, 0.177785f, -0.195121f, -1.191148f, -0.477102f,
+ 0.023371f, 0.227004f, -0.023502f, -0.242913f, -0.074398f, -0.153480f,
+ 0.162900f, 0.415509f, -0.162565f, -0.131709f, -0.258852f, -0.252027f,
+ -0.080845f, -0.330274f, 0.021874f, 0.232398f, 0.069277f, 0.220567f,
+ -0.024237f, -0.366771f, 0.081673f, -0.429906f, -0.302170f, 0.061045f,
+ 0.352777f, -0.230376f, 0.408153f, 0.064758f, 0.142051f, 0.007219f,
+ 0.622878f, 0.212577f, 0.036489f, 0.081150f, -0.284767f, 0.107763f,
+ -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f,
+ -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f, 0.057565f,
+ 0.414265f, -0.159155f, 0.221456f, 0.146314f, 0.265776f, -0.006516f,
+ 0.473978f, -0.186431f, 0.288672f, -0.060437f, 0.083380f, -0.205641f,
+ 0.360016f, 0.222041f, 0.420011f, 0.024579f, 0.377546f, 0.250380f,
+ -0.069900f, 0.296743f, 0.073532f, -0.243225f, -0.374987f, -0.387288f,
+ -0.237255f, -0.287013f, 0.417831f, -0.252988f, -0.257652f, -0.066775f,
+ -0.253926f, 0.057841f, 0.346133f, -0.157797f, -0.406028f, -0.286893f,
+ 0.274507f, -0.452561f, 0.143381f, -0.097755f, 0.021242f, 0.034561f,
+ 0.044115f, 0.004065f, 0.066729f, 0.043558f, 0.102991f, -0.477574f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer0[24] = {
+ -0.479033f, 1.467402f, -0.366291f, 0.372511f, 0.715322f, -0.605500f,
+ 0.176848f, 0.032318f, 0.237429f, -0.046047f, 0.452082f, 0.451805f,
+ -0.822845f, 0.636762f, -0.057350f, 1.163978f, 0.728287f, 0.603654f,
+ -0.245519f, -0.893569f, -1.428185f, 0.808870f, -0.076159f, 1.231976f,
+};
+
+static const float av1_tx_split_nn_weights_16x16_layer1[24] = {
+ -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f, -0.170504f,
+ -0.538432f, 0.033893f, 0.149842f, 0.404140f, -0.377812f, 0.338838f,
+ -0.176091f, 0.249844f, -0.362533f, 1.412460f, 0.196862f, 0.278194f,
+ -0.140444f, 0.297746f, 0.172533f, 0.116470f, -0.151656f, -0.603250f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer1[1] = {
+ 0.184803f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x16 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_16x16_layer0,
+ av1_tx_split_nn_weights_16x16_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_16x16_layer0,
+ av1_tx_split_nn_bias_16x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 32x32 block.
+static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = {
+ -0.439303f, 0.004813f, -0.365052f, -0.116868f, -0.356716f, -0.196537f,
+ -0.196770f, -0.076096f, 0.357004f, -0.044909f, -0.112910f, -0.129081f,
+ 0.156725f, -0.386346f, 0.038971f, 0.160696f, 0.204923f, -0.384333f,
+ -0.319546f, 0.028179f, -0.250524f, -0.289669f, -0.284138f, -0.258963f,
+ -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f, 0.141414f,
+ 0.303016f, 0.098066f, 0.482455f, 0.036069f, -0.166279f, 0.210119f,
+ -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f,
+ -0.306403f, 0.026318f, -0.277296f, 0.092684f, -0.033584f, -0.018371f,
+ -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f,
+ 0.361851f, -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f,
+ -0.097051f, 0.259172f, 0.016432f, 0.259358f, 0.145059f, 0.037196f,
+ 0.091581f, -0.219644f, 0.140384f, -0.446837f, -0.234531f, 0.149508f,
+ -0.083429f, 0.186189f, -0.099890f, -0.111277f, 0.495214f, 0.085053f,
+ -0.266613f, -0.051366f, 0.148593f, 0.111875f, 0.077787f, -0.371653f,
+ -0.146157f, -0.229235f, 0.076203f, 0.488975f, 0.096771f, -0.009483f,
+ 0.192985f, 0.246273f, -0.192671f, -0.557890f, -0.292650f, -0.088907f,
+ -0.106892f, -0.329659f, 0.012105f, -0.359326f, 0.170723f, -0.004357f,
+ 0.171593f, -0.478768f, -0.236016f, -0.035077f, 0.133731f, 0.137962f,
+ -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f,
+ -0.649359f, 0.127605f, 0.097930f, 0.182775f, -0.313324f, 0.053349f,
+ 0.204203f, -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f,
+ -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f,
+ 0.632147f, 0.221825f, 0.268394f, -0.096357f, 0.442545f, -0.007117f,
+ -0.036125f, 0.000525f, 0.088092f, -0.203653f, 0.086925f, 0.439141f,
+ 0.329889f, -0.370050f, -0.194306f, -0.207430f, 0.132779f, -0.217614f,
+ -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f,
+ -0.007300f, 0.062257f, -0.347865f, -0.296767f, -0.359123f, 0.230459f,
+ -0.189117f, -0.087622f, -0.561091f, 0.184182f, -0.044980f, 0.012643f,
+ 0.241672f, 0.050272f, -0.204851f, -0.159285f, -0.064081f, -0.118666f,
+ -0.269471f, 0.231668f, 0.135749f, -0.131162f, 0.062760f, 0.100949f,
+ 0.074967f, -0.056918f, 0.251707f, 0.034098f, 0.341290f, -0.105027f,
+ 0.313246f, -0.092679f, -0.014632f, -0.390967f, 0.136881f, -0.241554f,
+ 0.097674f, 0.110832f, -0.390245f, 0.017654f, -0.506222f, 0.065252f,
+ 0.244834f, -0.171352f, -0.331702f, 0.111043f, 0.125217f, -0.058116f,
+ -0.382595f, -0.052545f, 0.114261f, -0.493617f, 0.243984f, -0.171053f,
+ 0.165009f, -0.063020f, 0.096502f, 0.341339f, -0.013443f, 0.056372f,
+ 0.339284f, 0.398376f, 0.389409f, 0.257252f, 0.517368f, 0.078856f,
+ 0.087716f, -0.171092f, 0.227461f, 0.125307f, -0.054423f, -0.143161f,
+ 0.224041f, -0.086477f, -0.092548f, 0.072392f, -0.061608f, 0.258347f,
+ 0.147033f, -0.478244f, -0.204869f, 0.038552f, -0.144563f, 0.224087f,
+ -0.296705f, 0.153889f, -0.064624f, 0.085265f, -0.103826f, 0.127971f,
+ 0.019965f, 0.111937f, -0.074187f, -0.029518f, -0.127305f, -0.012210f,
+ 0.042714f, 0.070052f, -0.202360f, 0.348144f, -0.132097f, -0.209585f,
+ -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f, -0.013468f,
+ -0.406090f, -0.144936f, 0.208620f, 0.343445f, -0.059639f, 0.114857f,
+ -0.069431f, -0.218725f, 0.190575f, -0.368101f, 0.030030f, 0.062815f,
+ -0.239369f, -0.537852f, 0.022487f, 0.023038f, 0.190788f, 0.040123f,
+ -0.004304f, 0.060749f, -0.108929f, 0.136796f, -0.542875f, -0.227074f,
+ -0.182244f, 0.082559f, 0.019149f, 0.178854f, 0.120284f, 0.009070f,
+ 0.068268f, -0.544822f, 0.120536f, 0.354028f, -0.119890f, -0.122055f,
+ -0.405335f, 0.122341f, -0.304412f, 0.062405f, -0.302568f, -0.276505f,
+ -0.120915f, -0.221841f, 0.282007f, -0.253971f, 0.059517f, -0.144976f,
+ 0.149391f, -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f,
+ 0.017485f, 0.021038f, -0.023728f, -0.192181f, -0.103996f, 0.092873f,
+ -0.114365f, -0.397732f, -0.065421f, 0.053084f, 0.035201f, 0.053019f,
+ -0.105377f, -0.039500f, 0.131904f, -0.123911f, -0.390328f, -0.125198f,
+ -0.000126f, 0.014864f, -0.220187f, 0.084056f, -0.492155f, -0.164979f,
+ 0.133592f, 0.121519f, -0.240813f, 0.186680f, 0.118673f, 0.235006f,
+ -0.239894f, -0.185759f, -0.336992f, 0.209620f, -0.298845f, 0.127803f,
+ -0.083992f, 0.194340f, -0.245378f, 0.212308f, 0.142512f, -0.163324f,
+ 0.383495f, 0.291065f, 0.286620f, -0.239957f, 0.225127f, -0.174424f,
+ 0.297231f, -0.045434f, 0.156444f, -0.184273f, -0.204567f, 0.202551f,
+ 0.370019f, -0.073910f, 0.344897f, 0.063100f, 0.338547f, -0.099145f,
+ 0.391863f, -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer0[32] = {
+ 0.143343f, -0.021982f, -0.314939f, 0.170867f, -0.081248f, 0.125758f,
+ -0.355762f, 0.279798f, 1.027712f, -0.434660f, 1.072005f, 0.668893f,
+ -0.031216f, -0.528650f, 0.328349f, 0.543645f, -0.188810f, 0.221110f,
+ -1.638637f, 0.058045f, -1.731105f, -0.444284f, 0.513693f, 0.890025f,
+ 0.160288f, 0.393312f, 0.332856f, -0.080767f, 0.299822f, 0.235876f,
+ 0.254942f, -0.017796f,
+};
+
+static const float av1_tx_split_nn_weights_32x32_layer1[32] = {
+ -0.090326f, -0.267553f, -0.026071f, 0.100912f, 0.279137f, 0.079064f,
+ -0.074885f, 0.053804f, 0.736810f, -0.031693f, -0.970514f, 0.174069f,
+ 0.095940f, -0.065047f, 0.052911f, 0.176728f, -0.058274f, 0.148364f,
+ -0.162210f, 0.093875f, -0.367663f, 0.020876f, 0.137280f, -1.099116f,
+ 0.146854f, 0.075590f, 0.228534f, 0.141993f, 0.072143f, 0.101421f,
+ -0.068547f, -0.154148f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer1[1] = {
+ 0.316622f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x32 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_32x32_layer0,
+ av1_tx_split_nn_weights_32x32_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_32x32_layer0,
+ av1_tx_split_nn_bias_32x32_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 64x64 block.
+static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = {
+ -0.006828f, 0.149944f, -0.017614f, -0.044599f, -0.024517f, 0.507698f,
+ 0.001039f, 0.037164f, 0.015091f, -0.306620f, -0.162047f, -0.369440f,
+ 0.396310f, 0.087121f, 0.208609f, -0.083068f, 0.493774f, 0.217682f,
+ 0.377393f, 0.172879f, 0.397422f, 0.078919f, 0.741350f, 0.064169f,
+ -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f,
+ -0.436596f, -0.007551f, -0.396721f, 0.153570f, -0.190838f, -0.071869f,
+ 0.048799f, -0.301301f, -0.005015f, 0.500480f, -0.030622f, -0.559095f,
+ -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f, -0.411323f,
+ -0.005366f, -0.069496f, 0.019990f, 0.327931f, -0.002516f, 0.393190f,
+ 0.001759f, 0.035093f, -0.030302f, -0.528984f, 0.174781f, 0.241462f,
+ -0.415427f, -0.164502f, 0.143065f, -0.122595f, 0.082049f, -0.143346f,
+ 0.055642f, -0.124701f, 0.004050f, -0.216235f, -2.681730f, 0.101658f,
+ 0.381239f, 0.465936f, 0.331154f, 0.301708f, -0.360171f, 0.054886f,
+ -0.118658f, 0.287921f, 0.277859f, 0.203784f, 0.247809f, 0.656924f,
+ -0.354628f, 0.315081f, 0.105108f, -0.510179f, 0.059267f, 0.061386f,
+ 0.076423f, 0.347119f, 0.100134f, 0.028402f, -0.118621f, -0.238689f,
+ 0.080141f, -0.138863f, 0.009009f, -0.100526f, -0.138875f, 0.066992f,
+ 0.005949f, 0.564336f, 0.046994f, 0.004655f, 0.366047f, 0.014695f,
+ -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f, -0.020925f,
+ -0.227236f, -0.068141f, 0.282009f, 0.040192f, -0.267100f, 0.229228f,
+ 0.133861f, 0.338706f, -0.030178f, -0.040919f, -0.026343f, -0.330338f,
+ -0.066931f, -0.110580f, -0.072056f, 0.599457f, -0.020738f, 0.169200f,
+ 0.836240f, -0.157548f, 0.386273f, 0.002404f, 0.329410f, -0.007020f,
+ 0.351705f, -0.041259f, 0.388861f, 0.003899f, 0.582627f, 0.023572f,
+ 0.409912f, -0.158472f, 0.536383f, 0.525093f, 0.604247f, 0.439159f,
+ 0.692832f, 0.046272f, 0.590367f, -0.082166f, 0.262357f, 0.478671f,
+ 0.031935f, 0.042675f, 0.120002f, 0.398616f, -0.078967f, 0.227986f,
+ -0.044679f, 0.151061f, -0.085564f, 0.220205f, -0.265606f, -0.203623f,
+ 0.204719f, -0.125922f, 0.038544f, -0.269379f, 0.025866f, 0.109967f,
+ 0.019064f, -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f,
+ 0.278496f, 0.018620f, 0.209971f, 0.296250f, 0.142850f, 0.288689f,
+ 0.137084f, 0.130517f, 0.128171f, -0.155396f, -0.008449f, -0.099845f,
+ 0.173455f, -0.059909f, -0.147318f, 0.102851f, -0.251389f, -0.001448f,
+ 0.103907f, 0.297273f, -0.027846f, 0.028260f, -0.382601f, 0.346695f,
+ -0.601641f, 0.162366f, -0.477495f, -0.042731f, -0.387871f, -0.051791f,
+ -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f, 0.003008f,
+ 0.099917f, -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f,
+ -0.050644f, 0.020041f, -0.132912f, -0.061578f, -3.083691f, -0.014961f,
+ -0.129115f, -0.710559f, 0.157213f, -0.844037f, -0.121991f, -0.943386f,
+ -0.231269f, -0.003462f, 0.331478f, -0.132703f, -1.285993f, -0.120957f,
+ -0.373755f, -0.322609f, 0.309059f, -0.131523f, -0.118334f, -0.063805f,
+ -0.104251f, 0.012166f, -0.094699f, -0.283753f, 0.128168f, -0.526929f,
+ -0.050331f, 0.186153f, 0.005913f, -0.221236f, 0.036363f, 0.160909f,
+ -0.001342f, -0.382749f, 0.037820f, 0.281689f, -0.024275f, 0.028854f,
+ 0.318291f, 0.318526f, 0.035778f, 0.034031f, 0.189663f, -0.293367f,
+ 0.082022f, 0.127923f, 0.078866f, -0.081361f, -0.268117f, 0.246675f,
+ 0.248605f, -0.215479f, -0.073084f, 0.496140f, -0.067327f, 0.396237f,
+ -0.120739f, 0.033752f, -0.044120f, -0.218941f, -0.028078f, 0.195132f,
+ -0.040400f, 0.281604f, -0.100471f, 0.415207f, -0.258503f, -0.429749f,
+ 0.150569f, -0.010859f, 0.136448f, 0.026589f, 0.148466f, 0.110764f,
+ 0.380967f, 0.009177f, 0.103075f, 0.116417f, 0.226273f, -0.327746f,
+ 0.169346f, 0.284553f, -0.094986f, 0.312745f, -0.147840f, 0.025062f,
+ -0.494482f, 0.112388f, -0.213962f, 0.107050f, -0.433371f, -0.096276f,
+ -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f, 0.042846f,
+ -0.237479f, 0.104746f, 0.158677f, 0.358937f, 0.099921f, 0.277109f,
+ 0.012410f, -0.062897f, 0.116130f, 0.255309f, 0.341628f, 0.145002f,
+ -0.429344f, -0.016433f, -0.068985f, 0.285194f, -0.286719f, -0.018298f,
+ -0.179369f, -0.194655f, -0.165380f, 0.026071f, -0.428268f, -0.379929f,
+ -0.727543f, 0.179610f, -0.963979f, -0.042026f, -0.616202f, 0.133401f,
+ -0.784966f, 0.061205f, -0.713357f, 0.129795f, 0.120512f, -0.339545f,
+ 0.353557f, 0.114906f, -0.329813f, -0.209987f, 0.085410f, 0.214313f,
+ -0.122082f, 0.335770f, -0.020937f, 0.202456f, 0.289023f, -0.421186f,
+ 0.337905f, 0.407663f, 0.132771f, 0.071734f, 0.213914f, 0.128595f,
+ 0.302659f, -0.209501f, 0.217756f, 0.253079f, -0.089505f, -0.205614f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer0[32] = {
+ 0.296914f, -1.826816f, 0.346130f, 0.969520f, -0.528154f, 1.175862f,
+ -0.075985f, -0.097323f, -0.233059f, 0.004846f, 0.401279f, -2.272435f,
+ 0.086257f, 0.414162f, -0.194786f, -0.233887f, -0.113215f, -2.453546f,
+ 0.861214f, 0.298361f, 0.267397f, -0.158557f, -0.119911f, -0.098134f,
+ -0.339263f, 0.385871f, -0.678123f, 0.263218f, 0.251611f, -1.155773f,
+ -0.365437f, 0.229255f,
+};
+
+static const float av1_tx_split_nn_weights_64x64_layer1[32] = {
+ 0.502104f, -0.708023f, 0.419648f, 1.583418f, 0.419355f, -1.462981f,
+ -0.439623f, 0.405691f, 0.823257f, 0.061654f, 0.750875f, 0.775031f,
+ -0.387909f, 0.447385f, 0.284690f, 0.353262f, -0.224347f, 0.832864f,
+ -1.708491f, -1.042447f, -0.272829f, 0.540640f, 0.310509f, 0.723745f,
+ 0.245592f, -0.218417f, -0.597987f, -0.362301f, 0.702217f, -0.692614f,
+ 0.207812f, 0.513560f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f };
+
+static const NN_CONFIG av1_tx_split_nnconfig_64x64 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_64x64_layer0,
+ av1_tx_split_nn_weights_64x64_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_64x64_layer0,
+ av1_tx_split_nn_bias_64x64_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 4x16 block.
+static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = {
+ -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f,
+ -2.128968f, -0.655518f, 0.432180f, 0.879752f, -0.222211f, 0.061615f,
+ -0.230969f, 0.569496f, 1.424188f, 0.598063f, -0.436005f, -0.737606f,
+ -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f,
+ -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f, -0.331752f,
+ -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f,
+ -0.636060f, 0.183271f, -0.610212f, 0.345895f, -1.100906f, -1.605713f,
+ 0.111888f, -0.140937f, 0.063013f, -0.013315f, -0.273472f, -0.255870f,
+ 1.200328f, 0.274002f, 1.005776f, 0.322392f, 1.222373f, 0.158227f,
+ 0.408810f, 0.145022f, 0.139842f, -1.249412f, 0.286672f, -0.635699f,
+ 0.312562f, -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f,
+ -0.132199f, -0.863055f, 0.217579f, -1.161425f, -0.302087f, -1.357271f,
+ -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f,
+ -2.057684f, -0.228755f, 0.606278f, 0.101198f, -0.314847f, -1.303255f,
+ -0.294964f, 1.301923f, 0.041712f, 0.077593f, -1.152746f, 0.495315f,
+ -0.751566f, 0.230249f, -0.840661f, 0.100731f, 1.346269f, 0.649898f,
+ -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f,
+ -0.354072f, 0.068292f, -0.234168f, 0.277503f, 0.179134f, 0.907420f,
+ 0.354626f, -0.627210f, 0.905779f, 0.512612f, 0.161190f, -0.843177f,
+ 0.014953f, -0.354983f, 0.011116f, -0.429598f, -1.017138f, -0.211432f,
+ 0.941840f, -0.281747f, 0.957776f, -0.541914f, 1.041880f, -0.433580f,
+ -1.416451f, -0.166467f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer0[16] = {
+ 3.086118f, -3.235095f, 4.830956f, -0.165706f, 0.955031f, 4.055783f,
+ -0.311489f, 4.660205f, -0.576277f, -0.248111f, -0.790519f, -1.686412f,
+ -1.191704f, -3.800073f, 4.121552f, -1.399397f,
+};
+
+static const float av1_tx_split_nn_weights_4x16_layer1[16] = {
+ -0.758677f, 0.388776f, 0.439906f, 0.011390f, -0.084319f, -0.667969f,
+ -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f, -0.549682f,
+ 0.462109f, 0.343315f, 1.092593f, 0.483152f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer1[1] = {
+ 0.8205083f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x16 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_4x16_layer0,
+ av1_tx_split_nn_weights_4x16_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_4x16_layer0,
+ av1_tx_split_nn_bias_4x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = {
+ 0.180713f, 0.033211f, 0.607561f, 0.138642f, 0.637204f, -0.000940f,
+ 0.012630f, 0.358109f, 0.022238f, 0.190418f, 0.079088f, 0.065925f,
+ 0.038242f, 0.162380f, -0.122728f, 0.379382f, -0.303283f, -0.327550f,
+ 0.029120f, -0.284553f, 0.269588f, -0.309805f, -0.241036f, -0.161103f,
+ -0.304887f, 0.239843f, -0.149146f, 0.311234f, -0.073640f, -0.132718f,
+ 0.178901f, 0.474712f, 0.020280f, 0.063685f, -0.609170f, -0.013658f,
+ -0.338074f, 0.250429f, 0.082978f, -0.186315f, -0.788959f, 0.039859f,
+ -0.426461f, -0.001524f, -0.447211f, 0.378102f, 0.315617f, 0.017428f,
+ 0.745494f, -0.219024f, 0.512836f, 0.200522f, 0.680449f, 0.313686f,
+ -0.412569f, -0.132927f, 0.631120f, 0.042735f, 0.336153f, 0.044772f,
+ 0.432606f, 0.175681f, -0.634411f, -0.073509f, -0.040643f, -0.559260f,
+ -0.104034f, -0.570495f, -0.247365f, 0.063256f, -0.582021f, -0.492585f,
+ -0.194955f, -0.207934f, -0.506627f, 0.021743f, -0.416518f, 0.320876f,
+ 0.115889f, 0.149399f, -0.229376f, 0.095505f, 0.115191f, -0.471921f,
+ 0.113068f, 0.343684f, -0.036831f, 0.021240f, 0.295112f, 0.031166f,
+ 0.448201f, -0.132241f, 0.164032f, 0.355572f, 0.072154f, 0.017335f,
+ -0.046113f, 0.178719f, -0.026881f, -0.242590f, 0.055073f, -0.012958f,
+ 0.077904f, 0.351356f, 0.107655f, 0.260568f, -0.080052f, -0.197553f,
+ 0.085763f, 0.263416f, -0.327741f, 0.158855f, 0.056899f, -0.162121f,
+ 0.339518f, -0.571204f, 0.264966f, -0.252214f, -0.202560f, -0.134213f,
+ -0.330188f, 0.009470f, -0.468376f, -0.065240f, -0.307957f, 0.116479f,
+ -0.222238f, -0.458716f, 0.186493f, -0.391415f, 0.118649f, -0.104653f,
+ -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f,
+ -0.598358f, 0.164947f, -0.119694f, -0.058520f, 0.203829f, -0.267404f,
+ -0.048202f, -0.600006f, 0.181594f, -0.731805f, 0.146417f, -0.687148f,
+ -1.210525f, -0.450101f, -0.620635f, 0.208825f, -0.611357f, 0.112202f,
+ -0.309468f, -0.323545f, 0.357770f, 0.308061f, 0.553199f, 0.049012f,
+ 0.530093f, -0.208597f, 0.607882f, -0.058120f, -0.527634f, 0.018136f,
+ 0.060753f, 0.118894f, 0.175649f, 0.014731f, 0.428318f, -0.106465f,
+ -0.119077f, 0.080179f, 0.524997f, 0.368286f, 0.528286f, 0.213659f,
+ 0.639286f, 0.195079f, -0.049815f, -0.092008f, -0.302958f, 0.298149f,
+ -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f, 0.325622f,
+ -0.115293f, 0.155188f, 0.047225f, 0.231050f, -0.167447f, 0.349754f,
+ 0.295544f, -0.319466f, 0.095144f, 0.174612f, -0.194652f, 0.305915f,
+ -0.239008f, -0.037453f, 0.280696f, 0.125850f, 0.749196f, -0.101919f,
+ 0.791808f, -0.236811f, 0.064157f, 0.032865f, -0.225911f, 0.350384f,
+ 0.723183f, -0.103992f, 0.483085f, -0.123992f, 0.602138f, 0.023895f,
+ -0.692601f, -0.118387f, 0.162527f, 0.145178f, -0.184702f, -0.017753f,
+ -0.159436f, 0.124105f, -0.131067f, 0.310275f, 0.151499f, 0.138924f,
+ 0.537459f, 0.263212f, 0.615896f, 0.281255f, 0.021293f, -0.473459f,
+ 0.210145f, -0.056682f, 0.063658f, 0.377254f, -0.314410f, -0.183487f,
+ 0.300384f, 0.328471f, 0.164694f, -0.159272f, -0.160942f, -0.502861f,
+ -0.129147f, 0.045916f, -0.606865f, -0.101378f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer0[32] = {
+ 0.051664f, -0.212487f, -0.077596f, -0.818467f, 0.638475f, -0.759937f,
+ 0.157198f, 0.989640f, 1.586035f, 0.431144f, 0.041605f, 0.543085f,
+ 0.498379f, 0.320504f, 0.134233f, 0.670979f, -0.105562f, -1.574879f,
+ 1.261812f, -0.287530f, -1.610592f, 0.730899f, -0.894240f, -0.657790f,
+ 0.270806f, -0.181708f, 0.298578f, 0.817240f, -0.221508f, -0.201771f,
+ -0.294389f, 1.456413f,
+};
+
+static const float av1_tx_split_nn_weights_16x32_layer1[32] = {
+ 1.208914f, 0.324728f, 0.383352f, -0.874321f, 0.172565f, -0.580927f,
+ -0.432927f, 0.433698f, -0.801935f, 0.672028f, 0.563493f, 0.260077f,
+ -0.200557f, -0.121638f, 0.530735f, -0.525196f, 0.281799f, 0.624204f,
+ -0.662775f, -0.230887f, 0.980989f, 0.223437f, -0.790591f, 0.600724f,
+ -0.273445f, 0.427635f, -0.501641f, -0.878390f, 0.234731f, -0.172550f,
+ 0.418904f, 1.792187f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer1[1] = {
+ -0.29233751f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x32 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_16x32_layer0,
+ av1_tx_split_nn_weights_16x32_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_16x32_layer0,
+ av1_tx_split_nn_bias_16x32_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 32x64 block.
+static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = {
+ 0.031614f, -0.110926f, 0.052418f, -0.702506f, 0.045708f, 0.238329f,
+ -0.021806f, -0.208128f, 0.509745f, -0.293891f, 0.277788f, 0.113937f,
+ 0.741576f, 0.062848f, 0.351878f, 0.212532f, 0.385842f, 0.081517f,
+ 0.398502f, -0.015156f, 0.242616f, 0.214619f, -0.182678f, -0.170546f,
+ 0.110605f, -0.236749f, -0.023831f, -0.285243f, 0.147156f, -0.257639f,
+ 0.341355f, -0.571641f, -0.721797f, 0.139588f, -0.518494f, -0.206526f,
+ -0.570560f, -0.184295f, 0.110271f, 0.210292f, -0.109132f, -0.001080f,
+ 0.129251f, -0.204230f, -0.396312f, -0.183024f, 0.421243f, -0.013154f,
+ 0.222627f, 0.169826f, 0.226037f, 0.218153f, -0.343528f, 0.274906f,
+ -0.156632f, 0.250261f, -0.484020f, 0.019909f, -0.349575f, -0.286643f,
+ -0.507396f, 0.202446f, -0.154110f, -0.292644f, 0.122666f, 0.306963f,
+ 0.424895f, 0.005579f, 0.494094f, -0.079551f, 0.473740f, 0.352414f,
+ -0.356917f, 0.264331f, -0.554487f, 0.119978f, 0.012291f, -0.141641f,
+ -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f, -0.118501f,
+ 0.305151f, -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f,
+ -0.177066f, -0.055114f, 0.229698f, -0.199523f, 0.054278f, 0.365020f,
+ -0.060586f, -0.300618f, 0.157563f, -0.064338f, -0.005711f, -0.176991f,
+ -0.424502f, -0.111914f, 0.092608f, 0.126621f, 0.078547f, 0.148008f,
+ 0.024221f, 0.124599f, 0.001343f, 0.059402f, 0.453753f, 0.047102f,
+ 0.242544f, 0.055735f, -0.067451f, -0.170061f, -0.170469f, -0.232173f,
+ 0.214908f, 0.248889f, 0.544348f, -0.084566f, 0.402478f, 0.298031f,
+ 0.099038f, -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f,
+ -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f,
+ 0.100219f, 0.293934f, 0.099271f, -0.036320f, 0.356626f, -0.261445f,
+ 0.879544f, 0.000878f, 0.532920f, -0.093918f, 0.508867f, -0.040215f,
+ -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f, 0.352989f,
+ -0.058831f, -0.164588f, 0.039890f, 0.122861f, 0.222508f, 0.061217f,
+ 0.466487f, 0.022666f, 0.423777f, -0.002200f, -0.656835f, -0.099760f,
+ -0.520606f, 0.303204f, -0.563620f, -0.160922f, -0.243203f, 0.313354f,
+ -0.336516f, -0.206764f, -0.236040f, 0.325899f, -0.418748f, 0.163205f,
+ -0.476242f, -0.121928f, 0.139178f, -0.157193f, -0.531766f, -0.180202f,
+ -0.485254f, 0.187703f, -0.440072f, 0.137854f, 0.029139f, 0.109530f,
+ -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f,
+ -0.304542f, 0.005123f, 0.413995f, 0.314639f, 0.342648f, -0.293264f,
+ 0.358135f, -0.180425f, -0.369530f, -0.048413f, 0.498366f, 0.121875f,
+ 0.270948f, -0.187966f, 0.342503f, 0.174420f, -0.352105f, 0.088080f,
+ 0.008277f, 0.020275f, -0.002381f, 0.504389f, -0.018832f, -0.366047f,
+ -0.090947f, -0.168150f, 0.016184f, -0.328914f, 0.089579f, -0.017349f,
+ 0.005844f, -0.005010f, -1.857514f, -0.282426f, 0.010177f, -0.214727f,
+ -0.182529f, 0.156943f, -0.162032f, -0.472654f, 0.069432f, 0.016901f,
+ -0.767905f, 0.137129f, -0.411463f, 0.049056f, -0.431657f, -0.037641f,
+ 0.785500f, 0.046225f, 0.195831f, 0.245204f, 0.368614f, 0.212261f,
+ 0.440626f, -0.158048f, -0.461031f, -0.146280f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer0[32] = {
+ 0.490777f, -1.894238f, 0.621333f, -0.076756f, 0.286298f, 0.286375f,
+ -0.126431f, -0.350034f, -1.017572f, 0.620125f, 0.408128f, 0.238756f,
+ -0.060728f, 0.210912f, 0.043124f, 0.445649f, 0.907025f, 0.360272f,
+ 1.083101f, -0.068952f, 1.062348f, 0.396354f, 0.280075f, 0.501732f,
+ 0.328422f, 0.066241f, 0.474697f, 0.126313f, 0.741206f, 0.314796f,
+ 0.552712f, 0.299410f,
+};
+
+static const float av1_tx_split_nn_weights_32x64_layer1[32] = {
+ 1.033823f, 0.603439f, 0.304591f, -0.279940f, -0.780909f, -0.132801f,
+ 0.154059f, 0.662014f, -0.718368f, 0.198733f, 0.039766f, -0.208516f,
+ -0.104909f, -0.394209f, 0.081617f, 0.365041f, -0.874960f, -0.063315f,
+ -1.189897f, 0.337225f, 0.410893f, 0.307519f, 0.221323f, 0.233895f,
+ 0.469536f, 0.438557f, 0.280144f, 0.422423f, -1.394513f, 0.781900f,
+ 0.352981f, 0.111265f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer1[1] = {
+ -0.18160765f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x64 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_32x64_layer0,
+ av1_tx_split_nn_weights_32x64_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_32x64_layer0,
+ av1_tx_split_nn_bias_32x64_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 8x32 block.
+static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = {
+ -0.687846f, 0.121404f, -0.372905f, 0.126770f, -0.103298f, -0.101650f,
+ -0.148490f, -0.271740f, 0.682915f, -0.079765f, 0.634347f, -0.151503f,
+ 0.287692f, -0.079072f, -0.236948f, 0.065064f, 0.713383f, 0.397123f,
+ 0.553621f, 0.368529f, 0.767663f, -0.046601f, -0.392402f, -0.294822f,
+ -0.292325f, -0.010573f, -0.837945f, 0.050113f, -0.811360f, 0.199162f,
+ 0.150832f, 0.011602f, 0.369694f, -0.225876f, 0.234113f, -0.269808f,
+ 0.303805f, -0.190281f, -0.451136f, 0.209755f, -0.308894f, 0.326956f,
+ 0.313591f, 0.089923f, -0.095754f, 0.390981f, 0.467366f, 0.169670f,
+ 0.853322f, 0.054055f, 0.830319f, -0.121918f, 0.262019f, -0.093526f,
+ 0.385558f, 0.419174f, 0.040198f, -0.347030f, -0.450492f, -0.106764f,
+ 0.487502f, -0.204188f, 0.430374f, -0.116388f, 0.236407f, -0.157376f,
+ 0.732294f, -0.651387f, 0.347446f, 0.342575f, 0.048406f, 0.187657f,
+ 0.434899f, -0.447782f, 0.032728f, -0.071168f, -0.255327f, 0.104174f,
+ 0.095689f, -0.431743f, 0.725694f, 0.031797f, 0.523171f, 0.061801f,
+ 0.469804f, -0.071068f, -0.059024f, -0.211937f, 0.392134f, -0.321490f,
+ 0.366060f, -0.427798f, 0.166771f, 0.299652f, 0.044660f, 0.205142f,
+ 0.039133f, -0.051835f, -0.465475f, 0.216976f, -0.341156f, 0.095358f,
+ 0.230807f, 0.201674f, 0.279266f, -0.713534f, -0.091690f, -0.569708f,
+ -0.119001f, 0.252160f, -1.544578f, -0.284477f, 0.555348f, 0.226471f,
+ 0.347690f, 0.034365f, 0.770835f, -0.241859f, -0.130241f, 0.292936f,
+ 0.396622f, -0.417916f, 0.492224f, 0.125517f, 0.344824f, 0.232172f,
+ -0.432106f, -0.278745f, 0.035069f, -0.307247f, -0.120760f, 0.170950f,
+ 0.433601f, 0.044286f, 0.141463f, -0.041382f, 0.529346f, 0.010868f,
+ -0.323674f, 0.185205f, 0.623459f, 0.232842f, -0.406693f, -0.142944f,
+ 0.222988f, 0.343634f, 0.065401f, 0.002621f, 0.805335f, -0.426926f,
+ 0.279181f, 0.131364f, 0.192339f, -0.402391f, 0.544120f, -0.060618f,
+ 0.467780f, 0.165224f, -0.373131f, 0.002427f, 0.688064f, 0.322317f,
+ 0.259713f, 0.130583f, 0.185032f, -0.189111f, -0.067821f, 0.010875f,
+ 0.644724f, -0.179291f, 0.463222f, 0.155230f, 0.721384f, -0.046019f,
+ 0.438501f, 0.440027f, -0.462090f, -0.002039f, -0.468026f, -0.008890f,
+ -0.328530f, 0.370102f, 0.482531f, 0.043471f, -0.469732f, -0.532663f,
+ 0.122081f, -0.379659f, 0.037219f, -0.519913f, -0.128975f, -0.404365f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer0[24] = {
+ -1.198965f, 0.395204f, -0.408627f, -0.021654f, -0.658355f, 0.154525f,
+ -0.288354f, 1.207574f, 0.411608f, 0.964678f, -1.176893f, 1.059006f,
+ -0.472969f, 2.087975f, 1.065536f, 0.595569f, 0.197907f, -0.349938f,
+ 1.013651f, -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f,
+};
+
+static const float av1_tx_split_nn_weights_8x32_layer1[24] = {
+ 0.815787f, -0.393465f, -0.483427f, -0.565592f, 0.493494f, 0.430229f,
+ -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f, 0.649146f,
+ -0.487383f, 1.844503f, 0.480324f, -0.982705f, -0.501446f, -0.220584f,
+ 0.334299f, 0.802238f, 0.805838f, -0.487848f, 0.300772f, -1.232857f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer1[1] = {
+ 0.13435879f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x32 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_8x32_layer0,
+ av1_tx_split_nn_weights_8x32_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_8x32_layer0,
+ av1_tx_split_nn_bias_8x32_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = {
+ -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f,
+ -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f,
+ -0.454709f, -0.059461f, 0.210313f, -0.155683f, 0.192968f, -0.127804f,
+ 0.471996f, 0.253377f, 0.472625f, 0.485322f, 0.150560f, 0.164868f,
+ -0.475587f, 0.447559f, -0.455759f, -0.306665f, -0.194866f, -0.283716f,
+ -0.243897f, 0.293020f, -0.308298f, -0.191904f, -0.468568f, 0.014053f,
+ -0.618848f, 0.096273f, -0.444586f, 0.347750f, -0.280643f, -0.062872f,
+ 0.118661f, 0.540099f, 0.104141f, -0.279300f, -0.098721f, -0.173427f,
+ -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f,
+ -0.523103f, 0.093620f, -0.930396f, -0.431997f, -1.163297f, 0.190384f,
+ -0.422581f, -0.005354f, 0.450552f, 0.369210f, 0.562484f, 0.679922f,
+ 0.282099f, -0.039075f, 0.404196f, 0.006371f, 0.069679f, -0.196160f,
+ -0.213675f, 0.275187f, -0.104235f, -0.193090f, 0.003116f, -0.252454f,
+ -0.094591f, 0.210439f, -0.137070f, 0.145043f, 0.024558f, 0.121718f,
+ 0.010138f, 0.301651f, -0.377990f, 0.444414f, 0.001845f, -0.095334f,
+ 0.550259f, 0.087603f, 0.792492f, -0.044584f, 0.641706f, -0.328458f,
+ -0.447791f, 0.135376f, 0.356385f, 0.135748f, 0.310370f, 0.293757f,
+ -0.062000f, -0.056368f, 0.343930f, 0.312039f, 0.370763f, 0.452381f,
+ -0.023630f, -0.185909f, 0.422277f, -0.006306f, 0.045166f, 0.423359f,
+ -0.157735f, -0.084901f, 0.219527f, -0.209510f, 0.575057f, 0.249276f,
+ 0.069267f, 0.233898f, -0.229392f, 0.117197f, -0.038551f, 0.293976f,
+ 0.101996f, 0.120878f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer0[16] = {
+ 1.036995f, 0.160249f, 0.100264f, 0.694881f, 0.694677f, 0.128379f,
+ -0.843405f, -0.405515f, 0.104139f, 0.182980f, -0.025472f, 0.901067f,
+ -0.299866f, -0.103079f, -0.190352f, -0.048121f,
+};
+
+static const float av1_tx_split_nn_weights_16x64_layer1[16] = {
+ -1.778868f, 0.174690f, 0.211991f, 0.712138f, 0.589352f, 0.466652f,
+ 1.029146f, -0.490044f, 0.483015f, 0.600215f, -0.577776f, -0.755546f,
+ 0.348337f, -0.205082f, 0.347129f, -0.322277f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer1[1] = {
+ 0.04230947f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x64 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_16x64_layer0,
+ av1_tx_split_nn_weights_16x64_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_16x64_layer0,
+ av1_tx_split_nn_bias_16x64_layer1,
+ },
+};
+/******************************************************************************/
+
+// Map block size to its corresponding neural net model for tx split prediction.
+static const NN_CONFIG *const av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
+ NULL, // TX_4X4,
+ &av1_tx_split_nnconfig_8x8, // TX_8X8,
+ &av1_tx_split_nnconfig_16x16, // TX_16X16,
+ &av1_tx_split_nnconfig_32x32, // TX_32X32,
+ &av1_tx_split_nnconfig_64x64, // TX_64X64,
+ &av1_tx_split_nnconfig_4x8, // TX_4X8,
+ &av1_tx_split_nnconfig_4x8, // TX_8X4,
+ &av1_tx_split_nnconfig_8x16, // TX_8X16,
+ &av1_tx_split_nnconfig_8x16, // TX_16X8,
+ &av1_tx_split_nnconfig_16x32, // TX_16X32,
+ &av1_tx_split_nnconfig_16x32, // TX_32X16,
+ &av1_tx_split_nnconfig_32x64, // TX_32X64,
+ &av1_tx_split_nnconfig_32x64, // TX_64X32,
+ &av1_tx_split_nnconfig_4x16, // TX_4X16,
+ &av1_tx_split_nnconfig_4x16, // TX_16X4,
+ &av1_tx_split_nnconfig_8x32, // TX_8X32,
+ &av1_tx_split_nnconfig_8x32, // TX_32X8,
+ &av1_tx_split_nnconfig_16x64, // TX_16X64,
+ &av1_tx_split_nnconfig_16x64, // TX_64X16,
+};
+
+#if !CONFIG_REALTIME_ONLY
+#define NUM_INTRA_TX_SPLIT_FEATURES 14
+#define NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS 1
+#define NUM_INTRA_TX_SPLIT_HIDDEN_NODES 16
+// Model to prune intra transform depth for intra 8x8 block.
+static const float av1_intra_tx_split_8x8_mean[NUM_INTRA_TX_SPLIT_FEATURES] = {
+ 0.110706f, 18.901518f, 0.250436f, 13.483487f, 0.118141f,
+ 14.318728f, 0.028409f, 14.257664f, 0.045839f, 15.143358f,
+ 9.702971f, 14.300809f, 6.018646f, 3.682534f,
+};
+
+static const float av1_intra_tx_split_8x8_std[NUM_INTRA_TX_SPLIT_FEATURES] = {
+ 13.750575f, 13.440116f, 14.334330f, 12.236641f, 18.415247f,
+ 12.733355f, 18.309339f, 12.858130f, 23.465142f, 13.447014f,
+ 8.625048f, 10.456774f, 1.185447f, 1.810423f,
+};
+
+static const float av1_intra_tx_split_nn_weights_8x8_layer0
+ [NUM_INTRA_TX_SPLIT_FEATURES * NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = {
+ -0.156142f, -0.753623f, 0.026883f, 0.039188f, -0.035310f, 0.106140f,
+ 0.051622f, 0.077838f, 0.101632f, 0.107278f, 0.232200f, 0.269083f,
+ 0.048966f, -1.553293f, -0.113983f, -0.151248f, -0.067369f, 0.787292f,
+ 0.076651f, -0.802634f, 0.266414f, 1.107563f, -0.068848f, -0.956468f,
+ -0.074920f, -0.192258f, 0.006207f, 0.176196f, -0.493442f, 0.152290f,
+ -0.208874f, -0.014658f, 0.297385f, -0.351695f, 0.246295f, -0.178519f,
+ -0.204191f, 0.049663f, -0.330343f, -0.299754f, 0.246215f, -0.014558f,
+ -0.117611f, 0.206445f, 0.045840f, -0.047563f, -0.049679f, 0.406892f,
+ -0.052307f, -1.513404f, 0.166166f, 0.520760f, -0.143320f, -0.593928f,
+ -0.010533f, 0.250752f, 0.076738f, 0.537512f, -0.082619f, -1.534031f,
+ 0.047109f, 0.634247f, -0.089730f, 0.545534f, -0.022742f, -0.779047f,
+ -0.606358f, -0.199145f, -0.051269f, 0.248784f, 0.327545f, -0.851751f,
+ 0.071739f, 0.035975f, 0.387781f, -0.136427f, -0.284436f, 0.578449f,
+ -0.198276f, 0.579950f, 0.600111f, -0.370164f, -0.215297f, 0.517342f,
+ 0.200061f, -2.507660f, -0.030851f, 0.227315f, -0.078289f, 0.276052f,
+ -0.050281f, 0.251481f, -0.139318f, 0.281175f, 0.226524f, 0.058968f,
+ 0.197436f, 0.517294f, -0.105914f, -1.599567f, 0.064985f, 0.043209f,
+ -0.280038f, 0.126874f, 0.330387f, -0.014407f, 0.031241f, 0.237801f,
+ 0.948959f, -0.253791f, -0.022622f, -0.061430f, 0.265852f, 0.750823f,
+ 0.086606f, 0.853527f, -0.180971f, -1.255744f, -0.152979f, -1.022198f,
+ -0.044708f, 0.506424f, -0.501968f, -0.416863f, -0.012688f, 0.193523f,
+ -0.093698f, 0.430875f, 0.007379f, 0.019278f, 0.080890f, 0.462755f,
+ -0.054326f, -0.157611f, -0.004851f, -1.275676f, -0.060528f, -0.508170f,
+ 0.195429f, -0.023534f, 0.355211f, 0.983561f, -0.122036f, -0.911948f,
+ -0.172280f, -1.135245f, -0.043211f, 0.576456f, -0.075247f, 0.429734f,
+ -0.246309f, -0.355575f, -0.048809f, 0.217113f, 0.078385f, 0.720341f,
+ 0.007070f, 0.144617f, -0.167642f, 0.303056f, -0.031425f, 0.123448f,
+ -0.320530f, 0.164070f, -0.497849f, -0.233918f, -0.032123f, 0.084983f,
+ 0.312216f, 0.062609f, -0.389815f, 0.237593f, 0.000157f, -0.642068f,
+ 0.167898f, 0.495234f, -0.083493f, -0.555971f, 0.124437f, 0.381125f,
+ -0.459219f, 0.047924f, -0.138222f, -2.232816f, 0.127585f, -0.102420f,
+ 0.131598f, 0.036837f, -0.163055f, -0.067429f, -0.078521f, -0.055666f,
+ 1.387057f, 0.400154f, -0.003355f, -0.073627f, -0.305098f, -0.413383f,
+ -0.008266f, -0.038329f, 0.209808f, 0.375777f, 0.037274f, -0.050226f,
+ -0.100576f, 0.237441f, 0.237854f, 0.828296f, 0.001149f, -0.093964f,
+ 0.214051f, -0.031486f, -0.561307f, 0.014540f, 0.169357f, 0.323202f,
+ -0.395334f, -0.038941f, 0.476800f, -0.213122f, -0.287521f, -0.420717f,
+ -0.054142f, -0.102266f,
+ };
+
+static const float
+ av1_intra_tx_split_nn_bias_8x8_layer0[NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = {
+ -1.150850f, -0.236404f, 0.184554f, -0.904162f, -0.949979f, 0.427016f,
+ -0.546867f, -0.611094f, -0.676570f, -0.208959f, -0.286384f, 0.562238f,
+ 0.434197f, -0.746518f, 0.123085f, -0.549836f,
+ };
+
+static const float av1_intra_tx_split_nn_weights_8x8_layer1
+ [NUM_INTRA_TX_SPLIT_HIDDEN_NODES] = {
+ 0.749814f, 0.598172f, 0.375611f, 0.751612f, 0.947538f, -0.282228f,
+ -1.457522f, -1.092290f, 0.738657f, 0.575779f, 0.514823f, -0.560616f,
+ -0.491619f, -1.482014f, 0.524625f, -0.533590f,
+ };
+
+static const float av1_intra_tx_split_nn_bias_8x8_layer1[1] = {
+ -0.488888f,
+};
+
+static const NN_CONFIG av1_intra_tx_split_nnconfig_8x8 = {
+ NUM_INTRA_TX_SPLIT_FEATURES, // num_inputs
+ 1, // num_outputs
+ NUM_INTRA_TX_SPLIT_HIDDEN_LAYERS, // num_hidden_layers
+ {
+ NUM_INTRA_TX_SPLIT_HIDDEN_NODES,
+ }, // num_hidden_nodes
+ {
+ av1_intra_tx_split_nn_weights_8x8_layer0,
+ av1_intra_tx_split_nn_weights_8x8_layer1,
+ },
+ {
+ av1_intra_tx_split_nn_bias_8x8_layer0,
+ av1_intra_tx_split_nn_bias_8x8_layer1,
+ },
+};
+
+static const float av1_intra_tx_prune_nn_thresh_8x8[2] = { -0.405465f,
+ 0.405465f };
+#endif // !CONFIG_REALTIME_ONLY
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/tx_search.c b/third_party/aom/av1/encoder/tx_search.c
new file mode 100644
index 0000000000..7292c01191
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_search.c
@@ -0,0 +1,3830 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/common/idct.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/sorting_network.h"
+#include "av1/encoder/tx_prune_model_weights.h"
+#include "av1/encoder/tx_search.h"
+#include "av1/encoder/txb_rdopt.h"
+
+#define PROB_THRESH_OFFSET_TX_TYPE 100
+
+struct rdcost_block_args {
+ const AV1_COMP *cpi;
+ MACROBLOCK *x;
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE];
+ RD_STATS rd_stats;
+ int64_t current_rd;
+ int64_t best_rd;
+ int exit_early;
+ int incomplete_exit;
+ FAST_TX_SEARCH_MODE ftxs_mode;
+ int skip_trellis;
+};
+
+typedef struct {
+ int64_t rd;
+ int txb_entropy_ctx;
+ TX_TYPE tx_type;
+} TxCandidateInfo;
+
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+ {
+ 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+ 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+ },
+ {
+ 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+ 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+ },
+ {
+ 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+ },
+};
+
+// lookup table for predict_skip_txfm
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+ TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8,
+ TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+ TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4,
+ TX_8X8, TX_8X8, TX_16X16, TX_16X16,
+};
+
+// look-up table for sqrt of number of pixels in a transform block
+// rounded up to the nearest integer.
+static const int sqrt_tx_pixels_2d[TX_SIZES_ALL] = { 4, 8, 16, 32, 32, 6, 6,
+ 12, 12, 23, 23, 32, 32, 8,
+ 8, 16, 16, 23, 23 };
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+ const int16_t *diff = x->plane[0].src_diff;
+ const uint32_t hash =
+ av1_get_crc32c_value(&x->txfm_search_info.mb_rd_record->crc_calculator,
+ (uint8_t *)diff, 2 * rows * cols);
+ return (hash << 5) + bsize;
+}
+
+static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
+ const int64_t ref_best_rd,
+ const uint32_t hash) {
+ int32_t match_index = -1;
+ if (ref_best_rd != INT64_MAX) {
+ for (int i = 0; i < mb_rd_record->num; ++i) {
+ const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+ // If there is a match in the mb_rd_record, fetch the RD decision and
+ // terminate early.
+ if (mb_rd_record->mb_rd_info[index].hash_value == hash) {
+ match_index = index;
+ break;
+ }
+ }
+ }
+ return match_index;
+}
+
+static AOM_INLINE void fetch_mb_rd_info(int n4,
+ const MB_RD_INFO *const mb_rd_info,
+ RD_STATS *const rd_stats,
+ MACROBLOCK *const x) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ mbmi->tx_size = mb_rd_info->tx_size;
+ memcpy(x->txfm_search_info.blk_skip, mb_rd_info->blk_skip,
+ sizeof(mb_rd_info->blk_skip[0]) * n4);
+ av1_copy(mbmi->inter_tx_size, mb_rd_info->inter_tx_size);
+ av1_copy_array(xd->tx_type_map, mb_rd_info->tx_type_map, n4);
+ *rd_stats = mb_rd_info->rd_stats;
+}
+
+int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
+ int blk_col, const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize,
+ unsigned int *block_mse_q8) {
+ int visible_rows, visible_cols;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+ NULL, &visible_cols, &visible_rows);
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int16_t *diff = x->plane[plane].src_diff;
+
+ diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+ uint64_t sse =
+ aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+ if (block_mse_q8 != NULL) {
+ if (visible_cols > 0 && visible_rows > 0)
+ *block_mse_q8 =
+ (unsigned int)((256 * sse) / (visible_cols * visible_rows));
+ else
+ *block_mse_q8 = UINT_MAX;
+ }
+ return sse;
+}
+
+// Computes the residual block's SSE and mean on all visible 4x4s in the
+// transform block
+static INLINE int64_t pixel_diff_stats(
+ MACROBLOCK *x, int plane, int blk_row, int blk_col,
+ const BLOCK_SIZE plane_bsize, const BLOCK_SIZE tx_bsize,
+ unsigned int *block_mse_q8, int64_t *per_px_mean, uint64_t *block_var) {
+ int visible_rows, visible_cols;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+ NULL, &visible_cols, &visible_rows);
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int16_t *diff = x->plane[plane].src_diff;
+
+ diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+ uint64_t sse = 0;
+ int sum = 0;
+ sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum);
+ if (visible_cols > 0 && visible_rows > 0) {
+ double norm_factor = 1.0 / (visible_cols * visible_rows);
+ int sign_sum = sum > 0 ? 1 : -1;
+ // Conversion to transform domain
+ *per_px_mean = (int64_t)(norm_factor * abs(sum)) << 7;
+ *per_px_mean = sign_sum * (*per_px_mean);
+ *block_mse_q8 = (unsigned int)(norm_factor * (256 * sse));
+ *block_var = (uint64_t)(sse - (uint64_t)(norm_factor * sum * sum));
+ } else {
+ *block_mse_q8 = UINT_MAX;
+ }
+ return sse;
+}
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+// The sse value is stored in dist.
+static int predict_skip_txfm(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+ int reduced_tx_set) {
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+ *dist = av1_pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
+
+ const int64_t mse = *dist / bw / bh;
+ // Normalized quantizer takes the transform upscaling factor (8 for tx size
+ // smaller than 32) into account.
+ const int16_t normalized_dc_q = dc_q >> 3;
+ const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+ // For faster early skip decision, use dist to compare against threshold so
+ // that quality risk is less for the skip=1 decision. Otherwise, use mse
+ // since the fwd_txfm coeff checks will take care of quality
+ // TODO(any): Use dist to return 0 when skip_txfm_level is 1
+ int64_t pred_err = (txfm_params->skip_txfm_level >= 2) ? *dist : mse;
+ // Predict not to skip when error is larger than threshold.
+ if (pred_err > mse_thresh) return 0;
+ // Return as skip otherwise for aggressive early skip
+ else if (txfm_params->skip_txfm_level >= 2)
+ return 1;
+
+ const int max_tx_size = max_predict_sf_tx_size[bsize];
+ const int tx_h = tx_size_high[max_tx_size];
+ const int tx_w = tx_size_wide[max_tx_size];
+ DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
+ TxfmParam param;
+ param.tx_type = DCT_DCT;
+ param.tx_size = max_tx_size;
+ param.bd = xd->bd;
+ param.is_hbd = is_cur_buf_hbd(xd);
+ param.lossless = 0;
+ param.tx_set_type = av1_get_ext_tx_set_type(
+ param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+ const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+ const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+ const int16_t *src_diff = x->plane[0].src_diff;
+ const int n_coeff = tx_w * tx_h;
+ const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+ const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+ const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+ for (int row = 0; row < bh; row += tx_h) {
+ for (int col = 0; col < bw; col += tx_w) {
+ av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+ // Operating on TX domain, not pixels; we want the QTX quantizers
+ const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+ if (dc_coef >= dc_thresh) return 0;
+ for (int i = 1; i < n_coeff; ++i) {
+ const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+ if (ac_coef >= ac_thresh) return 0;
+ }
+ }
+ src_diff += tx_h * bw;
+ }
+ return 1;
+}
+
+// Used to set proper context for early termination with skip = 1.
+static AOM_INLINE void set_skip_txfm(MACROBLOCK *x, RD_STATS *rd_stats,
+ BLOCK_SIZE bsize, int64_t dist) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int n4 = bsize_to_num_blk(bsize);
+ const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4);
+ memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
+ mbmi->tx_size = tx_size;
+ for (int i = 0; i < n4; ++i)
+ set_blk_skip(x->txfm_search_info.blk_skip, 0, i, 1);
+ rd_stats->skip_txfm = 1;
+ if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+ rd_stats->dist = rd_stats->sse = (dist << 4);
+ // Though decision is to make the block as skip based on luma stats,
+ // it is possible that block becomes non skip after chroma rd. In addition
+ // intermediate non skip costs calculated by caller function will be
+ // incorrect, if rate is set as zero (i.e., if zero_blk_rate is not
+ // accounted). Hence intermediate rate is populated to code the luma tx blks
+ // as skip, the caller function based on final rd decision (i.e., skip vs
+ // non-skip) sets the final rate accordingly. Here the rate populated
+ // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx
+ // size possible) in the current block. Eg: For 128*128 block, rate would be
+ // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx
+ // block as 'all zeros'
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
+ ENTROPY_CONTEXT *ta = ctxa;
+ ENTROPY_CONTEXT *tl = ctxl;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx);
+ const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+ rd_stats->rate = zero_blk_rate *
+ (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
+ (block_size_high[bsize] >> tx_size_high_log2[tx_size]);
+}
+
+static AOM_INLINE void save_mb_rd_info(int n4, uint32_t hash,
+ const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ MB_RD_RECORD *mb_rd_record) {
+ int index;
+ if (mb_rd_record->num < RD_RECORD_BUFFER_LEN) {
+ index =
+ (mb_rd_record->index_start + mb_rd_record->num) % RD_RECORD_BUFFER_LEN;
+ ++mb_rd_record->num;
+ } else {
+ index = mb_rd_record->index_start;
+ mb_rd_record->index_start =
+ (mb_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+ }
+ MB_RD_INFO *const mb_rd_info = &mb_rd_record->mb_rd_info[index];
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ mb_rd_info->hash_value = hash;
+ mb_rd_info->tx_size = mbmi->tx_size;
+ memcpy(mb_rd_info->blk_skip, x->txfm_search_info.blk_skip,
+ sizeof(mb_rd_info->blk_skip[0]) * n4);
+ av1_copy(mb_rd_info->inter_tx_size, mbmi->inter_tx_size);
+ av1_copy_array(mb_rd_info->tx_type_map, xd->tx_type_map, n4);
+ mb_rd_info->rd_stats = *rd_stats;
+}
+
+static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
+ const SPEED_FEATURES *sf,
+ int tx_size_search_method) {
+ if (tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
+
+ if (sf->tx_sf.tx_size_search_lgr_block) {
+ if (mi_width > mi_size_wide[BLOCK_64X64] ||
+ mi_height > mi_size_high[BLOCK_64X64])
+ return MAX_VARTX_DEPTH;
+ }
+
+ if (is_inter) {
+ return (mi_height != mi_width)
+ ? sf->tx_sf.inter_tx_size_search_init_depth_rect
+ : sf->tx_sf.inter_tx_size_search_init_depth_sqr;
+ } else {
+ return (mi_height != mi_width)
+ ? sf->tx_sf.intra_tx_size_search_init_depth_rect
+ : sf->tx_sf.intra_tx_size_search_init_depth_sqr;
+ }
+}
+
+static AOM_INLINE void select_tx_block(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+ RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
+ int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode);
+
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
+#if CONFIG_COLLECT_RD_STATS
+
+static AOM_INLINE void get_energy_distribution_fine(
+ const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride, int need_4th, double *hordist,
+ double *verdist) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+ if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) {
+ // Special cases: calculate 'esq' values manually, as we don't have 'vf'
+ // functions for the 16 (very small) sub-blocks of this block.
+ const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3;
+ const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3;
+ assert(bw <= 32);
+ assert(bh <= 32);
+ assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
+ if (cpi->common.seq_params->use_highbitdepth) {
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (int i = 0; i < bh; ++i)
+ for (int j = 0; j < bw; ++j) {
+ const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+ esq[index] +=
+ (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+ (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+ }
+ } else {
+ for (int i = 0; i < bh; ++i)
+ for (int j = 0; j < bw; ++j) {
+ const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+ esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+ (src[j + i * src_stride] - dst[j + i * dst_stride]);
+ }
+ }
+ } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
+ const int f_index =
+ (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
+ assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL);
+ const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
+ assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
+ assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
+ cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+ dst_stride, &esq[1]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+ dst_stride, &esq[2]);
+ cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[3]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+ dst_stride, &esq[5]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+ dst_stride, &esq[6]);
+ cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[7]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+ dst_stride, &esq[9]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+ dst_stride, &esq[10]);
+ cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[11]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+ dst_stride, &esq[13]);
+ cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+ dst_stride, &esq[14]);
+ cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[15]);
+ }
+
+ double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
+ esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
+ esq[12] + esq[13] + esq[14] + esq[15];
+ if (total > 0) {
+ const double e_recip = 1.0 / total;
+ hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
+ hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
+ hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+ if (need_4th) {
+ hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip;
+ }
+ verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
+ verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
+ verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+ if (need_4th) {
+ verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip;
+ }
+ } else {
+ hordist[0] = verdist[0] = 0.25;
+ hordist[1] = verdist[1] = 0.25;
+ hordist[2] = verdist[2] = 0.25;
+ if (need_4th) {
+ hordist[3] = verdist[3] = 0.25;
+ }
+ }
+}
+
+static double get_sse_norm(const int16_t *diff, int stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ const int err = diff[j * stride + i];
+ sum += err * err;
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ sum += abs(diff[j * stride + i]);
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static AOM_INLINE void get_2x2_normalized_sses_and_sads(
+ const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
+ int src_stride, const uint8_t *const dst, int dst_stride,
+ const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
+ double *const sad_norm_arr) {
+ const BLOCK_SIZE tx_bsize_half =
+ get_partition_subsize(tx_bsize, PARTITION_SPLIT);
+ if (tx_bsize_half == BLOCK_INVALID) { // manually calculate stats
+ const int half_width = block_size_wide[tx_bsize] / 2;
+ const int half_height = block_size_high[tx_bsize] / 2;
+ for (int row = 0; row < 2; ++row) {
+ for (int col = 0; col < 2; ++col) {
+ const int16_t *const this_src_diff =
+ src_diff + row * half_height * diff_stride + col * half_width;
+ if (sse_norm_arr) {
+ sse_norm_arr[row * 2 + col] =
+ get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+ }
+ if (sad_norm_arr) {
+ sad_norm_arr[row * 2 + col] =
+ get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+ }
+ }
+ }
+ } else { // use function pointers to calculate stats
+ const int half_width = block_size_wide[tx_bsize_half];
+ const int half_height = block_size_high[tx_bsize_half];
+ const int num_samples_half = half_width * half_height;
+ for (int row = 0; row < 2; ++row) {
+ for (int col = 0; col < 2; ++col) {
+ const uint8_t *const this_src =
+ src + row * half_height * src_stride + col * half_width;
+ const uint8_t *const this_dst =
+ dst + row * half_height * dst_stride + col * half_width;
+
+ if (sse_norm_arr) {
+ unsigned int this_sse;
+ cpi->ppi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+ dst_stride, &this_sse);
+ sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+ }
+
+ if (sad_norm_arr) {
+ const unsigned int this_sad = cpi->ppi->fn_ptr[tx_bsize_half].sdf(
+ this_src, src_stride, this_dst, dst_stride);
+ sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+ }
+ }
+ }
+ }
+}
+
+#if CONFIG_COLLECT_RD_STATS == 1
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ sum += diff[j * stride + i];
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+static AOM_INLINE void PrintTransformUnitStats(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats,
+ int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ TX_TYPE tx_type, int64_t rd) {
+ if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+ // Generate small sample to restrict output size.
+ static unsigned int seed = 21743;
+ if (lcg_rand16(&seed) % 256 > 0) return;
+
+ const char output_file[] = "tu_stats.txt";
+ FILE *fout = fopen(output_file, "a");
+ if (!fout) return;
+
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int plane = 0;
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int txw = tx_size_wide[tx_size];
+ const int txh = tx_size_high[tx_size];
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int q_step = p->dequant_QTX[1] >> dequant_shift;
+ const int num_samples = txw * txh;
+
+ const double rate_norm = (double)rd_stats->rate / num_samples;
+ const double dist_norm = (double)rd_stats->dist / num_samples;
+
+ fprintf(fout, "%g %g", rate_norm, dist_norm);
+
+ const int src_stride = p->src.stride;
+ const uint8_t *const src =
+ &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+ const int dst_stride = pd->dst.stride;
+ const uint8_t *const dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ unsigned int sse;
+ cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+ const double sse_norm = (double)sse / num_samples;
+
+ const unsigned int sad =
+ cpi->ppi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+ const double sad_norm = (double)sad / num_samples;
+
+ fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int16_t *const src_diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
+
+ double sse_norm_arr[4], sad_norm_arr[4];
+ get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst,
+ dst_stride, src_diff, diff_stride,
+ sse_norm_arr, sad_norm_arr);
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sse_norm_arr[i]);
+ }
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sad_norm_arr[i]);
+ }
+
+ const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+ const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+
+ fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size],
+ tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col);
+
+ int model_rate;
+ int64_t model_dist;
+ model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
+ const double model_rate_norm = (double)model_rate / num_samples;
+ const double model_dist_norm = (double)model_dist / num_samples;
+ fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+
+ const double mean = get_mean(src_diff, diff_stride, txw, txh);
+ float hor_corr, vert_corr;
+ av1_get_horver_correlation_full(src_diff, diff_stride, txw, txh, &hor_corr,
+ &vert_corr);
+ fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+ double hdist[4] = { 0 }, vdist[4] = { 0 };
+ get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride,
+ 1, hdist, vdist);
+ fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+ hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+ fprintf(fout, " %d %" PRId64, x->rdmult, rd);
+
+ fprintf(fout, "\n");
+ fclose(fout);
+}
+#endif // CONFIG_COLLECT_RD_STATS == 1
+
+#if CONFIG_COLLECT_RD_STATS >= 2
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ int64_t total_sse = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
+ unsigned int sse;
+
+ if (plane) continue;
+
+ cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, &sse);
+ total_sse += sse;
+ }
+ total_sse <<= 4;
+ return total_sse;
+}
+
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ int64_t sse, int *est_residue_cost,
+ int64_t *est_dist) {
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ if (md->ready) {
+ if (sse < md->dist_mean) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ } else {
+ *est_dist = (int64_t)round(md->dist_mean);
+ const double est_ld = md->a * sse + md->b;
+ // Clamp estimated rate cost by INT_MAX / 2.
+ // TODO(angiebird@google.com): find better solution than clamping.
+ if (fabs(est_ld) < 1e-2) {
+ *est_residue_cost = INT_MAX / 2;
+ } else {
+ double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+ if (est_residue_cost_dbl < 0) {
+ *est_residue_cost = 0;
+ } else {
+ *est_residue_cost =
+ (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+ }
+ }
+ if (*est_residue_cost <= 0) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ }
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static double get_highbd_diff_mean(const uint8_t *src8, int src_stride,
+ const uint8_t *dst8, int dst_stride, int w,
+ int h) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+ sum += diff;
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static double get_diff_mean(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+ sum += diff;
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data,
+ MACROBLOCK *x,
+ const RD_STATS *const rd_stats,
+ BLOCK_SIZE plane_bsize) {
+ if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
+ (tile_data == NULL ||
+ !tile_data->inter_mode_rd_models[plane_bsize].ready))
+ return;
+ (void)tile_data;
+ // Generate small sample to restrict output size.
+ static unsigned int seed = 95014;
+
+ if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) !=
+ 1)
+ return;
+
+ const char output_file[] = "pu_stats.txt";
+ FILE *fout = fopen(output_file, "a");
+ if (!fout) return;
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int plane = 0;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const int diff_stride = block_size_wide[plane_bsize];
+ int bw, bh;
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+ &bh);
+ const int num_samples = bw * bh;
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int q_step = p->dequant_QTX[1] >> dequant_shift;
+ const int shift = (xd->bd - 8);
+
+ const double rate_norm = (double)rd_stats->rate / num_samples;
+ const double dist_norm = (double)rd_stats->dist / num_samples;
+ const double rdcost_norm =
+ (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
+
+ fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
+
+ const int src_stride = p->src.stride;
+ const uint8_t *const src = p->src.buf;
+ const int dst_stride = pd->dst.stride;
+ const uint8_t *const dst = pd->dst.buf;
+ const int16_t *const src_diff = p->src_diff;
+
+ int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+ const double sse_norm = (double)sse / num_samples;
+
+ const unsigned int sad =
+ cpi->ppi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+ const double sad_norm =
+ (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
+
+ fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+ double sse_norm_arr[4], sad_norm_arr[4];
+ get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+ dst_stride, src_diff, diff_stride,
+ sse_norm_arr, sad_norm_arr);
+ if (shift) {
+ for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+ for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift);
+ }
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sse_norm_arr[i]);
+ }
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sad_norm_arr[i]);
+ }
+
+ fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
+
+ int model_rate;
+ int64_t model_dist;
+ model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
+ const double model_rdcost_norm =
+ (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
+ const double model_rate_norm = (double)model_rate / num_samples;
+ const double model_dist_norm = (double)model_dist / num_samples;
+ fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
+ model_rdcost_norm);
+
+ double mean;
+ if (is_cur_buf_hbd(xd)) {
+ mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ bw, bh);
+ }
+ mean /= (1 << shift);
+ float hor_corr, vert_corr;
+ av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr,
+ &vert_corr);
+ fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+ double hdist[4] = { 0 }, vdist[4] = { 0 };
+ get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+ dst_stride, 1, hdist, vdist);
+ fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+ hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+ if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+ assert(tile_data->inter_mode_rd_models[plane_bsize].ready);
+ const int64_t overall_sse = get_sse(cpi, x);
+ int est_residue_cost = 0;
+ int64_t est_dist = 0;
+ get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost,
+ &est_dist);
+ const double est_residue_cost_norm = (double)est_residue_cost / num_samples;
+ const double est_dist_norm = (double)est_dist / num_samples;
+ const double est_rdcost_norm =
+ (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples;
+ fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm,
+ est_rdcost_norm);
+ }
+
+ fprintf(fout, "\n");
+ fclose(fout);
+}
+#endif // CONFIG_COLLECT_RD_STATS >= 2
+#endif // CONFIG_COLLECT_RD_STATS
+
+static AOM_INLINE void inverse_transform_block_facade(MACROBLOCK *const x,
+ int plane, int block,
+ int blk_row, int blk_col,
+ int eob,
+ int reduced_tx_set) {
+ if (!eob) return;
+ struct macroblock_plane *const p = &x->plane[plane];
+ MACROBLOCKD *const xd = &x->e_mbd;
+ tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+ tx_size, reduced_tx_set);
+
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+ dst_stride, eob, reduced_tx_set);
+}
+
+static INLINE void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ const TXB_CTX *const txb_ctx, int skip_trellis,
+ TX_TYPE best_tx_type, int do_quant,
+ int *rate_cost, uint16_t best_eob) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ if (!is_inter && best_eob &&
+ (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] ||
+ blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) {
+ // if the quantized coefficients are stored in the dqcoeff buffer, we don't
+ // need to do transform and quantization again.
+ if (do_quant) {
+ TxfmParam txfm_param_intra;
+ QUANT_PARAM quant_param_intra;
+ av1_setup_xform(cm, x, tx_size, best_tx_type, &txfm_param_intra);
+ av1_setup_quant(tx_size, !skip_trellis,
+ skip_trellis
+ ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
+ : AV1_XFORM_QUANT_FP)
+ : AV1_XFORM_QUANT_FP,
+ cpi->oxcf.q_cfg.quant_b_adapt, &quant_param_intra);
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type,
+ &quant_param_intra);
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+ &txfm_param_intra, &quant_param_intra);
+ if (quant_param_intra.use_optimize_b) {
+ av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx,
+ rate_cost);
+ }
+ }
+
+ inverse_transform_block_facade(x, plane, block, blk_row, blk_col,
+ x->plane[plane].eobs[block],
+ cm->features.reduced_tx_set_used);
+
+ // This may happen because of hash collision. The eob stored in the hash
+ // table is non-zero, but the real eob is zero. We need to make sure tx_type
+ // is DCT_DCT in this case.
+ if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
+ best_tx_type != DCT_DCT) {
+ update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+ }
+ }
+}
+
+static unsigned pixel_dist_visible_only(
+ const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+ const int src_stride, const uint8_t *dst, const int dst_stride,
+ const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+ int visible_cols) {
+ unsigned sse;
+
+ if (txb_rows == visible_rows && txb_cols == visible_cols) {
+ cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+ return sse;
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ const MACROBLOCKD *xd = &x->e_mbd;
+ if (is_cur_buf_hbd(xd)) {
+ uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+ visible_cols, visible_rows);
+ return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+ }
+#else
+ (void)x;
+#endif
+ sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+ visible_rows);
+ return sse;
+}
+
+// Compute the pixel domain distortion from src and dst on all visible 4x4s in
+// the
+// transform block.
+static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
+ int plane, const uint8_t *src, const int src_stride,
+ const uint8_t *dst, const int dst_stride,
+ int blk_row, int blk_col,
+ const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize) {
+ int txb_rows, txb_cols, visible_rows, visible_cols;
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
+ &txb_cols, &txb_rows, &visible_cols, &visible_rows);
+ assert(visible_rows > 0);
+ assert(visible_cols > 0);
+
+ unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
+ dst_stride, tx_bsize, txb_rows,
+ txb_cols, visible_rows, visible_cols);
+
+ return sse;
+}
+
+static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+ int plane, BLOCK_SIZE plane_bsize,
+ int block, int blk_row, int blk_col,
+ TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const uint16_t eob = p->eobs[block];
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ const int bsw = block_size_wide[tx_bsize];
+ const int bsh = block_size_high[tx_bsize];
+ const int src_stride = x->plane[plane].src.stride;
+ const int dst_stride = xd->plane[plane].dst.stride;
+ // Scale the transform block index to pixel unit.
+ const int src_idx = (blk_row * src_stride + blk_col) << MI_SIZE_LOG2;
+ const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2;
+ const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+ const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+ const tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+
+ assert(cpi != NULL);
+ assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+ uint8_t *recon;
+ DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (is_cur_buf_hbd(xd)) {
+ recon = CONVERT_TO_BYTEPTR(recon16);
+ aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride,
+ CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, bsh);
+ } else {
+ recon = (uint8_t *)recon16;
+ aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
+ }
+#else
+ recon = (uint8_t *)recon16;
+ aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh);
+#endif
+
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+ cpi->common.features.reduced_tx_set_used);
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
+ MAX_TX_SIZE, eob,
+ cpi->common.features.reduced_tx_set_used);
+
+ return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+ blk_row, blk_col, plane_bsize, tx_bsize);
+}
+
+// pruning thresholds for prune_txk_type and prune_txk_type_separ
+static const int prune_factors[5] = { 200, 200, 120, 80, 40 }; // scale 1000
+static const int mul_factors[5] = { 80, 80, 70, 50, 30 }; // scale 100
+
+// R-D costs are sorted in ascending order.
+static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
+ int i, j, k;
+
+ for (i = 1; i <= len - 1; ++i) {
+ for (j = 0; j < i; ++j) {
+ if (rds[j] > rds[i]) {
+ int64_t temprd;
+ int tempi;
+
+ temprd = rds[i];
+ tempi = txk[i];
+
+ for (k = i; k > j; k--) {
+ rds[k] = rds[k - 1];
+ txk[k] = txk[k - 1];
+ }
+
+ rds[j] = temprd;
+ txk[j] = tempi;
+ break;
+ }
+ }
+ }
+}
+
+static INLINE int64_t av1_block_error_qm(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ const qm_val_t *qmatrix,
+ const int16_t *scan, int64_t *ssz) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+
+ for (i = 0; i < block_size; i++) {
+ int64_t weight = qmatrix[scan[i]];
+ int64_t dd = coeff[i] - dqcoeff[i];
+ dd *= weight;
+ int64_t cc = coeff[i];
+ cc *= weight;
+ // The ranges of coeff and dqcoeff are
+ // bd8 : 18 bits (including sign)
+ // bd10: 20 bits (including sign)
+ // bd12: 22 bits (including sign)
+ // As AOM_QM_BITS is 5, the intermediate quantities in the calculation
+ // below should fit in 54 bits, thus no overflow should happen.
+ error += (dd * dd + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+ sqcoeff += (cc * cc + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+ }
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+ TX_SIZE tx_size,
+ const qm_val_t *qmatrix,
+ const int16_t *scan, int64_t *out_dist,
+ int64_t *out_sse) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ // Transform domain distortion computation is more efficient as it does
+ // not involve an inverse transform, but it is less accurate.
+ const int buffer_length = av1_get_max_eob(tx_size);
+ int64_t this_sse;
+ // TX-domain results need to shift down to Q2/D10 to match pixel
+ // domain distortion values which are in Q2^2
+ int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff = p->coeff + block_offset;
+ tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+#if CONFIG_AV1_HIGHBITDEPTH
+ MACROBLOCKD *const xd = &x->e_mbd;
+ if (is_cur_buf_hbd(xd)) {
+ // TODO(veluca): handle use_qm_dist_metric for HBD too.
+ *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
+ xd->bd);
+ } else {
+#endif
+ if (qmatrix == NULL || !x->txfm_search_params.use_qm_dist_metric) {
+ *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+ } else {
+ *out_dist = av1_block_error_qm(coeff, dqcoeff, buffer_length, qmatrix,
+ scan, &this_sse);
+ }
+#if CONFIG_AV1_HIGHBITDEPTH
+ }
+#endif
+
+ *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+ *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+}
+
+uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
+ int16_t allowed_tx_mask, int prune_factor,
+ const TXB_CTX *const txb_ctx,
+ int reduced_tx_set_used, int64_t ref_best_rd,
+ int num_sel) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ int idx;
+
+ int64_t rds_v[4];
+ int64_t rds_h[4];
+ int idx_v[4] = { 0, 1, 2, 3 };
+ int idx_h[4] = { 0, 1, 2, 3 };
+ int skip_v[4] = { 0 };
+ int skip_h[4] = { 0 };
+ const int idx_map[16] = {
+ DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT,
+ ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST,
+ FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+ H_DCT, H_ADST, H_FLIPADST, IDTX
+ };
+
+ const int sel_pattern_v[16] = {
+ 0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 3, 1, 3, 2, 3, 3
+ };
+ const int sel_pattern_h[16] = {
+ 0, 1, 0, 1, 2, 0, 2, 1, 2, 3, 0, 3, 1, 3, 2, 3
+ };
+
+ QUANT_PARAM quant_param;
+ TxfmParam txfm_param;
+ av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+ av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
+ &quant_param);
+ int tx_type;
+ // to ensure we can try ones even outside of ext_tx_set of current block
+ // this function should only be called for size < 16
+ assert(txsize_sqr_up_map[tx_size] <= TX_16X16);
+ txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+ int rate_cost = 0;
+ int64_t dist = 0, sse = 0;
+ // evaluate horizontal with vertical DCT
+ for (idx = 0; idx < 4; ++idx) {
+ tx_type = idx_map[idx];
+ txfm_param.tx_type = tx_type;
+
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+ scan_order->scan, &dist, &sse);
+
+ rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+ txb_ctx, reduced_tx_set_used, 0);
+
+ rds_h[idx] = RDCOST(x->rdmult, rate_cost, dist);
+
+ if ((rds_h[idx] - (rds_h[idx] >> 2)) > ref_best_rd) {
+ skip_h[idx] = 1;
+ }
+ }
+ sort_rd(rds_h, idx_h, 4);
+ for (idx = 1; idx < 4; idx++) {
+ if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1;
+ }
+
+ if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF;
+
+ // evaluate vertical with the best horizontal chosen
+ rds_v[0] = rds_h[0];
+ int start_v = 1, end_v = 4;
+ const int *idx_map_v = idx_map + idx_h[0];
+
+ for (idx = start_v; idx < end_v; ++idx) {
+ tx_type = idx_map_v[idx_v[idx] * 4];
+ txfm_param.tx_type = tx_type;
+
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+ scan_order->scan, &dist, &sse);
+
+ rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+ txb_ctx, reduced_tx_set_used, 0);
+
+ rds_v[idx] = RDCOST(x->rdmult, rate_cost, dist);
+
+ if ((rds_v[idx] - (rds_v[idx] >> 2)) > ref_best_rd) {
+ skip_v[idx] = 1;
+ }
+ }
+ sort_rd(rds_v, idx_v, 4);
+ for (idx = 1; idx < 4; idx++) {
+ if (rds_v[idx] > rds_v[0] * 1.2) skip_v[idx_v[idx]] = 1;
+ }
+
+ // combine rd_h and rd_v to prune tx candidates
+ int i_v, i_h;
+ int64_t rds[16];
+ int num_cand = 0, last = TX_TYPES - 1;
+
+ for (int i = 0; i < 16; i++) {
+ i_v = sel_pattern_v[i];
+ i_h = sel_pattern_h[i];
+ tx_type = idx_map[idx_v[i_v] * 4 + idx_h[i_h]];
+ if (!(allowed_tx_mask & (1 << tx_type)) || skip_h[idx_h[i_h]] ||
+ skip_v[idx_v[i_v]]) {
+ txk_map[last] = tx_type;
+ last--;
+ } else {
+ txk_map[num_cand] = tx_type;
+ rds[num_cand] = rds_v[i_v] + rds_h[i_h];
+ if (rds[num_cand] == 0) rds[num_cand] = 1;
+ num_cand++;
+ }
+ }
+ sort_rd(rds, txk_map, num_cand);
+
+ uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
+ num_sel = AOMMIN(num_sel, num_cand);
+
+ for (int i = 1; i < num_sel; i++) {
+ int64_t factor = 1800 * (rds[i] - rds[0]) / (rds[0]);
+ if (factor < (int64_t)prune_factor)
+ prune &= ~(1 << txk_map[i]);
+ else
+ break;
+ }
+ return prune;
+}
+
+uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, int *txk_map,
+ uint16_t allowed_tx_mask, int prune_factor,
+ const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int tx_type;
+
+ int64_t rds[TX_TYPES];
+
+ int num_cand = 0;
+ int last = TX_TYPES - 1;
+
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+ av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+ av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.q_cfg.quant_b_adapt,
+ &quant_param);
+
+ for (int idx = 0; idx < TX_TYPES; idx++) {
+ tx_type = idx;
+ int rate_cost = 0;
+ int64_t dist = 0, sse = 0;
+ if (!(allowed_tx_mask & (1 << tx_type))) {
+ txk_map[last] = tx_type;
+ last--;
+ continue;
+ }
+ txfm_param.tx_type = tx_type;
+
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+
+ // do txfm and quantization
+ av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+ &quant_param);
+ // estimate rate cost
+ rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+ txb_ctx, reduced_tx_set_used, 0);
+ // tx domain dist
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+ scan_order->scan, &dist, &sse);
+
+ txk_map[num_cand] = tx_type;
+ rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist);
+ if (rds[num_cand] == 0) rds[num_cand] = 1;
+ num_cand++;
+ }
+
+ if (num_cand == 0) return (uint16_t)0xFFFF;
+
+ sort_rd(rds, txk_map, num_cand);
+ uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
+
+ // 0 < prune_factor <= 1000 controls aggressiveness
+ int64_t factor = 0;
+ for (int idx = 1; idx < num_cand; idx++) {
+ factor = 1000 * (rds[idx] - rds[0]) / rds[0];
+ if (factor < (int64_t)prune_factor)
+ prune &= ~(1 << txk_map[idx]);
+ else
+ break;
+ }
+ return prune;
+}
+
+// These thresholds were calibrated to provide a certain number of TX types
+// pruned by the model on average, i.e. selecting a threshold with index i
+// will lead to pruning i+1 TX types on average
+static const float *prune_2D_adaptive_thresholds[] = {
+ // TX_4X4
+ (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
+ 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
+ 0.09778f, 0.11780f },
+ // TX_8X8
+ (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
+ 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
+ 0.10803f, 0.14124f },
+ // TX_16X16
+ (float[]){ 0.01404f, 0.02000f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
+ 0.06897f, 0.07629f, 0.08875f, 0.11169f },
+ // TX_32X32
+ NULL,
+ // TX_64X64
+ NULL,
+ // TX_4X8
+ (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
+ 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
+ 0.10168f, 0.12585f },
+ // TX_8X4
+ (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
+ 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
+ 0.10583f, 0.13123f },
+ // TX_8X16
+ (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
+ 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
+ 0.10730f, 0.14221f },
+ // TX_16X8
+ (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
+ 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
+ 0.10339f, 0.13464f },
+ // TX_16X32
+ NULL,
+ // TX_32X16
+ NULL,
+ // TX_32X64
+ NULL,
+ // TX_64X32
+ NULL,
+ // TX_4X16
+ (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
+ 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
+ 0.10242f, 0.12878f },
+ // TX_16X4
+ (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
+ 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
+ 0.10217f, 0.12610f },
+ // TX_8X32
+ NULL,
+ // TX_32X8
+ NULL,
+ // TX_16X64
+ NULL,
+ // TX_64X16
+ NULL,
+};
+
+static INLINE float get_adaptive_thresholds(
+ TX_SIZE tx_size, TxSetType tx_set_type,
+ TX_TYPE_PRUNE_MODE prune_2d_txfm_mode) {
+ const int prune_aggr_table[5][2] = {
+ { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 }, { 12, 9 }
+ };
+ int pruning_aggressiveness = 0;
+ if (tx_set_type == EXT_TX_SET_ALL16)
+ pruning_aggressiveness =
+ prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][0];
+ else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
+ pruning_aggressiveness =
+ prune_aggr_table[prune_2d_txfm_mode - TX_TYPE_PRUNE_1][1];
+
+ return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness];
+}
+
+static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff,
+ int stride, int bw, int bh,
+ float *hordist,
+ float *verdist) {
+ // First compute downscaled block energy values (esq); downscale factors
+ // are defined by w_shift and h_shift.
+ unsigned int esq[256];
+ const int w_shift = bw <= 8 ? 0 : 1;
+ const int h_shift = bh <= 8 ? 0 : 1;
+ const int esq_w = bw >> w_shift;
+ const int esq_h = bh >> h_shift;
+ const int esq_sz = esq_w * esq_h;
+ int i, j;
+ memset(esq, 0, esq_sz * sizeof(esq[0]));
+ if (w_shift) {
+ for (i = 0; i < bh; i++) {
+ unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+ const int16_t *cur_diff_row = diff + i * stride;
+ for (j = 0; j < bw; j += 2) {
+ cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] +
+ cur_diff_row[j + 1] * cur_diff_row[j + 1]);
+ }
+ }
+ } else {
+ for (i = 0; i < bh; i++) {
+ unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+ const int16_t *cur_diff_row = diff + i * stride;
+ for (j = 0; j < bw; j++) {
+ cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j];
+ }
+ }
+ }
+
+ uint64_t total = 0;
+ for (i = 0; i < esq_sz; i++) total += esq[i];
+
+ // Output hordist and verdist arrays are normalized 1D projections of esq
+ if (total == 0) {
+ float hor_val = 1.0f / esq_w;
+ for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val;
+ float ver_val = 1.0f / esq_h;
+ for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val;
+ return;
+ }
+
+ const float e_recip = 1.0f / (float)total;
+ memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0]));
+ memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0]));
+ const unsigned int *cur_esq_row;
+ for (i = 0; i < esq_h - 1; i++) {
+ cur_esq_row = esq + i * esq_w;
+ for (j = 0; j < esq_w - 1; j++) {
+ hordist[j] += (float)cur_esq_row[j];
+ verdist[i] += (float)cur_esq_row[j];
+ }
+ verdist[i] += (float)cur_esq_row[j];
+ }
+ cur_esq_row = esq + i * esq_w;
+ for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j];
+
+ for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip;
+ for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
+}
+
+static AOM_INLINE bool check_bit_mask(uint16_t mask, int val) {
+ return mask & (1 << val);
+}
+
+static AOM_INLINE void set_bit_mask(uint16_t *mask, int val) {
+ *mask |= (1 << val);
+}
+
+static AOM_INLINE void unset_bit_mask(uint16_t *mask, int val) {
+ *mask &= ~(1 << val);
+}
+
+static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int blk_row, int blk_col, TxSetType tx_set_type,
+ TX_TYPE_PRUNE_MODE prune_2d_txfm_mode, int *txk_map,
+ uint16_t *allowed_tx_mask) {
+ // This table is used because the search order is different from the enum
+ // order.
+ static const int tx_type_table_2D[16] = {
+ DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT,
+ ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST,
+ FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+ H_DCT, H_ADST, H_FLIPADST, IDTX
+ };
+ if (tx_set_type != EXT_TX_SET_ALL16 &&
+ tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT)
+ return;
+#if CONFIG_NN_V2
+ NN_CONFIG_V2 *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+ NN_CONFIG_V2 *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+#else
+ const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+ const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+#endif
+ if (!nn_config_hor || !nn_config_ver) return; // Model not established yet.
+
+ float hfeatures[16], vfeatures[16];
+ float hscores[4], vscores[4];
+ float scores_2D_raw[16];
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ const int hfeatures_num = bw <= 8 ? bw : bw / 2;
+ const int vfeatures_num = bh <= 8 ? bh : bh / 2;
+ assert(hfeatures_num <= 16);
+ assert(vfeatures_num <= 16);
+
+ const struct macroblock_plane *const p = &x->plane[0];
+ const int diff_stride = block_size_wide[bsize];
+ const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+ get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
+ vfeatures);
+
+ av1_get_horver_correlation_full(diff, diff_stride, bw, bh,
+ &hfeatures[hfeatures_num - 1],
+ &vfeatures[vfeatures_num - 1]);
+
+#if CONFIG_NN_V2
+ av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores);
+ av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores);
+#else
+ av1_nn_predict(hfeatures, nn_config_hor, 1, hscores);
+ av1_nn_predict(vfeatures, nn_config_ver, 1, vscores);
+#endif
+
+ for (int i = 0; i < 4; i++) {
+ float *cur_scores_2D = scores_2D_raw + i * 4;
+ cur_scores_2D[0] = vscores[i] * hscores[0];
+ cur_scores_2D[1] = vscores[i] * hscores[1];
+ cur_scores_2D[2] = vscores[i] * hscores[2];
+ cur_scores_2D[3] = vscores[i] * hscores[3];
+ }
+
+ assert(TX_TYPES == 16);
+ // This version of the function only works when there are at most 16 classes.
+ // So we will need to change the optimization or use av1_nn_softmax instead if
+ // this ever gets changed.
+ av1_nn_fast_softmax_16(scores_2D_raw, scores_2D_raw);
+
+ const float score_thresh =
+ get_adaptive_thresholds(tx_size, tx_set_type, prune_2d_txfm_mode);
+
+ // Always keep the TX type with the highest score, prune all others with
+ // score below score_thresh.
+ int max_score_i = 0;
+ float max_score = 0.0f;
+ uint16_t allow_bitmask = 0;
+ float sum_score = 0.0;
+ // Calculate sum of allowed tx type score and Populate allow bit mask based
+ // on score_thresh and allowed_tx_mask
+ int allow_count = 0;
+ int tx_type_allowed[16] = { TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+ TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+ TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+ TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+ TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+ TX_TYPE_INVALID };
+ float scores_2D[16] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ };
+ for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
+ const int allow_tx_type =
+ check_bit_mask(*allowed_tx_mask, tx_type_table_2D[tx_idx]);
+ if (!allow_tx_type) {
+ continue;
+ }
+ if (scores_2D_raw[tx_idx] > max_score) {
+ max_score = scores_2D_raw[tx_idx];
+ max_score_i = tx_idx;
+ }
+ if (scores_2D_raw[tx_idx] >= score_thresh) {
+ // Set allow mask based on score_thresh
+ set_bit_mask(&allow_bitmask, tx_type_table_2D[tx_idx]);
+
+ // Accumulate score of allowed tx type
+ sum_score += scores_2D_raw[tx_idx];
+
+ scores_2D[allow_count] = scores_2D_raw[tx_idx];
+ tx_type_allowed[allow_count] = tx_type_table_2D[tx_idx];
+ allow_count += 1;
+ }
+ }
+ if (!check_bit_mask(allow_bitmask, tx_type_table_2D[max_score_i])) {
+ // If even the tx_type with max score is pruned, this means that no other
+ // tx_type is feasible. When this happens, we force enable max_score_i and
+ // end the search.
+ set_bit_mask(&allow_bitmask, tx_type_table_2D[max_score_i]);
+ memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D));
+ *allowed_tx_mask = allow_bitmask;
+ return;
+ }
+
+ // Sort tx type probability of all types
+ if (allow_count <= 8) {
+ av1_sort_fi32_8(scores_2D, tx_type_allowed);
+ } else {
+ av1_sort_fi32_16(scores_2D, tx_type_allowed);
+ }
+
+ // Enable more pruning based on tx type probability and number of allowed tx
+ // types
+ if (prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) {
+ float temp_score = 0.0;
+ float score_ratio = 0.0;
+ int tx_idx, tx_count = 0;
+ const float inv_sum_score = 100 / sum_score;
+ // Get allowed tx types based on sorted probability score and tx count
+ for (tx_idx = 0; tx_idx < allow_count; tx_idx++) {
+ // Skip the tx type which has more than 30% of cumulative
+ // probability and allowed tx type count is more than 2
+ if (score_ratio > 30.0 && tx_count >= 2) break;
+
+ assert(check_bit_mask(allow_bitmask, tx_type_allowed[tx_idx]));
+ // Calculate cumulative probability
+ temp_score += scores_2D[tx_idx];
+
+ // Calculate percentage of cumulative probability of allowed tx type
+ score_ratio = temp_score * inv_sum_score;
+ tx_count++;
+ }
+ // Set remaining tx types as pruned
+ for (; tx_idx < allow_count; tx_idx++)
+ unset_bit_mask(&allow_bitmask, tx_type_allowed[tx_idx]);
+ }
+
+ memcpy(txk_map, tx_type_allowed, sizeof(tx_type_table_2D));
+ *allowed_tx_mask = allow_bitmask;
+}
+
+static float get_dev(float mean, double x2_sum, int num) {
+ const float e_x2 = (float)(x2_sum / num);
+ const float diff = e_x2 - mean * mean;
+ const float dev = (diff > 0) ? sqrtf(diff) : 0;
+ return dev;
+}
+
+// Writes the features required by the ML model to predict tx split based on
+// mean and standard deviation values of the block and sub-blocks.
+// Returns the number of elements written to the output array which is at most
+// 12 currently. Hence 'features' buffer should be able to accommodate at least
+// 12 elements.
+static AOM_INLINE int get_mean_dev_features(const int16_t *data, int stride,
+ int bw, int bh, float *features) {
+ const int16_t *const data_ptr = &data[0];
+ const int subh = (bh >= bw) ? (bh >> 1) : bh;
+ const int subw = (bw >= bh) ? (bw >> 1) : bw;
+ const int num = bw * bh;
+ const int sub_num = subw * subh;
+ int feature_idx = 2;
+ int total_x_sum = 0;
+ int64_t total_x2_sum = 0;
+ int num_sub_blks = 0;
+ double mean2_sum = 0.0f;
+ float dev_sum = 0.0f;
+
+ for (int row = 0; row < bh; row += subh) {
+ for (int col = 0; col < bw; col += subw) {
+ int x_sum;
+ int64_t x2_sum;
+ // TODO(any): Write a SIMD version. Clear registers.
+ aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
+ &x_sum, &x2_sum);
+ total_x_sum += x_sum;
+ total_x2_sum += x2_sum;
+
+ const float mean = (float)x_sum / sub_num;
+ const float dev = get_dev(mean, (double)x2_sum, sub_num);
+ features[feature_idx++] = mean;
+ features[feature_idx++] = dev;
+ mean2_sum += (double)(mean * mean);
+ dev_sum += dev;
+ num_sub_blks++;
+ }
+ }
+
+ const float lvl0_mean = (float)total_x_sum / num;
+ features[0] = lvl0_mean;
+ features[1] = get_dev(lvl0_mean, (double)total_x2_sum, num);
+
+ // Deviation of means.
+ features[feature_idx++] = get_dev(lvl0_mean, mean2_sum, num_sub_blks);
+ // Mean of deviations.
+ features[feature_idx++] = dev_sum / num_sub_blks;
+
+ return feature_idx;
+}
+
+static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
+ int blk_col, TX_SIZE tx_size) {
+ const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size];
+ if (!nn_config) return -1;
+
+ const int diff_stride = block_size_wide[bsize];
+ const int16_t *diff =
+ x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ float features[64] = { 0.0f };
+ get_mean_dev_features(diff, diff_stride, bw, bh, features);
+
+ float score = 0.0f;
+ av1_nn_predict(features, nn_config, 1, &score);
+
+ int int_score = (int)(score * 10000);
+ return clamp(int_score, -80000, 80000);
+}
+
+static INLINE uint16_t
+get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
+ int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode,
+ int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const int is_inter = is_inter_block(mbmi);
+ const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
+ // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed <
+ // TX_TYPES, only that specific tx type is allowed.
+ TX_TYPE txk_allowed = TX_TYPES;
+
+ const FRAME_UPDATE_TYPE update_type =
+ get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+ int use_actual_frame_probs = 1;
+ const int *tx_type_probs;
+#if CONFIG_FPMT_TEST
+ use_actual_frame_probs =
+ (cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE) ? 0 : 1;
+ if (!use_actual_frame_probs) {
+ tx_type_probs =
+ (int *)cpi->ppi->temp_frame_probs.tx_type_probs[update_type][tx_size];
+ }
+#endif
+ if (use_actual_frame_probs) {
+ tx_type_probs = cpi->ppi->frame_probs.tx_type_probs[update_type][tx_size];
+ }
+
+ if ((!is_inter && txfm_params->use_default_intra_tx_type) ||
+ (is_inter && txfm_params->default_inter_tx_type_prob_thresh == 0)) {
+ txk_allowed =
+ get_default_tx_type(0, xd, tx_size, cpi->use_screen_content_tools);
+ } else if (is_inter &&
+ txfm_params->default_inter_tx_type_prob_thresh != INT_MAX) {
+ if (tx_type_probs[DEFAULT_INTER_TX_TYPE] >
+ txfm_params->default_inter_tx_type_prob_thresh) {
+ txk_allowed = DEFAULT_INTER_TX_TYPE;
+ } else {
+ int force_tx_type = 0;
+ int max_prob = 0;
+ const int tx_type_prob_threshold =
+ txfm_params->default_inter_tx_type_prob_thresh +
+ PROB_THRESH_OFFSET_TX_TYPE;
+ for (int i = 1; i < TX_TYPES; i++) { // find maximum probability.
+ if (tx_type_probs[i] > max_prob) {
+ max_prob = tx_type_probs[i];
+ force_tx_type = i;
+ }
+ }
+ if (max_prob > tx_type_prob_threshold) // force tx type with max prob.
+ txk_allowed = force_tx_type;
+ else if (x->rd_model == LOW_TXFM_RD) {
+ if (plane == 0) txk_allowed = DCT_DCT;
+ }
+ }
+ } else if (x->rd_model == LOW_TXFM_RD) {
+ if (plane == 0) txk_allowed = DCT_DCT;
+ }
+
+ const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+ tx_size, is_inter, cm->features.reduced_tx_set_used);
+
+ TX_TYPE uv_tx_type = DCT_DCT;
+ if (plane) {
+ // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
+ uv_tx_type = txk_allowed =
+ av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
+ cm->features.reduced_tx_set_used);
+ }
+ PREDICTION_MODE intra_dir =
+ mbmi->filter_intra_mode_info.use_filter_intra
+ ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]
+ : mbmi->mode;
+ uint16_t ext_tx_used_flag =
+ cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset != 0 &&
+ tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT
+ ? av1_reduced_intra_tx_used_flag[intra_dir]
+ : av1_ext_tx_used_flag[tx_set_type];
+
+ if (cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset == 2)
+ ext_tx_used_flag &= av1_derived_intra_tx_used_flag[intra_dir];
+
+ if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
+ ext_tx_used_flag == 0x0001 ||
+ (is_inter && cpi->oxcf.txfm_cfg.use_inter_dct_only) ||
+ (!is_inter && cpi->oxcf.txfm_cfg.use_intra_dct_only)) {
+ txk_allowed = DCT_DCT;
+ }
+
+ if (cpi->oxcf.txfm_cfg.enable_flip_idtx == 0)
+ ext_tx_used_flag &= DCT_ADST_TX_MASK;
+
+ uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip.
+ if (txk_allowed < TX_TYPES) {
+ allowed_tx_mask = 1 << txk_allowed;
+ allowed_tx_mask &= ext_tx_used_flag;
+ } else if (fast_tx_search) {
+ allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT
+ allowed_tx_mask &= ext_tx_used_flag;
+ } else {
+ assert(plane == 0);
+ allowed_tx_mask = ext_tx_used_flag;
+ int num_allowed = 0;
+ int i;
+
+ if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+ static const int thresh_arr[2][7] = { { 10, 15, 15, 10, 15, 15, 15 },
+ { 10, 17, 17, 10, 17, 17, 17 } };
+ const int thresh =
+ thresh_arr[cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats - 1]
+ [update_type];
+ uint16_t prune = 0;
+ int max_prob = -1;
+ int max_idx = 0;
+ for (i = 0; i < TX_TYPES; i++) {
+ if (tx_type_probs[i] > max_prob && (allowed_tx_mask & (1 << i))) {
+ max_prob = tx_type_probs[i];
+ max_idx = i;
+ }
+ if (tx_type_probs[i] < thresh) prune |= (1 << i);
+ }
+ if ((prune >> max_idx) & 0x01) prune &= ~(1 << max_idx);
+ allowed_tx_mask &= (~prune);
+ }
+ for (i = 0; i < TX_TYPES; i++) {
+ if (allowed_tx_mask & (1 << i)) num_allowed++;
+ }
+ assert(num_allowed > 0);
+
+ if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) {
+ int pf = prune_factors[txfm_params->prune_2d_txfm_mode];
+ int mf = mul_factors[txfm_params->prune_2d_txfm_mode];
+ if (num_allowed <= 7) {
+ const uint16_t prune =
+ prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col,
+ plane_bsize, txk_map, allowed_tx_mask, pf, txb_ctx,
+ cm->features.reduced_tx_set_used);
+ allowed_tx_mask &= (~prune);
+ } else {
+ const int num_sel = (num_allowed * mf + 50) / 100;
+ const uint16_t prune = prune_txk_type_separ(
+ cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize,
+ txk_map, allowed_tx_mask, pf, txb_ctx,
+ cm->features.reduced_tx_set_used, ref_best_rd, num_sel);
+
+ allowed_tx_mask &= (~prune);
+ }
+ } else {
+ assert(num_allowed > 0);
+ int allowed_tx_count =
+ (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_4) ? 1 : 5;
+ // !fast_tx_search && txk_end != txk_start && plane == 0
+ if (txfm_params->prune_2d_txfm_mode >= TX_TYPE_PRUNE_1 && is_inter &&
+ num_allowed > allowed_tx_count) {
+ prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+ txfm_params->prune_2d_txfm_mode, txk_map, &allowed_tx_mask);
+ }
+ }
+ }
+
+ // Need to have at least one transform type allowed.
+ if (allowed_tx_mask == 0) {
+ txk_allowed = (plane ? uv_tx_type : DCT_DCT);
+ allowed_tx_mask = (1 << txk_allowed);
+ }
+
+ assert(IMPLIES(txk_allowed < TX_TYPES, allowed_tx_mask == 1 << txk_allowed));
+ *allowed_txk_types = txk_allowed;
+ return allowed_tx_mask;
+}
+
+#if CONFIG_RD_DEBUG
+static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
+ int txb_coeff_cost) {
+ rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+}
+#endif
+
+static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block,
+ TX_SIZE tx_size, const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx,
+ int reduced_tx_set_used) {
+#if TXCOEFF_COST_TIMER
+ struct aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+#endif
+ const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type,
+ txb_ctx, reduced_tx_set_used);
+#if TXCOEFF_COST_TIMER
+ AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
+ aom_usec_timer_mark(&timer);
+ const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+ tmp_cm->txcoeff_cost_timer += elapsed_time;
+ ++tmp_cm->txcoeff_cost_count;
+#endif
+ return cost;
+}
+
+static int skip_trellis_opt_based_on_satd(MACROBLOCK *x,
+ QUANT_PARAM *quant_param, int plane,
+ int block, TX_SIZE tx_size,
+ int quant_b_adapt, int qstep,
+ unsigned int coeff_opt_satd_threshold,
+ int skip_trellis, int dc_only_blk) {
+ if (skip_trellis || (coeff_opt_satd_threshold == UINT_MAX))
+ return skip_trellis;
+
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *const coeff_ptr = p->coeff + block_offset;
+ const int n_coeffs = av1_get_max_eob(tx_size);
+ const int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size));
+ int satd = (dc_only_blk) ? abs(coeff_ptr[0]) : aom_satd(coeff_ptr, n_coeffs);
+ satd = RIGHT_SIGNED_SHIFT(satd, shift);
+ satd >>= (x->e_mbd.bd - 8);
+
+ const int skip_block_trellis =
+ ((uint64_t)satd >
+ (uint64_t)coeff_opt_satd_threshold * qstep * sqrt_tx_pixels_2d[tx_size]);
+
+ av1_setup_quant(
+ tx_size, !skip_block_trellis,
+ skip_block_trellis
+ ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP)
+ : AV1_XFORM_QUANT_FP,
+ quant_b_adapt, quant_param);
+
+ return skip_block_trellis;
+}
+
+// Predict DC only blocks if the residual variance is below a qstep based
+// threshold.For such blocks, transform type search is bypassed.
+static INLINE void predict_dc_only_block(
+ MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ int block, int blk_row, int blk_col, RD_STATS *best_rd_stats,
+ int64_t *block_sse, unsigned int *block_mse_q8, int64_t *per_px_mean,
+ int *dc_only_blk) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+ uint64_t block_var = UINT64_MAX;
+ const int dc_qstep = x->plane[plane].dequant_QTX[0] >> 3;
+ *block_sse = pixel_diff_stats(x, plane, blk_row, blk_col, plane_bsize,
+ txsize_to_bsize[tx_size], block_mse_q8,
+ per_px_mean, &block_var);
+ assert((*block_mse_q8) != UINT_MAX);
+ uint64_t var_threshold = (uint64_t)(1.8 * qstep * qstep);
+ if (is_cur_buf_hbd(xd))
+ block_var = ROUND_POWER_OF_TWO(block_var, (xd->bd - 8) * 2);
+
+ if (block_var >= var_threshold) return;
+ const unsigned int predict_dc_level = x->txfm_search_params.predict_dc_level;
+ assert(predict_dc_level != 0);
+
+ // Prediction of skip block if residual mean and variance are less
+ // than qstep based threshold
+ if ((llabs(*per_px_mean) * dc_coeff_scale[tx_size]) < (dc_qstep << 12)) {
+ // If the normalized mean of residual block is less than the dc qstep and
+ // the normalized block variance is less than ac qstep, then the block is
+ // assumed to be a skip block and its rdcost is updated accordingly.
+ best_rd_stats->skip_txfm = 1;
+
+ x->plane[plane].eobs[block] = 0;
+
+ if (is_cur_buf_hbd(xd))
+ *block_sse = ROUND_POWER_OF_TWO((*block_sse), (xd->bd - 8) * 2);
+
+ best_rd_stats->dist = (*block_sse) << 4;
+ best_rd_stats->sse = best_rd_stats->dist;
+
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(plane_bsize, &xd->plane[plane], ctxa, ctxl);
+ ENTROPY_CONTEXT *ta = ctxa;
+ ENTROPY_CONTEXT *tl = ctxl;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx_tmp;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx_tmp);
+ const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][plane_type]
+ .txb_skip_cost[txb_ctx_tmp.txb_skip_ctx][1];
+ best_rd_stats->rate = zero_blk_rate;
+
+ best_rd_stats->rdcost =
+ RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->sse);
+
+ x->plane[plane].txb_entropy_ctx[block] = 0;
+ } else if (predict_dc_level > 1) {
+ // Predict DC only blocks based on residual variance.
+ // For chroma plane, this prediction is disabled for intra blocks.
+ if ((plane == 0) || (plane > 0 && is_inter_block(mbmi))) *dc_only_blk = 1;
+ }
+}
+
+// Search for the best transform type for a given transform block.
+// This function can be used for both inter and intra, both luma and chroma.
+static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ const TXB_CTX *const txb_ctx,
+ FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis,
+ int64_t ref_best_rd, RD_STATS *best_rd_stats) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ int64_t best_rd = INT64_MAX;
+ uint16_t best_eob = 0;
+ TX_TYPE best_tx_type = DCT_DCT;
+ int rate_cost = 0;
+ struct macroblock_plane *const p = &x->plane[plane];
+ tran_low_t *orig_dqcoeff = p->dqcoeff;
+ tran_low_t *best_dqcoeff = x->dqcoeff_buf;
+ const int tx_type_map_idx =
+ plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col;
+ av1_invalid_rd_stats(best_rd_stats);
+
+ skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id],
+ DRY_RUN_NORMAL);
+
+ uint8_t best_txb_ctx = 0;
+ // txk_allowed = TX_TYPES: >1 tx types are allowed
+ // txk_allowed < TX_TYPES: only that specific tx type is allowed.
+ TX_TYPE txk_allowed = TX_TYPES;
+ int txk_map[TX_TYPES] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ };
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+ const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+
+ const uint8_t txw = tx_size_wide[tx_size];
+ const uint8_t txh = tx_size_high[tx_size];
+ int64_t block_sse;
+ unsigned int block_mse_q8;
+ int dc_only_blk = 0;
+ const bool predict_dc_block =
+ txfm_params->predict_dc_level >= 1 && txw != 64 && txh != 64;
+ int64_t per_px_mean = INT64_MAX;
+ if (predict_dc_block) {
+ predict_dc_only_block(x, plane, plane_bsize, tx_size, block, blk_row,
+ blk_col, best_rd_stats, &block_sse, &block_mse_q8,
+ &per_px_mean, &dc_only_blk);
+ if (best_rd_stats->skip_txfm == 1) {
+ const TX_TYPE tx_type = DCT_DCT;
+ if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type;
+ return;
+ }
+ } else {
+ block_sse = av1_pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+ txsize_to_bsize[tx_size], &block_mse_q8);
+ assert(block_mse_q8 != UINT_MAX);
+ }
+
+ // Bit mask to indicate which transform types are allowed in the RD search.
+ uint16_t tx_mask;
+
+ // Use DCT_DCT transform for DC only block.
+ if (dc_only_blk || cpi->sf.rt_sf.dct_only_palette_nonrd == 1)
+ tx_mask = 1 << DCT_DCT;
+ else
+ tx_mask = get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, txb_ctx, ftxs_mode, ref_best_rd,
+ &txk_allowed, txk_map);
+ const uint16_t allowed_tx_mask = tx_mask;
+
+ if (is_cur_buf_hbd(xd)) {
+ block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+ block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
+ }
+ block_sse *= 16;
+ // Use mse / qstep^2 based threshold logic to take decision of R-D
+ // optimization of coeffs. For smaller residuals, coeff optimization
+ // would be helpful. For larger residuals, R-D optimization may not be
+ // effective.
+ // TODO(any): Experiment with variance and mean based thresholds
+ const int perform_block_coeff_opt =
+ ((uint64_t)block_mse_q8 <=
+ (uint64_t)txfm_params->coeff_opt_thresholds[0] * qstep * qstep);
+ skip_trellis |= !perform_block_coeff_opt;
+
+ // Flag to indicate if distortion should be calculated in transform domain or
+ // not during iterating through transform type candidates.
+ // Transform domain distortion is accurate for higher residuals.
+ // TODO(any): Experiment with variance and mean based thresholds
+ int use_transform_domain_distortion =
+ (txfm_params->use_transform_domain_distortion > 0) &&
+ (block_mse_q8 >= txfm_params->tx_domain_dist_threshold) &&
+ // Any 64-pt transforms only preserves half the coefficients.
+ // Therefore transform domain distortion is not valid for these
+ // transform sizes.
+ (txsize_sqr_up_map[tx_size] != TX_64X64) &&
+ // Use pixel domain distortion for DC only blocks
+ !dc_only_blk;
+ // Flag to indicate if an extra calculation of distortion in the pixel domain
+ // should be performed at the end, after the best transform type has been
+ // decided.
+ int calc_pixel_domain_distortion_final =
+ txfm_params->use_transform_domain_distortion == 1 &&
+ use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD;
+ if (calc_pixel_domain_distortion_final &&
+ (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001))
+ calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
+
+ const uint16_t *eobs_ptr = x->plane[plane].eobs;
+
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+ int skip_trellis_based_on_satd[TX_TYPES] = { 0 };
+ av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+ av1_setup_quant(tx_size, !skip_trellis,
+ skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
+ : AV1_XFORM_QUANT_FP)
+ : AV1_XFORM_QUANT_FP,
+ cpi->oxcf.q_cfg.quant_b_adapt, &quant_param);
+
+ // Iterate through all transform type candidates.
+ for (int idx = 0; idx < TX_TYPES; ++idx) {
+ const TX_TYPE tx_type = (TX_TYPE)txk_map[idx];
+ if (tx_type == TX_TYPE_INVALID || !check_bit_mask(allowed_tx_mask, tx_type))
+ continue;
+ txfm_param.tx_type = tx_type;
+ if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) {
+ av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+ &quant_param);
+ }
+ if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type;
+ RD_STATS this_rd_stats;
+ av1_invalid_rd_stats(&this_rd_stats);
+
+ if (!dc_only_blk)
+ av1_xform(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param);
+ else
+ av1_xform_dc_only(x, plane, block, &txfm_param, per_px_mean);
+
+ skip_trellis_based_on_satd[tx_type] = skip_trellis_opt_based_on_satd(
+ x, &quant_param, plane, block, tx_size, cpi->oxcf.q_cfg.quant_b_adapt,
+ qstep, txfm_params->coeff_opt_thresholds[1], skip_trellis, dc_only_blk);
+
+ av1_quant(x, plane, block, &txfm_param, &quant_param);
+
+ // Calculate rate cost of quantized coefficients.
+ if (quant_param.use_optimize_b) {
+ // TODO(aomedia:3209): update Trellis quantization to take into account
+ // quantization matrices.
+ av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+ &rate_cost);
+ } else {
+ rate_cost = cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx,
+ cm->features.reduced_tx_set_used);
+ }
+
+ // If rd cost based on coeff rate alone is already more than best_rd,
+ // terminate early.
+ if (RDCOST(x->rdmult, rate_cost, 0) > best_rd) continue;
+
+ // Calculate distortion.
+ if (eobs_ptr[block] == 0) {
+ // When eob is 0, pixel domain distortion is more efficient and accurate.
+ this_rd_stats.dist = this_rd_stats.sse = block_sse;
+ } else if (dc_only_blk) {
+ this_rd_stats.sse = block_sse;
+ this_rd_stats.dist = dist_block_px_domain(
+ cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+ } else if (use_transform_domain_distortion) {
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+ scan_order->scan, &this_rd_stats.dist,
+ &this_rd_stats.sse);
+ } else {
+ int64_t sse_diff = INT64_MAX;
+ // high_energy threshold assumes that every pixel within a txfm block
+ // has a residue energy of at least 25% of the maximum, i.e. 128 * 128
+ // for 8 bit.
+ const int64_t high_energy_thresh =
+ ((int64_t)128 * 128 * tx_size_2d[tx_size]);
+ const int is_high_energy = (block_sse >= high_energy_thresh);
+ if (tx_size == TX_64X64 || is_high_energy) {
+ // Because 3 out 4 quadrants of transform coefficients are forced to
+ // zero, the inverse transform has a tendency to overflow. sse_diff
+ // is effectively the energy of those 3 quadrants, here we use it
+ // to decide if we should do pixel domain distortion. If the energy
+ // is mostly in first quadrant, then it is unlikely that we have
+ // overflow issue in inverse transform.
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, plane, block, tx_size, quant_param.qmatrix,
+ scan_order->scan, &this_rd_stats.dist,
+ &this_rd_stats.sse);
+ sse_diff = block_sse - this_rd_stats.sse;
+ }
+ if (tx_size != TX_64X64 || !is_high_energy ||
+ (sse_diff * 2) < this_rd_stats.sse) {
+ const int64_t tx_domain_dist = this_rd_stats.dist;
+ this_rd_stats.dist = dist_block_px_domain(
+ cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+ // For high energy blocks, occasionally, the pixel domain distortion
+ // can be artificially low due to clamping at reconstruction stage
+ // even when inverse transform output is hugely different from the
+ // actual residue.
+ if (is_high_energy && this_rd_stats.dist < tx_domain_dist)
+ this_rd_stats.dist = tx_domain_dist;
+ } else {
+ assert(sse_diff < INT64_MAX);
+ this_rd_stats.dist += sse_diff;
+ }
+ this_rd_stats.sse = block_sse;
+ }
+
+ this_rd_stats.rate = rate_cost;
+
+ const int64_t rd =
+ RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+
+ if (rd < best_rd) {
+ best_rd = rd;
+ *best_rd_stats = this_rd_stats;
+ best_tx_type = tx_type;
+ best_txb_ctx = x->plane[plane].txb_entropy_ctx[block];
+ best_eob = x->plane[plane].eobs[block];
+ // Swap dqcoeff buffers
+ tran_low_t *const tmp_dqcoeff = best_dqcoeff;
+ best_dqcoeff = p->dqcoeff;
+ p->dqcoeff = tmp_dqcoeff;
+ }
+
+#if CONFIG_COLLECT_RD_STATS == 1
+ if (plane == 0) {
+ PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
+ plane_bsize, tx_size, tx_type, rd);
+ }
+#endif // CONFIG_COLLECT_RD_STATS == 1
+
+#if COLLECT_TX_SIZE_DATA
+ // Generate small sample to restrict output size.
+ static unsigned int seed = 21743;
+ if (lcg_rand16(&seed) % 200 == 0) {
+ FILE *fp = NULL;
+
+ if (within_border) {
+ fp = fopen(av1_tx_size_data_output_file, "a");
+ }
+
+ if (fp) {
+ // Transform info and RD
+ const int txb_w = tx_size_wide[tx_size];
+ const int txb_h = tx_size_high[tx_size];
+
+ // Residue signal.
+ const int diff_stride = block_size_wide[plane_bsize];
+ struct macroblock_plane *const p = &x->plane[plane];
+ const int16_t *src_diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
+
+ for (int r = 0; r < txb_h; ++r) {
+ for (int c = 0; c < txb_w; ++c) {
+ fprintf(fp, "%d,", src_diff[c]);
+ }
+ src_diff += diff_stride;
+ }
+
+ fprintf(fp, "%d,%d,%d,%" PRId64, txb_w, txb_h, tx_type, rd);
+ fprintf(fp, "\n");
+ fclose(fp);
+ }
+ }
+#endif // COLLECT_TX_SIZE_DATA
+
+ // If the current best RD cost is much worse than the reference RD cost,
+ // terminate early.
+ if (cpi->sf.tx_sf.adaptive_txb_search_level) {
+ if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) >
+ ref_best_rd) {
+ break;
+ }
+ }
+
+ // Terminate transform type search if the block has been quantized to
+ // all zero.
+ if (cpi->sf.tx_sf.tx_type_search.skip_tx_search && !best_eob) break;
+ }
+
+ assert(best_rd != INT64_MAX);
+
+ best_rd_stats->skip_txfm = best_eob == 0;
+ if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
+ x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
+ x->plane[plane].eobs[block] = best_eob;
+ skip_trellis = skip_trellis_based_on_satd[best_tx_type];
+
+ // Point dqcoeff to the quantized coefficients corresponding to the best
+ // transform type, then we can skip transform and quantization, e.g. in the
+ // final pixel domain distortion calculation and recon_intra().
+ p->dqcoeff = best_dqcoeff;
+
+ if (calc_pixel_domain_distortion_final && best_eob) {
+ best_rd_stats->dist = dist_block_px_domain(
+ cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+ best_rd_stats->sse = block_sse;
+ }
+
+ // Intra mode needs decoded pixels such that the next transform block
+ // can use them for prediction.
+ recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob);
+ p->dqcoeff = orig_dqcoeff;
+}
+
+// Pick transform type for a luma transform block of tx_size. Note this function
+// is used only for inter-predicted blocks.
+static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x,
+ TX_SIZE tx_size, int blk_row, int blk_col,
+ int block, int plane_bsize, TXB_CTX *txb_ctx,
+ RD_STATS *rd_stats,
+ FAST_TX_SEARCH_MODE ftxs_mode,
+ int64_t ref_rdcost) {
+ assert(is_inter_block(x->e_mbd.mi[0]));
+ RD_STATS this_rd_stats;
+ const int skip_trellis = 0;
+ search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size,
+ txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats);
+
+ av1_merge_rd_stats(rd_stats, &this_rd_stats);
+}
+
+static AOM_INLINE void try_tx_block_no_split(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+ const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
+ int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode, TxCandidateInfo *no_split) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[0];
+ const int bw = mi_size_wide[plane_bsize];
+ const ENTROPY_CONTEXT *const pta = ta + blk_col;
+ const ENTROPY_CONTEXT *const ptl = tl + blk_row;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+ const int zero_blk_rate = x->coeff_costs.coeff_costs[txs_ctx][PLANE_TYPE_Y]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+ rd_stats->zero_rate = zero_blk_rate;
+ const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+ mbmi->inter_tx_size[index] = tx_size;
+ tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
+ rd_stats, ftxs_mode, ref_best_rd);
+ assert(rd_stats->rate < INT_MAX);
+
+ const int pick_skip_txfm =
+ !xd->lossless[mbmi->segment_id] &&
+ (rd_stats->skip_txfm == 1 ||
+ RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
+ if (pick_skip_txfm) {
+#if CONFIG_RD_DEBUG
+ update_txb_coeff_cost(rd_stats, 0, zero_blk_rate - rd_stats->rate);
+#endif // CONFIG_RD_DEBUG
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ p->eobs[block] = 0;
+ update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+ }
+ rd_stats->skip_txfm = pick_skip_txfm;
+ set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col,
+ pick_skip_txfm);
+
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate += x->mode_costs.txfm_partition_cost[txfm_partition_ctx][0];
+
+ no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
+ no_split->tx_type =
+ xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+}
+
+static AOM_INLINE void try_tx_block_split(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+ int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode, RD_STATS *split_rd_stats) {
+ assert(tx_size < TX_SIZES_ALL);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+ const int txb_width = tx_size_wide_unit[tx_size];
+ const int txb_height = tx_size_high_unit[tx_size];
+ // Transform size after splitting current block.
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int sub_txb_width = tx_size_wide_unit[sub_txs];
+ const int sub_txb_height = tx_size_high_unit[sub_txs];
+ const int sub_step = sub_txb_width * sub_txb_height;
+ const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width);
+ assert(nblks > 0);
+ av1_init_rd_stats(split_rd_stats);
+ split_rd_stats->rate =
+ x->mode_costs.txfm_partition_cost[txfm_partition_ctx][1];
+
+ for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) {
+ const int offsetr = blk_row + r;
+ if (offsetr >= max_blocks_high) break;
+ for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) {
+ assert(blk_idx < 4);
+ const int offsetc = blk_col + c;
+ if (offsetc >= max_blocks_wide) continue;
+
+ RD_STATS this_rd_stats;
+ int this_cost_valid = 1;
+ select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1,
+ plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats,
+ no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost,
+ &this_cost_valid, ftxs_mode);
+ if (!this_cost_valid) {
+ split_rd_stats->rdcost = INT64_MAX;
+ return;
+ }
+ av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
+ split_rd_stats->rdcost =
+ RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+ if (split_rd_stats->rdcost > ref_best_rd) {
+ split_rd_stats->rdcost = INT64_MAX;
+ return;
+ }
+ block += sub_step;
+ }
+ }
+}
+
+static float get_var(float mean, double x2_sum, int num) {
+ const float e_x2 = (float)(x2_sum / num);
+ const float diff = e_x2 - mean * mean;
+ return diff;
+}
+
+static AOM_INLINE void get_blk_var_dev(const int16_t *data, int stride, int bw,
+ int bh, float *dev_of_mean,
+ float *var_of_vars) {
+ const int16_t *const data_ptr = &data[0];
+ const int subh = (bh >= bw) ? (bh >> 1) : bh;
+ const int subw = (bw >= bh) ? (bw >> 1) : bw;
+ const int num = bw * bh;
+ const int sub_num = subw * subh;
+ int total_x_sum = 0;
+ int64_t total_x2_sum = 0;
+ int blk_idx = 0;
+ float var_sum = 0.0f;
+ float mean_sum = 0.0f;
+ double var2_sum = 0.0f;
+ double mean2_sum = 0.0f;
+
+ for (int row = 0; row < bh; row += subh) {
+ for (int col = 0; col < bw; col += subw) {
+ int x_sum;
+ int64_t x2_sum;
+ aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
+ &x_sum, &x2_sum);
+ total_x_sum += x_sum;
+ total_x2_sum += x2_sum;
+
+ const float mean = (float)x_sum / sub_num;
+ const float var = get_var(mean, (double)x2_sum, sub_num);
+ mean_sum += mean;
+ mean2_sum += (double)(mean * mean);
+ var_sum += var;
+ var2_sum += var * var;
+ blk_idx++;
+ }
+ }
+
+ const float lvl0_mean = (float)total_x_sum / num;
+ const float block_var = get_var(lvl0_mean, (double)total_x2_sum, num);
+ mean_sum += lvl0_mean;
+ mean2_sum += (double)(lvl0_mean * lvl0_mean);
+ var_sum += block_var;
+ var2_sum += block_var * block_var;
+ const float av_mean = mean_sum / 5;
+
+ if (blk_idx > 1) {
+ // Deviation of means.
+ *dev_of_mean = get_dev(av_mean, mean2_sum, (blk_idx + 1));
+ // Variance of variances.
+ const float mean_var = var_sum / (blk_idx + 1);
+ *var_of_vars = get_var(mean_var, var2_sum, (blk_idx + 1));
+ }
+}
+
+static void prune_tx_split_no_split(MACROBLOCK *x, BLOCK_SIZE bsize,
+ int blk_row, int blk_col, TX_SIZE tx_size,
+ int *try_no_split, int *try_split,
+ int pruning_level) {
+ const int diff_stride = block_size_wide[bsize];
+ const int16_t *diff =
+ x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ float dev_of_means = 0.0f;
+ float var_of_vars = 0.0f;
+
+ // This function calculates the deviation of means, and the variance of pixel
+ // variances of the block as well as it's sub-blocks.
+ get_blk_var_dev(diff, diff_stride, bw, bh, &dev_of_means, &var_of_vars);
+ const int dc_q = x->plane[0].dequant_QTX[0] >> 3;
+ const int ac_q = x->plane[0].dequant_QTX[1] >> 3;
+ const int no_split_thresh_scales[4] = { 0, 24, 8, 8 };
+ const int no_split_thresh_scale = no_split_thresh_scales[pruning_level];
+ const int split_thresh_scales[4] = { 0, 24, 10, 8 };
+ const int split_thresh_scale = split_thresh_scales[pruning_level];
+
+ if ((dev_of_means <= dc_q) &&
+ (split_thresh_scale * var_of_vars <= ac_q * ac_q)) {
+ *try_split = 0;
+ }
+ if ((dev_of_means > no_split_thresh_scale * dc_q) &&
+ (var_of_vars > no_split_thresh_scale * ac_q * ac_q)) {
+ *try_no_split = 0;
+ }
+}
+
+// Search for the best transform partition(recursive)/type for a given
+// inter-predicted luma block. The obtained transform selection will be saved
+// in xd->mi[0], the corresponding RD stats will be saved in rd_stats.
+static AOM_INLINE void select_tx_block(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+ RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
+ int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) {
+ assert(tx_size < TX_SIZES_ALL);
+ av1_init_rd_stats(rd_stats);
+ if (ref_best_rd < 0) {
+ *is_cost_valid = 0;
+ return;
+ }
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ assert(blk_row < max_block_high(xd, plane_bsize, 0) &&
+ blk_col < max_block_wide(xd, plane_bsize, 0));
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+ mbmi->bsize, tx_size);
+ struct macroblock_plane *const p = &x->plane[0];
+
+ int try_no_split = (cpi->oxcf.txfm_cfg.enable_tx64 ||
+ txsize_sqr_up_map[tx_size] != TX_64X64) &&
+ (cpi->oxcf.txfm_cfg.enable_rect_tx ||
+ tx_size_wide[tx_size] == tx_size_high[tx_size]);
+ int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
+ TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
+
+ // Prune tx_split and no-split based on sub-block properties.
+ if (tx_size != TX_4X4 && try_split == 1 && try_no_split == 1 &&
+ cpi->sf.tx_sf.prune_tx_size_level > 0) {
+ prune_tx_split_no_split(x, plane_bsize, blk_row, blk_col, tx_size,
+ &try_no_split, &try_split,
+ cpi->sf.tx_sf.prune_tx_size_level);
+ }
+
+ if (cpi->sf.rt_sf.skip_tx_no_split_var_based_partition) {
+ if (x->try_merge_partition && try_split && p->eobs[block]) try_no_split = 0;
+ }
+
+ // Try using current block as a single transform block without split.
+ if (try_no_split) {
+ try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+ plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
+ ftxs_mode, &no_split);
+
+ // Speed features for early termination.
+ const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
+ if (search_level) {
+ if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) {
+ *is_cost_valid = 0;
+ return;
+ }
+ if (no_split.rd - (no_split.rd >> (2 + search_level)) > prev_level_rd) {
+ try_split = 0;
+ }
+ }
+ if (cpi->sf.tx_sf.txb_split_cap) {
+ if (p->eobs[block] == 0) try_split = 0;
+ }
+ }
+
+ // ML based speed feature to skip searching for split transform blocks.
+ if (x->e_mbd.bd == 8 && try_split &&
+ !(ref_best_rd == INT64_MAX && no_split.rd == INT64_MAX)) {
+ const int threshold = cpi->sf.tx_sf.tx_type_search.ml_tx_split_thresh;
+ if (threshold >= 0) {
+ const int split_score =
+ ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size);
+ if (split_score < -threshold) try_split = 0;
+ }
+ }
+
+ RD_STATS split_rd_stats;
+ split_rd_stats.rdcost = INT64_MAX;
+ // Try splitting current block into smaller transform blocks.
+ if (try_split) {
+ try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+ plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
+ AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
+ &split_rd_stats);
+ }
+
+ if (no_split.rd < split_rd_stats.rdcost) {
+ ENTROPY_CONTEXT *pta = ta + blk_col;
+ ENTROPY_CONTEXT *ptl = tl + blk_row;
+ p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx;
+ av1_set_txb_context(x, 0, block, tx_size, pta, ptl);
+ txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+ tx_size);
+ for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+ for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+ const int index =
+ av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx);
+ mbmi->inter_tx_size[index] = tx_size;
+ }
+ }
+ mbmi->tx_size = tx_size;
+ update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type);
+ const int bw = mi_size_wide[plane_bsize];
+ set_blk_skip(x->txfm_search_info.blk_skip, 0, blk_row * bw + blk_col,
+ rd_stats->skip_txfm);
+ } else {
+ *rd_stats = split_rd_stats;
+ if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0;
+ }
+}
+
+static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi,
+ MACROBLOCK *x, RD_STATS *rd_stats,
+ int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ mbmi->tx_size = tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
+
+ // If tx64 is not enabled, we need to go down to the next available size
+ if (!cpi->oxcf.txfm_cfg.enable_tx64 && cpi->oxcf.txfm_cfg.enable_rect_tx) {
+ static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = {
+ TX_4X4, // 4x4 transform
+ TX_8X8, // 8x8 transform
+ TX_16X16, // 16x16 transform
+ TX_32X32, // 32x32 transform
+ TX_32X32, // 64x64 transform
+ TX_4X8, // 4x8 transform
+ TX_8X4, // 8x4 transform
+ TX_8X16, // 8x16 transform
+ TX_16X8, // 16x8 transform
+ TX_16X32, // 16x32 transform
+ TX_32X16, // 32x16 transform
+ TX_32X32, // 32x64 transform
+ TX_32X32, // 64x32 transform
+ TX_4X16, // 4x16 transform
+ TX_16X4, // 16x4 transform
+ TX_8X32, // 8x32 transform
+ TX_32X8, // 32x8 transform
+ TX_16X32, // 16x64 transform
+ TX_32X16, // 64x16 transform
+ };
+ mbmi->tx_size = tx_size_max_32[mbmi->tx_size];
+ } else if (cpi->oxcf.txfm_cfg.enable_tx64 &&
+ !cpi->oxcf.txfm_cfg.enable_rect_tx) {
+ static const TX_SIZE tx_size_max_square[TX_SIZES_ALL] = {
+ TX_4X4, // 4x4 transform
+ TX_8X8, // 8x8 transform
+ TX_16X16, // 16x16 transform
+ TX_32X32, // 32x32 transform
+ TX_64X64, // 64x64 transform
+ TX_4X4, // 4x8 transform
+ TX_4X4, // 8x4 transform
+ TX_8X8, // 8x16 transform
+ TX_8X8, // 16x8 transform
+ TX_16X16, // 16x32 transform
+ TX_16X16, // 32x16 transform
+ TX_32X32, // 32x64 transform
+ TX_32X32, // 64x32 transform
+ TX_4X4, // 4x16 transform
+ TX_4X4, // 16x4 transform
+ TX_8X8, // 8x32 transform
+ TX_8X8, // 32x8 transform
+ TX_16X16, // 16x64 transform
+ TX_16X16, // 64x16 transform
+ };
+ mbmi->tx_size = tx_size_max_square[mbmi->tx_size];
+ } else if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
+ !cpi->oxcf.txfm_cfg.enable_rect_tx) {
+ static const TX_SIZE tx_size_max_32_square[TX_SIZES_ALL] = {
+ TX_4X4, // 4x4 transform
+ TX_8X8, // 8x8 transform
+ TX_16X16, // 16x16 transform
+ TX_32X32, // 32x32 transform
+ TX_32X32, // 64x64 transform
+ TX_4X4, // 4x8 transform
+ TX_4X4, // 8x4 transform
+ TX_8X8, // 8x16 transform
+ TX_8X8, // 16x8 transform
+ TX_16X16, // 16x32 transform
+ TX_16X16, // 32x16 transform
+ TX_32X32, // 32x64 transform
+ TX_32X32, // 64x32 transform
+ TX_4X4, // 4x16 transform
+ TX_4X4, // 16x4 transform
+ TX_8X8, // 8x32 transform
+ TX_8X8, // 32x8 transform
+ TX_16X16, // 16x64 transform
+ TX_16X16, // 64x16 transform
+ };
+
+ mbmi->tx_size = tx_size_max_32_square[mbmi->tx_size];
+ }
+
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+ const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+ // Skip RDcost is used only for Inter blocks
+ const int64_t skip_txfm_rd =
+ is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX;
+ const int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_rate, 0);
+ const int skip_trellis = 0;
+ av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
+ AOMMIN(no_skip_txfm_rd, skip_txfm_rd), AOM_PLANE_Y, bs,
+ mbmi->tx_size, FTXS_NONE, skip_trellis);
+}
+
+static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ RD_STATS *rd_stats,
+ int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ mbmi->tx_size = TX_4X4;
+ // TODO(any) : Pass this_rd based on skip/non-skip cost
+ const int skip_trellis = 0;
+ av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size,
+ FTXS_NONE, skip_trellis);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void ml_predict_intra_tx_depth_prune(MACROBLOCK *x, int blk_row,
+ int blk_col, BLOCK_SIZE bsize,
+ TX_SIZE tx_size) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ // Disable the pruning logic using NN model for the following cases:
+ // 1) Lossless coding as only 4x4 transform is evaluated in this case
+ // 2) When transform and current block sizes do not match as the features are
+ // obtained over the current block
+ // 3) When operating bit-depth is not 8-bit as the input features are not
+ // scaled according to bit-depth.
+ if (xd->lossless[mbmi->segment_id] || txsize_to_bsize[tx_size] != bsize ||
+ xd->bd != 8)
+ return;
+
+ // Currently NN model based pruning is supported only when largest transform
+ // size is 8x8
+ if (tx_size != TX_8X8) return;
+
+ // Neural network model is a sequential neural net and was trained using SGD
+ // optimizer. The model can be further improved in terms of speed/quality by
+ // considering the following experiments:
+ // 1) Generate ML model by training with balanced data for different learning
+ // rates and optimizers.
+ // 2) Experiment with ML model by adding features related to the statistics of
+ // top and left pixels to capture the accuracy of reconstructed neighbouring
+ // pixels for 4x4 blocks numbered 1, 2, 3 in 8x8 block, source variance of 4x4
+ // sub-blocks, etc.
+ // 3) Generate ML models for transform blocks other than 8x8.
+ const NN_CONFIG *const nn_config = &av1_intra_tx_split_nnconfig_8x8;
+ const float *const intra_tx_prune_thresh = av1_intra_tx_prune_nn_thresh_8x8;
+
+ float features[NUM_INTRA_TX_SPLIT_FEATURES] = { 0.0f };
+ const int diff_stride = block_size_wide[bsize];
+
+ const int16_t *diff = x->plane[0].src_diff + MI_SIZE * blk_row * diff_stride +
+ MI_SIZE * blk_col;
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ int feature_idx = get_mean_dev_features(diff, diff_stride, bw, bh, features);
+
+ features[feature_idx++] = log1pf((float)x->source_variance);
+
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ const float log_dc_q_square = log1pf((float)(dc_q * dc_q) / 256.0f);
+ features[feature_idx++] = log_dc_q_square;
+ assert(feature_idx == NUM_INTRA_TX_SPLIT_FEATURES);
+ for (int i = 0; i < NUM_INTRA_TX_SPLIT_FEATURES; i++) {
+ features[i] = (features[i] - av1_intra_tx_split_8x8_mean[i]) /
+ av1_intra_tx_split_8x8_std[i];
+ }
+
+ float score;
+ av1_nn_predict(features, nn_config, 1, &score);
+
+ TxfmSearchParams *const txfm_params = &x->txfm_search_params;
+ if (score <= intra_tx_prune_thresh[0])
+ txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_SPLIT;
+ else if (score > intra_tx_prune_thresh[1])
+ txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_LARGEST;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// Search for the best uniform transform size and type for current coding block.
+static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ RD_STATS *rd_stats,
+ int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
+ av1_invalid_rd_stats(rd_stats);
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ TxfmSearchParams *const txfm_params = &x->txfm_search_params;
+ const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
+ const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT;
+ int start_tx;
+ // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls
+ // how many times of splitting is allowed during the RD search.
+ int init_depth;
+
+ if (tx_select) {
+ start_tx = max_rect_tx_size;
+ init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+ is_inter_block(mbmi), &cpi->sf,
+ txfm_params->tx_size_search_method);
+ if (init_depth == MAX_TX_DEPTH && !cpi->oxcf.txfm_cfg.enable_tx64 &&
+ txsize_sqr_up_map[start_tx] == TX_64X64) {
+ start_tx = sub_tx_size_map[start_tx];
+ }
+ } else {
+ const TX_SIZE chosen_tx_size =
+ tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
+ start_tx = chosen_tx_size;
+ init_depth = MAX_TX_DEPTH;
+ }
+
+ const int skip_trellis = 0;
+ uint8_t best_txk_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ TX_SIZE best_tx_size = max_rect_tx_size;
+ int64_t best_rd = INT64_MAX;
+ const int num_blks = bsize_to_num_blk(bs);
+ x->rd_model = FULL_TXFM_RD;
+ int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH;
+ depth++, tx_size = sub_tx_size_map[tx_size]) {
+ if ((!cpi->oxcf.txfm_cfg.enable_tx64 &&
+ txsize_sqr_up_map[tx_size] == TX_64X64) ||
+ (!cpi->oxcf.txfm_cfg.enable_rect_tx &&
+ tx_size_wide[tx_size] != tx_size_high[tx_size])) {
+ continue;
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_SPLIT) break;
+
+ // Set the flag to enable the evaluation of NN classifier to prune transform
+ // depths. As the features are based on intra residual information of
+ // largest transform, the evaluation of NN model is enabled only for this
+ // case.
+ txfm_params->enable_nn_prune_intra_tx_depths =
+ (cpi->sf.tx_sf.prune_intra_tx_depths_using_nn && tx_size == start_tx);
+#endif
+
+ RD_STATS this_rd_stats;
+ // When the speed feature use_rd_based_breakout_for_intra_tx_search is
+ // enabled, use the known minimum best_rd for early termination.
+ const int64_t rd_thresh =
+ cpi->sf.tx_sf.use_rd_based_breakout_for_intra_tx_search
+ ? AOMMIN(ref_best_rd, best_rd)
+ : ref_best_rd;
+ rd[depth] = av1_uniform_txfm_yrd(cpi, x, &this_rd_stats, rd_thresh, bs,
+ tx_size, FTXS_NONE, skip_trellis);
+ if (rd[depth] < best_rd) {
+ av1_copy_array(best_blk_skip, txfm_info->blk_skip, num_blks);
+ av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks);
+ best_tx_size = tx_size;
+ best_rd = rd[depth];
+ *rd_stats = this_rd_stats;
+ }
+ if (tx_size == TX_4X4) break;
+ // If we are searching three depths, prune the smallest size depending
+ // on rd results for the first two depths for low contrast blocks.
+ if (depth > init_depth && depth != MAX_TX_DEPTH &&
+ x->source_variance < 256) {
+ if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break;
+ }
+ }
+
+ if (rd_stats->rate != INT_MAX) {
+ mbmi->tx_size = best_tx_size;
+ av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks);
+ av1_copy_array(txfm_info->blk_skip, best_blk_skip, num_blks);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ // Reset the flags to avoid any unintentional evaluation of NN model and
+ // consumption of prune depths.
+ txfm_params->enable_nn_prune_intra_tx_depths = false;
+ txfm_params->nn_prune_depths_for_intra_tx = TX_PRUNE_NONE;
+#endif
+}
+
+// Search for the best transform type for the given transform block in the
+// given plane/channel, and calculate the corresponding RD cost.
+static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct rdcost_block_args *args = arg;
+ if (args->exit_early) {
+ args->incomplete_exit = 1;
+ return;
+ }
+
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int is_inter = is_inter_block(xd->mi[0]);
+ const AV1_COMP *cpi = args->cpi;
+ ENTROPY_CONTEXT *a = args->t_above + blk_col;
+ ENTROPY_CONTEXT *l = args->t_left + blk_row;
+ const AV1_COMMON *cm = &cpi->common;
+ RD_STATS this_rd_stats;
+ av1_init_rd_stats(&this_rd_stats);
+
+ if (!is_inter) {
+ av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+ av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+#if !CONFIG_REALTIME_ONLY
+ const TxfmSearchParams *const txfm_params = &x->txfm_search_params;
+ if (txfm_params->enable_nn_prune_intra_tx_depths) {
+ ml_predict_intra_tx_depth_prune(x, blk_row, blk_col, plane_bsize,
+ tx_size);
+ if (txfm_params->nn_prune_depths_for_intra_tx == TX_PRUNE_LARGEST) {
+ av1_invalid_rd_stats(&args->rd_stats);
+ args->exit_early = 1;
+ return;
+ }
+ }
+#endif
+ }
+
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ &txb_ctx, args->ftxs_mode, args->skip_trellis,
+ args->best_rd - args->current_rd, &this_rd_stats);
+
+ if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+ assert(!is_inter || plane_bsize < BLOCK_8X8);
+ cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+ }
+
+#if CONFIG_RD_DEBUG
+ update_txb_coeff_cost(&this_rd_stats, plane, this_rd_stats.rate);
+#endif // CONFIG_RD_DEBUG
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+ const int blk_idx =
+ blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col;
+
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ if (plane == 0)
+ set_blk_skip(txfm_info->blk_skip, plane, blk_idx,
+ x->plane[plane].eobs[block] == 0);
+ else
+ set_blk_skip(txfm_info->blk_skip, plane, blk_idx, 0);
+
+ int64_t rd;
+ if (is_inter) {
+ const int64_t no_skip_txfm_rd =
+ RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+ const int64_t skip_txfm_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+ rd = AOMMIN(no_skip_txfm_rd, skip_txfm_rd);
+ this_rd_stats.skip_txfm &= !x->plane[plane].eobs[block];
+ } else {
+ // Signal non-skip_txfm for Intra blocks
+ rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+ this_rd_stats.skip_txfm = 0;
+ }
+
+ av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
+ args->current_rd += rd;
+ if (args->current_rd > args->best_rd) args->exit_early = 1;
+}
+
+int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int is_inter = is_inter_block(mbmi);
+ const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ block_signals_txsize(mbmi->bsize);
+ int tx_size_rate = 0;
+ if (tx_select) {
+ const int ctx = txfm_partition_context(
+ xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size);
+ tx_size_rate = mode_costs->txfm_partition_cost[ctx][0];
+ }
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0];
+ const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+ const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, 0);
+ const int64_t no_this_rd =
+ RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0);
+ mbmi->tx_size = tx_size;
+
+ const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+ const uint8_t txh_unit = tx_size_high_unit[tx_size];
+ const int step = txw_unit * txh_unit;
+ const int max_blocks_wide = max_block_wide(xd, bs, 0);
+ const int max_blocks_high = max_block_high(xd, bs, 0);
+
+ struct rdcost_block_args args;
+ av1_zero(args);
+ args.x = x;
+ args.cpi = cpi;
+ args.best_rd = ref_best_rd;
+ args.current_rd = AOMMIN(no_this_rd, skip_txfm_rd);
+ av1_init_rd_stats(&args.rd_stats);
+ av1_get_entropy_contexts(bs, &xd->plane[0], args.t_above, args.t_left);
+ int i = 0;
+ for (int blk_row = 0; blk_row < max_blocks_high && !args.incomplete_exit;
+ blk_row += txh_unit) {
+ for (int blk_col = 0; blk_col < max_blocks_wide; blk_col += txw_unit) {
+ RD_STATS this_rd_stats;
+ av1_init_rd_stats(&this_rd_stats);
+
+ if (args.exit_early) {
+ args.incomplete_exit = 1;
+ break;
+ }
+
+ ENTROPY_CONTEXT *a = args.t_above + blk_col;
+ ENTROPY_CONTEXT *l = args.t_left + blk_row;
+ TXB_CTX txb_ctx;
+ get_txb_ctx(bs, tx_size, 0, a, l, &txb_ctx);
+
+ TxfmParam txfm_param;
+ QUANT_PARAM quant_param;
+ av1_setup_xform(&cpi->common, x, tx_size, DCT_DCT, &txfm_param);
+ av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, 0, &quant_param);
+
+ av1_xform(x, 0, i, blk_row, blk_col, bs, &txfm_param);
+ av1_quant(x, 0, i, &txfm_param, &quant_param);
+
+ this_rd_stats.rate =
+ cost_coeffs(x, 0, i, tx_size, txfm_param.tx_type, &txb_ctx, 0);
+
+ const SCAN_ORDER *const scan_order =
+ get_scan(txfm_param.tx_size, txfm_param.tx_type);
+ dist_block_tx_domain(x, 0, i, tx_size, quant_param.qmatrix,
+ scan_order->scan, &this_rd_stats.dist,
+ &this_rd_stats.sse);
+
+ const int64_t no_skip_txfm_rd =
+ RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+ const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+
+ this_rd_stats.skip_txfm &= !x->plane[0].eobs[i];
+
+ av1_merge_rd_stats(&args.rd_stats, &this_rd_stats);
+ args.current_rd += AOMMIN(no_skip_txfm_rd, skip_rd);
+
+ if (args.current_rd > ref_best_rd) {
+ args.exit_early = 1;
+ break;
+ }
+
+ av1_set_txb_context(x, 0, i, tx_size, a, l);
+ i += step;
+ }
+ }
+
+ if (args.incomplete_exit) av1_invalid_rd_stats(&args.rd_stats);
+
+ *rd_stats = args.rd_stats;
+ if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+ int64_t rd;
+ // rdstats->rate should include all the rate except skip/non-skip cost as the
+ // same is accounted in the caller functions after rd evaluation of all
+ // planes. However the decisions should be done after considering the
+ // skip/non-skip header cost
+ if (rd_stats->skip_txfm && is_inter) {
+ rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+ } else {
+ // Intra blocks are always signalled as non-skip
+ rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
+ rd_stats->dist);
+ rd_stats->rate += tx_size_rate;
+ }
+ // Check if forcing the block to skip transform leads to smaller RD cost.
+ if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
+ int64_t temp_skip_txfm_rd =
+ RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+ if (temp_skip_txfm_rd <= rd) {
+ rd = temp_skip_txfm_rd;
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip_txfm = 1;
+ }
+ }
+
+ return rd;
+}
+
+int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs, TX_SIZE tx_size,
+ FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
+ assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const ModeCosts *mode_costs = &x->mode_costs;
+ const int is_inter = is_inter_block(mbmi);
+ const int tx_select = txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ block_signals_txsize(mbmi->bsize);
+ int tx_size_rate = 0;
+ if (tx_select) {
+ const int ctx = txfm_partition_context(
+ xd->above_txfm_context, xd->left_txfm_context, mbmi->bsize, tx_size);
+ tx_size_rate = is_inter ? mode_costs->txfm_partition_cost[ctx][0]
+ : tx_size_cost(x, bs, tx_size);
+ }
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int no_skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][0];
+ const int skip_txfm_rate = mode_costs->skip_txfm_cost[skip_ctx][1];
+ const int64_t skip_txfm_rd =
+ is_inter ? RDCOST(x->rdmult, skip_txfm_rate, 0) : INT64_MAX;
+ const int64_t no_this_rd =
+ RDCOST(x->rdmult, no_skip_txfm_rate + tx_size_rate, 0);
+
+ mbmi->tx_size = tx_size;
+ av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
+ AOMMIN(no_this_rd, skip_txfm_rd), AOM_PLANE_Y, bs,
+ tx_size, ftxs_mode, skip_trellis);
+ if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+ int64_t rd;
+ // rdstats->rate should include all the rate except skip/non-skip cost as the
+ // same is accounted in the caller functions after rd evaluation of all
+ // planes. However the decisions should be done after considering the
+ // skip/non-skip header cost
+ if (rd_stats->skip_txfm && is_inter) {
+ rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+ } else {
+ // Intra blocks are always signalled as non-skip
+ rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate + tx_size_rate,
+ rd_stats->dist);
+ rd_stats->rate += tx_size_rate;
+ }
+ // Check if forcing the block to skip transform leads to smaller RD cost.
+ if (is_inter && !rd_stats->skip_txfm && !xd->lossless[mbmi->segment_id]) {
+ int64_t temp_skip_txfm_rd =
+ RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+ if (temp_skip_txfm_rd <= rd) {
+ rd = temp_skip_txfm_rd;
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip_txfm = 1;
+ }
+ }
+
+ return rd;
+}
+
+// Search for the best transform type for a luma inter-predicted block, given
+// the transform block partitions.
+// This function is used only when some speed features are enabled.
+static AOM_INLINE void tx_block_yrd(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int depth,
+ ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+ TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int64_t ref_best_rd,
+ RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode) {
+ assert(tx_size < TX_SIZES_ALL);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(is_inter_block(mbmi));
+ const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+ plane_bsize, blk_row, blk_col)];
+ const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+ mbmi->bsize, tx_size);
+
+ av1_init_rd_stats(rd_stats);
+ if (tx_size == plane_tx_size) {
+ ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+ ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
+
+ const int zero_blk_rate =
+ x->coeff_costs.coeff_costs[txs_ctx][get_plane_type(0)]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+ rd_stats->zero_rate = zero_blk_rate;
+ tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
+ rd_stats, ftxs_mode, ref_best_rd);
+ const int mi_width = mi_size_wide[plane_bsize];
+ TxfmSearchInfo *txfm_info = &x->txfm_search_info;
+ if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+ rd_stats->skip_txfm == 1) {
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip_txfm = 1;
+ set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 1);
+ x->plane[0].eobs[block] = 0;
+ x->plane[0].txb_entropy_ctx[block] = 0;
+ update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+ } else {
+ rd_stats->skip_txfm = 0;
+ set_blk_skip(txfm_info->blk_skip, 0, blk_row * mi_width + blk_col, 0);
+ }
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][0];
+ av1_set_txb_context(x, 0, block, tx_size, ta, tl);
+ txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+ tx_size);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int txb_width = tx_size_wide_unit[sub_txs];
+ const int txb_height = tx_size_high_unit[sub_txs];
+ const int step = txb_height * txb_width;
+ const int row_end =
+ AOMMIN(tx_size_high_unit[tx_size], max_blocks_high - blk_row);
+ const int col_end =
+ AOMMIN(tx_size_wide_unit[tx_size], max_blocks_wide - blk_col);
+ RD_STATS pn_rd_stats;
+ int64_t this_rd = 0;
+ assert(txb_width > 0 && txb_height > 0);
+
+ for (int row = 0; row < row_end; row += txb_height) {
+ const int offsetr = blk_row + row;
+ for (int col = 0; col < col_end; col += txb_width) {
+ const int offsetc = blk_col + col;
+
+ av1_init_rd_stats(&pn_rd_stats);
+ tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
+ depth + 1, above_ctx, left_ctx, tx_above, tx_left,
+ ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+ if (pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist);
+ block += step;
+ }
+ }
+
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate += x->mode_costs.txfm_partition_cost[ctx][1];
+ }
+}
+
+// search for tx type with tx sizes already decided for a inter-predicted luma
+// partition block. It's used only when some speed features are enabled.
+// Return value 0: early termination triggered, no valid rd cost available;
+// 1: rd cost values are valid.
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) {
+ if (ref_best_rd < 0) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+
+ av1_init_rd_stats(rd_stats);
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ const int step = bw * bh;
+ const int init_depth = get_search_init_depth(
+ mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method);
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+ memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+ memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+ int64_t this_rd = 0;
+ for (int idy = 0, block = 0; idy < mi_height; idy += bh) {
+ for (int idx = 0; idx < mi_width; idx += bw) {
+ RD_STATS pn_rd_stats;
+ av1_init_rd_stats(&pn_rd_stats);
+ tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, bsize, init_depth,
+ ctxa, ctxl, tx_above, tx_left, ref_best_rd - this_rd,
+ &pn_rd_stats, ftxs_mode);
+ if (pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ this_rd +=
+ AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+ RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+ block += step;
+ }
+ }
+
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int no_skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+ const int skip_txfm_rate = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+ const int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_rate, rd_stats->sse);
+ this_rd =
+ RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_rate, rd_stats->dist);
+ if (skip_txfm_rd < this_rd) {
+ this_rd = skip_txfm_rd;
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip_txfm = 1;
+ }
+
+ const int is_cost_valid = this_rd > ref_best_rd;
+ if (!is_cost_valid) {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+ return is_cost_valid;
+}
+
+// Search for the best transform size and type for current inter-predicted
+// luma block with recursive transform block partitioning. The obtained
+// transform selection will be saved in xd->mi[0], the corresponding RD stats
+// will be saved in rd_stats. The returned value is the corresponding RD cost.
+static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ assert(is_inter_block(xd->mi[0]));
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int fast_tx_search = txfm_params->tx_size_search_method > USE_FULL_RD;
+ int64_t rd_thresh = ref_best_rd;
+ if (rd_thresh == 0) {
+ av1_invalid_rd_stats(rd_stats);
+ return INT64_MAX;
+ }
+ if (fast_tx_search && rd_thresh < INT64_MAX) {
+ if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
+ }
+ assert(rd_thresh > 0);
+ const FAST_TX_SEARCH_MODE ftxs_mode =
+ fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ assert(bsize < BLOCK_SIZES_ALL);
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+ memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+ memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+ const int init_depth = get_search_init_depth(
+ mi_width, mi_height, 1, &cpi->sf, txfm_params->tx_size_search_method);
+ const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ const int step = bw * bh;
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int no_skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][0];
+ const int skip_txfm_cost = x->mode_costs.skip_txfm_cost[skip_ctx][1];
+ int64_t skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, 0);
+ int64_t no_skip_txfm_rd = RDCOST(x->rdmult, no_skip_txfm_cost, 0);
+ int block = 0;
+
+ av1_init_rd_stats(rd_stats);
+ for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) {
+ for (int idx = 0; idx < max_block_wide(xd, bsize, 0); idx += bw) {
+ const int64_t best_rd_sofar =
+ (rd_thresh == INT64_MAX)
+ ? INT64_MAX
+ : (rd_thresh - (AOMMIN(skip_txfm_rd, no_skip_txfm_rd)));
+ int is_cost_valid = 1;
+ RD_STATS pn_rd_stats;
+ // Search for the best transform block size and type for the sub-block.
+ select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize,
+ ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX,
+ best_rd_sofar, &is_cost_valid, ftxs_mode);
+ if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return INT64_MAX;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ skip_txfm_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse);
+ no_skip_txfm_rd =
+ RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
+ block += step;
+ }
+ }
+
+ if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+ rd_stats->skip_txfm = (skip_txfm_rd <= no_skip_txfm_rd);
+
+ // If fast_tx_search is true, only DCT and 1D DCT were tested in
+ // select_inter_block_yrd() above. Do a better search for tx type with
+ // tx sizes already decided.
+ if (fast_tx_search && cpi->sf.tx_sf.refine_fast_tx_search_results) {
+ if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE))
+ return INT64_MAX;
+ }
+
+ int64_t final_rd;
+ if (rd_stats->skip_txfm) {
+ final_rd = RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse);
+ } else {
+ final_rd =
+ RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
+ if (!xd->lossless[xd->mi[0]->segment_id]) {
+ final_rd =
+ AOMMIN(final_rd, RDCOST(x->rdmult, skip_txfm_cost, rd_stats->sse));
+ }
+ }
+
+ return final_rd;
+}
+
+// Return 1 to terminate transform search early. The decision is made based on
+// the comparison with the reference RD cost and the model-estimated RD cost.
+static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi,
+ MACROBLOCK *x,
+ BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
+ const int level = cpi->sf.tx_sf.model_based_prune_tx_search_level;
+ assert(level >= 0 && level <= 2);
+ int model_rate;
+ int64_t model_dist;
+ uint8_t model_skip;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
+ cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, &model_skip, NULL,
+ NULL, NULL, NULL);
+ if (model_skip) return 0;
+ const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
+ // TODO(debargha, urvang): Improve the model and make the check below
+ // tighter.
+ static const int prune_factor_by8[] = { 3, 5 };
+ const int factor = prune_factor_by8[level - 1];
+ return ((model_rd * factor) >> 3) > ref_best_rd;
+}
+
+void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ assert(is_inter_block(xd->mi[0]));
+
+ av1_invalid_rd_stats(rd_stats);
+
+ // If modeled RD cost is a lot worse than the best so far, terminate early.
+ if (cpi->sf.tx_sf.model_based_prune_tx_search_level &&
+ ref_best_rd != INT64_MAX) {
+ if (model_based_tx_search_prune(cpi, x, bsize, ref_best_rd)) return;
+ }
+
+ // Hashing based speed feature. If the hash of the prediction residue block is
+ // found in the hash table, use previous search results and terminate early.
+ uint32_t hash = 0;
+ MB_RD_RECORD *mb_rd_record = NULL;
+ const int mi_row = x->e_mbd.mi_row;
+ const int mi_col = x->e_mbd.mi_col;
+ const int within_border =
+ mi_row >= xd->tile.mi_row_start &&
+ (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+ mi_col >= xd->tile.mi_col_start &&
+ (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+ const int is_mb_rd_hash_enabled =
+ (within_border && cpi->sf.rd_sf.use_mb_rd_hash);
+ const int n4 = bsize_to_num_blk(bsize);
+ if (is_mb_rd_hash_enabled) {
+ hash = get_block_residue_hash(x, bsize);
+ mb_rd_record = x->txfm_search_info.mb_rd_record;
+ const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+ if (match_index != -1) {
+ MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index];
+ fetch_mb_rd_info(n4, mb_rd_info, rd_stats, x);
+ return;
+ }
+ }
+
+ // If we predict that skip is the optimal RD decision - set the respective
+ // context and terminate early.
+ int64_t dist;
+ if (txfm_params->skip_txfm_level &&
+ predict_skip_txfm(x, bsize, &dist,
+ cpi->common.features.reduced_tx_set_used)) {
+ set_skip_txfm(x, rd_stats, bsize, dist);
+ // Save the RD search results into mb_rd_record.
+ if (is_mb_rd_hash_enabled)
+ save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+ return;
+ }
+#if CONFIG_SPEED_STATS
+ ++x->txfm_search_info.tx_search_count;
+#endif // CONFIG_SPEED_STATS
+
+ const int64_t rd =
+ select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd);
+
+ if (rd == INT64_MAX) {
+ // We should always find at least one candidate unless ref_best_rd is less
+ // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
+ // might have failed to find something better)
+ assert(ref_best_rd != INT64_MAX);
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+
+ // Save the RD search results into mb_rd_record.
+ if (is_mb_rd_hash_enabled) {
+ assert(mb_rd_record != NULL);
+ save_mb_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+ }
+}
+
+void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bs,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const TxfmSearchParams *tx_params = &x->txfm_search_params;
+ assert(bs == mbmi->bsize);
+ const int is_inter = is_inter_block(mbmi);
+ const int mi_row = xd->mi_row;
+ const int mi_col = xd->mi_col;
+
+ av1_init_rd_stats(rd_stats);
+
+ // Hashing based speed feature for inter blocks. If the hash of the residue
+ // block is found in the table, use previously saved search results and
+ // terminate early.
+ uint32_t hash = 0;
+ MB_RD_RECORD *mb_rd_record = NULL;
+ const int num_blks = bsize_to_num_blk(bs);
+ if (is_inter && cpi->sf.rd_sf.use_mb_rd_hash) {
+ const int within_border =
+ mi_row >= xd->tile.mi_row_start &&
+ (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) &&
+ mi_col >= xd->tile.mi_col_start &&
+ (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
+ if (within_border) {
+ hash = get_block_residue_hash(x, bs);
+ mb_rd_record = x->txfm_search_info.mb_rd_record;
+ const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+ if (match_index != -1) {
+ MB_RD_INFO *mb_rd_info = &mb_rd_record->mb_rd_info[match_index];
+ fetch_mb_rd_info(num_blks, mb_rd_info, rd_stats, x);
+ return;
+ }
+ }
+ }
+
+ // If we predict that skip is the optimal RD decision - set the respective
+ // context and terminate early.
+ int64_t dist;
+ if (tx_params->skip_txfm_level && is_inter &&
+ !xd->lossless[mbmi->segment_id] &&
+ predict_skip_txfm(x, bs, &dist,
+ cpi->common.features.reduced_tx_set_used)) {
+ // Populate rdstats as per skip decision
+ set_skip_txfm(x, rd_stats, bs, dist);
+ // Save the RD search results into mb_rd_record.
+ if (mb_rd_record) {
+ save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+ }
+ return;
+ }
+
+ if (xd->lossless[mbmi->segment_id]) {
+ // Lossless mode can only pick the smallest (4x4) transform size.
+ choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+ } else if (tx_params->tx_size_search_method == USE_LARGESTALL) {
+ choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+ } else {
+ choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+ }
+
+ // Save the RD search results into mb_rd_record for possible reuse in future.
+ if (mb_rd_record) {
+ save_mb_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+ }
+}
+
+int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+ BLOCK_SIZE bsize, int64_t ref_best_rd) {
+ av1_init_rd_stats(rd_stats);
+ if (ref_best_rd < 0) return 0;
+ if (!x->e_mbd.is_chroma_ref) return 1;
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
+ const int is_inter = is_inter_block(mbmi);
+ int64_t this_rd = 0, skip_txfm_rd = 0;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ if (is_inter) {
+ for (int plane = 1; plane < MAX_MB_PLANE; ++plane)
+ av1_subtract_plane(x, plane_bsize, plane);
+ }
+
+ const int skip_trellis = 0;
+ const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+ int is_cost_valid = 1;
+ for (int plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ RD_STATS this_rd_stats;
+ int64_t chroma_ref_best_rd = ref_best_rd;
+ // For inter blocks, refined ref_best_rd is used for early exit
+ // For intra blocks, even though current rd crosses ref_best_rd, early
+ // exit is not recommended as current rd is used for gating subsequent
+ // modes as well (say, for angular modes)
+ // TODO(any): Extend the early exit mechanism for intra modes as well
+ if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter &&
+ chroma_ref_best_rd != INT64_MAX)
+ chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_txfm_rd);
+ av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane,
+ plane_bsize, uv_tx_size, FTXS_NONE, skip_trellis);
+ if (this_rd_stats.rate == INT_MAX) {
+ is_cost_valid = 0;
+ break;
+ }
+ av1_merge_rd_stats(rd_stats, &this_rd_stats);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ skip_txfm_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+ if (AOMMIN(this_rd, skip_txfm_rd) > ref_best_rd) {
+ is_cost_valid = 0;
+ break;
+ }
+ }
+
+ if (!is_cost_valid) {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+
+ return is_cost_valid;
+}
+
+void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+ int skip_trellis) {
+ assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size));
+
+ if (!cpi->oxcf.txfm_cfg.enable_tx64 &&
+ txsize_sqr_up_map[tx_size] == TX_64X64) {
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+
+ if (current_rd > ref_best_rd) {
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct rdcost_block_args args;
+ av1_zero(args);
+ args.x = x;
+ args.cpi = cpi;
+ args.best_rd = ref_best_rd;
+ args.current_rd = current_rd;
+ args.ftxs_mode = ftxs_mode;
+ args.skip_trellis = skip_trellis;
+ av1_init_rd_stats(&args.rd_stats);
+
+ av1_get_entropy_contexts(plane_bsize, pd, args.t_above, args.t_left);
+ av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, block_rd_txfm,
+ &args);
+
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
+
+ if (invalid_rd) {
+ av1_invalid_rd_stats(rd_stats);
+ } else {
+ *rd_stats = args.rd_stats;
+ }
+}
+
+int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TxfmSearchParams *txfm_params = &x->txfm_search_params;
+ const int skip_ctx = av1_get_skip_txfm_context(xd);
+ const int skip_txfm_cost[2] = { x->mode_costs.skip_txfm_cost[skip_ctx][0],
+ x->mode_costs.skip_txfm_cost[skip_ctx][1] };
+ const int64_t min_header_rate =
+ mode_rate + AOMMIN(skip_txfm_cost[0], skip_txfm_cost[1]);
+ // Account for minimum skip and non_skip rd.
+ // Eventually either one of them will be added to mode_rate
+ const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+ if (min_header_rd_possible > ref_best_rd) {
+ av1_invalid_rd_stats(rd_stats_y);
+ return 0;
+ }
+
+ const AV1_COMMON *cm = &cpi->common;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
+ const int64_t rd_thresh =
+ ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
+ av1_init_rd_stats(rd_stats);
+ av1_init_rd_stats(rd_stats_y);
+ rd_stats->rate = mode_rate;
+
+ // cost and distortion
+ av1_subtract_plane(x, bsize, 0);
+ if (txfm_params->tx_mode_search_type == TX_MODE_SELECT &&
+ !xd->lossless[mbmi->segment_id]) {
+ av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+#if CONFIG_COLLECT_RD_STATS == 2
+ PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 2
+ } else {
+ av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ for (int i = 0; i < xd->height * xd->width; ++i)
+ set_blk_skip(x->txfm_search_info.blk_skip, 0, i, rd_stats_y->skip_txfm);
+ }
+
+ if (rd_stats_y->rate == INT_MAX) return 0;
+
+ av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+ const int64_t non_skip_txfm_rdcosty =
+ RDCOST(x->rdmult, rd_stats->rate + skip_txfm_cost[0], rd_stats->dist);
+ const int64_t skip_txfm_rdcosty =
+ RDCOST(x->rdmult, mode_rate + skip_txfm_cost[1], rd_stats->sse);
+ const int64_t min_rdcosty = AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty);
+ if (min_rdcosty > ref_best_rd) return 0;
+
+ av1_init_rd_stats(rd_stats_uv);
+ const int num_planes = av1_num_planes(cm);
+ if (num_planes > 1) {
+ int64_t ref_best_chroma_rd = ref_best_rd;
+ // Calculate best rd cost possible for chroma
+ if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma &&
+ (ref_best_chroma_rd != INT64_MAX)) {
+ ref_best_chroma_rd = (ref_best_chroma_rd -
+ AOMMIN(non_skip_txfm_rdcosty, skip_txfm_rdcosty));
+ }
+ const int is_cost_valid_uv =
+ av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
+ if (!is_cost_valid_uv) return 0;
+ av1_merge_rd_stats(rd_stats, rd_stats_uv);
+ }
+
+ int choose_skip_txfm = rd_stats->skip_txfm;
+ if (!choose_skip_txfm && !xd->lossless[mbmi->segment_id]) {
+ const int64_t rdcost_no_skip_txfm = RDCOST(
+ x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_txfm_cost[0],
+ rd_stats->dist);
+ const int64_t rdcost_skip_txfm =
+ RDCOST(x->rdmult, skip_txfm_cost[1], rd_stats->sse);
+ if (rdcost_no_skip_txfm >= rdcost_skip_txfm) choose_skip_txfm = 1;
+ }
+ if (choose_skip_txfm) {
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ rd_stats->rate = mode_rate + skip_txfm_cost[1];
+ rd_stats->dist = rd_stats->sse;
+ rd_stats_y->dist = rd_stats_y->sse;
+ rd_stats_uv->dist = rd_stats_uv->sse;
+ mbmi->skip_txfm = 1;
+ if (rd_stats->skip_txfm) {
+ const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (tmprd > ref_best_rd) return 0;
+ }
+ } else {
+ rd_stats->rate += skip_txfm_cost[0];
+ mbmi->skip_txfm = 0;
+ }
+
+ return 1;
+}
diff --git a/third_party/aom/av1/encoder/tx_search.h b/third_party/aom/av1/encoder/tx_search.h
new file mode 100644
index 0000000000..ed95c1cd98
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_search.h
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
+#define AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Set this macro as 1 to collect data about tx size selection.
+#define COLLECT_TX_SIZE_DATA 0
+
+#if COLLECT_TX_SIZE_DATA
+static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
+#endif
+
+enum {
+ FTXS_NONE = 0,
+ FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
+ FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
+ FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
+} UENUM1BYTE(FAST_TX_SEARCH_MODE);
+
+static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize,
+ TX_SIZE tx_size) {
+ assert(bsize == x->e_mbd.mi[0]->bsize);
+ if (x->txfm_search_params.tx_mode_search_type != TX_MODE_SELECT ||
+ !block_signals_txsize(bsize))
+ return 0;
+
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+ const int depth = tx_size_to_depth(tx_size, bsize);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int tx_size_ctx = get_tx_size_context(xd);
+ return x->mode_costs.tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+}
+
+/*!\brief Compute the pixel domain distortion.
+ *
+ * \ingroup transform_search
+ * Compute the pixel domain distortion from diff on all visible 4x4s in the
+ * transform block.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] plane Plane index
+ * \param[in] blk_row Block row index
+ * \param[in] blk_col Block col index
+ * \param[in] plane_bsize Current plane block size
+ * \param[in] tx_bsize Transform size
+ * \param[in] block_mse_q8 Block mse
+ * \return An int64_t value that is the block sse.
+ */
+int64_t av1_pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row,
+ int blk_col, const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize,
+ unsigned int *block_mse_q8);
+
+int64_t av1_estimate_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs, TX_SIZE tx_size);
+
+/*!\brief Transform type search for luma macroblock with fixed transform size.
+ *
+ * \ingroup transform_search
+ * Search for the best transform type and return the transform coefficients RD
+ * cost of current luma macroblock with the given uniform transform size.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] rd_stats Pointer to struct to keep track of the RD stats
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ * \param[in] bs Size of the current macroblock
+ * \param[in] tx_size The given transform size
+ * \param[in] ftxs_mode Transform search mode specifying desired speed
+ and quality tradeoff
+ * \param[in] skip_trellis Binary flag indicating if trellis optimization
+ should be skipped
+ * \return An int64_t value that is the best RD cost found.
+ */
+int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs, TX_SIZE tx_size,
+ FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
+
+/*!\brief Recursive transform size and type search.
+ *
+ * \ingroup transform_search
+ * Search for best transform size and type for luma inter blocks. The transform
+ * block partitioning can be recursive resulting in non-uniform transform sizes.
+ * The best transform size and type, if found, will be saved in the MB_MODE_INFO
+ * structure, and the corresponding RD stats will be saved in rd_stats.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] rd_stats Pointer to struct to keep track of the RD stats
+ * \param[in] bsize Current macroblock size
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ * \remark Nothing is returned. The selected transform size and type will
+ be saved in the MB_MODE_INFO structure
+ */
+void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd);
+
+/*!\brief Uniform transform size and type search.
+ *
+ * \ingroup transform_search
+ * Search for the best transform size and type for current macroblock block,
+ * with the assumption that all the transform blocks have a uniform size
+ * (VP9 style). The selected transform size and type will be saved in the
+ * MB_MODE_INFO structure; the corresponding RD stats will be saved in rd_stats.
+ * This function may be used for both intra and inter predicted blocks.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] rd_stats Pointer to struct to keep track of the RD stats
+ * \param[in] bs Current macroblock size
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ * \remark Nothing is returned. The selected transform size and type will
+ be saved in the MB_MODE_INFO structure
+ */
+void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bs,
+ int64_t ref_best_rd);
+
+/*!\brief Chroma block transform search.
+ *
+ * \ingroup transform_search
+ * Calculate the transform coefficient RD cost for the given chroma macroblock
+ * If the current mode is intra, then this function will compute the predictor.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] rd_stats Pointer to struct to keep track of the RD stats
+ * \param[in] bsize Current macroblock size
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ * \return An integer value is returned. 0: early termination triggered,
+ no valid rd cost available; 1: rd cost values are valid.
+ */
+int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+ BLOCK_SIZE bsize, int64_t ref_best_rd);
+
+/*!\brief Transform type search with fixed transform size.
+ *
+ * \ingroup transform_search
+ * Search for the best transform type and calculate the transform coefficients
+ * RD cost of the current transform block with the specified (uniform) transform
+ * size and plane. The RD results will be saved in rd_stats.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] rd_stats Pointer to struct to keep track of the RD stats
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ * \param[in] current_rd Current RD cost for this block so far
+ * \param[in] plane Plane index
+ * \param[in] plane_bsize Size of the current macroblock considering
+ sup-sampling
+ * \param[in] tx_size The given transform size
+ * \param[in] ftxs_mode Transform search mode specifying desired speed
+ and quality tradeoff
+ * \param[in] skip_trellis Binary flag indicating if trellis optimization
+ should be skipped
+ *
+ * \remark Nothing is returned. The RD results will be saved in rd_stats.
+ */
+void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+ int skip_trellis);
+
+/*!\brief Recursive transform size and type search.
+ *
+ * \ingroup transform_search
+ * This function combines y and uv planes' transform search processes together
+ * for inter-predicted blocks (including IntraBC), when the prediction is
+ * already generated. It first does subtraction to obtain the prediction error.
+ * Then it calls
+ * av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and
+ * av1_txfm_uvrd sequentially and handles possible early terminations.
+ * The RD metrics are calculated and stored in rd_stats/_y/_uv.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] bsize Current macroblock size
+ * \param[in] rd_stats Pointer to struct to keep track of the overal RD
+ stats
+ * \param[in] rd_stats_y Pointer to struct to keep track of the RD
+ stats for the luma plane
+ * \param[in] rd_stats_uv Pointer to struct to keep track of the RD
+ stats for the chroma planes
+ * \param[in] mode_rate Rate cost to encode the prediction mode info. of
+ the current macroblock
+ * \param[in] ref_best_rd Best RD cost seen for this block so far
+ *
+ * \return An integer value is returned indicating if a valid transform
+ candidate is found (1) or not (0).
+ */
+int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
diff --git a/third_party/aom/av1/encoder/txb_rdopt.c b/third_party/aom/av1/encoder/txb_rdopt.c
new file mode 100644
index 0000000000..e551e8aa12
--- /dev/null
+++ b/third_party/aom/av1/encoder/txb_rdopt.c
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/txb_rdopt.h"
+#include "av1/encoder/txb_rdopt_utils.h"
+
+#include "av1/common/idct.h"
+
+static INLINE void update_coeff_general(
+ int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
+ TX_CLASS tx_class, int bhl, int width, int64_t rdmult, int shift,
+ int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
+ const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels,
+ const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+ const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const int is_last = si == (eob - 1);
+ const int coeff_ctx = get_lower_levels_ctx_general(
+ is_last, si, bhl, width, levels, ci, tx_size, tx_class);
+ if (qc == 0) {
+ *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ const int sign = (qc < 0) ? 1 : 0;
+ const tran_low_t abs_qc = abs(qc);
+ const tran_low_t tqc = tcoeff[ci];
+ const tran_low_t dqc = dqcoeff[ci];
+ const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci);
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+ const int rate =
+ get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
+ dc_sign_ctx, txb_costs, bhl, tx_class, levels);
+ const int64_t rd = RDCOST(rdmult, rate, dist);
+
+ tran_low_t qc_low, dqc_low;
+ tran_low_t abs_qc_low;
+ int64_t dist_low, rd_low;
+ int rate_low;
+ if (abs_qc == 1) {
+ abs_qc_low = qc_low = dqc_low = 0;
+ dist_low = dist0;
+ rate_low = txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+ abs_qc_low = abs_qc - 1;
+ dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci);
+ rate_low =
+ get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+ dc_sign_ctx, txb_costs, bhl, tx_class, levels);
+ }
+
+ rd_low = RDCOST(rdmult, rate_low, dist_low);
+ if (rd_low < rd) {
+ qcoeff[ci] = qc_low;
+ dqcoeff[ci] = dqc_low;
+ levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+ *accu_rate += rate_low;
+ *accu_dist += dist_low - dist0;
+ } else {
+ *accu_rate += rate;
+ *accu_dist += dist - dist0;
+ }
+ }
+}
+
+static AOM_FORCE_INLINE void update_coeff_simple(
+ int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+ int bhl, int64_t rdmult, int shift, const int16_t *dequant,
+ const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+ const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ uint8_t *levels, const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+ const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+ (void)eob;
+ // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+ // and not the last (scan_idx != eob - 1)
+ assert(si != eob - 1);
+ assert(si > 0);
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const int coeff_ctx =
+ get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class);
+ if (qc == 0) {
+ *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ const tran_low_t abs_qc = abs(qc);
+ const tran_low_t abs_tqc = abs(tcoeff[ci]);
+ const tran_low_t abs_dqc = abs(dqcoeff[ci]);
+ int rate_low = 0;
+ const int rate = get_two_coeff_cost_simple(
+ ci, abs_qc, coeff_ctx, txb_costs, bhl, tx_class, levels, &rate_low);
+ if (abs_dqc < abs_tqc) {
+ *accu_rate += rate;
+ return;
+ }
+
+ const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift, qmatrix, ci);
+ const int64_t rd = RDCOST(rdmult, rate, dist);
+
+ const tran_low_t abs_qc_low = abs_qc - 1;
+ const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+ const int64_t dist_low =
+ get_coeff_dist(abs_tqc, abs_dqc_low, shift, qmatrix, ci);
+ const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+
+ if (rd_low < rd) {
+ const int sign = (qc < 0) ? 1 : 0;
+ qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+ dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
+ levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+ *accu_rate += rate_low;
+ } else {
+ *accu_rate += rate;
+ }
+ }
+}
+
+static AOM_FORCE_INLINE void update_coeff_eob(
+ int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+ int si, TX_SIZE tx_size, TX_CLASS tx_class, int bhl, int width,
+ int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+ const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+ const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
+ const qm_val_t *iqmatrix, const qm_val_t *qmatrix) {
+ const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+ assert(si != *eob - 1);
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const int coeff_ctx =
+ get_lower_levels_ctx(levels, ci, bhl, tx_size, tx_class);
+ if (qc == 0) {
+ *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ int lower_level = 0;
+ const tran_low_t abs_qc = abs(qc);
+ const tran_low_t tqc = tcoeff[ci];
+ const tran_low_t dqc = dqcoeff[ci];
+ const int sign = (qc < 0) ? 1 : 0;
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+ int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci) - dist0;
+ int rate =
+ get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+ txb_costs, bhl, tx_class, levels);
+ int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+
+ tran_low_t qc_low, dqc_low;
+ tran_low_t abs_qc_low;
+ int64_t dist_low, rd_low;
+ int rate_low;
+
+ if (abs_qc == 1) {
+ abs_qc_low = 0;
+ dqc_low = qc_low = 0;
+ dist_low = 0;
+ rate_low = txb_costs->base_cost[coeff_ctx][0];
+ rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
+ } else {
+ get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+ abs_qc_low = abs_qc - 1;
+ dist_low = get_coeff_dist(tqc, dqc_low, shift, qmatrix, ci) - dist0;
+ rate_low =
+ get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
+ dc_sign_ctx, txb_costs, bhl, tx_class, levels);
+ rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+ }
+
+ int lower_level_new_eob = 0;
+ const int new_eob = si + 1;
+ const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bhl, width, si);
+ const int new_eob_cost =
+ get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
+ int rate_coeff_eob =
+ new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob,
+ dc_sign_ctx, txb_costs, bhl,
+ tx_class);
+ int64_t dist_new_eob = dist;
+ int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
+
+ if (abs_qc_low > 0) {
+ const int rate_coeff_eob_low =
+ new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign,
+ coeff_ctx_new_eob, dc_sign_ctx,
+ txb_costs, bhl, tx_class);
+ const int64_t dist_new_eob_low = dist_low;
+ const int64_t rd_new_eob_low =
+ RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
+ if (rd_new_eob_low < rd_new_eob) {
+ lower_level_new_eob = 1;
+ rd_new_eob = rd_new_eob_low;
+ rate_coeff_eob = rate_coeff_eob_low;
+ dist_new_eob = dist_new_eob_low;
+ }
+ }
+
+ if (sharpness == 0 || abs_qc > 1) {
+ if (rd_low < rd) {
+ lower_level = 1;
+ rd = rd_low;
+ rate = rate_low;
+ dist = dist_low;
+ }
+ }
+
+ if (sharpness == 0 && rd_new_eob < rd) {
+ for (int ni = 0; ni < *nz_num; ++ni) {
+ int last_ci = nz_ci[ni];
+ levels[get_padded_idx(last_ci, bhl)] = 0;
+ qcoeff[last_ci] = 0;
+ dqcoeff[last_ci] = 0;
+ }
+ *eob = new_eob;
+ *nz_num = 0;
+ *accu_rate = rate_coeff_eob;
+ *accu_dist = dist_new_eob;
+ lower_level = lower_level_new_eob;
+ } else {
+ *accu_rate += rate;
+ *accu_dist += dist;
+ }
+
+ if (lower_level) {
+ qcoeff[ci] = qc_low;
+ dqcoeff[ci] = dqc_low;
+ levels[get_padded_idx(ci, bhl)] = AOMMIN(abs_qc_low, INT8_MAX);
+ }
+ if (qcoeff[ci]) {
+ nz_ci[*nz_num] = ci;
+ ++*nz_num;
+ }
+ }
+}
+
+static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
+ int nz_num, int *nz_ci, int64_t rdmult,
+ int skip_cost, int non_skip_cost,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff) {
+ const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
+ const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
+ if (rd_new_eob < rd) {
+ for (int i = 0; i < nz_num; ++i) {
+ const int ci = nz_ci[i];
+ qcoeff[ci] = 0;
+ dqcoeff[ci] = 0;
+ // no need to set up levels because this is the last step
+ // levels[get_padded_idx(ci, bhl)] = 0;
+ }
+ *accu_rate = 0;
+ *eob = 0;
+ }
+}
+
+// TODO(angiebird): use this function whenever it's possible
+static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd,
+ int plane, TX_SIZE tx_size, TX_TYPE tx_type,
+ int reduced_tx_set_used) {
+ if (plane > 0) return 0;
+
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+ !xd->lossless[xd->mi[0]->segment_id]) {
+ const int ext_tx_set =
+ get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
+ if (is_inter) {
+ if (ext_tx_set > 0)
+ return x->mode_costs
+ .inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
+ } else {
+ if (ext_tx_set > 0) {
+ PREDICTION_MODE intra_dir;
+ if (mbmi->filter_intra_mode_info.use_filter_intra)
+ intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+ .filter_intra_mode];
+ else
+ intra_dir = mbmi->mode;
+ return x->mode_costs.intra_tx_type_costs[ext_tx_set][square_tx_size]
+ [intra_dir][tx_type];
+ }
+ }
+ }
+ return 0;
+}
+
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost,
+ int sharpness) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblock_plane *p = &x->plane[plane];
+ const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+ const int16_t *scan = scan_order->scan;
+ const int shift = av1_get_tx_scale(tx_size);
+ int eob = p->eobs[block];
+ const int16_t *dequant = p->dequant_QTX;
+ const qm_val_t *iqmatrix =
+ av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
+ const qm_val_t *qmatrix =
+ cpi->oxcf.tune_cfg.dist_metric == AOM_DIST_METRIC_QM_PSNR
+ ? av1_get_qmatrix(&cpi->common.quant_params, xd, plane, tx_size,
+ tx_type)
+ : NULL;
+ const int block_offset = BLOCK_OFFSET(block);
+ tran_low_t *qcoeff = p->qcoeff + block_offset;
+ tran_low_t *dqcoeff = p->dqcoeff + block_offset;
+ const tran_low_t *tcoeff = p->coeff + block_offset;
+ const CoeffCosts *coeff_costs = &x->coeff_costs;
+
+ // This function is not called if eob = 0.
+ assert(eob > 0);
+
+ const AV1_COMMON *cm = &cpi->common;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ assert(height == (1 << bhl));
+ const int is_inter = is_inter_block(mbmi);
+ const LV_MAP_COEFF_COST *txb_costs =
+ &coeff_costs->coeff_costs[txs_ctx][plane_type];
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const LV_MAP_EOB_COST *txb_eob_costs =
+ &coeff_costs->eob_costs[eob_multi_size][plane_type];
+
+ const int rshift = 2;
+
+ const int64_t rdmult =
+ (((int64_t)x->rdmult *
+ (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
+ 2) >>
+ rshift;
+
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+
+ if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels);
+
+ // TODO(angirbird): check iqmatrix
+
+ const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
+ const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+ const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
+ int accu_rate = eob_cost;
+ int64_t accu_dist = 0;
+ int si = eob - 1;
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const tran_low_t abs_qc = abs(qc);
+ const int sign = qc < 0;
+ const int max_nz_num = 2;
+ int nz_num = 1;
+ int nz_ci[3] = { ci, 0, 0 };
+ if (abs_qc >= 2) {
+ update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
+ bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx,
+ dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+ levels, iqmatrix, qmatrix);
+ --si;
+ } else {
+ assert(abs_qc == 1);
+ const int coeff_ctx = get_lower_levels_ctx_eob(bhl, width, si);
+ accu_rate +=
+ get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx,
+ txb_costs, bhl, tx_class);
+ const tran_low_t tqc = tcoeff[ci];
+ const tran_low_t dqc = dqcoeff[ci];
+ const int64_t dist = get_coeff_dist(tqc, dqc, shift, qmatrix, ci);
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift, qmatrix, ci);
+ accu_dist += dist - dist0;
+ --si;
+ }
+
+#define UPDATE_COEFF_EOB_CASE(tx_class_literal) \
+ case tx_class_literal: \
+ for (; si >= 0 && nz_num <= max_nz_num; --si) { \
+ update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \
+ tx_size, tx_class_literal, bhl, width, \
+ txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
+ txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff, \
+ levels, sharpness, iqmatrix, qmatrix); \
+ } \
+ break
+ switch (tx_class) {
+ UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
+ UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
+ UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_EOB_CASE
+ default: assert(false);
+ }
+
+ if (si == -1 && nz_num <= max_nz_num && sharpness == 0) {
+ update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
+ non_skip_cost, qcoeff, dqcoeff);
+ }
+
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal) \
+ case tx_class_literal: \
+ for (; si >= 1; --si) { \
+ update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bhl, \
+ rdmult, shift, dequant, scan, txb_costs, tcoeff, \
+ qcoeff, dqcoeff, levels, iqmatrix, qmatrix); \
+ } \
+ break
+ switch (tx_class) {
+ UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
+ UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
+ UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_SIMPLE_CASE
+ default: assert(false);
+ }
+
+ // DC position
+ if (si == 0) {
+ // no need to update accu_dist because it's not used after this point
+ int64_t dummy_dist = 0;
+ update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
+ bhl, width, rdmult, shift, txb_ctx->dc_sign_ctx,
+ dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+ levels, iqmatrix, qmatrix);
+ }
+
+ const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type,
+ cm->features.reduced_tx_set_used);
+ if (eob == 0)
+ accu_rate += skip_cost;
+ else
+ accu_rate += non_skip_cost + tx_type_cost;
+
+ p->eobs[block] = eob;
+ p->txb_entropy_ctx[block] =
+ av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
+
+ *rate_cost = accu_rate;
+ return eob;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb(
+ const MACROBLOCK *x, const int plane, const int block,
+ const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+ const struct macroblock_plane *p, const int eob,
+ const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+ const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+ int reduced_tx_set_used) {
+ const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+ const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+ const int bhl = get_txb_bhl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const int16_t *const scan = scan_order->scan;
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, height);
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const LV_MAP_EOB_COST *const eob_costs =
+ &x->coeff_costs.eob_costs[eob_multi_size][plane_type];
+ int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+ av1_txb_init_levels(qcoeff, width, height, levels);
+
+ cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+ cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+ const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
+ coeff_costs->lps_cost;
+ int c = eob - 1;
+ {
+ const int pos = scan[c];
+ const tran_low_t v = qcoeff[pos];
+ const int sign = AOMSIGN(v);
+ const int level = (v ^ sign) - sign;
+ const int coeff_ctx = coeff_contexts[pos];
+ cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
+
+ if (v) {
+ // sign bit cost
+ if (level > NUM_BASE_LEVELS) {
+ const int ctx = get_br_ctx_eob(pos, bhl, tx_class);
+ cost += get_br_cost(level, lps_cost[ctx]);
+ }
+ if (c) {
+ cost += av1_cost_literal(1);
+ } else {
+ const int sign01 = (sign ^ sign) - sign;
+ const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+ cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+ return cost;
+ }
+ }
+ }
+ const int(*base_cost)[8] = coeff_costs->base_cost;
+ for (c = eob - 2; c >= 1; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = coeff_contexts[pos];
+ const tran_low_t v = qcoeff[pos];
+ const int level = abs(v);
+ cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+ if (v) {
+ // sign bit cost
+ cost += av1_cost_literal(1);
+ if (level > NUM_BASE_LEVELS) {
+ const int ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ cost += get_br_cost(level, lps_cost[ctx]);
+ }
+ }
+ }
+ // c == 0 after previous loop
+ {
+ const int pos = scan[c];
+ const tran_low_t v = qcoeff[pos];
+ const int coeff_ctx = coeff_contexts[pos];
+ const int sign = AOMSIGN(v);
+ const int level = (v ^ sign) - sign;
+ cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+
+ if (v) {
+ // sign bit cost
+ const int sign01 = (sign ^ sign) - sign;
+ const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+ cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+ if (level > NUM_BASE_LEVELS) {
+ const int ctx = get_br_ctx(levels, pos, bhl, tx_class);
+ cost += get_br_cost(level, lps_cost[ctx]);
+ }
+ }
+ }
+ return cost;
+}
+
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+ const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type) {
+ assert(plane == 0);
+
+ int cost = 0;
+ const struct macroblock_plane *p = &x->plane[plane];
+ const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+ const int16_t *scan = scan_order->scan;
+ tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+
+ int eob = p->eobs[block];
+
+ // coeffs
+ int c = eob - 1;
+ // eob
+ {
+ const int pos = scan[c];
+ const tran_low_t v = abs(qcoeff[pos]) - 1;
+ cost += (v << (AV1_PROB_COST_SHIFT + 2));
+ }
+ // other coeffs
+ for (c = eob - 2; c >= 0; c--) {
+ const int pos = scan[c];
+ const tran_low_t v = abs(qcoeff[pos]);
+ const int idx = AOMMIN(v, 14);
+
+ cost += costLUT[idx];
+ }
+
+ // const_term does not contain DC, and log(e) does not contain eob, so both
+ // (eob-1)
+ cost += (const_term + loge_par) * (eob - 1);
+
+ return cost;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian(
+ const MACROBLOCK *x, const int plane, const int block,
+ const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob,
+ const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+ const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+ int reduced_tx_set_used) {
+ const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const LV_MAP_EOB_COST *const eob_costs =
+ &x->coeff_costs.eob_costs[eob_multi_size][plane_type];
+ int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+ cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+ cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+ cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type);
+ return cost;
+}
+
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+ const TX_SIZE tx_size, const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+ const struct macroblock_plane *p = &x->plane[plane];
+ const int eob = p->eobs[block];
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const LV_MAP_COEFF_COST *const coeff_costs =
+ &x->coeff_costs.coeff_costs[txs_ctx][plane_type];
+ if (eob == 0) {
+ return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+ }
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+ return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob,
+ plane_type, coeff_costs, xd, tx_type,
+ tx_class, reduced_tx_set_used);
+}
+
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+ const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx,
+ const int reduced_tx_set_used,
+ const int adjust_eob) {
+ const struct macroblock_plane *p = &x->plane[plane];
+ int eob = p->eobs[block];
+
+ if (adjust_eob) {
+ const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+ const int16_t *scan = scan_order->scan;
+ tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block);
+ tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+ tran_low_t *dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+ update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan,
+ tcoeff, qcoeff, dqcoeff);
+ p->eobs[block] = eob;
+ }
+
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const LV_MAP_COEFF_COST *const coeff_costs =
+ &x->coeff_costs.coeff_costs[txs_ctx][plane_type];
+ if (eob == 0) {
+ return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+ }
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+ return warehouse_efficients_txb_laplacian(
+ x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd,
+ tx_type, tx_class, reduced_tx_set_used);
+}
diff --git a/third_party/aom/av1/encoder/txb_rdopt.h b/third_party/aom/av1/encoder/txb_rdopt.h
new file mode 100644
index 0000000000..70b322a2e1
--- /dev/null
+++ b/third_party/aom/av1/encoder/txb_rdopt.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TXB_RDOPT_H_
+#define AOM_AV1_ENCODER_TXB_RDOPT_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Adjust the magnitude of quantized coefficients to achieve better
+ * rate-distortion (RD) trade-off.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function goes through each coefficient and greedily choose to lower
+ * the coefficient magnitude by 1 or not based on the RD score.
+ *
+ * The coefficients are processing in reversed scan order.
+ *
+ * Note that, the end of block position (eob) may change if the original last
+ * coefficient is lowered to zero.
+ *
+ * \param[in] cpi Top-level encoder structure
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] plane The index of the current plane
+ * \param[in] block The index of the current transform block in the
+ * \param[in] tx_size The transform size
+ * \param[in] tx_type The transform type
+ * \param[in] txb_ctx Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[out] rate_cost The entropy cost of coding the transform block
+ * after adjustment of coefficients.
+ * \param[in] sharpness When sharpness > 0, the function will be less
+ * aggressive towards lowering the magnitude of coefficients.
+ * In this way, the transform block will contain more high-frequency
+ * coefficients and therefore will preserve the sharpness of the reconstructed
+ * block.
+ */
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost,
+ int sharpness);
+
+/*!\brief Compute the entropy cost of coding coefficients in a transform block.
+ *
+ * \ingroup coefficient_coding
+ *
+ * \param[in] x Pointer to structure holding the data for
+ the current encoding macroblock.
+ * \param[in] plane The index of the current plane.
+ * \param[in] block The index of the current transform block
+ in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block.
+ * \param[in] tx_size The transform size.
+ * \param[in] tx_type The transform type.
+ * \param[in] txb_ctx Context info for entropy coding transform
+ block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in] reduced_tx_set_used Whether the transform type is chosen from
+ * a reduced set.
+ */
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+ const TX_SIZE tx_size, const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int reduced_tx_set_used);
+
+/*!\brief Estimate the entropy cost of coding a transform block using Laplacian
+ * distribution.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function compute the entropy costs of the end of block position (eob)
+ * and the transform type (tx_type) precisely.
+ *
+ * Then using \ref av1_cost_coeffs_txb_estimate to estimate the entropy costs
+ * of coefficients in the transform block.
+ *
+ * In the end, the function returns the sum of entropy costs of end of block
+ * position (eob), transform type (tx_type) and coefficients.
+ *
+ * Compared to \ref av1_cost_coeffs_txb, this function is much faster but less
+ * accurate.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] plane The index of the current plane
+ * \param[in] block The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in] tx_size The transform size
+ * \param[in] tx_type The transform type
+ * \param[in] txb_ctx Context info for entropy coding transform block
+ * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
+ * \param[in] reduced_tx_set_used Whether the transform type is chosen from
+ * a reduced set.
+ * \param[in] adjust_eob Whether to adjust the end of block position
+ (eob)
+ * or not.
+ * \return int Estimated entropy cost of coding the transform
+ block.
+ */
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+ const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx,
+ const int reduced_tx_set_used,
+ const int adjust_eob);
+
+/*!\brief Estimate the entropy cost of transform coefficients using Laplacian
+ * distribution.
+ *
+ * \ingroup coefficient_coding
+ *
+ * This function assumes each transform coefficient is of its own Laplacian
+ * distribution and the coefficient is the only observation of the Laplacian
+ * distribution.
+ *
+ * Based on that, each coefficient's coding cost can be estimated by computing
+ * the entropy of the corresponding Laplacian distribution.
+ *
+ * This function then return the sum of the estimated entropy cost for all
+ * coefficients in the transform block.
+ *
+ * Note that the entropy cost of end of block (eob) and transform type (tx_type)
+ * are not included.
+ *
+ * \param[in] x Pointer to structure holding the data for the
+ current encoding macroblock
+ * \param[in] plane The index of the current plane
+ * \param[in] block The index of the current transform block in the
+ * macroblock. It's defined by number of 4x4 units that have been coded before
+ * the currernt transform block
+ * \param[in] tx_size The transform size
+ * \param[in] tx_type The transform type
+ * \return int Estimated entropy cost of coefficients in the
+ * transform block.
+ */
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+ const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_TXB_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/txb_rdopt_utils.h b/third_party/aom/av1/encoder/txb_rdopt_utils.h
new file mode 100644
index 0000000000..b9f08aacf0
--- /dev/null
+++ b/third_party/aom/av1/encoder/txb_rdopt_utils.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
+#define AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
+
+#include "av1/encoder/encodetxb.h"
+
+static const int golomb_bits_cost[32] = {
+ 0, 512, 512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
+ 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
+ 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
+ 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
+};
+
+static const int golomb_cost_diff[32] = {
+ 0, 512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
+ 512 * 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+// Look up table of individual cost of coefficient by its quantization level.
+// determined based on Laplacian distribution conditioned on estimated context
+static const int costLUT[15] = { -1143, 53, 545, 825, 1031,
+ 1209, 1393, 1577, 1763, 1947,
+ 2132, 2317, 2501, 2686, 2871 };
+
+static const int const_term = (1 << AV1_PROB_COST_SHIFT);
+
+static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000;
+
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+ const qm_val_t *iqmatrix) {
+ int dqv = dequant[!!coeff_idx];
+ if (iqmatrix != NULL)
+ dqv =
+ ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ return dqv;
+}
+
+static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+ int shift, const qm_val_t *qmatrix,
+ int coeff_idx) {
+ int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
+ if (qmatrix == NULL) {
+ return diff * diff;
+ }
+ // When AOM_DIST_METRIC_QM_PSNR is enabled, this mirrors the rate-distortion
+ // computation done in av1_block_error_qm, improving visual quality.
+ // The maximum value of `shift` is 2, `tcoeff` and `dqcoeff` are at most 22
+ // bits, and AOM_QM_BITS is 5, so `diff` should fit in 29-bits. The
+ // multiplication `diff * diff` then does not risk overflowing.
+ diff *= qmatrix[coeff_idx];
+ const int64_t error =
+ (diff * diff + (1 << (2 * AOM_QM_BITS - 1))) >> (2 * AOM_QM_BITS);
+ return error;
+}
+
+static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
+ const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
+ int eob_extra;
+ const int eob_pt = av1_get_eob_pos_token(eob, &eob_extra);
+ int eob_cost = 0;
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+ eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
+
+ if (av1_eob_offset_bits[eob_pt] > 0) {
+ const int eob_ctx = eob_pt - 3;
+ const int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
+ const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+ eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
+ const int offset_bits = av1_eob_offset_bits[eob_pt];
+ if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
+ }
+ return eob_cost;
+}
+
+static INLINE int get_golomb_cost(int abs_qc) {
+ if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+ const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+ const int length = get_msb(r) + 1;
+ return av1_cost_literal(2 * length - 1);
+ }
+ return 0;
+}
+
+static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
+ const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+ return coeff_lps[base_range] + get_golomb_cost(level);
+}
+
+static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
+ int *diff) {
+ const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+ int golomb_bits = 0;
+ if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
+ *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
+
+ if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
+ int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+ if (r < 32) {
+ golomb_bits = golomb_bits_cost[r];
+ *diff += golomb_cost_diff[r];
+ } else {
+ golomb_bits = get_golomb_cost(level);
+ *diff += (r & (r - 1)) == 0 ? 1024 : 0;
+ }
+ }
+
+ return coeff_lps[base_range] + golomb_bits;
+}
+
+static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
+ int ci, tran_low_t abs_qc, int coeff_ctx,
+ const LV_MAP_COEFF_COST *txb_costs, int bhl, TX_CLASS tx_class,
+ const uint8_t *levels, int *cost_low) {
+ // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+ // and not the last (scan_idx != eob - 1)
+ assert(ci > 0);
+ int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+ int diff = 0;
+ if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
+ if (abs_qc) {
+ cost += av1_cost_literal(1);
+ if (abs_qc > NUM_BASE_LEVELS) {
+ const int br_ctx = get_br_ctx(levels, ci, bhl, tx_class);
+ int brcost_diff = 0;
+ cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
+ &brcost_diff);
+ diff += brcost_diff;
+ }
+ }
+ *cost_low = cost - diff;
+
+ return cost;
+}
+
+static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
+ int coeff_ctx, int dc_sign_ctx,
+ const LV_MAP_COEFF_COST *txb_costs,
+ int bhl, TX_CLASS tx_class) {
+ int cost = 0;
+ cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+ if (abs_qc != 0) {
+ if (ci == 0) {
+ cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+ } else {
+ cost += av1_cost_literal(1);
+ }
+ if (abs_qc > NUM_BASE_LEVELS) {
+ int br_ctx;
+ br_ctx = get_br_ctx_eob(ci, bhl, tx_class);
+ cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+ }
+ }
+ return cost;
+}
+
+static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
+ int sign, int coeff_ctx,
+ int dc_sign_ctx,
+ const LV_MAP_COEFF_COST *txb_costs,
+ int bhl, TX_CLASS tx_class,
+ const uint8_t *levels) {
+ int cost = 0;
+ if (is_last) {
+ cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+ } else {
+ cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+ }
+ if (abs_qc != 0) {
+ if (ci == 0) {
+ cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+ } else {
+ cost += av1_cost_literal(1);
+ }
+ if (abs_qc > NUM_BASE_LEVELS) {
+ int br_ctx;
+ if (is_last)
+ br_ctx = get_br_ctx_eob(ci, bhl, tx_class);
+ else
+ br_ctx = get_br_ctx(levels, ci, bhl, tx_class);
+ cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+ }
+ }
+ return cost;
+}
+
+static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
+ int shift, tran_low_t *qc_low,
+ tran_low_t *dqc_low) {
+ tran_low_t abs_qc_low = abs_qc - 1;
+ *qc_low = (-sign ^ abs_qc_low) + sign;
+ assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
+ tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+ *dqc_low = (-sign ^ abs_dqc_low) + sign;
+ assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
+}
+
+static INLINE void update_coeff_eob_fast(int *eob, int shift,
+ const int16_t *dequant_ptr,
+ const int16_t *scan,
+ const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr) {
+ // TODO(sarahparker) make this work for aomqm
+ int eob_out = *eob;
+ int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
+ dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
+
+ for (int i = *eob - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
+ eob_out--;
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ } else {
+ break;
+ }
+ }
+
+ *eob = eob_out;
+}
+#endif // AOM_AV1_ENCODER_TXB_RDOPT_UTILS_H_
diff --git a/third_party/aom/av1/encoder/var_based_part.c b/third_party/aom/av1/encoder/var_based_part.c
new file mode 100644
index 0000000000..f664795153
--- /dev/null
+++ b/third_party/aom/av1/encoder/var_based_part.c
@@ -0,0 +1,1914 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/rdopt_utils.h"
+
+// Possible values for the force_split variable while evaluating variance based
+// partitioning.
+enum {
+ // Evaluate all partition types
+ PART_EVAL_ALL = 0,
+ // Force PARTITION_SPLIT
+ PART_EVAL_ONLY_SPLIT = 1,
+ // Force PARTITION_NONE
+ PART_EVAL_ONLY_NONE = 2
+} UENUM1BYTE(PART_EVAL_STATUS);
+
+typedef struct {
+ VPVariance *part_variances;
+ VPartVar *split[4];
+} variance_node;
+
+static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize,
+ variance_node *node) {
+ node->part_variances = NULL;
+ switch (bsize) {
+ case BLOCK_128X128: {
+ VP128x128 *vt = (VP128x128 *)data;
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+ break;
+ }
+ case BLOCK_64X64: {
+ VP64x64 *vt = (VP64x64 *)data;
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+ break;
+ }
+ case BLOCK_32X32: {
+ VP32x32 *vt = (VP32x32 *)data;
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+ break;
+ }
+ case BLOCK_16X16: {
+ VP16x16 *vt = (VP16x16 *)data;
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+ break;
+ }
+ case BLOCK_8X8: {
+ VP8x8 *vt = (VP8x8 *)data;
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx].part_variances.none;
+ break;
+ }
+ default: {
+ VP4x4 *vt = (VP4x4 *)data;
+ assert(bsize == BLOCK_4X4);
+ node->part_variances = &vt->part_variances;
+ for (int split_idx = 0; split_idx < 4; split_idx++)
+ node->split[split_idx] = &vt->split[split_idx];
+ break;
+ }
+ }
+}
+
+// Set variance values given sum square error, sum error, count.
+static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c,
+ VPartVar *v) {
+ v->sum_square_error = s2;
+ v->sum_error = s;
+ v->log2_count = c;
+}
+
+static AOM_INLINE void get_variance(VPartVar *v) {
+ v->variance =
+ (int)(256 * (v->sum_square_error -
+ (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
+ v->log2_count)) >>
+ v->log2_count);
+}
+
+static AOM_INLINE void sum_2_variances(const VPartVar *a, const VPartVar *b,
+ VPartVar *r) {
+ assert(a->log2_count == b->log2_count);
+ fill_variance(a->sum_square_error + b->sum_square_error,
+ a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+ variance_node node;
+ memset(&node, 0, sizeof(node));
+ tree_to_node(data, bsize, &node);
+ sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+ sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+ sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+ sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+ sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+ &node.part_variances->none);
+}
+
+static AOM_INLINE void set_block_size(AV1_COMP *const cpi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ if (cpi->common.mi_params.mi_cols > mi_col &&
+ cpi->common.mi_params.mi_rows > mi_row) {
+ CommonModeInfoParams *mi_params = &cpi->common.mi_params;
+ const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+ const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
+ MB_MODE_INFO *mi = mi_params->mi_grid_base[mi_grid_idx] =
+ &mi_params->mi_alloc[mi_alloc_idx];
+ mi->bsize = bsize;
+ }
+}
+
+static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCKD *const xd,
+ const TileInfo *const tile, void *data,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int64_t threshold, BLOCK_SIZE bsize_min,
+ PART_EVAL_STATUS force_split) {
+ AV1_COMMON *const cm = &cpi->common;
+ variance_node vt;
+ const int block_width = mi_size_wide[bsize];
+ const int block_height = mi_size_high[bsize];
+ int bs_width_check = block_width;
+ int bs_height_check = block_height;
+ int bs_width_vert_check = block_width >> 1;
+ int bs_height_horiz_check = block_height >> 1;
+ // On the right and bottom boundary we only need to check
+ // if half the bsize fits, because boundary is extended
+ // up to 64. So do this check only for sb_size = 64X64.
+ if (cm->seq_params->sb_size == BLOCK_64X64) {
+ if (tile->mi_col_end == cm->mi_params.mi_cols) {
+ bs_width_check = (block_width >> 1) + 1;
+ bs_width_vert_check = (block_width >> 2) + 1;
+ }
+ if (tile->mi_row_end == cm->mi_params.mi_rows) {
+ bs_height_check = (block_height >> 1) + 1;
+ bs_height_horiz_check = (block_height >> 2) + 1;
+ }
+ }
+
+ assert(block_height == block_width);
+ tree_to_node(data, bsize, &vt);
+
+ if (mi_col + bs_width_check <= tile->mi_col_end &&
+ mi_row + bs_height_check <= tile->mi_row_end &&
+ force_split == PART_EVAL_ONLY_NONE) {
+ set_block_size(cpi, mi_row, mi_col, bsize);
+ return 1;
+ }
+ if (force_split == PART_EVAL_ONLY_SPLIT) return 0;
+
+ // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+ // variance is below threshold, otherwise split will be selected.
+ // No check for vert/horiz split as too few samples for variance.
+ if (bsize == bsize_min) {
+ // Variance already computed to set the force_split.
+ if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+ if (mi_col + bs_width_check <= tile->mi_col_end &&
+ mi_row + bs_height_check <= tile->mi_row_end &&
+ vt.part_variances->none.variance < threshold) {
+ set_block_size(cpi, mi_row, mi_col, bsize);
+ return 1;
+ }
+ return 0;
+ } else if (bsize > bsize_min) {
+ // Variance already computed to set the force_split.
+ if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+ // For key frame: take split for bsize above 32X32 or very high variance.
+ if (frame_is_intra_only(cm) &&
+ (bsize > BLOCK_32X32 ||
+ vt.part_variances->none.variance > (threshold << 4))) {
+ return 0;
+ }
+ // If variance is low, take the bsize (no split).
+ if (mi_col + bs_width_check <= tile->mi_col_end &&
+ mi_row + bs_height_check <= tile->mi_row_end &&
+ vt.part_variances->none.variance < threshold) {
+ set_block_size(cpi, mi_row, mi_col, bsize);
+ return 1;
+ }
+ // Check vertical split.
+ if (mi_row + bs_height_check <= tile->mi_row_end &&
+ mi_col + bs_width_vert_check <= tile->mi_col_end) {
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+ BLOCK_SIZE plane_bsize =
+ get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+ get_variance(&vt.part_variances->vert[0]);
+ get_variance(&vt.part_variances->vert[1]);
+ if (vt.part_variances->vert[0].variance < threshold &&
+ vt.part_variances->vert[1].variance < threshold &&
+ plane_bsize < BLOCK_INVALID) {
+ set_block_size(cpi, mi_row, mi_col, subsize);
+ set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
+ return 1;
+ }
+ }
+ // Check horizontal split.
+ if (mi_col + bs_width_check <= tile->mi_col_end &&
+ mi_row + bs_height_horiz_check <= tile->mi_row_end) {
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+ BLOCK_SIZE plane_bsize =
+ get_plane_block_size(subsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+ get_variance(&vt.part_variances->horz[0]);
+ get_variance(&vt.part_variances->horz[1]);
+ if (vt.part_variances->horz[0].variance < threshold &&
+ vt.part_variances->horz[1].variance < threshold &&
+ plane_bsize < BLOCK_INVALID) {
+ set_block_size(cpi, mi_row, mi_col, subsize);
+ set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
+ return 1;
+ }
+ }
+ return 0;
+ }
+ return 0;
+}
+
+static AOM_INLINE int all_blks_inside(int x16_idx, int y16_idx, int pixels_wide,
+ int pixels_high) {
+ int all_inside = 1;
+ for (int idx = 0; idx < 4; idx++) {
+ all_inside &= ((x16_idx + GET_BLK_IDX_X(idx, 3)) < pixels_wide);
+ all_inside &= ((y16_idx + GET_BLK_IDX_Y(idx, 3)) < pixels_high);
+ }
+ return all_inside;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// TODO(yunqingwang): Perform average of four 8x8 blocks similar to lowbd
+static AOM_INLINE void fill_variance_8x8avg_highbd(
+ const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
+ int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide,
+ int pixels_high) {
+ for (int idx = 0; idx < 4; idx++) {
+ const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3);
+ const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ int src_avg = aom_highbd_avg_8x8(src_buf + y8_idx * src_stride + x8_idx,
+ src_stride);
+ int dst_avg = aom_highbd_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx,
+ dst_stride);
+
+ sum = src_avg - dst_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none);
+ }
+}
+#endif
+
+static AOM_INLINE void fill_variance_8x8avg_lowbd(
+ const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
+ int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int pixels_wide,
+ int pixels_high) {
+ unsigned int sse[4] = { 0 };
+ int sum[4] = { 0 };
+
+ if (all_blks_inside(x16_idx, y16_idx, pixels_wide, pixels_high)) {
+ int src_avg[4];
+ int dst_avg[4];
+ aom_avg_8x8_quad(src_buf, src_stride, x16_idx, y16_idx, src_avg);
+ aom_avg_8x8_quad(dst_buf, dst_stride, x16_idx, y16_idx, dst_avg);
+ for (int idx = 0; idx < 4; idx++) {
+ sum[idx] = src_avg[idx] - dst_avg[idx];
+ sse[idx] = sum[idx] * sum[idx];
+ }
+ } else {
+ for (int idx = 0; idx < 4; idx++) {
+ const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3);
+ const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3);
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ int src_avg =
+ aom_avg_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride);
+ int dst_avg =
+ aom_avg_8x8(dst_buf + y8_idx * dst_stride + x8_idx, dst_stride);
+ sum[idx] = src_avg - dst_avg;
+ sse[idx] = sum[idx] * sum[idx];
+ }
+ }
+ }
+
+ for (int idx = 0; idx < 4; idx++) {
+ fill_variance(sse[idx], sum[idx], 0, &vst->split[idx].part_variances.none);
+ }
+}
+
+// Obtain parameters required to calculate variance (such as sum, sse, etc,.)
+// at 8x8 sub-block level for a given 16x16 block.
+// The function can be called only when is_key_frame is false since sum is
+// computed between source and reference frames.
+static AOM_INLINE void fill_variance_8x8avg(
+ const uint8_t *src_buf, int src_stride, const uint8_t *dst_buf,
+ int dst_stride, int x16_idx, int y16_idx, VP16x16 *vst, int highbd_flag,
+ int pixels_wide, int pixels_high) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd_flag) {
+ fill_variance_8x8avg_highbd(src_buf, src_stride, dst_buf, dst_stride,
+ x16_idx, y16_idx, vst, pixels_wide,
+ pixels_high);
+ return;
+ }
+#else
+ (void)highbd_flag;
+#endif // CONFIG_AV1_HIGHBITDEPTH
+ fill_variance_8x8avg_lowbd(src_buf, src_stride, dst_buf, dst_stride, x16_idx,
+ y16_idx, vst, pixels_wide, pixels_high);
+}
+
+static int compute_minmax_8x8(const uint8_t *src_buf, int src_stride,
+ const uint8_t *dst_buf, int dst_stride,
+ int x16_idx, int y16_idx,
+#if CONFIG_AV1_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide, int pixels_high) {
+ int minmax_max = 0;
+ int minmax_min = 255;
+ // Loop over the 4 8x8 subblocks.
+ for (int idx = 0; idx < 4; idx++) {
+ const int x8_idx = x16_idx + GET_BLK_IDX_X(idx, 3);
+ const int y8_idx = y16_idx + GET_BLK_IDX_Y(idx, 3);
+ int min = 0;
+ int max = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_minmax_8x8(
+ src_buf + y8_idx * src_stride + x8_idx, src_stride,
+ dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min, &max);
+ } else {
+ aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride,
+ dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min,
+ &max);
+ }
+#else
+ aom_minmax_8x8(src_buf + y8_idx * src_stride + x8_idx, src_stride,
+ dst_buf + y8_idx * dst_stride + x8_idx, dst_stride, &min,
+ &max);
+#endif
+ if ((max - min) > minmax_max) minmax_max = (max - min);
+ if ((max - min) < minmax_min) minmax_min = (max - min);
+ }
+ }
+ return (minmax_max - minmax_min);
+}
+
+// Function to compute average and variance of 4x4 sub-block.
+// The function can be called only when is_key_frame is true since sum is
+// computed using source frame only.
+static AOM_INLINE void fill_variance_4x4avg(const uint8_t *src_buf,
+ int src_stride, int x8_idx,
+ int y8_idx, VP8x8 *vst,
+#if CONFIG_AV1_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide, int pixels_high,
+ int border_offset_4x4) {
+ for (int idx = 0; idx < 4; idx++) {
+ const int x4_idx = x8_idx + GET_BLK_IDX_X(idx, 2);
+ const int y4_idx = y8_idx + GET_BLK_IDX_Y(idx, 2);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x4_idx < pixels_wide - border_offset_4x4 &&
+ y4_idx < pixels_high - border_offset_4x4) {
+ int src_avg;
+ int dst_avg = 128;
+#if CONFIG_AV1_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ src_avg = aom_highbd_avg_4x4(src_buf + y4_idx * src_stride + x4_idx,
+ src_stride);
+ } else {
+ src_avg =
+ aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride);
+ }
+#else
+ src_avg = aom_avg_4x4(src_buf + y4_idx * src_stride + x4_idx, src_stride);
+#endif
+
+ sum = src_avg - dst_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[idx].part_variances.none);
+ }
+}
+
+// TODO(kyslov) Bring back threshold adjustment based on content state
+static int64_t scale_part_thresh_content(int64_t threshold_base, int speed,
+ int width, int height,
+ int non_reference_frame) {
+ (void)width;
+ (void)height;
+ int64_t threshold = threshold_base;
+ if (non_reference_frame) threshold = (3 * threshold) >> 1;
+ if (speed >= 8) {
+ return (5 * threshold) >> 2;
+ }
+ return threshold;
+}
+
+// Tune thresholds less or more aggressively to prefer larger partitions
+static AOM_INLINE void tune_thresh_based_on_qindex(
+ AV1_COMP *cpi, int64_t thresholds[], uint64_t block_sad, int current_qindex,
+ int num_pixels, bool is_segment_id_boosted, int source_sad_nonrd,
+ int lighting_change) {
+ double weight;
+ if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 3) {
+ const int win = 20;
+ if (current_qindex < QINDEX_LARGE_BLOCK_THR - win)
+ weight = 1.0;
+ else if (current_qindex > QINDEX_LARGE_BLOCK_THR + win)
+ weight = 0.0;
+ else
+ weight =
+ 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + win) / (2 * win);
+ if (num_pixels > RESOLUTION_480P) {
+ for (int i = 0; i < 4; i++) {
+ thresholds[i] <<= 1;
+ }
+ }
+ if (num_pixels <= RESOLUTION_288P) {
+ thresholds[3] = INT64_MAX;
+ if (is_segment_id_boosted == false) {
+ thresholds[1] <<= 2;
+ thresholds[2] <<= (source_sad_nonrd <= kLowSad) ? 5 : 4;
+ } else {
+ thresholds[1] <<= 1;
+ thresholds[2] <<= 3;
+ }
+ // Allow for split to 8x8 for superblocks where part of it has
+ // moving boundary. So allow for sb with source_sad above threshold,
+ // and avoid very large source_sad or high source content, to avoid
+ // too many 8x8 within superblock.
+ uint64_t avg_source_sad_thresh = 25000;
+ uint64_t block_sad_low = 25000;
+ uint64_t block_sad_high = 50000;
+ if (cpi->svc.temporal_layer_id == 0 &&
+ cpi->svc.number_temporal_layers > 1) {
+ // Increase the sad thresholds for base TL0, as reference/LAST is
+ // 2/4 frames behind (for 2/3 #TL).
+ avg_source_sad_thresh = 40000;
+ block_sad_high = 70000;
+ }
+ if (is_segment_id_boosted == false &&
+ cpi->rc.avg_source_sad < avg_source_sad_thresh &&
+ block_sad > block_sad_low && block_sad < block_sad_high &&
+ !lighting_change) {
+ thresholds[2] = (3 * thresholds[2]) >> 2;
+ thresholds[3] = thresholds[2] << 3;
+ }
+ // Condition the increase of partition thresholds on the segment
+ // and the content. Avoid the increase for superblocks which have
+ // high source sad, unless the whole frame has very high motion
+ // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks
+ // have high source sad).
+ } else if (num_pixels > RESOLUTION_480P && is_segment_id_boosted == false &&
+ (source_sad_nonrd != kHighSad ||
+ cpi->rc.avg_source_sad > 50000)) {
+ thresholds[0] = (3 * thresholds[0]) >> 1;
+ thresholds[3] = INT64_MAX;
+ if (current_qindex > QINDEX_LARGE_BLOCK_THR) {
+ thresholds[1] =
+ (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]);
+ thresholds[2] =
+ (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]);
+ }
+ } else if (current_qindex > QINDEX_LARGE_BLOCK_THR &&
+ is_segment_id_boosted == false &&
+ (source_sad_nonrd != kHighSad ||
+ cpi->rc.avg_source_sad > 50000)) {
+ thresholds[1] =
+ (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]);
+ thresholds[2] =
+ (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]);
+ thresholds[3] = INT64_MAX;
+ }
+ } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 2) {
+ thresholds[1] <<= (source_sad_nonrd <= kLowSad) ? 2 : 0;
+ thresholds[2] =
+ (source_sad_nonrd <= kLowSad) ? (3 * thresholds[2]) : thresholds[2];
+ } else if (cpi->sf.rt_sf.prefer_large_partition_blocks >= 1) {
+ const int fac = (source_sad_nonrd <= kLowSad) ? 2 : 1;
+ if (current_qindex < QINDEX_LARGE_BLOCK_THR - 45)
+ weight = 1.0;
+ else if (current_qindex > QINDEX_LARGE_BLOCK_THR + 45)
+ weight = 0.0;
+ else
+ weight = 1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + 45) / (2 * 45);
+ thresholds[1] =
+ (int)((1 - weight) * (thresholds[1] << 1) + weight * thresholds[1]);
+ thresholds[2] =
+ (int)((1 - weight) * (thresholds[2] << 1) + weight * thresholds[2]);
+ thresholds[3] =
+ (int)((1 - weight) * (thresholds[3] << fac) + weight * thresholds[3]);
+ }
+ if (cpi->sf.part_sf.disable_8x8_part_based_on_qidx && (current_qindex < 128))
+ thresholds[3] = INT64_MAX;
+}
+
+static void set_vbp_thresholds_key_frame(AV1_COMP *cpi, int64_t thresholds[],
+ int64_t threshold_base,
+ int threshold_left_shift,
+ int num_pixels) {
+ if (cpi->sf.rt_sf.force_large_partition_blocks_intra) {
+ const int shift_steps =
+ threshold_left_shift - (cpi->oxcf.mode == ALLINTRA ? 7 : 8);
+ assert(shift_steps >= 0);
+ threshold_base <<= shift_steps;
+ }
+ thresholds[0] = threshold_base;
+ thresholds[1] = threshold_base;
+ if (num_pixels < RESOLUTION_720P) {
+ thresholds[2] = threshold_base / 3;
+ thresholds[3] = threshold_base >> 1;
+ } else {
+ int shift_val = 2;
+ if (cpi->sf.rt_sf.force_large_partition_blocks_intra) {
+ shift_val = 0;
+ }
+
+ thresholds[2] = threshold_base >> shift_val;
+ thresholds[3] = threshold_base >> shift_val;
+ }
+ thresholds[4] = threshold_base << 2;
+}
+
+static AOM_INLINE void tune_thresh_based_on_resolution(
+ AV1_COMP *cpi, int64_t thresholds[], int64_t threshold_base,
+ int current_qindex, int source_sad_rd, int num_pixels) {
+ if (num_pixels >= RESOLUTION_720P) thresholds[3] = thresholds[3] << 1;
+ if (num_pixels <= RESOLUTION_288P) {
+ const int qindex_thr[5][2] = {
+ { 200, 220 }, { 140, 170 }, { 120, 150 }, { 200, 210 }, { 170, 220 },
+ };
+ int th_idx = 0;
+ if (cpi->sf.rt_sf.var_part_based_on_qidx >= 1)
+ th_idx =
+ (source_sad_rd <= kLowSad) ? cpi->sf.rt_sf.var_part_based_on_qidx : 0;
+ if (cpi->sf.rt_sf.var_part_based_on_qidx >= 3)
+ th_idx = cpi->sf.rt_sf.var_part_based_on_qidx;
+ const int qindex_low_thr = qindex_thr[th_idx][0];
+ const int qindex_high_thr = qindex_thr[th_idx][1];
+ if (current_qindex >= qindex_high_thr) {
+ threshold_base = (5 * threshold_base) >> 1;
+ thresholds[1] = threshold_base >> 3;
+ thresholds[2] = threshold_base << 2;
+ thresholds[3] = threshold_base << 5;
+ } else if (current_qindex < qindex_low_thr) {
+ thresholds[1] = threshold_base >> 3;
+ thresholds[2] = threshold_base >> 1;
+ thresholds[3] = threshold_base << 3;
+ } else {
+ int64_t qi_diff_low = current_qindex - qindex_low_thr;
+ int64_t qi_diff_high = qindex_high_thr - current_qindex;
+ int64_t threshold_diff = qindex_high_thr - qindex_low_thr;
+ int64_t threshold_base_high = (5 * threshold_base) >> 1;
+
+ threshold_diff = threshold_diff > 0 ? threshold_diff : 1;
+ threshold_base =
+ (qi_diff_low * threshold_base_high + qi_diff_high * threshold_base) /
+ threshold_diff;
+ thresholds[1] = threshold_base >> 3;
+ thresholds[2] = ((qi_diff_low * threshold_base) +
+ qi_diff_high * (threshold_base >> 1)) /
+ threshold_diff;
+ thresholds[3] = ((qi_diff_low * (threshold_base << 5)) +
+ qi_diff_high * (threshold_base << 3)) /
+ threshold_diff;
+ }
+ } else if (num_pixels < RESOLUTION_720P) {
+ thresholds[2] = (5 * threshold_base) >> 2;
+ } else if (num_pixels < RESOLUTION_1080P) {
+ thresholds[2] = threshold_base << 1;
+ } else {
+ // num_pixels >= RESOLUTION_1080P
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) {
+ if (num_pixels < RESOLUTION_1440P) {
+ thresholds[2] = (5 * threshold_base) >> 1;
+ } else {
+ thresholds[2] = (7 * threshold_base) >> 1;
+ }
+ } else {
+ if (cpi->oxcf.speed > 7) {
+ thresholds[2] = 6 * threshold_base;
+ } else {
+ thresholds[2] = 3 * threshold_base;
+ }
+ }
+ }
+}
+
+// Increase partition thresholds for noisy content. Apply it only for
+// superblocks where sumdiff is low, as we assume the sumdiff of superblock
+// whose only change is due to noise will be low (i.e, noise will average
+// out over large block).
+static AOM_INLINE int64_t tune_thresh_noisy_content(AV1_COMP *cpi,
+ int64_t threshold_base,
+ int content_lowsumdiff,
+ int num_pixels) {
+ AV1_COMMON *const cm = &cpi->common;
+ int64_t updated_thresh_base = threshold_base;
+ if (cpi->noise_estimate.enabled && content_lowsumdiff &&
+ num_pixels > RESOLUTION_480P && cm->current_frame.frame_number > 60) {
+ NOISE_LEVEL noise_level =
+ av1_noise_estimate_extract_level(&cpi->noise_estimate);
+ if (noise_level == kHigh)
+ updated_thresh_base = (5 * updated_thresh_base) >> 1;
+ else if (noise_level == kMedium &&
+ !cpi->sf.rt_sf.prefer_large_partition_blocks)
+ updated_thresh_base = (5 * updated_thresh_base) >> 2;
+ }
+ // TODO(kyslov) Enable var based partition adjusment on temporal denoising
+#if 0 // CONFIG_AV1_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
+ updated_thresh_base =
+ av1_scale_part_thresh(updated_thresh_base, cpi->denoiser.denoising_level,
+ content_state, cpi->svc.temporal_layer_id);
+ else
+ threshold_base =
+ scale_part_thresh_content(updated_thresh_base, cpi->oxcf.speed, cm->width,
+ cm->height, cpi->ppi->rtc_ref.non_reference_frame);
+#else
+ // Increase base variance threshold based on content_state/sum_diff level.
+ updated_thresh_base = scale_part_thresh_content(
+ updated_thresh_base, cpi->oxcf.speed, cm->width, cm->height,
+ cpi->ppi->rtc_ref.non_reference_frame);
+#endif
+ return updated_thresh_base;
+}
+
+static AOM_INLINE void set_vbp_thresholds(
+ AV1_COMP *cpi, int64_t thresholds[], uint64_t blk_sad, int qindex,
+ int content_lowsumdiff, int source_sad_nonrd, int source_sad_rd,
+ bool is_segment_id_boosted, int lighting_change) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int is_key_frame = frame_is_intra_only(cm);
+ const int threshold_multiplier = is_key_frame ? 120 : 1;
+ const int ac_q = av1_ac_quant_QTX(qindex, 0, cm->seq_params->bit_depth);
+ int64_t threshold_base = (int64_t)(threshold_multiplier * ac_q);
+ const int current_qindex = cm->quant_params.base_qindex;
+ const int threshold_left_shift = cpi->sf.rt_sf.var_part_split_threshold_shift;
+ const int num_pixels = cm->width * cm->height;
+
+ if (is_key_frame) {
+ set_vbp_thresholds_key_frame(cpi, thresholds, threshold_base,
+ threshold_left_shift, num_pixels);
+ return;
+ }
+
+ threshold_base = tune_thresh_noisy_content(cpi, threshold_base,
+ content_lowsumdiff, num_pixels);
+ thresholds[0] = threshold_base >> 1;
+ thresholds[1] = threshold_base;
+ thresholds[3] = threshold_base << threshold_left_shift;
+
+ tune_thresh_based_on_resolution(cpi, thresholds, threshold_base,
+ current_qindex, source_sad_rd, num_pixels);
+
+ tune_thresh_based_on_qindex(cpi, thresholds, blk_sad, current_qindex,
+ num_pixels, is_segment_id_boosted,
+ source_sad_nonrd, lighting_change);
+}
+
+// Set temporal variance low flag for superblock 64x64.
+// Only first 25 in the array are used in this case.
+static AOM_INLINE void set_low_temp_var_flag_64x64(
+ CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
+ MACROBLOCKD *xd, VP64x64 *vt, const int64_t thresholds[], int mi_col,
+ int mi_row) {
+ if (xd->mi[0]->bsize == BLOCK_64X64) {
+ if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
+ part_info->variance_low[0] = 1;
+ } else if (xd->mi[0]->bsize == BLOCK_64X32) {
+ for (int part_idx = 0; part_idx < 2; part_idx++) {
+ if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2))
+ part_info->variance_low[part_idx + 1] = 1;
+ }
+ } else if (xd->mi[0]->bsize == BLOCK_32X64) {
+ for (int part_idx = 0; part_idx < 2; part_idx++) {
+ if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2))
+ part_info->variance_low[part_idx + 3] = 1;
+ }
+ } else {
+ static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+ for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+ const int idx_str = mi_params->mi_stride * (mi_row + idx[lvl1_idx][0]) +
+ mi_col + idx[lvl1_idx][1];
+ MB_MODE_INFO **this_mi = mi_params->mi_grid_base + idx_str;
+
+ if (mi_params->mi_cols <= mi_col + idx[lvl1_idx][1] ||
+ mi_params->mi_rows <= mi_row + idx[lvl1_idx][0])
+ continue;
+
+ if (*this_mi == NULL) continue;
+
+ if ((*this_mi)->bsize == BLOCK_32X32) {
+ int64_t threshold_32x32 = (5 * thresholds[1]) >> 3;
+ if (vt->split[lvl1_idx].part_variances.none.variance < threshold_32x32)
+ part_info->variance_low[lvl1_idx + 5] = 1;
+ } else {
+ // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+ // inside.
+ if ((*this_mi)->bsize == BLOCK_16X16 ||
+ (*this_mi)->bsize == BLOCK_32X16 ||
+ (*this_mi)->bsize == BLOCK_16X32) {
+ for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+ if (vt->split[lvl1_idx]
+ .split[lvl2_idx]
+ .part_variances.none.variance < (thresholds[2] >> 8))
+ part_info->variance_low[(lvl1_idx << 2) + lvl2_idx + 9] = 1;
+ }
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void set_low_temp_var_flag_128x128(
+ CommonModeInfoParams *mi_params, PartitionSearchInfo *part_info,
+ MACROBLOCKD *xd, VP128x128 *vt, const int64_t thresholds[], int mi_col,
+ int mi_row) {
+ if (xd->mi[0]->bsize == BLOCK_128X128) {
+ if (vt->part_variances.none.variance < (thresholds[0] >> 1))
+ part_info->variance_low[0] = 1;
+ } else if (xd->mi[0]->bsize == BLOCK_128X64) {
+ for (int part_idx = 0; part_idx < 2; part_idx++) {
+ if (vt->part_variances.horz[part_idx].variance < (thresholds[0] >> 2))
+ part_info->variance_low[part_idx + 1] = 1;
+ }
+ } else if (xd->mi[0]->bsize == BLOCK_64X128) {
+ for (int part_idx = 0; part_idx < 2; part_idx++) {
+ if (vt->part_variances.vert[part_idx].variance < (thresholds[0] >> 2))
+ part_info->variance_low[part_idx + 3] = 1;
+ }
+ } else {
+ static const int idx64[4][2] = {
+ { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 }
+ };
+ static const int idx32[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+ for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+ const int idx_str = mi_params->mi_stride * (mi_row + idx64[lvl1_idx][0]) +
+ mi_col + idx64[lvl1_idx][1];
+ MB_MODE_INFO **mi_64 = mi_params->mi_grid_base + idx_str;
+ if (*mi_64 == NULL) continue;
+ if (mi_params->mi_cols <= mi_col + idx64[lvl1_idx][1] ||
+ mi_params->mi_rows <= mi_row + idx64[lvl1_idx][0])
+ continue;
+ const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3;
+ if ((*mi_64)->bsize == BLOCK_64X64) {
+ if (vt->split[lvl1_idx].part_variances.none.variance < threshold_64x64)
+ part_info->variance_low[5 + lvl1_idx] = 1;
+ } else if ((*mi_64)->bsize == BLOCK_64X32) {
+ for (int part_idx = 0; part_idx < 2; part_idx++)
+ if (vt->split[lvl1_idx].part_variances.horz[part_idx].variance <
+ (threshold_64x64 >> 1))
+ part_info->variance_low[9 + (lvl1_idx << 1) + part_idx] = 1;
+ } else if ((*mi_64)->bsize == BLOCK_32X64) {
+ for (int part_idx = 0; part_idx < 2; part_idx++)
+ if (vt->split[lvl1_idx].part_variances.vert[part_idx].variance <
+ (threshold_64x64 >> 1))
+ part_info->variance_low[17 + (lvl1_idx << 1) + part_idx] = 1;
+ } else {
+ for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+ const int idx_str1 =
+ mi_params->mi_stride * idx32[lvl2_idx][0] + idx32[lvl2_idx][1];
+ MB_MODE_INFO **mi_32 = mi_params->mi_grid_base + idx_str + idx_str1;
+ if (*mi_32 == NULL) continue;
+
+ if (mi_params->mi_cols <=
+ mi_col + idx64[lvl1_idx][1] + idx32[lvl2_idx][1] ||
+ mi_params->mi_rows <=
+ mi_row + idx64[lvl1_idx][0] + idx32[lvl2_idx][0])
+ continue;
+ const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3;
+ if ((*mi_32)->bsize == BLOCK_32X32) {
+ if (vt->split[lvl1_idx]
+ .split[lvl2_idx]
+ .part_variances.none.variance < threshold_32x32)
+ part_info->variance_low[25 + (lvl1_idx << 2) + lvl2_idx] = 1;
+ } else {
+ // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+ // inside.
+ if ((*mi_32)->bsize == BLOCK_16X16 ||
+ (*mi_32)->bsize == BLOCK_32X16 ||
+ (*mi_32)->bsize == BLOCK_16X32) {
+ for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) {
+ VPartVar *none_var = &vt->split[lvl1_idx]
+ .split[lvl2_idx]
+ .split[lvl3_idx]
+ .part_variances.none;
+ if (none_var->variance < (thresholds[3] >> 8))
+ part_info->variance_low[41 + (lvl1_idx << 4) +
+ (lvl2_idx << 2) + lvl3_idx] = 1;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void set_low_temp_var_flag(
+ AV1_COMP *cpi, PartitionSearchInfo *part_info, MACROBLOCKD *xd,
+ VP128x128 *vt, int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition,
+ int mi_col, int mi_row, const bool is_small_sb) {
+ AV1_COMMON *const cm = &cpi->common;
+ // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected.
+ // If the temporal variance is small set the flag
+ // variance_low for the block. The variance threshold can be adjusted, the
+ // higher the more aggressive.
+ if (ref_frame_partition == LAST_FRAME) {
+ if (is_small_sb)
+ set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd,
+ &(vt->split[0]), thresholds, mi_col, mi_row);
+ else
+ set_low_temp_var_flag_128x128(&cm->mi_params, part_info, xd, vt,
+ thresholds, mi_col, mi_row);
+ }
+}
+
+static const int pos_shift_16x16[4][4] = {
+ { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
+};
+
+int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ // Relative indices of MB inside the superblock.
+ const int mi_x = mi_row & 0xF;
+ const int mi_y = mi_col & 0xF;
+ // Relative indices of 16x16 block inside the superblock.
+ const int i = mi_x >> 2;
+ const int j = mi_y >> 2;
+ int force_skip_low_temp_var = 0;
+ // Set force_skip_low_temp_var based on the block size and block offset.
+ switch (bsize) {
+ case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break;
+ case BLOCK_64X32:
+ if (!mi_y && !mi_x) {
+ force_skip_low_temp_var = variance_low[1];
+ } else if (!mi_y && mi_x) {
+ force_skip_low_temp_var = variance_low[2];
+ }
+ break;
+ case BLOCK_32X64:
+ if (!mi_y && !mi_x) {
+ force_skip_low_temp_var = variance_low[3];
+ } else if (mi_y && !mi_x) {
+ force_skip_low_temp_var = variance_low[4];
+ }
+ break;
+ case BLOCK_32X32:
+ if (!mi_y && !mi_x) {
+ force_skip_low_temp_var = variance_low[5];
+ } else if (mi_y && !mi_x) {
+ force_skip_low_temp_var = variance_low[6];
+ } else if (!mi_y && mi_x) {
+ force_skip_low_temp_var = variance_low[7];
+ } else if (mi_y && mi_x) {
+ force_skip_low_temp_var = variance_low[8];
+ }
+ break;
+ case BLOCK_32X16:
+ case BLOCK_16X32:
+ case BLOCK_16X16:
+ force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+ break;
+ default: break;
+ }
+
+ return force_skip_low_temp_var;
+}
+
+int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ int force_skip_low_temp_var = 0;
+ int x, y;
+ x = (mi_col & 0x1F) >> 4;
+ // y = (mi_row & 0x1F) >> 4;
+ // const int idx64 = (y << 1) + x;
+ y = (mi_row & 0x17) >> 3;
+ const int idx64 = y + x;
+
+ x = (mi_col & 0xF) >> 3;
+ // y = (mi_row & 0xF) >> 3;
+ // const int idx32 = (y << 1) + x;
+ y = (mi_row & 0xB) >> 2;
+ const int idx32 = y + x;
+
+ x = (mi_col & 0x7) >> 2;
+ // y = (mi_row & 0x7) >> 2;
+ // const int idx16 = (y << 1) + x;
+ y = (mi_row & 0x5) >> 1;
+ const int idx16 = y + x;
+ // Set force_skip_low_temp_var based on the block size and block offset.
+ switch (bsize) {
+ case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break;
+ case BLOCK_128X64:
+ assert((mi_col & 0x1F) == 0);
+ force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)];
+ break;
+ case BLOCK_64X128:
+ assert((mi_row & 0x1F) == 0);
+ force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)];
+ break;
+ case BLOCK_64X64:
+ // Location of this 64x64 block inside the 128x128 superblock
+ force_skip_low_temp_var = variance_low[5 + idx64];
+ break;
+ case BLOCK_64X32:
+ x = (mi_col & 0x1F) >> 4;
+ y = (mi_row & 0x1F) >> 3;
+ /*
+ .---------------.---------------.
+ | x=0,y=0,idx=0 | x=0,y=0,idx=2 |
+ :---------------+---------------:
+ | x=0,y=1,idx=1 | x=1,y=1,idx=3 |
+ :---------------+---------------:
+ | x=0,y=2,idx=4 | x=1,y=2,idx=6 |
+ :---------------+---------------:
+ | x=0,y=3,idx=5 | x=1,y=3,idx=7 |
+ '---------------'---------------'
+ */
+ const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2);
+ force_skip_low_temp_var = variance_low[9 + idx64x32];
+ break;
+ case BLOCK_32X64:
+ x = (mi_col & 0x1F) >> 3;
+ y = (mi_row & 0x1F) >> 4;
+ const int idx32x64 = (y << 2) + x;
+ force_skip_low_temp_var = variance_low[17 + idx32x64];
+ break;
+ case BLOCK_32X32:
+ force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32];
+ break;
+ case BLOCK_32X16:
+ case BLOCK_16X32:
+ case BLOCK_16X16:
+ force_skip_low_temp_var =
+ variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16];
+ break;
+ default: break;
+ }
+ return force_skip_low_temp_var;
+}
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int qindex,
+ int content_lowsumdiff) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) {
+ return;
+ } else {
+ set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, 0, qindex,
+ content_lowsumdiff, 0, 0, 0, 0);
+ // The threshold below is not changed locally.
+ cpi->vbp_info.threshold_minmax = 15 + (qindex >> 3);
+ }
+}
+
+static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, unsigned int y_sad,
+ unsigned int y_sad_g,
+ unsigned int y_sad_alt, bool is_key_frame,
+ bool zero_motion, unsigned int *uv_sad) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+ int shift_upper_limit = 1;
+ int shift_lower_limit = 3;
+ int fac_uv = 6;
+ if (is_key_frame || cpi->oxcf.tool_cfg.enable_monochrome) return;
+
+ // Use lower threshold (more conservative in setting color flag) for
+ // higher resolutions non-screen, which tend to have more camera noise.
+ // Since this may be used to skip compound mode in nonrd pickmode, which
+ // is generally more effective for higher resolutions, better to be more
+ // conservative.
+ if (cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) {
+ if (cpi->common.width * cpi->common.height >= RESOLUTION_1080P)
+ fac_uv = 3;
+ else
+ fac_uv = 5;
+ }
+ if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN &&
+ cpi->rc.high_source_sad) {
+ shift_lower_limit = 7;
+ } else if (source_sad_nonrd >= kMedSad && x->source_variance > 500 &&
+ cpi->common.width * cpi->common.height >= 640 * 360) {
+ shift_upper_limit = 2;
+ shift_lower_limit = source_sad_nonrd > kMedSad ? 5 : 4;
+ }
+
+ MB_MODE_INFO *mi = xd->mi[0];
+ const AV1_COMMON *const cm = &cpi->common;
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+ const YV12_BUFFER_CONFIG *yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, LAST_FRAME);
+ struct buf_2d dst;
+ unsigned int uv_sad_g = 0;
+ unsigned int uv_sad_alt = 0;
+
+ for (int plane = AOM_PLANE_U; plane < MAX_MB_PLANE; ++plane) {
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ if (bs != BLOCK_INVALID) {
+ // For last:
+ if (zero_motion) {
+ if (mi->ref_frame[0] == LAST_FRAME) {
+ uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, pd->pre[0].buf, pd->pre[0].stride);
+ } else {
+ uint8_t *src = (plane == 1) ? yv12->u_buffer : yv12->v_buffer;
+ setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12->uv_crop_width,
+ yv12->uv_crop_height, yv12->uv_stride, xd->mi_row,
+ xd->mi_col, sf, xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y);
+
+ uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, dst.buf, dst.stride);
+ }
+ } else {
+ uv_sad[plane - 1] = cpi->ppi->fn_ptr[bs].sdf(
+ p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride);
+ }
+
+ // For golden:
+ if (y_sad_g != UINT_MAX) {
+ uint8_t *src = (plane == 1) ? yv12_g->u_buffer : yv12_g->v_buffer;
+ setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_g->uv_crop_width,
+ yv12_g->uv_crop_height, yv12_g->uv_stride, xd->mi_row,
+ xd->mi_col, sf, xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y);
+ uv_sad_g = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, dst.buf,
+ dst.stride);
+ }
+
+ // For altref:
+ if (y_sad_alt != UINT_MAX) {
+ uint8_t *src = (plane == 1) ? yv12_alt->u_buffer : yv12_alt->v_buffer;
+ setup_pred_plane(&dst, xd->mi[0]->bsize, src, yv12_alt->uv_crop_width,
+ yv12_alt->uv_crop_height, yv12_alt->uv_stride,
+ xd->mi_row, xd->mi_col, sf,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y);
+ uv_sad_alt = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+ dst.buf, dst.stride);
+ }
+ }
+
+ if (uv_sad[plane - 1] > (y_sad >> shift_upper_limit))
+ x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 1;
+ else if (uv_sad[plane - 1] < (y_sad >> shift_lower_limit))
+ x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 0;
+ // Borderline case: to be refined at coding block level in nonrd_pickmode,
+ // for coding block size < sb_size.
+ else
+ x->color_sensitivity_sb[COLOR_SENS_IDX(plane)] = 2;
+
+ x->color_sensitivity_sb_g[COLOR_SENS_IDX(plane)] =
+ uv_sad_g > y_sad_g / fac_uv;
+ x->color_sensitivity_sb_alt[COLOR_SENS_IDX(plane)] =
+ uv_sad_alt > y_sad_alt / fac_uv;
+ }
+}
+
+static void fill_variance_tree_leaves(
+ AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, PART_EVAL_STATUS *force_split,
+ int avg_16x16[][4], int maxvar_16x16[][4], int minvar_16x16[][4],
+ int64_t *thresholds, const uint8_t *src_buf, int src_stride,
+ const uint8_t *dst_buf, int dst_stride, bool is_key_frame,
+ const bool is_small_sb) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int num_64x64_blocks = is_small_sb ? 1 : 4;
+ // TODO(kyslov) Bring back compute_minmax_variance with content type detection
+ const int compute_minmax_variance = 0;
+ const int segment_id = xd->mi[0]->segment_id;
+ int pixels_wide = 128, pixels_high = 128;
+ int border_offset_4x4 = 0;
+ int temporal_denoising = cpi->sf.rt_sf.use_rtc_tf;
+ // dst_buf pointer is not used for is_key_frame, so it should be NULL.
+ assert(IMPLIES(is_key_frame, dst_buf == NULL));
+ if (is_small_sb) {
+ pixels_wide = 64;
+ pixels_high = 64;
+ }
+ if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
+ if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+#if CONFIG_AV1_TEMPORAL_DENOISING
+ temporal_denoising |= cpi->oxcf.noise_sensitivity;
+#endif
+ // For temporal filtering or temporal denoiser enabled: since the source
+ // is modified we need to avoid 4x4 avg along superblock boundary, since
+ // simd code will load 8 pixels for 4x4 avg and so can access source
+ // data outside superblock (while its being modified by temporal filter).
+ // Temporal filtering is never done on key frames.
+ if (!is_key_frame && temporal_denoising) border_offset_4x4 = 4;
+ for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; blk64_idx++) {
+ const int x64_idx = GET_BLK_IDX_X(blk64_idx, 6);
+ const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 6);
+ const int blk64_scale_idx = blk64_idx << 2;
+ force_split[blk64_idx + 1] = PART_EVAL_ALL;
+
+ for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+ const int x32_idx = x64_idx + GET_BLK_IDX_X(lvl1_idx, 5);
+ const int y32_idx = y64_idx + GET_BLK_IDX_Y(lvl1_idx, 5);
+ const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2;
+ force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ALL;
+ avg_16x16[blk64_idx][lvl1_idx] = 0;
+ maxvar_16x16[blk64_idx][lvl1_idx] = 0;
+ minvar_16x16[blk64_idx][lvl1_idx] = INT_MAX;
+ for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+ const int x16_idx = x32_idx + GET_BLK_IDX_X(lvl2_idx, 4);
+ const int y16_idx = y32_idx + GET_BLK_IDX_Y(lvl2_idx, 4);
+ const int split_index = 21 + lvl1_scale_idx + lvl2_idx;
+ VP16x16 *vst = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx];
+ force_split[split_index] = PART_EVAL_ALL;
+ if (is_key_frame) {
+ // Go down to 4x4 down-sampling for variance.
+ for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) {
+ const int x8_idx = x16_idx + GET_BLK_IDX_X(lvl3_idx, 3);
+ const int y8_idx = y16_idx + GET_BLK_IDX_Y(lvl3_idx, 3);
+ VP8x8 *vst2 = &vst->split[lvl3_idx];
+ fill_variance_4x4avg(src_buf, src_stride, x8_idx, y8_idx, vst2,
+#if CONFIG_AV1_HIGHBITDEPTH
+ xd->cur_buf->flags,
+#endif
+ pixels_wide, pixels_high, border_offset_4x4);
+ }
+ } else {
+ fill_variance_8x8avg(src_buf, src_stride, dst_buf, dst_stride,
+ x16_idx, y16_idx, vst, is_cur_buf_hbd(xd),
+ pixels_wide, pixels_high);
+
+ fill_variance_tree(vst, BLOCK_16X16);
+ VPartVar *none_var = &vt->split[blk64_idx]
+ .split[lvl1_idx]
+ .split[lvl2_idx]
+ .part_variances.none;
+ get_variance(none_var);
+ const int val_none_var = none_var->variance;
+ avg_16x16[blk64_idx][lvl1_idx] += val_none_var;
+ minvar_16x16[blk64_idx][lvl1_idx] =
+ AOMMIN(minvar_16x16[blk64_idx][lvl1_idx], val_none_var);
+ maxvar_16x16[blk64_idx][lvl1_idx] =
+ AOMMAX(maxvar_16x16[blk64_idx][lvl1_idx], val_none_var);
+ if (val_none_var > thresholds[3]) {
+ // 16X16 variance is above threshold for split, so force split to
+ // 8x8 for this 16x16 block (this also forces splits for upper
+ // levels).
+ force_split[split_index] = PART_EVAL_ONLY_SPLIT;
+ force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+ force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ } else if (!cyclic_refresh_segment_id_boosted(segment_id) &&
+ compute_minmax_variance && val_none_var > thresholds[2]) {
+ // We have some nominal amount of 16x16 variance (based on average),
+ // compute the minmax over the 8x8 sub-blocks, and if above
+ // threshold, force split to 8x8 block for this 16x16 block.
+ int minmax = compute_minmax_8x8(src_buf, src_stride, dst_buf,
+ dst_stride, x16_idx, y16_idx,
+#if CONFIG_AV1_HIGHBITDEPTH
+ xd->cur_buf->flags,
+#endif
+ pixels_wide, pixels_high);
+ const int thresh_minmax = (int)cpi->vbp_info.threshold_minmax;
+ if (minmax > thresh_minmax) {
+ force_split[split_index] = PART_EVAL_ONLY_SPLIT;
+ force_split[5 + blk64_scale_idx + lvl1_idx] =
+ PART_EVAL_ONLY_SPLIT;
+ force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static AOM_INLINE void set_ref_frame_for_partition(
+ AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ MV_REFERENCE_FRAME *ref_frame_partition, MB_MODE_INFO *mi,
+ unsigned int *y_sad, unsigned int *y_sad_g, unsigned int *y_sad_alt,
+ const YV12_BUFFER_CONFIG *yv12_g, const YV12_BUFFER_CONFIG *yv12_alt,
+ int mi_row, int mi_col, int num_planes) {
+ AV1_COMMON *const cm = &cpi->common;
+ const bool is_set_golden_ref_frame =
+ *y_sad_g < 0.9 * *y_sad && *y_sad_g < *y_sad_alt;
+ const bool is_set_altref_ref_frame =
+ *y_sad_alt < 0.9 * *y_sad && *y_sad_alt < *y_sad_g;
+
+ if (is_set_golden_ref_frame) {
+ av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+ get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
+ mi->ref_frame[0] = GOLDEN_FRAME;
+ mi->mv[0].as_int = 0;
+ *y_sad = *y_sad_g;
+ *ref_frame_partition = GOLDEN_FRAME;
+ x->nonrd_prune_ref_frame_search = 0;
+ x->sb_me_partition = 0;
+ } else if (is_set_altref_ref_frame) {
+ av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col,
+ get_ref_scale_factors(cm, ALTREF_FRAME), num_planes);
+ mi->ref_frame[0] = ALTREF_FRAME;
+ mi->mv[0].as_int = 0;
+ *y_sad = *y_sad_alt;
+ *ref_frame_partition = ALTREF_FRAME;
+ x->nonrd_prune_ref_frame_search = 0;
+ x->sb_me_partition = 0;
+ } else {
+ *ref_frame_partition = LAST_FRAME;
+ x->nonrd_prune_ref_frame_search =
+ cpi->sf.rt_sf.nonrd_prune_ref_frame_search;
+ }
+}
+
+static AOM_FORCE_INLINE int mv_distance(const FULLPEL_MV *mv0,
+ const FULLPEL_MV *mv1) {
+ return abs(mv0->row - mv1->row) + abs(mv0->col - mv1->col);
+}
+
+static AOM_INLINE void evaluate_neighbour_mvs(AV1_COMP *cpi, MACROBLOCK *x,
+ unsigned int *y_sad,
+ bool is_small_sb,
+ int est_motion) {
+ const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+ // TODO(yunqingwang@google.com): test if this condition works with other
+ // speeds.
+ if (est_motion > 2 && source_sad_nonrd > kMedSad) return;
+
+ MACROBLOCKD *xd = &x->e_mbd;
+ BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+ MB_MODE_INFO *mi = xd->mi[0];
+
+ unsigned int above_y_sad = UINT_MAX;
+ unsigned int left_y_sad = UINT_MAX;
+ FULLPEL_MV above_mv = kZeroFullMv;
+ FULLPEL_MV left_mv = kZeroFullMv;
+ SubpelMvLimits subpel_mv_limits;
+ const MV dummy_mv = { 0, 0 };
+ av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, &dummy_mv);
+
+ // Current best MV
+ FULLPEL_MV best_mv = get_fullmv_from_mv(&mi->mv[0].as_mv);
+ const int multi = (est_motion > 2 && source_sad_nonrd > kLowSad) ? 7 : 8;
+
+ if (xd->up_available) {
+ const MB_MODE_INFO *above_mbmi = xd->above_mbmi;
+ if (above_mbmi->mode >= INTRA_MODE_END &&
+ above_mbmi->ref_frame[0] == LAST_FRAME) {
+ MV temp = above_mbmi->mv[0].as_mv;
+ clamp_mv(&temp, &subpel_mv_limits);
+ above_mv = get_fullmv_from_mv(&temp);
+
+ if (mv_distance(&best_mv, &above_mv) > 0) {
+ uint8_t const *ref_buf =
+ get_buf_from_fullmv(&xd->plane[0].pre[0], &above_mv);
+ above_y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, x->plane[0].src.stride, ref_buf,
+ xd->plane[0].pre[0].stride);
+ }
+ }
+ }
+ if (xd->left_available) {
+ const MB_MODE_INFO *left_mbmi = xd->left_mbmi;
+ if (left_mbmi->mode >= INTRA_MODE_END &&
+ left_mbmi->ref_frame[0] == LAST_FRAME) {
+ MV temp = left_mbmi->mv[0].as_mv;
+ clamp_mv(&temp, &subpel_mv_limits);
+ left_mv = get_fullmv_from_mv(&temp);
+
+ if (mv_distance(&best_mv, &left_mv) > 0 &&
+ mv_distance(&above_mv, &left_mv) > 0) {
+ uint8_t const *ref_buf =
+ get_buf_from_fullmv(&xd->plane[0].pre[0], &left_mv);
+ left_y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, x->plane[0].src.stride, ref_buf,
+ xd->plane[0].pre[0].stride);
+ }
+ }
+ }
+
+ if (above_y_sad < ((multi * *y_sad) >> 3) && above_y_sad < left_y_sad) {
+ *y_sad = above_y_sad;
+ mi->mv[0].as_mv = get_mv_from_fullmv(&above_mv);
+ clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits);
+ }
+ if (left_y_sad < ((multi * *y_sad) >> 3) && left_y_sad < above_y_sad) {
+ *y_sad = left_y_sad;
+ mi->mv[0].as_mv = get_mv_from_fullmv(&left_mv);
+ clamp_mv(&mi->mv[0].as_mv, &subpel_mv_limits);
+ }
+}
+
+static void setup_planes(AV1_COMP *cpi, MACROBLOCK *x, unsigned int *y_sad,
+ unsigned int *y_sad_g, unsigned int *y_sad_alt,
+ unsigned int *y_sad_last,
+ MV_REFERENCE_FRAME *ref_frame_partition,
+ struct scale_factors *sf_no_scale, int mi_row,
+ int mi_col, bool is_small_sb, bool scaled_ref_last) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int num_planes = av1_num_planes(cm);
+ bool scaled_ref_golden = false;
+ bool scaled_ref_alt = false;
+ BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+ MB_MODE_INFO *mi = xd->mi[0];
+ const YV12_BUFFER_CONFIG *yv12 =
+ scaled_ref_last ? av1_get_scaled_ref_frame(cpi, LAST_FRAME)
+ : get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ assert(yv12 != NULL);
+ const YV12_BUFFER_CONFIG *yv12_g = NULL;
+ const YV12_BUFFER_CONFIG *yv12_alt = NULL;
+ // Check if LAST is a reference. For spatial layers always use it as
+ // reference scaling.
+ int use_last_ref = (cpi->ref_frame_flags & AOM_LAST_FLAG) ||
+ cpi->svc.number_spatial_layers > 1;
+ int use_golden_ref = cpi->ref_frame_flags & AOM_GOLD_FLAG;
+ int use_alt_ref = cpi->ppi->rtc_ref.set_ref_frame_config ||
+ cpi->sf.rt_sf.use_nonrd_altref_frame ||
+ (cpi->sf.rt_sf.use_comp_ref_nonrd &&
+ cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 1);
+
+ // For 1 spatial layer: GOLDEN is another temporal reference.
+ // Check if it should be used as reference for partitioning.
+ if (cpi->svc.number_spatial_layers == 1 && use_golden_ref &&
+ (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) {
+ yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+ if (yv12_g && (yv12_g->y_crop_height != cm->height ||
+ yv12_g->y_crop_width != cm->width)) {
+ yv12_g = av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+ scaled_ref_golden = true;
+ }
+ if (yv12_g && yv12_g != yv12) {
+ av1_setup_pre_planes(
+ xd, 0, yv12_g, mi_row, mi_col,
+ scaled_ref_golden ? NULL : get_ref_scale_factors(cm, GOLDEN_FRAME),
+ num_planes);
+ *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf(
+ x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
+ xd->plane[AOM_PLANE_Y].pre[0].buf,
+ xd->plane[AOM_PLANE_Y].pre[0].stride);
+ }
+ }
+
+ // For 1 spatial layer: ALTREF is another temporal reference.
+ // Check if it should be used as reference for partitioning.
+ if (cpi->svc.number_spatial_layers == 1 && use_alt_ref &&
+ (cpi->ref_frame_flags & AOM_ALT_FLAG) &&
+ (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) {
+ yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME);
+ if (yv12_alt && (yv12_alt->y_crop_height != cm->height ||
+ yv12_alt->y_crop_width != cm->width)) {
+ yv12_alt = av1_get_scaled_ref_frame(cpi, ALTREF_FRAME);
+ scaled_ref_alt = true;
+ }
+ if (yv12_alt && yv12_alt != yv12) {
+ av1_setup_pre_planes(
+ xd, 0, yv12_alt, mi_row, mi_col,
+ scaled_ref_alt ? NULL : get_ref_scale_factors(cm, ALTREF_FRAME),
+ num_planes);
+ *y_sad_alt = cpi->ppi->fn_ptr[bsize].sdf(
+ x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
+ xd->plane[AOM_PLANE_Y].pre[0].buf,
+ xd->plane[AOM_PLANE_Y].pre[0].stride);
+ }
+ }
+
+ if (use_last_ref) {
+ const int source_sad_nonrd = x->content_state_sb.source_sad_nonrd;
+ av1_setup_pre_planes(
+ xd, 0, yv12, mi_row, mi_col,
+ scaled_ref_last ? NULL : get_ref_scale_factors(cm, LAST_FRAME),
+ num_planes);
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE_FRAME;
+ mi->bsize = cm->seq_params->sb_size;
+ mi->mv[0].as_int = 0;
+ mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+
+ int est_motion = cpi->sf.rt_sf.estimate_motion_for_var_based_partition;
+ // TODO(b/290596301): Look into adjusting this condition.
+ // There is regression on color content when
+ // estimate_motion_for_var_based_partition = 3 and high motion,
+ // so for now force it to 2 based on superblock sad.
+ if (est_motion > 2 && source_sad_nonrd > kMedSad) est_motion = 2;
+
+ if (est_motion == 1 || est_motion == 2) {
+ if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+ // For screen only do int_pro_motion for spatial variance above
+ // threshold and motion level above LowSad.
+ if (x->source_variance > 100 && source_sad_nonrd > kLowSad) {
+ int is_screen = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN;
+ int me_search_size_col =
+ is_screen ? 96 : block_size_wide[cm->seq_params->sb_size] >> 1;
+ // For screen use larger search size row motion to capture
+ // vertical scroll, which can be larger motion.
+ int me_search_size_row =
+ is_screen ? 192 : block_size_high[cm->seq_params->sb_size] >> 1;
+ unsigned int y_sad_zero;
+ *y_sad = av1_int_pro_motion_estimation(
+ cpi, x, cm->seq_params->sb_size, mi_row, mi_col, &kZeroMv,
+ &y_sad_zero, me_search_size_col, me_search_size_row);
+ // The logic below selects whether the motion estimated in the
+ // int_pro_motion() will be used in nonrd_pickmode. Only do this
+ // for screen for now.
+ if (is_screen) {
+ unsigned int thresh_sad =
+ (cm->seq_params->sb_size == BLOCK_128X128) ? 50000 : 20000;
+ if (*y_sad < (y_sad_zero >> 1) && *y_sad < thresh_sad) {
+ x->sb_me_partition = 1;
+ x->sb_me_mv.as_int = mi->mv[0].as_int;
+ } else {
+ x->sb_me_partition = 0;
+ // Fall back to using zero motion.
+ *y_sad = y_sad_zero;
+ mi->mv[0].as_int = 0;
+ }
+ }
+ }
+ }
+ }
+
+ if (*y_sad == UINT_MAX) {
+ *y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+ x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride,
+ xd->plane[AOM_PLANE_Y].pre[0].buf,
+ xd->plane[AOM_PLANE_Y].pre[0].stride);
+ }
+
+ // Evaluate if neighbours' MVs give better predictions. Zero MV is tested
+ // already, so only non-zero MVs are tested here. Here the neighbour blocks
+ // are the first block above or left to this superblock.
+ if (est_motion >= 2 && (xd->up_available || xd->left_available))
+ evaluate_neighbour_mvs(cpi, x, y_sad, is_small_sb, est_motion);
+
+ *y_sad_last = *y_sad;
+ }
+
+ // Pick the ref frame for partitioning, use golden or altref frame only if
+ // its lower sad, bias to LAST with factor 0.9.
+ set_ref_frame_for_partition(cpi, x, xd, ref_frame_partition, mi, y_sad,
+ y_sad_g, y_sad_alt, yv12_g, yv12_alt, mi_row,
+ mi_col, num_planes);
+
+ // Only calculate the predictor for non-zero MV.
+ if (mi->mv[0].as_int != 0) {
+ if (!scaled_ref_last) {
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+ } else {
+ xd->block_ref_scale_factors[0] = sf_no_scale;
+ xd->block_ref_scale_factors[1] = sf_no_scale;
+ }
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
+ cm->seq_params->sb_size, AOM_PLANE_Y,
+ num_planes - 1);
+ }
+}
+
+// Decides whether to split or merge a 16x16 partition block in variance based
+// partitioning based on the 8x8 sub-block variances.
+static AOM_INLINE PART_EVAL_STATUS get_part_eval_based_on_sub_blk_var(
+ VP16x16 *var_16x16_info, int64_t threshold16) {
+ int max_8x8_var = 0, min_8x8_var = INT_MAX;
+ for (int split_idx = 0; split_idx < 4; split_idx++) {
+ get_variance(&var_16x16_info->split[split_idx].part_variances.none);
+ int this_8x8_var =
+ var_16x16_info->split[split_idx].part_variances.none.variance;
+ max_8x8_var = AOMMAX(this_8x8_var, max_8x8_var);
+ min_8x8_var = AOMMIN(this_8x8_var, min_8x8_var);
+ }
+ // If the difference between maximum and minimum sub-block variances is high,
+ // then only evaluate PARTITION_SPLIT for the 16x16 block. Otherwise, evaluate
+ // only PARTITION_NONE. The shift factor for threshold16 has been derived
+ // empirically.
+ return ((max_8x8_var - min_8x8_var) > (threshold16 << 2))
+ ? PART_EVAL_ONLY_SPLIT
+ : PART_EVAL_ONLY_NONE;
+}
+
+static AOM_INLINE bool is_set_force_zeromv_skip_based_on_src_sad(
+ int set_zeromv_skip_based_on_source_sad, SOURCE_SAD source_sad_nonrd) {
+ if (set_zeromv_skip_based_on_source_sad == 0) return false;
+
+ if (set_zeromv_skip_based_on_source_sad >= 3)
+ return source_sad_nonrd <= kLowSad;
+ else if (set_zeromv_skip_based_on_source_sad >= 2)
+ return source_sad_nonrd <= kVeryLowSad;
+ else if (set_zeromv_skip_based_on_source_sad >= 1)
+ return source_sad_nonrd == kZeroSad;
+
+ return false;
+}
+
+static AOM_INLINE bool set_force_zeromv_skip_for_sb(
+ AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP128x128 *vt,
+ unsigned int *uv_sad, int mi_row, int mi_col, unsigned int y_sad,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (!is_set_force_zeromv_skip_based_on_src_sad(
+ cpi->sf.rt_sf.set_zeromv_skip_based_on_source_sad,
+ x->content_state_sb.source_sad_nonrd))
+ return false;
+ int shift = cpi->sf.rt_sf.increase_source_sad_thresh ? 1 : 0;
+ const int block_width = mi_size_wide[cm->seq_params->sb_size];
+ const int block_height = mi_size_high[cm->seq_params->sb_size];
+ const unsigned int thresh_exit_part_y =
+ cpi->zeromv_skip_thresh_exit_part[bsize] << shift;
+ unsigned int thresh_exit_part_uv =
+ CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y) << shift;
+ // Be more aggressive in UV threshold if source_sad >= VeryLowSad
+ // to suppreess visual artifact caused by the speed feature:
+ // set_zeromv_skip_based_on_source_sad = 2. For now only for
+ // part_early_exit_zeromv = 1.
+ if (x->content_state_sb.source_sad_nonrd >= kVeryLowSad &&
+ cpi->sf.rt_sf.part_early_exit_zeromv == 1)
+ thresh_exit_part_uv = thresh_exit_part_uv >> 3;
+ if (mi_col + block_width <= tile->mi_col_end &&
+ mi_row + block_height <= tile->mi_row_end && y_sad < thresh_exit_part_y &&
+ uv_sad[0] < thresh_exit_part_uv && uv_sad[1] < thresh_exit_part_uv) {
+ set_block_size(cpi, mi_row, mi_col, bsize);
+ x->force_zeromv_skip_for_sb = 1;
+ aom_free(vt);
+ // Partition shape is set here at SB level.
+ // Exit needs to happen from av1_choose_var_based_partitioning().
+ return true;
+ } else if (x->content_state_sb.source_sad_nonrd == kZeroSad &&
+ cpi->sf.rt_sf.part_early_exit_zeromv >= 2)
+ x->force_zeromv_skip_for_sb = 2;
+ return false;
+}
+
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ ThreadData *td, MACROBLOCK *x, int mi_row,
+ int mi_col) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, choose_var_based_partitioning_time);
+#endif
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds;
+ PART_EVAL_STATUS force_split[85];
+ int avg_64x64;
+ int max_var_32x32[4];
+ int min_var_32x32[4];
+ int var_32x32;
+ int var_64x64;
+ int min_var_64x64 = INT_MAX;
+ int max_var_64x64 = 0;
+ int avg_16x16[4][4];
+ int maxvar_16x16[4][4];
+ int minvar_16x16[4][4];
+ const uint8_t *src_buf;
+ const uint8_t *dst_buf;
+ int dst_stride;
+ unsigned int uv_sad[MAX_MB_PLANE - 1];
+ NOISE_LEVEL noise_level = kLow;
+ bool is_zero_motion = true;
+ bool scaled_ref_last = false;
+ struct scale_factors sf_no_scale;
+ av1_setup_scale_factors_for_frame(&sf_no_scale, cm->width, cm->height,
+ cm->width, cm->height);
+
+ bool is_key_frame =
+ (frame_is_intra_only(cm) ||
+ (cpi->ppi->use_svc &&
+ cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+
+ assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+ cm->seq_params->sb_size == BLOCK_128X128);
+ const bool is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
+ const int num_64x64_blocks = is_small_sb ? 1 : 4;
+
+ unsigned int y_sad = UINT_MAX;
+ unsigned int y_sad_g = UINT_MAX;
+ unsigned int y_sad_alt = UINT_MAX;
+ unsigned int y_sad_last = UINT_MAX;
+ BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+
+ // Ref frame used in partitioning.
+ MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
+
+ int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1],
+ vbp_thresholds[2], vbp_thresholds[3],
+ vbp_thresholds[4] };
+
+ const int segment_id = xd->mi[0]->segment_id;
+ uint64_t blk_sad = 0;
+ if (cpi->src_sad_blk_64x64 != NULL &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+ ? (cm->seq_params->mib_size >> 1)
+ : cm->seq_params->mib_size;
+ const int sb_cols =
+ (cm->mi_params.mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
+ const int sbi_col = mi_col / sb_size_by_mb;
+ const int sbi_row = mi_row / sb_size_by_mb;
+ blk_sad = cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols];
+ }
+
+ const bool is_segment_id_boosted =
+ cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+ cyclic_refresh_segment_id_boosted(segment_id);
+ const int qindex =
+ is_segment_id_boosted
+ ? av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex)
+ : cm->quant_params.base_qindex;
+ set_vbp_thresholds(
+ cpi, thresholds, blk_sad, qindex, x->content_state_sb.low_sumdiff,
+ x->content_state_sb.source_sad_nonrd, x->content_state_sb.source_sad_rd,
+ is_segment_id_boosted, x->content_state_sb.lighting_change);
+
+ src_buf = x->plane[AOM_PLANE_Y].src.buf;
+ int src_stride = x->plane[AOM_PLANE_Y].src.stride;
+
+ // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+ // 5-20 for the 16x16 blocks.
+ force_split[0] = PART_EVAL_ALL;
+ memset(x->part_search_info.variance_low, 0,
+ sizeof(x->part_search_info.variance_low));
+
+ // Check if LAST frame is NULL, and if so, treat this frame
+ // as a key frame, for the purpose of the superblock partitioning.
+ // LAST == NULL can happen in cases where enhancement spatial layers are
+ // enabled dyanmically and the only reference is the spatial(GOLDEN).
+ // If LAST frame has a different resolution: set the scaled_ref_last flag
+ // and check if ref_scaled is NULL.
+ if (!frame_is_intra_only(cm)) {
+ const YV12_BUFFER_CONFIG *ref = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ if (ref == NULL) {
+ is_key_frame = true;
+ } else if (ref->y_crop_height != cm->height ||
+ ref->y_crop_width != cm->width) {
+ scaled_ref_last = true;
+ const YV12_BUFFER_CONFIG *ref_scaled =
+ av1_get_scaled_ref_frame(cpi, LAST_FRAME);
+ if (ref_scaled == NULL) is_key_frame = true;
+ }
+ }
+
+ x->source_variance = UINT_MAX;
+ // For nord_pickmode: compute source_variance, only for superblocks with
+ // some motion for now. This input can then be used to bias the partitioning
+ // or the chroma_check.
+ if (cpi->sf.rt_sf.use_nonrd_pick_mode &&
+ x->content_state_sb.source_sad_nonrd > kLowSad)
+ x->source_variance = av1_get_perpixel_variance_facade(
+ cpi, xd, &x->plane[0].src, cm->seq_params->sb_size, AOM_PLANE_Y);
+
+ if (!is_key_frame) {
+ setup_planes(cpi, x, &y_sad, &y_sad_g, &y_sad_alt, &y_sad_last,
+ &ref_frame_partition, &sf_no_scale, mi_row, mi_col,
+ is_small_sb, scaled_ref_last);
+
+ MB_MODE_INFO *mi = xd->mi[0];
+ // Use reference SB directly for zero mv.
+ if (mi->mv[0].as_int != 0) {
+ dst_buf = xd->plane[AOM_PLANE_Y].dst.buf;
+ dst_stride = xd->plane[AOM_PLANE_Y].dst.stride;
+ is_zero_motion = false;
+ } else {
+ dst_buf = xd->plane[AOM_PLANE_Y].pre[0].buf;
+ dst_stride = xd->plane[AOM_PLANE_Y].pre[0].stride;
+ }
+ } else {
+ dst_buf = NULL;
+ dst_stride = 0;
+ }
+
+ // check and set the color sensitivity of sb.
+ av1_zero(uv_sad);
+ chroma_check(cpi, x, bsize, y_sad_last, y_sad_g, y_sad_alt, is_key_frame,
+ is_zero_motion, uv_sad);
+
+ x->force_zeromv_skip_for_sb = 0;
+
+ VP128x128 *vt;
+ AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt)));
+ vt->split = td->vt64x64;
+
+ // If the superblock is completely static (zero source sad) and
+ // the y_sad (relative to LAST ref) is very small, take the sb_size partition
+ // and exit, and force zeromv_last skip mode for nonrd_pickmode.
+ // Only do this on the base segment (so the QP-boosted segment, if applied,
+ // can still continue cleaning/ramping up the quality).
+ // Condition on color uv_sad is also added.
+ if (!is_key_frame && cpi->sf.rt_sf.part_early_exit_zeromv &&
+ cpi->rc.frames_since_key > 30 && segment_id == CR_SEGMENT_ID_BASE &&
+ ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0) {
+ // Exit here, if zero mv skip flag is set at SB level.
+ if (set_force_zeromv_skip_for_sb(cpi, x, tile, vt, uv_sad, mi_row, mi_col,
+ y_sad, bsize))
+ return 0;
+ }
+
+ if (cpi->noise_estimate.enabled)
+ noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
+
+ // Fill in the entire tree of 8x8 (for inter frames) or 4x4 (for key frames)
+ // variances for splits.
+ fill_variance_tree_leaves(cpi, x, vt, force_split, avg_16x16, maxvar_16x16,
+ minvar_16x16, thresholds, src_buf, src_stride,
+ dst_buf, dst_stride, is_key_frame, is_small_sb);
+
+ avg_64x64 = 0;
+ for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) {
+ max_var_32x32[blk64_idx] = 0;
+ min_var_32x32[blk64_idx] = INT_MAX;
+ const int blk64_scale_idx = blk64_idx << 2;
+ for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) {
+ const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2;
+ for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) {
+ if (!is_key_frame) continue;
+ VP16x16 *vtemp = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx];
+ for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++)
+ fill_variance_tree(&vtemp->split[lvl3_idx], BLOCK_8X8);
+ fill_variance_tree(vtemp, BLOCK_16X16);
+ // If variance of this 16x16 block is above the threshold, force block
+ // to split. This also forces a split on the upper levels.
+ get_variance(&vtemp->part_variances.none);
+ if (vtemp->part_variances.none.variance > thresholds[3]) {
+ const int split_index = 21 + lvl1_scale_idx + lvl2_idx;
+ force_split[split_index] =
+ cpi->sf.rt_sf.vbp_prune_16x16_split_using_min_max_sub_blk_var
+ ? get_part_eval_based_on_sub_blk_var(vtemp, thresholds[3])
+ : PART_EVAL_ONLY_SPLIT;
+ force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+ force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+ }
+ fill_variance_tree(&vt->split[blk64_idx].split[lvl1_idx], BLOCK_32X32);
+ // If variance of this 32x32 block is above the threshold, or if its above
+ // (some threshold of) the average variance over the sub-16x16 blocks,
+ // then force this block to split. This also forces a split on the upper
+ // (64x64) level.
+ uint64_t frame_sad_thresh = 20000;
+ const int is_360p_or_smaller = cm->width * cm->height <= RESOLUTION_360P;
+ if (cpi->svc.number_temporal_layers > 2 &&
+ cpi->svc.temporal_layer_id == 0)
+ frame_sad_thresh = frame_sad_thresh << 1;
+ if (force_split[5 + blk64_scale_idx + lvl1_idx] == PART_EVAL_ALL) {
+ get_variance(&vt->split[blk64_idx].split[lvl1_idx].part_variances.none);
+ var_32x32 =
+ vt->split[blk64_idx].split[lvl1_idx].part_variances.none.variance;
+ max_var_32x32[blk64_idx] = AOMMAX(var_32x32, max_var_32x32[blk64_idx]);
+ min_var_32x32[blk64_idx] = AOMMIN(var_32x32, min_var_32x32[blk64_idx]);
+ const int max_min_var_16X16_diff = (maxvar_16x16[blk64_idx][lvl1_idx] -
+ minvar_16x16[blk64_idx][lvl1_idx]);
+
+ if (var_32x32 > thresholds[2] ||
+ (!is_key_frame && var_32x32 > (thresholds[2] >> 1) &&
+ var_32x32 > (avg_16x16[blk64_idx][lvl1_idx] >> 1))) {
+ force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+ force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ } else if (!is_key_frame && is_360p_or_smaller &&
+ ((max_min_var_16X16_diff > (thresholds[2] >> 1) &&
+ maxvar_16x16[blk64_idx][lvl1_idx] > thresholds[2]) ||
+ (cpi->sf.rt_sf.prefer_large_partition_blocks &&
+ x->content_state_sb.source_sad_nonrd > kLowSad &&
+ cpi->rc.frame_source_sad < frame_sad_thresh &&
+ maxvar_16x16[blk64_idx][lvl1_idx] > (thresholds[2] >> 4) &&
+ maxvar_16x16[blk64_idx][lvl1_idx] >
+ (minvar_16x16[blk64_idx][lvl1_idx] << 2)))) {
+ force_split[5 + blk64_scale_idx + lvl1_idx] = PART_EVAL_ONLY_SPLIT;
+ force_split[blk64_idx + 1] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+ }
+ }
+ if (force_split[1 + blk64_idx] == PART_EVAL_ALL) {
+ fill_variance_tree(&vt->split[blk64_idx], BLOCK_64X64);
+ get_variance(&vt->split[blk64_idx].part_variances.none);
+ var_64x64 = vt->split[blk64_idx].part_variances.none.variance;
+ max_var_64x64 = AOMMAX(var_64x64, max_var_64x64);
+ min_var_64x64 = AOMMIN(var_64x64, min_var_64x64);
+ // If the difference of the max-min variances of sub-blocks or max
+ // variance of a sub-block is above some threshold of then force this
+ // block to split. Only checking this for noise level >= medium, if
+ // encoder is in SVC or if we already forced large blocks.
+ const int max_min_var_32x32_diff =
+ max_var_32x32[blk64_idx] - min_var_32x32[blk64_idx];
+ const int check_max_var = max_var_32x32[blk64_idx] > thresholds[1] >> 1;
+ const bool check_noise_lvl = noise_level >= kMedium ||
+ cpi->ppi->use_svc ||
+ cpi->sf.rt_sf.prefer_large_partition_blocks;
+ const int64_t set_threshold = 3 * (thresholds[1] >> 3);
+
+ if (!is_key_frame && max_min_var_32x32_diff > set_threshold &&
+ check_max_var && check_noise_lvl) {
+ force_split[1 + blk64_idx] = PART_EVAL_ONLY_SPLIT;
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+ avg_64x64 += var_64x64;
+ }
+ if (is_small_sb) force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+
+ if (force_split[0] == PART_EVAL_ALL) {
+ fill_variance_tree(vt, BLOCK_128X128);
+ get_variance(&vt->part_variances.none);
+ const int set_avg_64x64 = (9 * avg_64x64) >> 5;
+ if (!is_key_frame && vt->part_variances.none.variance > set_avg_64x64)
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+
+ if (!is_key_frame &&
+ (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) &&
+ max_var_64x64 > thresholds[0] >> 1)
+ force_split[0] = PART_EVAL_ONLY_SPLIT;
+ }
+
+ if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end ||
+ !set_vt_partitioning(cpi, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
+ thresholds[0], BLOCK_16X16, force_split[0])) {
+ for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) {
+ const int x64_idx = GET_BLK_IDX_X(blk64_idx, 4);
+ const int y64_idx = GET_BLK_IDX_Y(blk64_idx, 4);
+ const int blk64_scale_idx = blk64_idx << 2;
+
+ // Now go through the entire structure, splitting every block size until
+ // we get to one that's got a variance lower than our threshold.
+ if (set_vt_partitioning(cpi, xd, tile, &vt->split[blk64_idx], BLOCK_64X64,
+ mi_row + y64_idx, mi_col + x64_idx, thresholds[1],
+ BLOCK_16X16, force_split[1 + blk64_idx]))
+ continue;
+ for (int lvl1_idx = 0; lvl1_idx < 4; ++lvl1_idx) {
+ const int x32_idx = GET_BLK_IDX_X(lvl1_idx, 3);
+ const int y32_idx = GET_BLK_IDX_Y(lvl1_idx, 3);
+ const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2;
+ if (set_vt_partitioning(
+ cpi, xd, tile, &vt->split[blk64_idx].split[lvl1_idx],
+ BLOCK_32X32, (mi_row + y64_idx + y32_idx),
+ (mi_col + x64_idx + x32_idx), thresholds[2], BLOCK_16X16,
+ force_split[5 + blk64_scale_idx + lvl1_idx]))
+ continue;
+ for (int lvl2_idx = 0; lvl2_idx < 4; ++lvl2_idx) {
+ const int x16_idx = GET_BLK_IDX_X(lvl2_idx, 2);
+ const int y16_idx = GET_BLK_IDX_Y(lvl2_idx, 2);
+ const int split_index = 21 + lvl1_scale_idx + lvl2_idx;
+ VP16x16 *vtemp =
+ &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx];
+ if (set_vt_partitioning(cpi, xd, tile, vtemp, BLOCK_16X16,
+ mi_row + y64_idx + y32_idx + y16_idx,
+ mi_col + x64_idx + x32_idx + x16_idx,
+ thresholds[3], BLOCK_8X8,
+ force_split[split_index]))
+ continue;
+ for (int lvl3_idx = 0; lvl3_idx < 4; ++lvl3_idx) {
+ const int x8_idx = GET_BLK_IDX_X(lvl3_idx, 1);
+ const int y8_idx = GET_BLK_IDX_Y(lvl3_idx, 1);
+ set_block_size(cpi, (mi_row + y64_idx + y32_idx + y16_idx + y8_idx),
+ (mi_col + x64_idx + x32_idx + x16_idx + x8_idx),
+ BLOCK_8X8);
+ }
+ }
+ }
+ }
+ }
+
+ if (cpi->sf.rt_sf.short_circuit_low_temp_var) {
+ set_low_temp_var_flag(cpi, &x->part_search_info, xd, vt, thresholds,
+ ref_frame_partition, mi_col, mi_row, is_small_sb);
+ }
+
+ aom_free(vt);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, choose_var_based_partitioning_time);
+#endif
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/var_based_part.h b/third_party/aom/av1/encoder/var_based_part.h
new file mode 100644
index 0000000000..f912458307
--- /dev/null
+++ b/third_party/aom/av1/encoder/var_based_part.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_
+#define AOM_AV1_ENCODER_VAR_BASED_PART_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+
+// Calculate block index x and y from split level and index
+#define GET_BLK_IDX_X(idx, level) (((idx) & (0x01)) << (level))
+#define GET_BLK_IDX_Y(idx, level) (((idx) >> (0x01)) << (level))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QINDEX_LARGE_BLOCK_THR \
+ 100 // Use increased thresholds for midres for speed 9 when qindex is above
+ // this threshold
+
+#define CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part) \
+ ((3 * (thresh_exit_part)) >> 2)
+/*!\brief Set the thresholds for variance based partition.
+ *
+ * Set the variance split thresholds for following the block sizes:
+ * 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
+ * 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
+ * currently only used on key frame. The thresholds are based om Q, resolution,
+ * noise level, and content state.
+ *
+ * \ingroup variance_partition
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] q q index
+ * \param[in] content_lowsumdiff Low sumdiff flag for superblock
+ *
+ * \remark Returns the set of thresholds in \c cpi->vbp_info.thresholds.
+ */
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+ int content_lowsumdiff);
+
+/*!\brief Variance based partition selection.
+ *
+ * Select the partitioning based on the variance of the residual signal,
+ * residual generated as the difference between the source and prediction.
+ * The prediction is the reconstructed LAST or reconstructed GOLDEN, whichever
+ * has lower y sad. For LAST, option exists (speed feature) to use motion
+ * compensation based on superblock motion via int_pro_motion_estimation. For
+ * key frames reference is fixed 128 level, so variance is the source variance.
+ * The variance is computed for downsampled inputs (8x8 or 4x4 downsampled),
+ * and selection is done top-down via as set of partition thresholds. defined
+ * for each block level, and set based on Q, resolution, noise level, and
+ * content state.
+ *
+ * \ingroup variance_partition
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in] cpi Top level encoder structure
+ * \param[in] tile Pointer to TileInfo
+ * \param[in] td Pointer to ThreadData
+ * \param[in] x Pointer to MACROBLOCK
+ * \param[in] mi_row Row coordinate of the superblock in a step
+ size of MI_SIZE
+ * \param[in] mi_col Column coordinate of the super block in a step
+ size of MI_SIZE
+ *
+ * \return Returns the partition in \c xd->mi[0]->sb_type. Also sets the low
+ * temporal variance flag and the color sensitivity flag (both used in
+ * nonrd_pickmode).
+ */
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ ThreadData *td, MACROBLOCK *x, int mi_row,
+ int mi_col);
+
+// Read out the block's temporal variance for 64x64 SB case.
+int av1_get_force_skip_low_temp_var_small_sb(const uint8_t *variance_low,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+// Read out the block's temporal variance for 128x128 SB case.
+int av1_get_force_skip_low_temp_var(const uint8_t *variance_low, int mi_row,
+ int mi_col, BLOCK_SIZE bsize);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_VAR_BASED_PART_H_
diff --git a/third_party/aom/av1/encoder/wedge_utils.c b/third_party/aom/av1/encoder/wedge_utils.c
new file mode 100644
index 0000000000..40670178d7
--- /dev/null
+++ b/third_party/aom/av1/encoder/wedge_utils.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * Computes SSE of a compound predictor constructed from 2 fundamental
+ * predictors p0 and p1 using blending with mask.
+ *
+ * r1: Residuals of p1.
+ * (source - p1)
+ * d: Difference of p1 and p0.
+ * (p1 - p0)
+ * m: The blending mask
+ * N: Number of pixels
+ *
+ * 'r1', 'd', and 'm' are contiguous.
+ *
+ * Computes:
+ * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
+ * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
+ * where r0 is (source - p0), and r1 is (source - p1), which is in turn
+ * is equivalent to:
+ * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
+ * which is the SSE of the residuals of the compound predictor scaled up by
+ * MAX_MASK_VALUE**2.
+ *
+ * Note that we clamp the partial term in the loop to 16 bits signed. This is
+ * to facilitate equivalent SIMD implementation. It should have no effect if
+ * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
+ * holds for 8 bit input, and on real input, it should hold practically always,
+ * as residuals are expected to be small.
+ */
+uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ uint64_t csse = 0;
+ int i;
+
+ for (i = 0; i < N; i++) {
+ int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
+ t = clamp(t, INT16_MIN, INT16_MAX);
+ csse += t * t;
+ }
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * Choose the mask sign for a compound predictor.
+ *
+ * ds: Difference of the squares of the residuals.
+ * r0**2 - r1**2
+ * m: The blending mask
+ * N: Number of pixels
+ * limit: Pre-computed threshold value.
+ * MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * 'ds' and 'm' are contiguous.
+ *
+ * Returns true if the negated mask has lower SSE compared to the positive
+ * mask. Computation is based on:
+ * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
+ * >
+ * Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
+ *
+ * which can be simplified to:
+ *
+ * Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * The right hand side does not depend on the mask, and needs to be passed as
+ * the 'limit' parameter.
+ *
+ * After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
+ * hand side is simply a scalar product between an int16_t and uint8_t vector.
+ *
+ * Note that for efficiency, ds is stored on 16 bits. Real input residuals
+ * being small, this should not cause a noticeable issue.
+ */
+int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int64_t acc = 0;
+
+ do {
+ acc += *ds++ * *m++;
+ } while (--N);
+
+ return acc > limit;
+}
+
+/**
+ * Compute the element-wise difference of the squares of 2 arrays.
+ *
+ * d: Difference of the squares of the inputs: a**2 - b**2
+ * a: First input array
+ * b: Second input array
+ * N: Number of elements
+ *
+ * 'd', 'a', and 'b' are contiguous.
+ *
+ * The result is saturated to signed 16 bits.
+ */
+void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ int i;
+
+ for (i = 0; i < N; i++)
+ d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 0000000000..494b0fdf15
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,1409 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+ const int stride) {
+ __m128i buf0[32];
+ __m128i buf1[32];
+ const int32_t *cospi;
+
+ int startidx = 0 * stride;
+ int endidx = 31 * stride;
+ // stage 0
+ // stage 1
+ buf1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += stride;
+ endidx -= stride;
+ buf1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+ buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+ buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+ buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+ buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+ buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+ buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+ buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+ buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+ buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+ buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+ buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+ buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+ buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+ buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+ buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+ buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+ buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+ buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+ buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+ buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+ buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+ buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+ buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+ buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+ buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+ buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+ buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+ buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+ buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+ buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+ buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+ // stage 4
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+ buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+ buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+ buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+ buf0[4] = buf1[4];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[7] = buf1[7];
+ buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+ buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+ buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+ buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+ buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+ buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+ buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+ buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ cospi = cospi_arr(cos_bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+ cos_bit);
+ buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+ buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+ buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+ buf1[8] = buf0[8];
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+ buf1[14], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+ buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+ buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+ buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+ buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+ buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+ buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+ buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+ buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+ buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+ buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+ buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+ // stage 6
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+ buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+ buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+ buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+ buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+ buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+ buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+ buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+ buf0[16] = buf1[16];
+ btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+ buf1[14], cos_bit);
+ btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+ buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+ buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+ buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+ buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+ buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+ buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+ buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+ buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+ buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+ // stage 8
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+ buf0[31], cos_bit);
+ btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+
+ startidx = 0 * stride;
+ endidx = 31 * stride;
+ // stage 9
+ output[startidx] = buf0[0];
+ output[endidx] = buf0[31];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[16];
+ output[endidx] = buf0[15];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[8];
+ output[endidx] = buf0[23];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[24];
+ output[endidx] = buf0[7];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[4];
+ output[endidx] = buf0[27];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[20];
+ output[endidx] = buf0[11];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[12];
+ output[endidx] = buf0[19];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[28];
+ output[endidx] = buf0[3];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[2];
+ output[endidx] = buf0[29];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[18];
+ output[endidx] = buf0[13];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[10];
+ output[endidx] = buf0[21];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[26];
+ output[endidx] = buf0[5];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[6];
+ output[endidx] = buf0[25];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[22];
+ output[endidx] = buf0[9];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[14];
+ output[endidx] = buf0[17];
+ startidx += stride;
+ endidx -= stride;
+ output[startidx] = buf0[30];
+ output[endidx] = buf0[1];
+}
+
+void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 4;
+ const int num_per_128 = 4;
+ const int32_t *cospi;
+ __m128i buf0[4];
+ __m128i buf1[4];
+ int col_num = txfm_size / num_per_128;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ // stage 0;
+ int j;
+ for (j = 0; j < 4; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
+
+ // stage 1
+ buf1[0] = buf0[3];
+ buf1[1] = buf0[0];
+ buf1[2] = buf0[1];
+ buf1[3] = buf0[2];
+
+ // stage 2
+ cospi = cospi_arr(cos_bit);
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+ cos_bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+ buf0[3], cos_bit);
+
+ // stage 3
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+ buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+ buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+ // stage 4
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+ buf0[3], cos_bit);
+
+ // stage 5
+ buf1[0] = buf0[0];
+ buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]);
+ buf1[2] = buf0[3];
+ buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]);
+
+ for (j = 0; j < 4; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
+ }
+}
+
+void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
+ const int instride, const int outstride) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]);
+ __m128i cospi_p32 = _mm_set1_epi32(cospi[32]);
+ __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]);
+ __m128i cospi_p48 = _mm_set1_epi32(cospi[48]);
+ __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]);
+ __m128i cospi_p16 = _mm_set1_epi32(cospi[16]);
+ __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]);
+ __m128i cospi_p56 = _mm_set1_epi32(cospi[56]);
+ __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]);
+ __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]);
+ __m128i cospi_p24 = _mm_set1_epi32(cospi[24]);
+ __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]);
+ __m128i cospi_p08 = _mm_set1_epi32(cospi[8]);
+ __m128i cospi_p40 = _mm_set1_epi32(cospi[40]);
+ __m128i cospi_p60 = _mm_set1_epi32(cospi[60]);
+ __m128i cospi_p04 = _mm_set1_epi32(cospi[4]);
+ __m128i cospi_p28 = _mm_set1_epi32(cospi[28]);
+ __m128i cospi_p36 = _mm_set1_epi32(cospi[36]);
+ __m128i cospi_p44 = _mm_set1_epi32(cospi[44]);
+ __m128i cospi_p20 = _mm_set1_epi32(cospi[20]);
+ __m128i cospi_p12 = _mm_set1_epi32(cospi[12]);
+ __m128i cospi_p52 = _mm_set1_epi32(cospi[52]);
+ __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]);
+ __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]);
+ __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]);
+ __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]);
+ __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]);
+ __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]);
+ __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]);
+ __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]);
+ __m128i cospi_p62 = _mm_set1_epi32(cospi[62]);
+ __m128i cospi_p02 = _mm_set1_epi32(cospi[2]);
+ __m128i cospi_p30 = _mm_set1_epi32(cospi[30]);
+ __m128i cospi_p34 = _mm_set1_epi32(cospi[34]);
+ __m128i cospi_p46 = _mm_set1_epi32(cospi[46]);
+ __m128i cospi_p18 = _mm_set1_epi32(cospi[18]);
+ __m128i cospi_p14 = _mm_set1_epi32(cospi[14]);
+ __m128i cospi_p50 = _mm_set1_epi32(cospi[50]);
+ __m128i cospi_p54 = _mm_set1_epi32(cospi[54]);
+ __m128i cospi_p10 = _mm_set1_epi32(cospi[10]);
+ __m128i cospi_p22 = _mm_set1_epi32(cospi[22]);
+ __m128i cospi_p42 = _mm_set1_epi32(cospi[42]);
+ __m128i cospi_p38 = _mm_set1_epi32(cospi[38]);
+ __m128i cospi_p26 = _mm_set1_epi32(cospi[26]);
+ __m128i cospi_p06 = _mm_set1_epi32(cospi[6]);
+ __m128i cospi_p58 = _mm_set1_epi32(cospi[58]);
+ __m128i cospi_p63 = _mm_set1_epi32(cospi[63]);
+ __m128i cospi_p01 = _mm_set1_epi32(cospi[1]);
+ __m128i cospi_p31 = _mm_set1_epi32(cospi[31]);
+ __m128i cospi_p33 = _mm_set1_epi32(cospi[33]);
+ __m128i cospi_p47 = _mm_set1_epi32(cospi[47]);
+ __m128i cospi_p17 = _mm_set1_epi32(cospi[17]);
+ __m128i cospi_p15 = _mm_set1_epi32(cospi[15]);
+ __m128i cospi_p49 = _mm_set1_epi32(cospi[49]);
+ __m128i cospi_p55 = _mm_set1_epi32(cospi[55]);
+ __m128i cospi_p09 = _mm_set1_epi32(cospi[9]);
+ __m128i cospi_p23 = _mm_set1_epi32(cospi[23]);
+ __m128i cospi_p41 = _mm_set1_epi32(cospi[41]);
+ __m128i cospi_p39 = _mm_set1_epi32(cospi[39]);
+ __m128i cospi_p25 = _mm_set1_epi32(cospi[25]);
+ __m128i cospi_p07 = _mm_set1_epi32(cospi[7]);
+ __m128i cospi_p57 = _mm_set1_epi32(cospi[57]);
+ __m128i cospi_p59 = _mm_set1_epi32(cospi[59]);
+ __m128i cospi_p05 = _mm_set1_epi32(cospi[5]);
+ __m128i cospi_p27 = _mm_set1_epi32(cospi[27]);
+ __m128i cospi_p37 = _mm_set1_epi32(cospi[37]);
+ __m128i cospi_p43 = _mm_set1_epi32(cospi[43]);
+ __m128i cospi_p21 = _mm_set1_epi32(cospi[21]);
+ __m128i cospi_p11 = _mm_set1_epi32(cospi[11]);
+ __m128i cospi_p53 = _mm_set1_epi32(cospi[53]);
+ __m128i cospi_p51 = _mm_set1_epi32(cospi[51]);
+ __m128i cospi_p13 = _mm_set1_epi32(cospi[13]);
+ __m128i cospi_p19 = _mm_set1_epi32(cospi[19]);
+ __m128i cospi_p45 = _mm_set1_epi32(cospi[45]);
+ __m128i cospi_p35 = _mm_set1_epi32(cospi[35]);
+ __m128i cospi_p29 = _mm_set1_epi32(cospi[29]);
+ __m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
+ __m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
+
+ int startidx = 0 * instride;
+ int endidx = 63 * instride;
+ // stage 1
+ __m128i x1[64];
+ x1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[63] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[62] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[61] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[60] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[59] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[58] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[57] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[56] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[55] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[54] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[53] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[52] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[51] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[50] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[49] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[48] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[16] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[47] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[17] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[46] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[18] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[45] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[19] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[44] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[20] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[43] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[21] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[42] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[22] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[41] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[23] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[40] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[24] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[39] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[25] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[38] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[26] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[37] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[27] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[36] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[28] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[35] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[29] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[34] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[30] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[33] = _mm_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[31] = _mm_add_epi32(input[startidx], input[endidx]);
+ x1[32] = _mm_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ __m128i x2[64];
+ x2[0] = _mm_add_epi32(x1[0], x1[31]);
+ x2[31] = _mm_sub_epi32(x1[0], x1[31]);
+ x2[1] = _mm_add_epi32(x1[1], x1[30]);
+ x2[30] = _mm_sub_epi32(x1[1], x1[30]);
+ x2[2] = _mm_add_epi32(x1[2], x1[29]);
+ x2[29] = _mm_sub_epi32(x1[2], x1[29]);
+ x2[3] = _mm_add_epi32(x1[3], x1[28]);
+ x2[28] = _mm_sub_epi32(x1[3], x1[28]);
+ x2[4] = _mm_add_epi32(x1[4], x1[27]);
+ x2[27] = _mm_sub_epi32(x1[4], x1[27]);
+ x2[5] = _mm_add_epi32(x1[5], x1[26]);
+ x2[26] = _mm_sub_epi32(x1[5], x1[26]);
+ x2[6] = _mm_add_epi32(x1[6], x1[25]);
+ x2[25] = _mm_sub_epi32(x1[6], x1[25]);
+ x2[7] = _mm_add_epi32(x1[7], x1[24]);
+ x2[24] = _mm_sub_epi32(x1[7], x1[24]);
+ x2[8] = _mm_add_epi32(x1[8], x1[23]);
+ x2[23] = _mm_sub_epi32(x1[8], x1[23]);
+ x2[9] = _mm_add_epi32(x1[9], x1[22]);
+ x2[22] = _mm_sub_epi32(x1[9], x1[22]);
+ x2[10] = _mm_add_epi32(x1[10], x1[21]);
+ x2[21] = _mm_sub_epi32(x1[10], x1[21]);
+ x2[11] = _mm_add_epi32(x1[11], x1[20]);
+ x2[20] = _mm_sub_epi32(x1[11], x1[20]);
+ x2[12] = _mm_add_epi32(x1[12], x1[19]);
+ x2[19] = _mm_sub_epi32(x1[12], x1[19]);
+ x2[13] = _mm_add_epi32(x1[13], x1[18]);
+ x2[18] = _mm_sub_epi32(x1[13], x1[18]);
+ x2[14] = _mm_add_epi32(x1[14], x1[17]);
+ x2[17] = _mm_sub_epi32(x1[14], x1[17]);
+ x2[15] = _mm_add_epi32(x1[15], x1[16]);
+ x2[16] = _mm_sub_epi32(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48],
+ __rounding, cos_bit);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+
+ // stage 3
+ __m128i x3[64];
+ x3[0] = _mm_add_epi32(x2[0], x2[15]);
+ x3[15] = _mm_sub_epi32(x2[0], x2[15]);
+ x3[1] = _mm_add_epi32(x2[1], x2[14]);
+ x3[14] = _mm_sub_epi32(x2[1], x2[14]);
+ x3[2] = _mm_add_epi32(x2[2], x2[13]);
+ x3[13] = _mm_sub_epi32(x2[2], x2[13]);
+ x3[3] = _mm_add_epi32(x2[3], x2[12]);
+ x3[12] = _mm_sub_epi32(x2[3], x2[12]);
+ x3[4] = _mm_add_epi32(x2[4], x2[11]);
+ x3[11] = _mm_sub_epi32(x2[4], x2[11]);
+ x3[5] = _mm_add_epi32(x2[5], x2[10]);
+ x3[10] = _mm_sub_epi32(x2[5], x2[10]);
+ x3[6] = _mm_add_epi32(x2[6], x2[9]);
+ x3[9] = _mm_sub_epi32(x2[6], x2[9]);
+ x3[7] = _mm_add_epi32(x2[7], x2[8]);
+ x3[8] = _mm_sub_epi32(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24],
+ __rounding, cos_bit);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm_add_epi32(x2[32], x2[47]);
+ x3[47] = _mm_sub_epi32(x2[32], x2[47]);
+ x3[33] = _mm_add_epi32(x2[33], x2[46]);
+ x3[46] = _mm_sub_epi32(x2[33], x2[46]);
+ x3[34] = _mm_add_epi32(x2[34], x2[45]);
+ x3[45] = _mm_sub_epi32(x2[34], x2[45]);
+ x3[35] = _mm_add_epi32(x2[35], x2[44]);
+ x3[44] = _mm_sub_epi32(x2[35], x2[44]);
+ x3[36] = _mm_add_epi32(x2[36], x2[43]);
+ x3[43] = _mm_sub_epi32(x2[36], x2[43]);
+ x3[37] = _mm_add_epi32(x2[37], x2[42]);
+ x3[42] = _mm_sub_epi32(x2[37], x2[42]);
+ x3[38] = _mm_add_epi32(x2[38], x2[41]);
+ x3[41] = _mm_sub_epi32(x2[38], x2[41]);
+ x3[39] = _mm_add_epi32(x2[39], x2[40]);
+ x3[40] = _mm_sub_epi32(x2[39], x2[40]);
+ x3[48] = _mm_sub_epi32(x2[63], x2[48]);
+ x3[63] = _mm_add_epi32(x2[63], x2[48]);
+ x3[49] = _mm_sub_epi32(x2[62], x2[49]);
+ x3[62] = _mm_add_epi32(x2[62], x2[49]);
+ x3[50] = _mm_sub_epi32(x2[61], x2[50]);
+ x3[61] = _mm_add_epi32(x2[61], x2[50]);
+ x3[51] = _mm_sub_epi32(x2[60], x2[51]);
+ x3[60] = _mm_add_epi32(x2[60], x2[51]);
+ x3[52] = _mm_sub_epi32(x2[59], x2[52]);
+ x3[59] = _mm_add_epi32(x2[59], x2[52]);
+ x3[53] = _mm_sub_epi32(x2[58], x2[53]);
+ x3[58] = _mm_add_epi32(x2[58], x2[53]);
+ x3[54] = _mm_sub_epi32(x2[57], x2[54]);
+ x3[57] = _mm_add_epi32(x2[57], x2[54]);
+ x3[55] = _mm_sub_epi32(x2[56], x2[55]);
+ x3[56] = _mm_add_epi32(x2[56], x2[55]);
+
+ // stage 4
+ __m128i x4[64];
+ x4[0] = _mm_add_epi32(x3[0], x3[7]);
+ x4[7] = _mm_sub_epi32(x3[0], x3[7]);
+ x4[1] = _mm_add_epi32(x3[1], x3[6]);
+ x4[6] = _mm_sub_epi32(x3[1], x3[6]);
+ x4[2] = _mm_add_epi32(x3[2], x3[5]);
+ x4[5] = _mm_sub_epi32(x3[2], x3[5]);
+ x4[3] = _mm_add_epi32(x3[3], x3[4]);
+ x4[4] = _mm_sub_epi32(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12],
+ __rounding, cos_bit);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm_add_epi32(x3[16], x3[23]);
+ x4[23] = _mm_sub_epi32(x3[16], x3[23]);
+ x4[17] = _mm_add_epi32(x3[17], x3[22]);
+ x4[22] = _mm_sub_epi32(x3[17], x3[22]);
+ x4[18] = _mm_add_epi32(x3[18], x3[21]);
+ x4[21] = _mm_sub_epi32(x3[18], x3[21]);
+ x4[19] = _mm_add_epi32(x3[19], x3[20]);
+ x4[20] = _mm_sub_epi32(x3[19], x3[20]);
+ x4[24] = _mm_sub_epi32(x3[31], x3[24]);
+ x4[31] = _mm_add_epi32(x3[31], x3[24]);
+ x4[25] = _mm_sub_epi32(x3[30], x3[25]);
+ x4[30] = _mm_add_epi32(x3[30], x3[25]);
+ x4[26] = _mm_sub_epi32(x3[29], x3[26]);
+ x4[29] = _mm_add_epi32(x3[29], x3[26]);
+ x4[27] = _mm_sub_epi32(x3[28], x3[27]);
+ x4[28] = _mm_add_epi32(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52],
+ __rounding, cos_bit);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+
+ // stage 5
+ __m128i x5[64];
+ x5[0] = _mm_add_epi32(x4[0], x4[3]);
+ x5[3] = _mm_sub_epi32(x4[0], x4[3]);
+ x5[1] = _mm_add_epi32(x4[1], x4[2]);
+ x5[2] = _mm_sub_epi32(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6],
+ __rounding, cos_bit);
+ x5[7] = x4[7];
+ x5[8] = _mm_add_epi32(x4[8], x4[11]);
+ x5[11] = _mm_sub_epi32(x4[8], x4[11]);
+ x5[9] = _mm_add_epi32(x4[9], x4[10]);
+ x5[10] = _mm_sub_epi32(x4[9], x4[10]);
+ x5[12] = _mm_sub_epi32(x4[15], x4[12]);
+ x5[15] = _mm_add_epi32(x4[15], x4[12]);
+ x5[13] = _mm_sub_epi32(x4[14], x4[13]);
+ x5[14] = _mm_add_epi32(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26],
+ __rounding, cos_bit);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm_add_epi32(x4[32], x4[39]);
+ x5[39] = _mm_sub_epi32(x4[32], x4[39]);
+ x5[33] = _mm_add_epi32(x4[33], x4[38]);
+ x5[38] = _mm_sub_epi32(x4[33], x4[38]);
+ x5[34] = _mm_add_epi32(x4[34], x4[37]);
+ x5[37] = _mm_sub_epi32(x4[34], x4[37]);
+ x5[35] = _mm_add_epi32(x4[35], x4[36]);
+ x5[36] = _mm_sub_epi32(x4[35], x4[36]);
+ x5[40] = _mm_sub_epi32(x4[47], x4[40]);
+ x5[47] = _mm_add_epi32(x4[47], x4[40]);
+ x5[41] = _mm_sub_epi32(x4[46], x4[41]);
+ x5[46] = _mm_add_epi32(x4[46], x4[41]);
+ x5[42] = _mm_sub_epi32(x4[45], x4[42]);
+ x5[45] = _mm_add_epi32(x4[45], x4[42]);
+ x5[43] = _mm_sub_epi32(x4[44], x4[43]);
+ x5[44] = _mm_add_epi32(x4[44], x4[43]);
+ x5[48] = _mm_add_epi32(x4[48], x4[55]);
+ x5[55] = _mm_sub_epi32(x4[48], x4[55]);
+ x5[49] = _mm_add_epi32(x4[49], x4[54]);
+ x5[54] = _mm_sub_epi32(x4[49], x4[54]);
+ x5[50] = _mm_add_epi32(x4[50], x4[53]);
+ x5[53] = _mm_sub_epi32(x4[50], x4[53]);
+ x5[51] = _mm_add_epi32(x4[51], x4[52]);
+ x5[52] = _mm_sub_epi32(x4[51], x4[52]);
+ x5[56] = _mm_sub_epi32(x4[63], x4[56]);
+ x5[63] = _mm_add_epi32(x4[63], x4[56]);
+ x5[57] = _mm_sub_epi32(x4[62], x4[57]);
+ x5[62] = _mm_add_epi32(x4[62], x4[57]);
+ x5[58] = _mm_sub_epi32(x4[61], x4[58]);
+ x5[61] = _mm_add_epi32(x4[61], x4[58]);
+ x5[59] = _mm_sub_epi32(x4[60], x4[59]);
+ x5[60] = _mm_add_epi32(x4[60], x4[59]);
+
+ // stage 6
+ __m128i x6[64];
+ btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3],
+ __rounding, cos_bit);
+ x6[4] = _mm_add_epi32(x5[4], x5[5]);
+ x6[5] = _mm_sub_epi32(x5[4], x5[5]);
+ x6[6] = _mm_sub_epi32(x5[7], x5[6]);
+ x6[7] = _mm_add_epi32(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13],
+ __rounding, cos_bit);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm_add_epi32(x5[16], x5[19]);
+ x6[19] = _mm_sub_epi32(x5[16], x5[19]);
+ x6[17] = _mm_add_epi32(x5[17], x5[18]);
+ x6[18] = _mm_sub_epi32(x5[17], x5[18]);
+ x6[20] = _mm_sub_epi32(x5[23], x5[20]);
+ x6[23] = _mm_add_epi32(x5[23], x5[20]);
+ x6[21] = _mm_sub_epi32(x5[22], x5[21]);
+ x6[22] = _mm_add_epi32(x5[22], x5[21]);
+ x6[24] = _mm_add_epi32(x5[24], x5[27]);
+ x6[27] = _mm_sub_epi32(x5[24], x5[27]);
+ x6[25] = _mm_add_epi32(x5[25], x5[26]);
+ x6[26] = _mm_sub_epi32(x5[25], x5[26]);
+ x6[28] = _mm_sub_epi32(x5[31], x5[28]);
+ x6[31] = _mm_add_epi32(x5[31], x5[28]);
+ x6[29] = _mm_sub_epi32(x5[30], x5[29]);
+ x6[30] = _mm_add_epi32(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58],
+ __rounding, cos_bit);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50],
+ __rounding, cos_bit);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+
+ // stage 7
+ __m128i x7[64];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6],
+ __rounding, cos_bit);
+ x7[8] = _mm_add_epi32(x6[8], x6[9]);
+ x7[9] = _mm_sub_epi32(x6[8], x6[9]);
+ x7[10] = _mm_sub_epi32(x6[11], x6[10]);
+ x7[11] = _mm_add_epi32(x6[11], x6[10]);
+ x7[12] = _mm_add_epi32(x6[12], x6[13]);
+ x7[13] = _mm_sub_epi32(x6[12], x6[13]);
+ x7[14] = _mm_sub_epi32(x6[15], x6[14]);
+ x7[15] = _mm_add_epi32(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29],
+ __rounding, cos_bit);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25],
+ __rounding, cos_bit);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm_add_epi32(x6[32], x6[35]);
+ x7[35] = _mm_sub_epi32(x6[32], x6[35]);
+ x7[33] = _mm_add_epi32(x6[33], x6[34]);
+ x7[34] = _mm_sub_epi32(x6[33], x6[34]);
+ x7[36] = _mm_sub_epi32(x6[39], x6[36]);
+ x7[39] = _mm_add_epi32(x6[39], x6[36]);
+ x7[37] = _mm_sub_epi32(x6[38], x6[37]);
+ x7[38] = _mm_add_epi32(x6[38], x6[37]);
+ x7[40] = _mm_add_epi32(x6[40], x6[43]);
+ x7[43] = _mm_sub_epi32(x6[40], x6[43]);
+ x7[41] = _mm_add_epi32(x6[41], x6[42]);
+ x7[42] = _mm_sub_epi32(x6[41], x6[42]);
+ x7[44] = _mm_sub_epi32(x6[47], x6[44]);
+ x7[47] = _mm_add_epi32(x6[47], x6[44]);
+ x7[45] = _mm_sub_epi32(x6[46], x6[45]);
+ x7[46] = _mm_add_epi32(x6[46], x6[45]);
+ x7[48] = _mm_add_epi32(x6[48], x6[51]);
+ x7[51] = _mm_sub_epi32(x6[48], x6[51]);
+ x7[49] = _mm_add_epi32(x6[49], x6[50]);
+ x7[50] = _mm_sub_epi32(x6[49], x6[50]);
+ x7[52] = _mm_sub_epi32(x6[55], x6[52]);
+ x7[55] = _mm_add_epi32(x6[55], x6[52]);
+ x7[53] = _mm_sub_epi32(x6[54], x6[53]);
+ x7[54] = _mm_add_epi32(x6[54], x6[53]);
+ x7[56] = _mm_add_epi32(x6[56], x6[59]);
+ x7[59] = _mm_sub_epi32(x6[56], x6[59]);
+ x7[57] = _mm_add_epi32(x6[57], x6[58]);
+ x7[58] = _mm_sub_epi32(x6[57], x6[58]);
+ x7[60] = _mm_sub_epi32(x6[63], x6[60]);
+ x7[63] = _mm_add_epi32(x6[63], x6[60]);
+ x7[61] = _mm_sub_epi32(x6[62], x6[61]);
+ x7[62] = _mm_add_epi32(x6[62], x6[61]);
+
+ // stage 8
+ __m128i x8[64];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12],
+ __rounding, cos_bit);
+ x8[16] = _mm_add_epi32(x7[16], x7[17]);
+ x8[17] = _mm_sub_epi32(x7[16], x7[17]);
+ x8[18] = _mm_sub_epi32(x7[19], x7[18]);
+ x8[19] = _mm_add_epi32(x7[19], x7[18]);
+ x8[20] = _mm_add_epi32(x7[20], x7[21]);
+ x8[21] = _mm_sub_epi32(x7[20], x7[21]);
+ x8[22] = _mm_sub_epi32(x7[23], x7[22]);
+ x8[23] = _mm_add_epi32(x7[23], x7[22]);
+ x8[24] = _mm_add_epi32(x7[24], x7[25]);
+ x8[25] = _mm_sub_epi32(x7[24], x7[25]);
+ x8[26] = _mm_sub_epi32(x7[27], x7[26]);
+ x8[27] = _mm_add_epi32(x7[27], x7[26]);
+ x8[28] = _mm_add_epi32(x7[28], x7[29]);
+ x8[29] = _mm_sub_epi32(x7[28], x7[29]);
+ x8[30] = _mm_sub_epi32(x7[31], x7[30]);
+ x8[31] = _mm_add_epi32(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+ __rounding, cos_bit);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+ __rounding, cos_bit);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+ __rounding, cos_bit);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+ __rounding, cos_bit);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+
+ // stage 9
+ __m128i x9[64];
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24],
+ __rounding, cos_bit);
+ x9[32] = _mm_add_epi32(x8[32], x8[33]);
+ x9[33] = _mm_sub_epi32(x8[32], x8[33]);
+ x9[34] = _mm_sub_epi32(x8[35], x8[34]);
+ x9[35] = _mm_add_epi32(x8[35], x8[34]);
+ x9[36] = _mm_add_epi32(x8[36], x8[37]);
+ x9[37] = _mm_sub_epi32(x8[36], x8[37]);
+ x9[38] = _mm_sub_epi32(x8[39], x8[38]);
+ x9[39] = _mm_add_epi32(x8[39], x8[38]);
+ x9[40] = _mm_add_epi32(x8[40], x8[41]);
+ x9[41] = _mm_sub_epi32(x8[40], x8[41]);
+ x9[42] = _mm_sub_epi32(x8[43], x8[42]);
+ x9[43] = _mm_add_epi32(x8[43], x8[42]);
+ x9[44] = _mm_add_epi32(x8[44], x8[45]);
+ x9[45] = _mm_sub_epi32(x8[44], x8[45]);
+ x9[46] = _mm_sub_epi32(x8[47], x8[46]);
+ x9[47] = _mm_add_epi32(x8[47], x8[46]);
+ x9[48] = _mm_add_epi32(x8[48], x8[49]);
+ x9[49] = _mm_sub_epi32(x8[48], x8[49]);
+ x9[50] = _mm_sub_epi32(x8[51], x8[50]);
+ x9[51] = _mm_add_epi32(x8[51], x8[50]);
+ x9[52] = _mm_add_epi32(x8[52], x8[53]);
+ x9[53] = _mm_sub_epi32(x8[52], x8[53]);
+ x9[54] = _mm_sub_epi32(x8[55], x8[54]);
+ x9[55] = _mm_add_epi32(x8[55], x8[54]);
+ x9[56] = _mm_add_epi32(x8[56], x8[57]);
+ x9[57] = _mm_sub_epi32(x8[56], x8[57]);
+ x9[58] = _mm_sub_epi32(x8[59], x8[58]);
+ x9[59] = _mm_add_epi32(x8[59], x8[58]);
+ x9[60] = _mm_add_epi32(x8[60], x8[61]);
+ x9[61] = _mm_sub_epi32(x8[60], x8[61]);
+ x9[62] = _mm_sub_epi32(x8[63], x8[62]);
+ x9[63] = _mm_add_epi32(x8[63], x8[62]);
+
+ // stage 10
+ __m128i x10[64];
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32],
+ x10[63], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33],
+ x10[62], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34],
+ x10[61], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35],
+ x10[60], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36],
+ x10[59], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37],
+ x10[58], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38],
+ x10[57], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39],
+ x10[56], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40],
+ x10[55], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41],
+ x10[54], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42],
+ x10[53], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43],
+ x10[52], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44],
+ x10[51], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45],
+ x10[50], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46],
+ x10[49], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
+ x10[48], __rounding, cos_bit);
+
+ startidx = 0 * outstride;
+ endidx = 63 * outstride;
+ // stage 11
+ output[startidx] = x10[0];
+ output[endidx] = x10[63];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[32];
+ output[endidx] = x10[31];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[16];
+ output[endidx] = x10[47];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[48];
+ output[endidx] = x10[15];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[8];
+ output[endidx] = x10[55];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[40];
+ output[endidx] = x10[23];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[24];
+ output[endidx] = x10[39];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[56];
+ output[endidx] = x10[7];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[4];
+ output[endidx] = x10[59];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[36];
+ output[endidx] = x10[27];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[20];
+ output[endidx] = x10[43];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[52];
+ output[endidx] = x10[11];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[12];
+ output[endidx] = x10[51];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[44];
+ output[endidx] = x10[19];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[28];
+ output[endidx] = x10[35];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[60];
+ output[endidx] = x10[3];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[2];
+ output[endidx] = x10[61];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[34];
+ output[endidx] = x10[29];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[18];
+ output[endidx] = x10[45];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[50];
+ output[endidx] = x10[13];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[10];
+ output[endidx] = x10[53];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[42];
+ output[endidx] = x10[21];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[26];
+ output[endidx] = x10[37];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[58];
+ output[endidx] = x10[5];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[6];
+ output[endidx] = x10[57];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[38];
+ output[endidx] = x10[25];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[22];
+ output[endidx] = x10[41];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[54];
+ output[endidx] = x10[9];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[14];
+ output[endidx] = x10[49];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[46];
+ output[endidx] = x10[17];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[30];
+ output[endidx] = x10[33];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x10[62];
+ output[endidx] = x10[1];
+}
+
+void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+ const int col_num) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; i++) {
+ output[i * col_num] = _mm_slli_epi32(input[i * col_num], 2);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
new file mode 100644
index 0000000000..b143df3523
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -0,0 +1,3010 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+
+ // stage 1
+ __m256i x1[16];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+ // stage 7
+ output[0] = x1[0];
+ output[1] = x1[8];
+ output[2] = x1[4];
+ output[3] = x1[12];
+ output[4] = x1[2];
+ output[5] = x1[10];
+ output[6] = x1[6];
+ output[7] = x1[14];
+ output[8] = x1[1];
+ output[9] = x1[9];
+ output[10] = x1[5];
+ output[11] = x1[13];
+ output[12] = x1[3];
+ output[13] = x1[11];
+ output[14] = x1[7];
+ output[15] = x1[15];
+}
+
+static INLINE void fdct16x32_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+ __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+ __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+ __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+ __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+ __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+ __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+ __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+ __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+ __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+ __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+ __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+ __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+ __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+ __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+ __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+
+ // stage 1
+ __m256i x1[32];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+ btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+ btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+ btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+ btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+ btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+ btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+ btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+ btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+
+ // stage 4
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+
+ // stage 7
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[0];
+ output[1] = x1[16];
+ output[2] = x1[8];
+ output[3] = x1[24];
+ output[4] = x1[4];
+ output[5] = x1[20];
+ output[6] = x1[12];
+ output[7] = x1[28];
+ output[8] = x1[2];
+ output[9] = x1[18];
+ output[10] = x1[10];
+ output[11] = x1[26];
+ output[12] = x1[6];
+ output[13] = x1[22];
+ output[14] = x1[14];
+ output[15] = x1[30];
+ output[16] = x1[1];
+ output[17] = x1[17];
+ output[18] = x1[9];
+ output[19] = x1[25];
+ output[20] = x1[5];
+ output[21] = x1[21];
+ output[22] = x1[13];
+ output[23] = x1[29];
+ output[24] = x1[3];
+ output[25] = x1[19];
+ output[26] = x1[11];
+ output[27] = x1[27];
+ output[28] = x1[7];
+ output[29] = x1[23];
+ output[30] = x1[15];
+ output[31] = x1[31];
+}
+
+static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+ __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+ __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+ __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+ __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+ __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+ __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+ __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+ __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+ __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+ __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+ __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+ __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+ __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+ __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+ __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+ __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+ __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+ __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
+ __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
+ __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
+ __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
+ __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
+ __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
+ __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
+ __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
+ __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
+ __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
+ __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
+ __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
+ __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
+ __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
+ __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
+ __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
+ __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
+ __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
+ __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
+ __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
+ __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
+ __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
+ __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
+ __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
+ __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
+ __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
+ __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
+ __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
+ __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
+ __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
+ __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
+ __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
+
+ // stage 1
+ __m256i x1[64];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+ btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+ btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+ btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+ btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+ btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+ btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+ btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+ btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+ btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+ btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+ btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+ btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+ btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+ btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+ btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+ btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+ btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+ btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+ btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+ btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+ btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+ btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+ btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+ btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[31]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[30]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[10], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[13], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[47]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[46]);
+ btf_16_adds_subs_avx2(&x1[34], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[44]);
+ btf_16_adds_subs_avx2(&x1[36], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[37], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[38], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[40]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[48]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[49]);
+ btf_16_adds_subs_avx2(&x1[61], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[51]);
+ btf_16_adds_subs_avx2(&x1[59], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[58], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[57], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[55]);
+
+ // stage 4
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
+
+ // stage 5
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[39]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[38]);
+ btf_16_adds_subs_avx2(&x1[34], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[36]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[40]);
+ btf_16_adds_subs_avx2(&x1[46], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[45], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[44], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[55]);
+ btf_16_adds_subs_avx2(&x1[49], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[50], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[51], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[56]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[57]);
+ btf_16_adds_subs_avx2(&x1[61], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[59]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
+
+ // stage 7
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[35]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[34]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[36]);
+ btf_16_adds_subs_avx2(&x1[38], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[40], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[41], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[44]);
+ btf_16_adds_subs_avx2(&x1[46], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[51]);
+ btf_16_adds_subs_avx2(&x1[49], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[55], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[54], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[59]);
+ btf_16_adds_subs_avx2(&x1[57], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[60]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[61]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
+
+ // stage 9
+ btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[33]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[34]);
+ btf_16_adds_subs_avx2(&x1[36], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[38]);
+ btf_16_adds_subs_avx2(&x1[40], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[43], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[44], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[46]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[49]);
+ btf_16_adds_subs_avx2(&x1[51], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[52], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[55], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[57]);
+ btf_16_adds_subs_avx2(&x1[59], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[61]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[62]);
+
+ // stage 10
+ btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 11
+ output[0] = x1[0];
+ output[1] = x1[32];
+ output[2] = x1[16];
+ output[3] = x1[48];
+ output[4] = x1[8];
+ output[5] = x1[40];
+ output[6] = x1[24];
+ output[7] = x1[56];
+ output[8] = x1[4];
+ output[9] = x1[36];
+ output[10] = x1[20];
+ output[11] = x1[52];
+ output[12] = x1[12];
+ output[13] = x1[44];
+ output[14] = x1[28];
+ output[15] = x1[60];
+ output[16] = x1[2];
+ output[17] = x1[34];
+ output[18] = x1[18];
+ output[19] = x1[50];
+ output[20] = x1[10];
+ output[21] = x1[42];
+ output[22] = x1[26];
+ output[23] = x1[58];
+ output[24] = x1[6];
+ output[25] = x1[38];
+ output[26] = x1[22];
+ output[27] = x1[54];
+ output[28] = x1[14];
+ output[29] = x1[46];
+ output[30] = x1[30];
+ output[31] = x1[62];
+ output[32] = x1[1];
+ output[33] = x1[33];
+ output[34] = x1[17];
+ output[35] = x1[49];
+ output[36] = x1[9];
+ output[37] = x1[41];
+ output[38] = x1[25];
+ output[39] = x1[57];
+ output[40] = x1[5];
+ output[41] = x1[37];
+ output[42] = x1[21];
+ output[43] = x1[53];
+ output[44] = x1[13];
+ output[45] = x1[45];
+ output[46] = x1[29];
+ output[47] = x1[61];
+ output[48] = x1[3];
+ output[49] = x1[35];
+ output[50] = x1[19];
+ output[51] = x1[51];
+ output[52] = x1[11];
+ output[53] = x1[43];
+ output[54] = x1[27];
+ output[55] = x1[59];
+ output[56] = x1[7];
+ output[57] = x1[39];
+ output[58] = x1[23];
+ output[59] = x1[55];
+ output[60] = x1[15];
+ output[61] = x1[47];
+ output[62] = x1[31];
+ output[63] = x1[63];
+}
+
+static INLINE void fdct32_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ __m256i x1[32];
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+ // stage 0
+ // stage 1
+ btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+ btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+ btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+ btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+ btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+ btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+ btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+ btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+ btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+ btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+ btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+ btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+ btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+ btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+ btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+ btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+ // stage 2
+ btf_32_add_sub_avx2(&x1[0], &x1[15]);
+ btf_32_add_sub_avx2(&x1[1], &x1[14]);
+ btf_32_add_sub_avx2(&x1[2], &x1[13]);
+ btf_32_add_sub_avx2(&x1[3], &x1[12]);
+ btf_32_add_sub_avx2(&x1[4], &x1[11]);
+ btf_32_add_sub_avx2(&x1[5], &x1[10]);
+ btf_32_add_sub_avx2(&x1[6], &x1[9]);
+ btf_32_add_sub_avx2(&x1[7], &x1[8]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 3
+ btf_32_add_sub_avx2(&x1[0], &x1[7]);
+ btf_32_add_sub_avx2(&x1[1], &x1[6]);
+ btf_32_add_sub_avx2(&x1[2], &x1[5]);
+ btf_32_add_sub_avx2(&x1[3], &x1[4]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[23]);
+ btf_32_add_sub_avx2(&x1[17], &x1[22]);
+ btf_32_add_sub_avx2(&x1[18], &x1[21]);
+ btf_32_add_sub_avx2(&x1[19], &x1[20]);
+ btf_32_add_sub_avx2(&x1[31], &x1[24]);
+ btf_32_add_sub_avx2(&x1[30], &x1[25]);
+ btf_32_add_sub_avx2(&x1[29], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[27]);
+
+ // stage 4
+ btf_32_add_sub_avx2(&x1[0], &x1[3]);
+ btf_32_add_sub_avx2(&x1[1], &x1[2]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[11]);
+ btf_32_add_sub_avx2(&x1[9], &x1[10]);
+ btf_32_add_sub_avx2(&x1[15], &x1[12]);
+ btf_32_add_sub_avx2(&x1[14], &x1[13]);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
+
+ // stage 5
+ btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
+ btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[4], &x1[5]);
+ btf_32_add_sub_avx2(&x1[7], &x1[6]);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[19]);
+ btf_32_add_sub_avx2(&x1[17], &x1[18]);
+ btf_32_add_sub_avx2(&x1[23], &x1[20]);
+ btf_32_add_sub_avx2(&x1[22], &x1[21]);
+ btf_32_add_sub_avx2(&x1[24], &x1[27]);
+ btf_32_add_sub_avx2(&x1[25], &x1[26]);
+ btf_32_add_sub_avx2(&x1[31], &x1[28]);
+ btf_32_add_sub_avx2(&x1[30], &x1[29]);
+
+ // stage 6
+ btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
+ btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[9]);
+ btf_32_add_sub_avx2(&x1[11], &x1[10]);
+ btf_32_add_sub_avx2(&x1[12], &x1[13]);
+ btf_32_add_sub_avx2(&x1[15], &x1[14]);
+ btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
+
+ // stage 7
+ btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
+ btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[17]);
+ btf_32_add_sub_avx2(&x1[19], &x1[18]);
+ btf_32_add_sub_avx2(&x1[20], &x1[21]);
+ btf_32_add_sub_avx2(&x1[23], &x1[22]);
+ btf_32_add_sub_avx2(&x1[24], &x1[25]);
+ btf_32_add_sub_avx2(&x1[27], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[29]);
+ btf_32_add_sub_avx2(&x1[31], &x1[30]);
+
+ // stage 8
+ btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
+ btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[0];
+ output[1] = x1[16];
+ output[2] = x1[8];
+ output[3] = x1[24];
+ output[4] = x1[4];
+ output[5] = x1[20];
+ output[6] = x1[12];
+ output[7] = x1[28];
+ output[8] = x1[2];
+ output[9] = x1[18];
+ output[10] = x1[10];
+ output[11] = x1[26];
+ output[12] = x1[6];
+ output[13] = x1[22];
+ output[14] = x1[14];
+ output[15] = x1[30];
+ output[16] = x1[1];
+ output[17] = x1[17];
+ output[18] = x1[9];
+ output[19] = x1[25];
+ output[20] = x1[5];
+ output[21] = x1[21];
+ output[22] = x1[13];
+ output[23] = x1[29];
+ output[24] = x1[3];
+ output[25] = x1[19];
+ output[26] = x1[11];
+ output[27] = x1[27];
+ output[28] = x1[7];
+ output[29] = x1[23];
+ output[30] = x1[15];
+ output[31] = x1[31];
+}
+
+static INLINE void fdct64_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+ __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+ __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+ __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+ __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+ __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+ __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+ __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+ __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+ __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+ __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+ __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+ __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+ __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+ __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+ __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+ __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+ __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+ __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+ __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+ __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+ __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+ __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+ __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+ __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+ __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+ __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+ __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+ __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+ __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+ __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+ __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+ __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+ __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+ __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+ __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+ __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+ __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+ __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+ __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+ __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+ __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+ __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+ __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+ __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+ __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+ __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+ __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+ __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+ __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+ __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+ __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+ __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+ __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+ __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+ __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+ __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+ __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+ __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+ __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+ __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+ __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+ __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+ __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+ __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+ __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+ __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+ __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+ __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+ __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+ __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+ __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+ __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+ __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+ __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+ __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+ __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+ __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+ // stage 1
+ __m256i x1[64];
+ btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+ btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+ btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+ btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+ btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+ btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+ btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+ btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+ btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+ btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+ btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+ btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+ btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+ btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+ btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+ btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+ btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+ btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+ btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+ btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+ btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+ btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+ btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+ btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+ btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+ btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+ btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+ btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+ btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+ btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+ btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+ btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+ // stage 2
+ btf_32_add_sub_avx2(&x1[0], &x1[31]);
+ btf_32_add_sub_avx2(&x1[1], &x1[30]);
+ btf_32_add_sub_avx2(&x1[2], &x1[29]);
+ btf_32_add_sub_avx2(&x1[3], &x1[28]);
+ btf_32_add_sub_avx2(&x1[4], &x1[27]);
+ btf_32_add_sub_avx2(&x1[5], &x1[26]);
+ btf_32_add_sub_avx2(&x1[6], &x1[25]);
+ btf_32_add_sub_avx2(&x1[7], &x1[24]);
+ btf_32_add_sub_avx2(&x1[8], &x1[23]);
+ btf_32_add_sub_avx2(&x1[9], &x1[22]);
+ btf_32_add_sub_avx2(&x1[10], &x1[21]);
+ btf_32_add_sub_avx2(&x1[11], &x1[20]);
+ btf_32_add_sub_avx2(&x1[12], &x1[19]);
+ btf_32_add_sub_avx2(&x1[13], &x1[18]);
+ btf_32_add_sub_avx2(&x1[14], &x1[17]);
+ btf_32_add_sub_avx2(&x1[15], &x1[16]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 3
+ btf_32_add_sub_avx2(&x1[0], &x1[15]);
+ btf_32_add_sub_avx2(&x1[1], &x1[14]);
+ btf_32_add_sub_avx2(&x1[2], &x1[13]);
+ btf_32_add_sub_avx2(&x1[3], &x1[12]);
+ btf_32_add_sub_avx2(&x1[4], &x1[11]);
+ btf_32_add_sub_avx2(&x1[5], &x1[10]);
+ btf_32_add_sub_avx2(&x1[6], &x1[9]);
+ btf_32_add_sub_avx2(&x1[7], &x1[8]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[47]);
+ btf_32_add_sub_avx2(&x1[33], &x1[46]);
+ btf_32_add_sub_avx2(&x1[34], &x1[45]);
+ btf_32_add_sub_avx2(&x1[35], &x1[44]);
+ btf_32_add_sub_avx2(&x1[36], &x1[43]);
+ btf_32_add_sub_avx2(&x1[37], &x1[42]);
+ btf_32_add_sub_avx2(&x1[38], &x1[41]);
+ btf_32_add_sub_avx2(&x1[39], &x1[40]);
+ btf_32_add_sub_avx2(&x1[63], &x1[48]);
+ btf_32_add_sub_avx2(&x1[62], &x1[49]);
+ btf_32_add_sub_avx2(&x1[61], &x1[50]);
+ btf_32_add_sub_avx2(&x1[60], &x1[51]);
+ btf_32_add_sub_avx2(&x1[59], &x1[52]);
+ btf_32_add_sub_avx2(&x1[58], &x1[53]);
+ btf_32_add_sub_avx2(&x1[57], &x1[54]);
+ btf_32_add_sub_avx2(&x1[56], &x1[55]);
+
+ // stage 4
+ btf_32_add_sub_avx2(&x1[0], &x1[7]);
+ btf_32_add_sub_avx2(&x1[1], &x1[6]);
+ btf_32_add_sub_avx2(&x1[2], &x1[5]);
+ btf_32_add_sub_avx2(&x1[3], &x1[4]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[23]);
+ btf_32_add_sub_avx2(&x1[17], &x1[22]);
+ btf_32_add_sub_avx2(&x1[18], &x1[21]);
+ btf_32_add_sub_avx2(&x1[19], &x1[20]);
+ btf_32_add_sub_avx2(&x1[31], &x1[24]);
+ btf_32_add_sub_avx2(&x1[30], &x1[25]);
+ btf_32_add_sub_avx2(&x1[29], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[27]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
+
+ // stage 5
+ btf_32_add_sub_avx2(&x1[0], &x1[3]);
+ btf_32_add_sub_avx2(&x1[1], &x1[2]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[11]);
+ btf_32_add_sub_avx2(&x1[9], &x1[10]);
+ btf_32_add_sub_avx2(&x1[15], &x1[12]);
+ btf_32_add_sub_avx2(&x1[14], &x1[13]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[39]);
+ btf_32_add_sub_avx2(&x1[33], &x1[38]);
+ btf_32_add_sub_avx2(&x1[34], &x1[37]);
+ btf_32_add_sub_avx2(&x1[35], &x1[36]);
+ btf_32_add_sub_avx2(&x1[47], &x1[40]);
+ btf_32_add_sub_avx2(&x1[46], &x1[41]);
+ btf_32_add_sub_avx2(&x1[45], &x1[42]);
+ btf_32_add_sub_avx2(&x1[44], &x1[43]);
+ btf_32_add_sub_avx2(&x1[48], &x1[55]);
+ btf_32_add_sub_avx2(&x1[49], &x1[54]);
+ btf_32_add_sub_avx2(&x1[50], &x1[53]);
+ btf_32_add_sub_avx2(&x1[51], &x1[52]);
+ btf_32_add_sub_avx2(&x1[63], &x1[56]);
+ btf_32_add_sub_avx2(&x1[62], &x1[57]);
+ btf_32_add_sub_avx2(&x1[61], &x1[58]);
+ btf_32_add_sub_avx2(&x1[60], &x1[59]);
+
+ // stage 6
+ btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[4], &x1[5]);
+ btf_32_add_sub_avx2(&x1[7], &x1[6]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[19]);
+ btf_32_add_sub_avx2(&x1[17], &x1[18]);
+ btf_32_add_sub_avx2(&x1[23], &x1[20]);
+ btf_32_add_sub_avx2(&x1[22], &x1[21]);
+ btf_32_add_sub_avx2(&x1[24], &x1[27]);
+ btf_32_add_sub_avx2(&x1[25], &x1[26]);
+ btf_32_add_sub_avx2(&x1[31], &x1[28]);
+ btf_32_add_sub_avx2(&x1[30], &x1[29]);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
+
+ // stage 7
+ btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[9]);
+ btf_32_add_sub_avx2(&x1[11], &x1[10]);
+ btf_32_add_sub_avx2(&x1[12], &x1[13]);
+ btf_32_add_sub_avx2(&x1[15], &x1[14]);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[35]);
+ btf_32_add_sub_avx2(&x1[33], &x1[34]);
+ btf_32_add_sub_avx2(&x1[39], &x1[36]);
+ btf_32_add_sub_avx2(&x1[38], &x1[37]);
+ btf_32_add_sub_avx2(&x1[40], &x1[43]);
+ btf_32_add_sub_avx2(&x1[41], &x1[42]);
+ btf_32_add_sub_avx2(&x1[47], &x1[44]);
+ btf_32_add_sub_avx2(&x1[46], &x1[45]);
+ btf_32_add_sub_avx2(&x1[48], &x1[51]);
+ btf_32_add_sub_avx2(&x1[49], &x1[50]);
+ btf_32_add_sub_avx2(&x1[55], &x1[52]);
+ btf_32_add_sub_avx2(&x1[54], &x1[53]);
+ btf_32_add_sub_avx2(&x1[56], &x1[59]);
+ btf_32_add_sub_avx2(&x1[57], &x1[58]);
+ btf_32_add_sub_avx2(&x1[63], &x1[60]);
+ btf_32_add_sub_avx2(&x1[62], &x1[61]);
+
+ // stage 8
+ btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[17]);
+ btf_32_add_sub_avx2(&x1[19], &x1[18]);
+ btf_32_add_sub_avx2(&x1[20], &x1[21]);
+ btf_32_add_sub_avx2(&x1[23], &x1[22]);
+ btf_32_add_sub_avx2(&x1[24], &x1[25]);
+ btf_32_add_sub_avx2(&x1[27], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[29]);
+ btf_32_add_sub_avx2(&x1[31], &x1[30]);
+ btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
+
+ // stage 9
+ btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[33]);
+ btf_32_add_sub_avx2(&x1[35], &x1[34]);
+ btf_32_add_sub_avx2(&x1[36], &x1[37]);
+ btf_32_add_sub_avx2(&x1[39], &x1[38]);
+ btf_32_add_sub_avx2(&x1[40], &x1[41]);
+ btf_32_add_sub_avx2(&x1[43], &x1[42]);
+ btf_32_add_sub_avx2(&x1[44], &x1[45]);
+ btf_32_add_sub_avx2(&x1[47], &x1[46]);
+ btf_32_add_sub_avx2(&x1[48], &x1[49]);
+ btf_32_add_sub_avx2(&x1[51], &x1[50]);
+ btf_32_add_sub_avx2(&x1[52], &x1[53]);
+ btf_32_add_sub_avx2(&x1[55], &x1[54]);
+ btf_32_add_sub_avx2(&x1[56], &x1[57]);
+ btf_32_add_sub_avx2(&x1[59], &x1[58]);
+ btf_32_add_sub_avx2(&x1[60], &x1[61]);
+ btf_32_add_sub_avx2(&x1[63], &x1[62]);
+
+ // stage 10
+ btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 11
+ output[0] = x1[0];
+ output[1] = x1[32];
+ output[2] = x1[16];
+ output[3] = x1[48];
+ output[4] = x1[8];
+ output[5] = x1[40];
+ output[6] = x1[24];
+ output[7] = x1[56];
+ output[8] = x1[4];
+ output[9] = x1[36];
+ output[10] = x1[20];
+ output[11] = x1[52];
+ output[12] = x1[12];
+ output[13] = x1[44];
+ output[14] = x1[28];
+ output[15] = x1[60];
+ output[16] = x1[2];
+ output[17] = x1[34];
+ output[18] = x1[18];
+ output[19] = x1[50];
+ output[20] = x1[10];
+ output[21] = x1[42];
+ output[22] = x1[26];
+ output[23] = x1[58];
+ output[24] = x1[6];
+ output[25] = x1[38];
+ output[26] = x1[22];
+ output[27] = x1[54];
+ output[28] = x1[14];
+ output[29] = x1[46];
+ output[30] = x1[30];
+ output[31] = x1[62];
+ output[32] = x1[1];
+ output[33] = x1[33];
+ output[34] = x1[17];
+ output[35] = x1[49];
+ output[36] = x1[9];
+ output[37] = x1[41];
+ output[38] = x1[25];
+ output[39] = x1[57];
+ output[40] = x1[5];
+ output[41] = x1[37];
+ output[42] = x1[21];
+ output[43] = x1[53];
+ output[44] = x1[13];
+ output[45] = x1[45];
+ output[46] = x1[29];
+ output[47] = x1[61];
+ output[48] = x1[3];
+ output[49] = x1[35];
+ output[50] = x1[19];
+ output[51] = x1[51];
+ output[52] = x1[11];
+ output[53] = x1[43];
+ output[54] = x1[27];
+ output[55] = x1[59];
+ output[56] = x1[7];
+ output[57] = x1[39];
+ output[58] = x1[23];
+ output[59] = x1[55];
+ output[60] = x1[15];
+ output[61] = x1[47];
+ output[62] = x1[31];
+ output[63] = x1[63];
+}
+
+static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __zero = _mm256_setzero_si256();
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+ __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[1] = _mm256_subs_epi16(__zero, input[15]);
+ x1[2] = _mm256_subs_epi16(__zero, input[7]);
+ x1[3] = input[8];
+ x1[4] = _mm256_subs_epi16(__zero, input[3]);
+ x1[5] = input[12];
+ x1[6] = input[4];
+ x1[7] = _mm256_subs_epi16(__zero, input[11]);
+ x1[8] = _mm256_subs_epi16(__zero, input[1]);
+ x1[9] = input[14];
+ x1[10] = input[6];
+ x1[11] = _mm256_subs_epi16(__zero, input[9]);
+ x1[12] = input[2];
+ x1[13] = _mm256_subs_epi16(__zero, input[13]);
+ x1[14] = _mm256_subs_epi16(__zero, input[5]);
+ x1[15] = input[10];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[2]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[13], &x1[15]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 5
+ btf_16_adds_subs_avx2(&x1[0], &x1[4]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[10], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[15]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 7
+ btf_16_adds_subs_avx2(&x1[0], &x1[8]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[15]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[1];
+ output[1] = x1[14];
+ output[2] = x1[3];
+ output[3] = x1[12];
+ output[4] = x1[5];
+ output[5] = x1[10];
+ output[6] = x1[7];
+ output[7] = x1[8];
+ output[8] = x1[9];
+ output[9] = x1[6];
+ output[10] = x1[11];
+ output[11] = x1[4];
+ output[12] = x1[13];
+ output[13] = x1[2];
+ output[14] = x1[15];
+ output[15] = x1[0];
+}
+
+static INLINE void fidentity16x16_new_avx2(const __m256i *input,
+ __m256i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i one = _mm256_set1_epi16(1);
+
+ for (int i = 0; i < 16; ++i) {
+ const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
+ const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+ output[i] = _mm256_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity16x32_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) {
+ output[i] = _mm256_slli_epi16(input[i], 2);
+ }
+}
+
+static INLINE void store_output_32bit_w16(int32_t *const out,
+ const __m256i *const in1,
+ const __m256i *const in2,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out + stride * i), in1[i]);
+ _mm256_store_si256((__m256i *)(out + stride * i + 8), in2[i]);
+ }
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+ int32_t *out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out),
+ _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
+ _mm256_store_si256(
+ (__m256i *)(out + 8),
+ _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
+ out += stride;
+ }
+}
+
+static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
+ int32_t *const b) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
+ const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
+ const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+ _mm256_store_si256((__m256i *)b, b_lo);
+ _mm256_store_si256((__m256i *)(b + 8), b_hi);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
+ const __m256i *const in, int32_t *const out, const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
+ }
+}
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+ int8_t cos_bit);
+
+static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
+ fdct16x32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity16x32_avx2, // IDTX
+ fdct16x32_avx2, // V_DCT
+ fidentity16x32_avx2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
+ fdct16x32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity16x32_avx2, // IDTX
+ fidentity16x32_avx2, // V_DCT
+ fdct16x32_avx2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
+ fdct16x16_new_avx2, // DCT_DCT
+ fadst16x16_new_avx2, // ADST_DCT
+ fdct16x16_new_avx2, // DCT_ADST
+ fadst16x16_new_avx2, // ADST_ADST
+ fadst16x16_new_avx2, // FLIPADST_DCT
+ fdct16x16_new_avx2, // DCT_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_FLIPADST
+ fadst16x16_new_avx2, // ADST_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_ADST
+ fidentity16x16_new_avx2, // IDTX
+ fdct16x16_new_avx2, // V_DCT
+ fidentity16x16_new_avx2, // H_DCT
+ fadst16x16_new_avx2, // V_ADST
+ fidentity16x16_new_avx2, // H_ADST
+ fadst16x16_new_avx2, // V_FLIPADST
+ fidentity16x16_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
+ fdct16x16_new_avx2, // DCT_DCT
+ fdct16x16_new_avx2, // ADST_DCT
+ fadst16x16_new_avx2, // DCT_ADST
+ fadst16x16_new_avx2, // ADST_ADST
+ fdct16x16_new_avx2, // FLIPADST_DCT
+ fadst16x16_new_avx2, // DCT_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_FLIPADST
+ fadst16x16_new_avx2, // ADST_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_ADST
+ fidentity16x16_new_avx2, // IDTX
+ fidentity16x16_new_avx2, // V_DCT
+ fdct16x16_new_avx2, // H_DCT
+ fidentity16x16_new_avx2, // V_ADST
+ fadst16x16_new_avx2, // H_ADST
+ fidentity16x16_new_avx2, // V_FLIPADST
+ fadst16x16_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fadst8x8_new_sse2, // ADST_DCT
+ fdct8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fadst8x8_new_sse2, // FLIPADST_DCT
+ fdct8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fdct8x8_new_sse2, // V_DCT
+ fidentity8x8_new_sse2, // H_DCT
+ fadst8x8_new_sse2, // V_ADST
+ fidentity8x8_new_sse2, // H_ADST
+ fadst8x8_new_sse2, // V_FLIPADST
+ fidentity8x8_new_sse2, // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fdct8x8_new_sse2, // ADST_DCT
+ fadst8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fdct8x8_new_sse2, // FLIPADST_DCT
+ fadst8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fidentity8x8_new_sse2, // V_DCT
+ fdct8x8_new_sse2, // H_DCT
+ fidentity8x8_new_sse2, // V_ADST
+ fadst8x8_new_sse2, // H_ADST
+ fidentity8x8_new_sse2, // V_FLIPADST
+ fadst8x8_new_sse2 // H_FLIPADST
+};
+
+static INLINE void load_buffer_and_round_shift(const int16_t *in, int stride,
+ __m128i *out, int bit) {
+ out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride));
+ out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride));
+ out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride));
+ out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride));
+ out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride));
+ out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride));
+ out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride));
+ out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride));
+ out[0] = _mm_slli_epi16(out[0], bit);
+ out[1] = _mm_slli_epi16(out[1], bit);
+ out[2] = _mm_slli_epi16(out[2], bit);
+ out[3] = _mm_slli_epi16(out[3], bit);
+ out[4] = _mm_slli_epi16(out[4], bit);
+ out[5] = _mm_slli_epi16(out[5], bit);
+ out[6] = _mm_slli_epi16(out[6], bit);
+ out[7] = _mm_slli_epi16(out[7], bit);
+}
+
+static INLINE void load_buffer_and_flip_round_shift(const int16_t *in,
+ int stride, __m128i *out,
+ int bit) {
+ out[7] = load_16bit_to_16bit(in + 0 * stride);
+ out[6] = load_16bit_to_16bit(in + 1 * stride);
+ out[5] = load_16bit_to_16bit(in + 2 * stride);
+ out[4] = load_16bit_to_16bit(in + 3 * stride);
+ out[3] = load_16bit_to_16bit(in + 4 * stride);
+ out[2] = load_16bit_to_16bit(in + 5 * stride);
+ out[1] = load_16bit_to_16bit(in + 6 * stride);
+ out[0] = load_16bit_to_16bit(in + 7 * stride);
+ out[7] = _mm_slli_epi16(out[7], bit);
+ out[6] = _mm_slli_epi16(out[6], bit);
+ out[5] = _mm_slli_epi16(out[5], bit);
+ out[4] = _mm_slli_epi16(out[4], bit);
+ out[3] = _mm_slli_epi16(out[3], bit);
+ out[2] = _mm_slli_epi16(out[2], bit);
+ out[1] = _mm_slli_epi16(out[1], bit);
+ out[0] = _mm_slli_epi16(out[0], bit);
+}
+
+#define TRANSPOSE_8X8_AVX2() \
+ { \
+ /* aa0: 00 10 01 11 02 12 03 13 | 40 50 41 51 42 52 43 53*/ \
+ /* aa1: 04 14 05 15 06 16 07 17 | 44 54 45 55 46 56 47 57*/ \
+ /* aa2: 20 30 21 31 22 32 23 33 | 60 70 61 71 62 72 63 73*/ \
+ /* aa3: 24 34 25 35 26 36 27 37 | 64 74 65 75 66 76 67 77*/ \
+ const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1); \
+ const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1); \
+ const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3); \
+ const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3); \
+ /* Unpack 32 bit elements resulting in: */ \
+ /* bb0: 00 10 20 30 01 11 21 31 | 40 50 60 70 41 51 61 71*/ \
+ /* bb1: 02 12 22 32 03 13 23 33 | 42 52 62 72 43 53 63 73*/ \
+ /* bb2: 04 14 24 34 05 15 25 35 | 44 54 64 74 45 55 65 75*/ \
+ /* bb2: 06 16 26 36 07 17 27 37 | 46 56 66 76 47 57 67 77*/ \
+ const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2); \
+ const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2); \
+ const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3); \
+ const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3); \
+ /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/ \
+ /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/ \
+ /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/ \
+ /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/ \
+ c0 = _mm256_permute4x64_epi64(bb0, 0xd8); \
+ c1 = _mm256_permute4x64_epi64(bb1, 0xd8); \
+ c2 = _mm256_permute4x64_epi64(bb2, 0xd8); \
+ c3 = _mm256_permute4x64_epi64(bb3, 0xd8); \
+ }
+
+static INLINE void transpose_round_shift_flip_8x8(__m128i *const in,
+ __m128i *const out, int bit) {
+ __m256i c0, c1, c2, c3;
+ bit = -bit;
+ const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
+ const __m256i s04 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
+ const __m256i s15 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
+ const __m256i s26 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
+ const __m256i s37 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
+
+ const __m256i a0 = _mm256_adds_epi16(s04, rounding);
+ const __m256i a1 = _mm256_adds_epi16(s15, rounding);
+ const __m256i a2 = _mm256_adds_epi16(s26, rounding);
+ const __m256i a3 = _mm256_adds_epi16(s37, rounding);
+
+ // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47
+ // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57
+ // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67
+ // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77
+ const __m256i b0 = _mm256_srai_epi16(a0, bit);
+ const __m256i b1 = _mm256_srai_epi16(a1, bit);
+ const __m256i b2 = _mm256_srai_epi16(a2, bit);
+ const __m256i b3 = _mm256_srai_epi16(a3, bit);
+
+ TRANSPOSE_8X8_AVX2()
+
+ // Unpack 64 bit elements resulting in:
+ // out[7]: 00 10 20 30 40 50 60 70
+ // out[6]: 01 11 21 31 41 51 61 71
+ // out[5]: 02 12 22 32 42 52 62 72
+ // out[4]: 03 13 23 33 43 53 63 73
+ // out[3]: 04 14 24 34 44 54 64 74
+ // out[2]: 05 15 25 35 45 55 65 75
+ // out[1]: 06 16 26 36 46 56 66 76
+ // out[0]: 07 17 27 37 47 57 67 77
+ out[7] = _mm256_castsi256_si128(c0);
+ out[6] = _mm256_extractf128_si256(c0, 1);
+ out[5] = _mm256_castsi256_si128(c1);
+ out[4] = _mm256_extractf128_si256(c1, 1);
+ out[3] = _mm256_castsi256_si128(c2);
+ out[2] = _mm256_extractf128_si256(c2, 1);
+ out[1] = _mm256_castsi256_si128(c3);
+ out[0] = _mm256_extractf128_si256(c3, 1);
+}
+
+static INLINE void transpose_round_shift_8x8(__m128i *const in,
+ __m128i *const out, int bit) {
+ __m256i c0, c1, c2, c3;
+ bit = -bit;
+ const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
+ const __m256i s04 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
+ const __m256i s15 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
+ const __m256i s26 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
+ const __m256i s37 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
+
+ const __m256i a0 = _mm256_adds_epi16(s04, rounding);
+ const __m256i a1 = _mm256_adds_epi16(s15, rounding);
+ const __m256i a2 = _mm256_adds_epi16(s26, rounding);
+ const __m256i a3 = _mm256_adds_epi16(s37, rounding);
+
+ // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47
+ // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57
+ // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67
+ // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77
+ const __m256i b0 = _mm256_srai_epi16(a0, bit);
+ const __m256i b1 = _mm256_srai_epi16(a1, bit);
+ const __m256i b2 = _mm256_srai_epi16(a2, bit);
+ const __m256i b3 = _mm256_srai_epi16(a3, bit);
+
+ TRANSPOSE_8X8_AVX2()
+ // Unpack 64 bit elements resulting in:
+ // out[7]: 00 10 20 30 40 50 60 70
+ // out[6]: 01 11 21 31 41 51 61 71
+ // out[5]: 02 12 22 32 42 52 62 72
+ // out[4]: 03 13 23 33 43 53 63 73
+ // out[3]: 04 14 24 34 44 54 64 74
+ // out[2]: 05 15 25 35 45 55 65 75
+ // out[1]: 06 16 26 36 46 56 66 76
+ // out[0]: 07 17 27 37 47 57 67 77
+ out[0] = _mm256_castsi256_si128(c0);
+ out[1] = _mm256_extractf128_si256(c0, 1);
+ out[2] = _mm256_castsi256_si128(c1);
+ out[3] = _mm256_extractf128_si256(c1, 1);
+ out[4] = _mm256_castsi256_si128(c2);
+ out[5] = _mm256_extractf128_si256(c2, 1);
+ out[6] = _mm256_castsi256_si128(c3);
+ out[7] = _mm256_extractf128_si256(c3, 1);
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out + i * stride),
+ _mm256_cvtepi16_epi32(in[i]));
+ }
+}
+
+static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // Condition to check shift bit is avoided while round shifting, by assuming
+ // that shift[0] will always be positive.
+ assert(shift[0] > 0);
+ if (ud_flip)
+ load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]);
+ else
+ load_buffer_and_round_shift(input, stride, buf0, shift[0]);
+
+ col_txfm(buf0, buf0, cos_bit_col);
+ // Condition to check shift bit is avoided while round shifting, by assuming
+ // that shift[1] will always be negative.
+ assert(shift[1] < 0);
+
+ if (lr_flip) {
+ transpose_round_shift_flip_8x8(buf0, buf1, shift[1]);
+ } else {
+ transpose_round_shift_8x8(buf0, buf1, shift[1]);
+ }
+
+ buf = buf1;
+ row_txfm(buf, buf, cos_bit_row);
+
+ // Round and shift operation is avoided here as the shift bit is assumed to be
+ // zero always.
+ assert(shift[2] == 0);
+ store_buffer_16bit_to_32bit_w8_avx2(buf, output, 8, 8);
+}
+
+static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_16X16;
+ __m256i buf0[16], buf1[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int32_t i = 0;
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
+}
+
+static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_32X32;
+ __m256i buf0[32], buf1[128];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+ height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
+ transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[64];
+ __m256i bufB[64];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ fdct64_new_avx2(bufA, bufA, cos_bit_row);
+ fdct64_new_avx2(bufB, bufB, cos_bit_row);
+ round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
+ round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
+ store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_16X32;
+ __m256i buf0[32], buf1[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1);
+ transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
+
+ for (int i = 0; i < 2; i++) {
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height,
+ width);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m256i buf0[32], buf1[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+ const int txw_idx = get_txw_idx(TX_32X16);
+ const int txh_idx = get_txh_idx(TX_32X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 16;
+ const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+ height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+ }
+
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, height, width);
+}
+
+static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_64X32;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+ assert(tx_type == DCT_DCT);
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[64];
+ __m256i bufB[64];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ fdct64_new_avx2(bufA, bufA, cos_bit_row);
+ fdct64_new_avx2(bufB, bufB, cos_bit_row);
+ round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+ round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+ store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_32X64;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[32];
+ __m256i bufB[32];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ fdct32_avx2(bufA, bufA, cos_bit_row);
+ fdct32_avx2(bufB, bufB, cos_bit_row);
+ round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+ round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+ store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_16X64;
+ __m256i buf0[64], buf1[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < height_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + width * i, 32, width);
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X16;
+ __m256i buf0[64], buf1[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
+ const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < height_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < height_div16; i++) {
+ __m256i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * i, 16, 32);
+ }
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
+ __m256i *in1, __m128i *out0, __m128i *out1,
+ __m128i *out2, __m128i *out3,
+ const __m256i *__rounding, int8_t *cos_bit) {
+ __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+ __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+ __m256i u0 = _mm256_madd_epi16(t0, *w0);
+ __m256i u1 = _mm256_madd_epi16(t1, *w0);
+ __m256i v0 = _mm256_madd_epi16(t0, *w1);
+ __m256i v1 = _mm256_madd_epi16(t1, *w1);
+
+ __m256i a0 = _mm256_add_epi32(u0, *__rounding);
+ __m256i a1 = _mm256_add_epi32(u1, *__rounding);
+ __m256i b0 = _mm256_add_epi32(v0, *__rounding);
+ __m256i b1 = _mm256_add_epi32(v1, *__rounding);
+
+ __m256i c0 = _mm256_srai_epi32(a0, *cos_bit);
+ __m256i c1 = _mm256_srai_epi32(a1, *cos_bit);
+ __m256i d0 = _mm256_srai_epi32(b0, *cos_bit);
+ __m256i d1 = _mm256_srai_epi32(b1, *cos_bit);
+
+ __m256i temp0 = _mm256_packs_epi32(c0, c1);
+ __m256i temp1 = _mm256_packs_epi32(d0, d1);
+
+ *out0 = _mm256_castsi256_si128(temp0);
+ *out1 = _mm256_castsi256_si128(temp1);
+ *out2 = _mm256_extracti128_si256(temp0, 0x01);
+ *out3 = _mm256_extracti128_si256(temp1, 0x01);
+}
+
+static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = _mm256_adds_epi16(input[0], input[7]);
+ x1[7] = _mm256_subs_epi16(input[0], input[7]);
+ x1[1] = _mm256_adds_epi16(input[1], input[6]);
+ x1[6] = _mm256_subs_epi16(input[1], input[6]);
+ x1[2] = _mm256_adds_epi16(input[2], input[5]);
+ x1[5] = _mm256_subs_epi16(input[2], input[5]);
+ x1[3] = _mm256_adds_epi16(input[3], input[4]);
+ x1[4] = _mm256_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
+ cos_bit);
+ x2[5] = x1[5];
+ x2[6] = x1[6];
+ x2[7] = x1[7];
+
+ // stage 3
+ __m256i x3[8];
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
+ cos_bit);
+ x3[0] = x2[0];
+ x3[1] = x2[1];
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
+ cos_bit);
+ x3[2] = x2[2];
+ x3[3] = x2[3];
+ x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
+ cos_bit);
+ x4[4] = x3[4];
+ x4[7] = x3[7];
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
+ cos_bit);
+ x4[5] = x3[5];
+ x4[6] = x3[6];
+ // stage 5
+ output[0] = x4[0];
+ output[1] = x4[4];
+ output[2] = x4[2];
+ output[3] = x4[6];
+ output[4] = x4[1];
+ output[5] = x4[5];
+ output[6] = x4[3];
+ output[7] = x4[7];
+}
+
+static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __zero = _mm256_setzero_si256();
+ const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm256_subs_epi16(__zero, input[7]);
+ x1[2] = _mm256_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm256_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm256_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
+ cos_bit);
+ x2[2] = x1[2];
+ x2[3] = x1[3];
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
+ cos_bit);
+ x2[6] = x1[6];
+ x2[7] = x1[7];
+
+ // stage 3
+ __m256i x3[8];
+ x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
+ cos_bit);
+ x4[4] = x3[4];
+ x4[5] = x3[5];
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
+ cos_bit);
+ x4[6] = x3[6];
+ x4[7] = x3[7];
+
+ // stage 5
+ __m256i x5[8];
+ x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
+
+ // stage 6
+ __m256i x6[8];
+ btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
+ cos_bit);
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
+ cos_bit);
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
+ cos_bit);
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
+ cos_bit);
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+
+ // stage 7
+ output[0] = x6[1];
+ output[1] = x6[6];
+ output[2] = x6[3];
+ output[3] = x6[4];
+ output[4] = x6[5];
+ output[5] = x6[2];
+ output[6] = x6[7];
+ output[7] = x6[0];
+}
+
+static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+
+ output[0] = _mm256_adds_epi16(input[0], input[0]);
+ output[1] = _mm256_adds_epi16(input[1], input[1]);
+ output[2] = _mm256_adds_epi16(input[2], input[2]);
+ output[3] = _mm256_adds_epi16(input[3], input[3]);
+ output[4] = _mm256_adds_epi16(input[4], input[4]);
+ output[5] = _mm256_adds_epi16(input[5], input[5]);
+ output[6] = _mm256_adds_epi16(input[6], input[6]);
+ output[7] = _mm256_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ __m128i temp0, temp1, temp2, temp3;
+ __m256i in0, in1;
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+ __m256i cospi_arr[12];
+
+ cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
+ cospi_m32_p32, 0x1);
+ cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p32_p32, 0x1);
+ cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p48_p16, 0x1);
+ cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+ cospi_m16_p48, 0x1);
+ cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
+ cospi_m48_m16, 0x1);
+ cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
+ cospi_m16_p48, 0x1);
+ cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
+ cospi_p24_p40, 0x1);
+ cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
+ cospi_m40_p24, 0x1);
+ cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
+ cospi_p28_p36, 0x1);
+ cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
+ cospi_m36_p28, 0x1);
+ cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
+ cospi_p12_p52, 0x1);
+ cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
+ cospi_m52_p12, 0x1);
+
+ __m256i x[8];
+ x[0] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
+ x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
+ 0x1);
+ x[2] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
+ x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
+ 0x1);
+ x[4] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
+ x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
+ 0x1);
+ x[6] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
+ x[7] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = _mm256_adds_epi16(x[0], x[1]);
+ x1[7] = _mm256_subs_epi16(x[0], x[1]);
+ x1[1] = _mm256_adds_epi16(x[2], x[3]);
+ x1[6] = _mm256_subs_epi16(x[2], x[3]);
+ x1[2] = _mm256_adds_epi16(x[4], x[5]);
+ x1[5] = _mm256_subs_epi16(x[4], x[5]);
+ x1[3] = _mm256_adds_epi16(x[6], x[7]);
+ x1[4] = _mm256_subs_epi16(x[6], x[7]);
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+ x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+ x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
+ x2[2] = x1[4];
+ x2[3] = x1[7];
+ btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1,
+ &temp2, &temp3, &__rounding_256, &cos_bit);
+ x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
+ x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+ // stage 3
+ __m256i x3[8];
+ x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
+ x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+ x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+ x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
+ _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
+ x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
+ x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
+ x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
+ x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
+ x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
+ x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
+ btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0],
+ &output[8], &output[4], &output[12], &__rounding_256, &cos_bit);
+ x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
+ x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
+ x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
+ x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
+ in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
+ in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
+ btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+
+ x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
+ x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+ // stage 5
+ __m256i x5[4];
+ in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
+ in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
+ btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14],
+ &output[10], &output[6], &__rounding_256, &cos_bit);
+ x5[0] = _mm256_adds_epi16(x4[4], x4[6]);
+ x5[1] = _mm256_subs_epi16(x4[4], x4[6]);
+ x5[2] = _mm256_adds_epi16(x4[5], x4[7]);
+ x5[3] = _mm256_subs_epi16(x4[5], x4[7]);
+
+ // stage 6
+ in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20);
+ in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31);
+ btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15],
+ &output[9], &output[7], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31);
+ in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20);
+ btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5],
+ &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __zero = _mm256_setzero_si256();
+ const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+ __m256i in0, in1;
+ __m128i temp0, temp1, temp2, temp3;
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ __m256i cospi_arr[20];
+
+ cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p32_p32, 0x1);
+ cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+ cospi_p32_m32, 0x1);
+ cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p32_p32, 0x1);
+ cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+ cospi_p32_m32, 0x1);
+ cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+ cospi_m48_p16, 0x1);
+ cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+ cospi_p16_p48, 0x1);
+ cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+ cospi_m48_p16, 0x1);
+ cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+ cospi_p16_p48, 0x1);
+ cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+ cospi_p40_p24, 0x1);
+ cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
+ cospi_p24_m40, 0x1);
+ cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
+ cospi_m24_p40, 0x1);
+ cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+ cospi_p40_p24, 0x1);
+ cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
+ cospi_p10_p54, 0x1);
+ cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
+ cospi_p54_m10, 0x1);
+ cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
+ cospi_p26_p38, 0x1);
+ cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
+ cospi_p38_m26, 0x1);
+ cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
+ cospi_p42_p22, 0x1);
+ cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
+ cospi_p22_m42, 0x1);
+ cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
+ cospi_p58_p06, 0x1);
+ cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
+ cospi_p06_m58, 0x1);
+
+ __m256i x[8];
+ x[0] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
+ x[1] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
+ x[2] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
+ x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
+ 0x1);
+ x[4] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
+ x[5] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
+ x[6] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
+ x[7] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = x[0];
+ x1[1] = _mm256_subs_epi16(__zero, x[7]);
+ x1[2] = x[2];
+ x1[3] = _mm256_subs_epi16(__zero, x[5]);
+ x1[4] = _mm256_subs_epi16(__zero, x[4]);
+ x1[5] = x[3];
+ x1[6] = _mm256_subs_epi16(__zero, x[6]);
+ x1[7] = x[1];
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
+ x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
+ x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
+ x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
+ in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
+ in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
+ btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+ in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
+ in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
+ btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+ // stage 3
+ __m256i x3[8];
+ x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+ x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+ x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
+ x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
+ x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
+ x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[4] = x3[4];
+ x4[5] = x3[5];
+ in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
+ in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
+ btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+ in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
+ in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
+ btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+ // stage 5
+ __m256i x5[8];
+ x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
+ x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
+ x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
+ x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
+ x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
+ x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
+ x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
+ x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
+
+ // stage 6
+ __m256i x6[8];
+ x6[0] = x5[0];
+ x6[1] = x5[2];
+ x6[2] = x5[1];
+ x6[3] = x5[3];
+ in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
+ in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
+ btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+ in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
+ in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
+ btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1,
+ &temp2, &temp3, &__rounding_256, &cos_bit);
+ x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+ // stage 7
+ __m256i x7[8];
+ x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
+ x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
+ x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
+ x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
+ x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
+ x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
+ x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
+ x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
+
+ // stage 8
+ in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
+ btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15],
+ &output[0], &output[13], &output[2], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
+ btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11],
+ &output[4], &output[9], &output[6], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
+ btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7],
+ &output[8], &output[5], &output[10], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
+ btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
+ &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i temp;
+ for (int i = 0; i < 16; i += 2) {
+ temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
+ input[i + 1], 0x1);
+ const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
+ const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+ temp = _mm256_packs_epi32(b_lo, b_hi);
+ output[i] = _mm256_castsi256_si128(temp);
+ output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
+ }
+}
+
+static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
+ fdct8x8_new_avx2, // DCT_DCT
+ fdct8x8_new_avx2, // ADST_DCT
+ fadst8x8_new_avx2, // DCT_ADST
+ fadst8x8_new_avx2, // ADST_ADST
+ fdct8x8_new_avx2, // FLIPADST_DCT
+ fadst8x8_new_avx2, // DCT_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_FLIPADST
+ fadst8x8_new_avx2, // ADST_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_ADST
+ fidentity8x8_new_avx2, // IDTX
+ fidentity8x8_new_avx2, // V_DCT
+ fdct8x8_new_avx2, // H_DCT
+ fidentity8x8_new_avx2, // V_ADST
+ fadst8x8_new_avx2, // H_ADST
+ fidentity8x8_new_avx2, // V_FLIPADST
+ fadst8x8_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_avx2, // DCT_DCT
+ fadst8x16_new_avx2, // ADST_DCT
+ fdct8x16_new_avx2, // DCT_ADST
+ fadst8x16_new_avx2, // ADST_ADST
+ fadst8x16_new_avx2, // FLIPADST_DCT
+ fdct8x16_new_avx2, // DCT_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_FLIPADST
+ fadst8x16_new_avx2, // ADST_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_ADST
+ fidentity8x16_new_avx2, // IDTX
+ fdct8x16_new_avx2, // V_DCT
+ fidentity8x16_new_avx2, // H_DCT
+ fadst8x16_new_avx2, // V_ADST
+ fidentity8x16_new_avx2, // H_ADST
+ fadst8x16_new_avx2, // V_FLIPADST
+ fidentity8x16_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
+ fdct8x8_new_avx2, // DCT_DCT
+ fadst8x8_new_avx2, // ADST_DCT
+ fdct8x8_new_avx2, // DCT_ADST
+ fadst8x8_new_avx2, // ADST_ADST
+ fadst8x8_new_avx2, // FLIPADST_DCT
+ fdct8x8_new_avx2, // DCT_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_FLIPADST
+ fadst8x8_new_avx2, // ADST_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_ADST
+ fidentity8x8_new_avx2, // IDTX
+ fdct8x8_new_avx2, // V_DCT
+ fidentity8x8_new_avx2, // H_DCT
+ fadst8x8_new_avx2, // V_ADST
+ fidentity8x8_new_avx2, // H_ADST
+ fadst8x8_new_avx2, // V_FLIPADST
+ fidentity8x8_new_avx2, // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
+ fdct8x16_new_avx2, // DCT_DCT
+ fdct8x16_new_avx2, // ADST_DCT
+ fadst8x16_new_avx2, // DCT_ADST
+ fadst8x16_new_avx2, // ADST_ADST
+ fdct8x16_new_avx2, // FLIPADST_DCT
+ fadst8x16_new_avx2, // DCT_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_FLIPADST
+ fadst8x16_new_avx2, // ADST_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_ADST
+ fidentity8x16_new_avx2, // IDTX
+ fidentity8x16_new_avx2, // V_DCT
+ fdct8x16_new_avx2, // H_DCT
+ fidentity8x16_new_avx2, // V_ADST
+ fadst8x16_new_avx2, // H_ADST
+ fidentity8x16_new_avx2, // V_FLIPADST
+ fadst8x16_new_avx2 // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ __m256i buf2[8];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+ __m128i *bufl, *bufu;
+ if (lr_flip) {
+ bufl = buf0;
+ bufu = buf0 + 8;
+ flip_buf_sse2(buf1 + width * 0, bufl, width);
+ flip_buf_sse2(buf1 + width * 1, bufu, width);
+ } else {
+ bufl = buf1 + width * 0;
+ bufu = buf1 + width * 1;
+ }
+ pack_reg(bufl, bufu, buf2);
+ row_txfm(buf2, buf2, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf2, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf2, output, height, width);
+}
+
+static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ __m256i buf2[8];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 8;
+ const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
+ load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
+ load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
+ }
+ pack_reg(buf0, &buf0[8], buf2);
+ round_shift_16bit_w16_avx2(buf2, height, shift[0]);
+ col_txfm(buf2, buf2, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf2, height, shift[1]);
+ transpose_16bit_16x8_avx2(buf2, buf2);
+ extract_reg(buf2, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_avx2, // 8x8 transform
+ lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform
+ lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform
+ lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform
+ lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform
+ lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform
+ lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+ if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ } else {
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 0000000000..825da8d7b4
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+ const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+ int r, c;
+ for (r = 0; r < txfm1d_size; r++) {
+ for (c = 0; c < txfm1d_size; c++) {
+ output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+ }
+ }
+}
+
+static INLINE void store_output_32bit_w8(int32_t *const out,
+ const __m128i *const in1,
+ const __m128i *const in2,
+ const int stride, const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm_store_si128((__m128i *)(out + stride * i), in1[i]);
+ _mm_store_si128((__m128i *)(out + stride * i + 4), in2[i]);
+ }
+}
+
+typedef void (*TxfmFuncSSE2)(__m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+static void fdct32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int txfm_size = 32;
+ const int num_per_128 = 4;
+ int col_num = txfm_size / num_per_128;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ av1_fdct32_sse4_1((input + col), (output + col), cos_bit, col_num);
+ }
+}
+
+static void fdct64_new_sse4_1(__m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 64;
+ const int num_per_128 = 4;
+ int col_num = txfm_size / num_per_128;
+ (void)stage_range;
+ for (int col = 0; col < col_num; col++) {
+ av1_fdct64_sse4_1((input + col), (output + col), cos_bit, col_num, col_num);
+ }
+}
+static void idtx32x32_sse4_1(__m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ (void)stage_range;
+
+ for (int i = 0; i < 8; i++) {
+ av1_idtx32_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1);
+ }
+}
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+ switch (txfm_type) {
+ case TXFM_TYPE_DCT32: return fdct32_sse4_1;
+ case TXFM_TYPE_DCT64: return fdct64_new_sse4_1;
+ case TXFM_TYPE_IDENTITY32: return idtx32x32_sse4_1;
+ default: assert(0);
+ }
+ return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+ const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf) {
+ // TODO(sarahparker) This does not currently support rectangular transforms
+ // and will break without splitting txfm_size out into row and col size.
+ // Rectangular transforms use c code only, so it should be ok for now.
+ // It will be corrected when there are sse implementations for rectangular
+ // transforms.
+ assert(cfg->tx_size < TX_SIZES);
+ const int txfm_size = tx_size_wide[cfg->tx_size];
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t *stage_range_row = cfg->stage_range_row;
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+ int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+ txfm_size);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+ txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+ av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]);
+}
+
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+ int32_t *output, const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf) {
+ assert(cfg->tx_size < TX_SIZES);
+ const int txfm_size = tx_size_wide[cfg->tx_size];
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+
+ const int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+ int col_num = txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+ txfm_size);
+ /*col wise transform*/
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+
+ /*row wise transform*/
+ for (int col = 0; col < (col_num >> 1); col++) {
+ av1_fdct64_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, col_num,
+ (col_num >> 1));
+ }
+
+ txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+ av1_round_shift_array_32_sse4_1(out_128, out_128, txfm2d_size_128, -shift[2]);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+ (void)bd;
+ fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+ (void)bd;
+ fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m128i buf0[64], buf1[512];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[64];
+ __m128i bufB[64];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+ av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+ store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_64X32;
+ __m128i buf0[64], buf1[256];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+ assert(tx_type == DCT_DCT);
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[64];
+ __m128i bufB[64];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+ store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_32X64;
+ __m128i buf0[64], buf1[256];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[32];
+ __m128i bufB[32];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct32_sse4_1(bufA, bufA, cos_bit_row, 1);
+ av1_fdct32_sse4_1(bufB, bufB, cos_bit_row, 1);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+ store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+ }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
+ av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform
+ av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
+ av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform
+ av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform
+ av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+ if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ } else {
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
new file mode 100644
index 0000000000..aaad76e5ae
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#include <immintrin.h>
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r, const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i ww0 = _mm256_set1_epi32(w0);
+ const __m256i ww1 = _mm256_set1_epi32(w1);
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r, const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i ww0 = _mm256_set1_epi32(w0);
+ const __m256i ww1 = _mm256_set1_epi32(w1);
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
new file mode 100644
index 0000000000..a4def754b0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -0,0 +1,2673 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).
+
+static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ __m128i u[4], v[4];
+
+ u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u[1] = _mm_unpacklo_epi16(input[3], input[2]);
+
+ v[0] = _mm_add_epi16(u[0], u[1]);
+ v[1] = _mm_sub_epi16(u[0], u[1]);
+
+ u[0] = _mm_madd_epi16(v[0], cospi_p32_p32); // 0
+ u[1] = _mm_madd_epi16(v[0], cospi_p32_m32); // 2
+ u[2] = _mm_madd_epi16(v[1], cospi_p16_p48); // 1
+ u[3] = _mm_madd_epi16(v[1], cospi_p48_m16); // 3
+
+ v[0] = _mm_add_epi32(u[0], __rounding);
+ v[1] = _mm_add_epi32(u[1], __rounding);
+ v[2] = _mm_add_epi32(u[2], __rounding);
+ v[3] = _mm_add_epi32(u[3], __rounding);
+ u[0] = _mm_srai_epi32(v[0], cos_bit);
+ u[1] = _mm_srai_epi32(v[1], cos_bit);
+ u[2] = _mm_srai_epi32(v[2], cos_bit);
+ u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u[0], u[1]);
+ output[1] = _mm_packs_epi32(u[2], u[3]);
+ output[2] = _mm_srli_si128(output[0], 8);
+ output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fdct8x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x1[4];
+ x1[0] = _mm_adds_epi16(input[0], input[3]);
+ x1[3] = _mm_subs_epi16(input[0], input[3]);
+ x1[1] = _mm_adds_epi16(input[1], input[2]);
+ x1[2] = _mm_subs_epi16(input[1], input[2]);
+
+ // stage 2
+ __m128i x2[4];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]);
+
+ // stage 3
+ output[0] = x2[0];
+ output[1] = x2[2];
+ output[2] = x2[1];
+ output[3] = x2[3];
+}
+
+static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = _mm_adds_epi16(input[0], input[7]);
+ x1[7] = _mm_subs_epi16(input[0], input[7]);
+ x1[1] = _mm_adds_epi16(input[1], input[6]);
+ x1[6] = _mm_subs_epi16(input[1], input[6]);
+ x1[2] = _mm_adds_epi16(input[2], input[5]);
+ x1[5] = _mm_subs_epi16(input[2], input[5]);
+ x1[3] = _mm_adds_epi16(input[3], input[4]);
+ x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5],
+ &x1[6], &x2[5], &x2[6]);
+ x2[7] = x1[7];
+
+ // stage 3
+ __m128i x3[8];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0],
+ &x2[1], &x3[0], &x3[1]);
+ btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2],
+ &x2[3], &x3[2], &x3[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4],
+ &x3[7], &x4[4], &x4[7]);
+ btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5],
+ &x3[6], &x4[5], &x4[6]);
+
+ // stage 5
+ output[0] = x4[0];
+ output[1] = x4[4];
+ output[2] = x4[2];
+ output[3] = x4[6];
+ output[4] = x4[1];
+ output[5] = x4[5];
+ output[6] = x4[3];
+ output[7] = x4[7];
+}
+
+static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+ // stage 1
+ __m128i x1[16];
+ x1[0] = _mm_adds_epi16(input[0], input[15]);
+ x1[15] = _mm_subs_epi16(input[0], input[15]);
+ x1[1] = _mm_adds_epi16(input[1], input[14]);
+ x1[14] = _mm_subs_epi16(input[1], input[14]);
+ x1[2] = _mm_adds_epi16(input[2], input[13]);
+ x1[13] = _mm_subs_epi16(input[2], input[13]);
+ x1[3] = _mm_adds_epi16(input[3], input[12]);
+ x1[12] = _mm_subs_epi16(input[3], input[12]);
+ x1[4] = _mm_adds_epi16(input[4], input[11]);
+ x1[11] = _mm_subs_epi16(input[4], input[11]);
+ x1[5] = _mm_adds_epi16(input[5], input[10]);
+ x1[10] = _mm_subs_epi16(input[5], input[10]);
+ x1[6] = _mm_adds_epi16(input[6], input[9]);
+ x1[9] = _mm_subs_epi16(input[6], input[9]);
+ x1[7] = _mm_adds_epi16(input[7], input[8]);
+ x1[8] = _mm_subs_epi16(input[7], input[8]);
+
+ // stage 2
+ __m128i x2[16];
+ x2[0] = _mm_adds_epi16(x1[0], x1[7]);
+ x2[7] = _mm_subs_epi16(x1[0], x1[7]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[6]);
+ x2[6] = _mm_subs_epi16(x1[1], x1[6]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[5]);
+ x2[5] = _mm_subs_epi16(x1[2], x1[5]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[4]);
+ x2[4] = _mm_subs_epi16(x1[3], x1[4]);
+ x2[8] = x1[8];
+ x2[9] = x1[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]);
+ x2[14] = x1[14];
+ x2[15] = x1[15];
+
+ // stage 3
+ __m128i x3[16];
+ x3[0] = _mm_adds_epi16(x2[0], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[0], x2[3]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[1], x2[2]);
+ x3[4] = x2[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]);
+ x3[7] = x2[7];
+ x3[8] = _mm_adds_epi16(x2[8], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[8], x2[11]);
+ x3[9] = _mm_adds_epi16(x2[9], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[9], x2[10]);
+ x3[12] = _mm_subs_epi16(x2[15], x2[12]);
+ x3[15] = _mm_adds_epi16(x2[15], x2[12]);
+ x3[13] = _mm_subs_epi16(x2[14], x2[13]);
+ x3[14] = _mm_adds_epi16(x2[14], x2[13]);
+
+ // stage 4
+ __m128i x4[16];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]);
+ x4[4] = _mm_adds_epi16(x3[4], x3[5]);
+ x4[5] = _mm_subs_epi16(x3[4], x3[5]);
+ x4[6] = _mm_subs_epi16(x3[7], x3[6]);
+ x4[7] = _mm_adds_epi16(x3[7], x3[6]);
+ x4[8] = x3[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
+ x4[11] = x3[11];
+ x4[12] = x3[12];
+ x4[15] = x3[15];
+
+ // stage 5
+ __m128i x5[16];
+ x5[0] = x4[0];
+ x5[1] = x4[1];
+ x5[2] = x4[2];
+ x5[3] = x4[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]);
+ x5[8] = _mm_adds_epi16(x4[8], x4[9]);
+ x5[9] = _mm_subs_epi16(x4[8], x4[9]);
+ x5[10] = _mm_subs_epi16(x4[11], x4[10]);
+ x5[11] = _mm_adds_epi16(x4[11], x4[10]);
+ x5[12] = _mm_adds_epi16(x4[12], x4[13]);
+ x5[13] = _mm_subs_epi16(x4[12], x4[13]);
+ x5[14] = _mm_subs_epi16(x4[15], x4[14]);
+ x5[15] = _mm_adds_epi16(x4[15], x4[14]);
+
+ // stage 6
+ __m128i x6[16];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]);
+
+ // stage 7
+ output[0] = x6[0];
+ output[1] = x6[8];
+ output[2] = x6[4];
+ output[3] = x6[12];
+ output[4] = x6[2];
+ output[5] = x6[10];
+ output[6] = x6[6];
+ output[7] = x6[14];
+ output[8] = x6[1];
+ output[9] = x6[9];
+ output[10] = x6[5];
+ output[11] = x6[13];
+ output[12] = x6[3];
+ output[13] = x6[11];
+ output[14] = x6[7];
+ output[15] = x6[15];
+}
+
+void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+ __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+ __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+ __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+ __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+ __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+ __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+ __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+ __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+ __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+ __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+ __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+ __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+ __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+ __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+ __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+
+ // stage 1
+ __m128i x1[32];
+ x1[0] = _mm_adds_epi16(input[0], input[31]);
+ x1[31] = _mm_subs_epi16(input[0], input[31]);
+ x1[1] = _mm_adds_epi16(input[1], input[30]);
+ x1[30] = _mm_subs_epi16(input[1], input[30]);
+ x1[2] = _mm_adds_epi16(input[2], input[29]);
+ x1[29] = _mm_subs_epi16(input[2], input[29]);
+ x1[3] = _mm_adds_epi16(input[3], input[28]);
+ x1[28] = _mm_subs_epi16(input[3], input[28]);
+ x1[4] = _mm_adds_epi16(input[4], input[27]);
+ x1[27] = _mm_subs_epi16(input[4], input[27]);
+ x1[5] = _mm_adds_epi16(input[5], input[26]);
+ x1[26] = _mm_subs_epi16(input[5], input[26]);
+ x1[6] = _mm_adds_epi16(input[6], input[25]);
+ x1[25] = _mm_subs_epi16(input[6], input[25]);
+ x1[7] = _mm_adds_epi16(input[7], input[24]);
+ x1[24] = _mm_subs_epi16(input[7], input[24]);
+ x1[8] = _mm_adds_epi16(input[8], input[23]);
+ x1[23] = _mm_subs_epi16(input[8], input[23]);
+ x1[9] = _mm_adds_epi16(input[9], input[22]);
+ x1[22] = _mm_subs_epi16(input[9], input[22]);
+ x1[10] = _mm_adds_epi16(input[10], input[21]);
+ x1[21] = _mm_subs_epi16(input[10], input[21]);
+ x1[11] = _mm_adds_epi16(input[11], input[20]);
+ x1[20] = _mm_subs_epi16(input[11], input[20]);
+ x1[12] = _mm_adds_epi16(input[12], input[19]);
+ x1[19] = _mm_subs_epi16(input[12], input[19]);
+ x1[13] = _mm_adds_epi16(input[13], input[18]);
+ x1[18] = _mm_subs_epi16(input[13], input[18]);
+ x1[14] = _mm_adds_epi16(input[14], input[17]);
+ x1[17] = _mm_subs_epi16(input[14], input[17]);
+ x1[15] = _mm_adds_epi16(input[15], input[16]);
+ x1[16] = _mm_subs_epi16(input[15], input[16]);
+
+ // stage 2
+ __m128i x2[32];
+ x2[0] = _mm_adds_epi16(x1[0], x1[15]);
+ x2[15] = _mm_subs_epi16(x1[0], x1[15]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[14]);
+ x2[14] = _mm_subs_epi16(x1[1], x1[14]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[13]);
+ x2[13] = _mm_subs_epi16(x1[2], x1[13]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[12]);
+ x2[12] = _mm_subs_epi16(x1[3], x1[12]);
+ x2[4] = _mm_adds_epi16(x1[4], x1[11]);
+ x2[11] = _mm_subs_epi16(x1[4], x1[11]);
+ x2[5] = _mm_adds_epi16(x1[5], x1[10]);
+ x2[10] = _mm_subs_epi16(x1[5], x1[10]);
+ x2[6] = _mm_adds_epi16(x1[6], x1[9]);
+ x2[9] = _mm_subs_epi16(x1[6], x1[9]);
+ x2[7] = _mm_adds_epi16(x1[7], x1[8]);
+ x2[8] = _mm_subs_epi16(x1[7], x1[8]);
+ x2[16] = x1[16];
+ x2[17] = x1[17];
+ x2[18] = x1[18];
+ x2[19] = x1[19];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]);
+ x2[28] = x1[28];
+ x2[29] = x1[29];
+ x2[30] = x1[30];
+ x2[31] = x1[31];
+
+ // stage 3
+ __m128i x3[32];
+ x3[0] = _mm_adds_epi16(x2[0], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[0], x2[7]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[1], x2[6]);
+ x3[2] = _mm_adds_epi16(x2[2], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[2], x2[5]);
+ x3[3] = _mm_adds_epi16(x2[3], x2[4]);
+ x3[4] = _mm_subs_epi16(x2[3], x2[4]);
+ x3[8] = x2[8];
+ x3[9] = x2[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]);
+ x3[14] = x2[14];
+ x3[15] = x2[15];
+ x3[16] = _mm_adds_epi16(x2[16], x2[23]);
+ x3[23] = _mm_subs_epi16(x2[16], x2[23]);
+ x3[17] = _mm_adds_epi16(x2[17], x2[22]);
+ x3[22] = _mm_subs_epi16(x2[17], x2[22]);
+ x3[18] = _mm_adds_epi16(x2[18], x2[21]);
+ x3[21] = _mm_subs_epi16(x2[18], x2[21]);
+ x3[19] = _mm_adds_epi16(x2[19], x2[20]);
+ x3[20] = _mm_subs_epi16(x2[19], x2[20]);
+ x3[24] = _mm_subs_epi16(x2[31], x2[24]);
+ x3[31] = _mm_adds_epi16(x2[31], x2[24]);
+ x3[25] = _mm_subs_epi16(x2[30], x2[25]);
+ x3[30] = _mm_adds_epi16(x2[30], x2[25]);
+ x3[26] = _mm_subs_epi16(x2[29], x2[26]);
+ x3[29] = _mm_adds_epi16(x2[29], x2[26]);
+ x3[27] = _mm_subs_epi16(x2[28], x2[27]);
+ x3[28] = _mm_adds_epi16(x2[28], x2[27]);
+
+ // stage 4
+ __m128i x4[32];
+ x4[0] = _mm_adds_epi16(x3[0], x3[3]);
+ x4[3] = _mm_subs_epi16(x3[0], x3[3]);
+ x4[1] = _mm_adds_epi16(x3[1], x3[2]);
+ x4[2] = _mm_subs_epi16(x3[1], x3[2]);
+ x4[4] = x3[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
+ x4[7] = x3[7];
+ x4[8] = _mm_adds_epi16(x3[8], x3[11]);
+ x4[11] = _mm_subs_epi16(x3[8], x3[11]);
+ x4[9] = _mm_adds_epi16(x3[9], x3[10]);
+ x4[10] = _mm_subs_epi16(x3[9], x3[10]);
+ x4[12] = _mm_subs_epi16(x3[15], x3[12]);
+ x4[15] = _mm_adds_epi16(x3[15], x3[12]);
+ x4[13] = _mm_subs_epi16(x3[14], x3[13]);
+ x4[14] = _mm_adds_epi16(x3[14], x3[13]);
+ x4[16] = x3[16];
+ x4[17] = x3[17];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]);
+ x4[22] = x3[22];
+ x4[23] = x3[23];
+ x4[24] = x3[24];
+ x4[25] = x3[25];
+ x4[30] = x3[30];
+ x4[31] = x3[31];
+
+ // stage 5
+ __m128i x5[32];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]);
+ x5[4] = _mm_adds_epi16(x4[4], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[4], x4[5]);
+ x5[6] = _mm_subs_epi16(x4[7], x4[6]);
+ x5[7] = _mm_adds_epi16(x4[7], x4[6]);
+ x5[8] = x4[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
+ x5[11] = x4[11];
+ x5[12] = x4[12];
+ x5[15] = x4[15];
+ x5[16] = _mm_adds_epi16(x4[16], x4[19]);
+ x5[19] = _mm_subs_epi16(x4[16], x4[19]);
+ x5[17] = _mm_adds_epi16(x4[17], x4[18]);
+ x5[18] = _mm_subs_epi16(x4[17], x4[18]);
+ x5[20] = _mm_subs_epi16(x4[23], x4[20]);
+ x5[23] = _mm_adds_epi16(x4[23], x4[20]);
+ x5[21] = _mm_subs_epi16(x4[22], x4[21]);
+ x5[22] = _mm_adds_epi16(x4[22], x4[21]);
+ x5[24] = _mm_adds_epi16(x4[24], x4[27]);
+ x5[27] = _mm_subs_epi16(x4[24], x4[27]);
+ x5[25] = _mm_adds_epi16(x4[25], x4[26]);
+ x5[26] = _mm_subs_epi16(x4[25], x4[26]);
+ x5[28] = _mm_subs_epi16(x4[31], x4[28]);
+ x5[31] = _mm_adds_epi16(x4[31], x4[28]);
+ x5[29] = _mm_subs_epi16(x4[30], x4[29]);
+ x5[30] = _mm_adds_epi16(x4[30], x4[29]);
+
+ // stage 6
+ __m128i x6[32];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]);
+ x6[8] = _mm_adds_epi16(x5[8], x5[9]);
+ x6[9] = _mm_subs_epi16(x5[8], x5[9]);
+ x6[10] = _mm_subs_epi16(x5[11], x5[10]);
+ x6[11] = _mm_adds_epi16(x5[11], x5[10]);
+ x6[12] = _mm_adds_epi16(x5[12], x5[13]);
+ x6[13] = _mm_subs_epi16(x5[12], x5[13]);
+ x6[14] = _mm_subs_epi16(x5[15], x5[14]);
+ x6[15] = _mm_adds_epi16(x5[15], x5[14]);
+ x6[16] = x5[16];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]);
+ x6[19] = x5[19];
+ x6[20] = x5[20];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]);
+ x6[23] = x5[23];
+ x6[24] = x5[24];
+ x6[27] = x5[27];
+ x6[28] = x5[28];
+ x6[31] = x5[31];
+
+ // stage 7
+ __m128i x7[32];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ x7[4] = x6[4];
+ x7[5] = x6[5];
+ x7[6] = x6[6];
+ x7[7] = x6[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]);
+ x7[16] = _mm_adds_epi16(x6[16], x6[17]);
+ x7[17] = _mm_subs_epi16(x6[16], x6[17]);
+ x7[18] = _mm_subs_epi16(x6[19], x6[18]);
+ x7[19] = _mm_adds_epi16(x6[19], x6[18]);
+ x7[20] = _mm_adds_epi16(x6[20], x6[21]);
+ x7[21] = _mm_subs_epi16(x6[20], x6[21]);
+ x7[22] = _mm_subs_epi16(x6[23], x6[22]);
+ x7[23] = _mm_adds_epi16(x6[23], x6[22]);
+ x7[24] = _mm_adds_epi16(x6[24], x6[25]);
+ x7[25] = _mm_subs_epi16(x6[24], x6[25]);
+ x7[26] = _mm_subs_epi16(x6[27], x6[26]);
+ x7[27] = _mm_adds_epi16(x6[27], x6[26]);
+ x7[28] = _mm_adds_epi16(x6[28], x6[29]);
+ x7[29] = _mm_subs_epi16(x6[28], x6[29]);
+ x7[30] = _mm_subs_epi16(x6[31], x6[30]);
+ x7[31] = _mm_adds_epi16(x6[31], x6[30]);
+
+ // stage 8
+ __m128i x8[32];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ x8[8] = x7[8];
+ x8[9] = x7[9];
+ x8[10] = x7[10];
+ x8[11] = x7[11];
+ x8[12] = x7[12];
+ x8[13] = x7[13];
+ x8[14] = x7[14];
+ x8[15] = x7[15];
+ btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]);
+ btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]);
+ btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]);
+ btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]);
+ btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]);
+ btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]);
+ btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]);
+ btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]);
+
+ // stage 9
+ output[0] = x8[0];
+ output[1] = x8[16];
+ output[2] = x8[8];
+ output[3] = x8[24];
+ output[4] = x8[4];
+ output[5] = x8[20];
+ output[6] = x8[12];
+ output[7] = x8[28];
+ output[8] = x8[2];
+ output[9] = x8[18];
+ output[10] = x8[10];
+ output[11] = x8[26];
+ output[12] = x8[6];
+ output[13] = x8[22];
+ output[14] = x8[14];
+ output[15] = x8[30];
+ output[16] = x8[1];
+ output[17] = x8[17];
+ output[18] = x8[9];
+ output[19] = x8[25];
+ output[20] = x8[5];
+ output[21] = x8[21];
+ output[22] = x8[13];
+ output[23] = x8[29];
+ output[24] = x8[3];
+ output[25] = x8[19];
+ output[26] = x8[11];
+ output[27] = x8[27];
+ output[28] = x8[7];
+ output[29] = x8[23];
+ output[30] = x8[15];
+ output[31] = x8[31];
+}
+
+void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+ __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+ __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+ __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+ __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+ __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+ __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+ __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+ __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+ __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+ __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+ __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+ __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+ __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+ __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+ __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+ __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+ __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+ __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]);
+ __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]);
+ __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]);
+ __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]);
+ __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]);
+ __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]);
+ __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]);
+ __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]);
+ __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]);
+ __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]);
+ __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]);
+ __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]);
+ __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]);
+ __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]);
+ __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]);
+ __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]);
+ __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]);
+ __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]);
+ __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]);
+ __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]);
+ __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]);
+ __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]);
+ __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]);
+ __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]);
+ __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]);
+ __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]);
+ __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]);
+ __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]);
+ __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]);
+ __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]);
+ __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]);
+ __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]);
+
+ // stage 1
+ __m128i x1[64];
+ x1[0] = _mm_adds_epi16(input[0], input[63]);
+ x1[63] = _mm_subs_epi16(input[0], input[63]);
+ x1[1] = _mm_adds_epi16(input[1], input[62]);
+ x1[62] = _mm_subs_epi16(input[1], input[62]);
+ x1[2] = _mm_adds_epi16(input[2], input[61]);
+ x1[61] = _mm_subs_epi16(input[2], input[61]);
+ x1[3] = _mm_adds_epi16(input[3], input[60]);
+ x1[60] = _mm_subs_epi16(input[3], input[60]);
+ x1[4] = _mm_adds_epi16(input[4], input[59]);
+ x1[59] = _mm_subs_epi16(input[4], input[59]);
+ x1[5] = _mm_adds_epi16(input[5], input[58]);
+ x1[58] = _mm_subs_epi16(input[5], input[58]);
+ x1[6] = _mm_adds_epi16(input[6], input[57]);
+ x1[57] = _mm_subs_epi16(input[6], input[57]);
+ x1[7] = _mm_adds_epi16(input[7], input[56]);
+ x1[56] = _mm_subs_epi16(input[7], input[56]);
+ x1[8] = _mm_adds_epi16(input[8], input[55]);
+ x1[55] = _mm_subs_epi16(input[8], input[55]);
+ x1[9] = _mm_adds_epi16(input[9], input[54]);
+ x1[54] = _mm_subs_epi16(input[9], input[54]);
+ x1[10] = _mm_adds_epi16(input[10], input[53]);
+ x1[53] = _mm_subs_epi16(input[10], input[53]);
+ x1[11] = _mm_adds_epi16(input[11], input[52]);
+ x1[52] = _mm_subs_epi16(input[11], input[52]);
+ x1[12] = _mm_adds_epi16(input[12], input[51]);
+ x1[51] = _mm_subs_epi16(input[12], input[51]);
+ x1[13] = _mm_adds_epi16(input[13], input[50]);
+ x1[50] = _mm_subs_epi16(input[13], input[50]);
+ x1[14] = _mm_adds_epi16(input[14], input[49]);
+ x1[49] = _mm_subs_epi16(input[14], input[49]);
+ x1[15] = _mm_adds_epi16(input[15], input[48]);
+ x1[48] = _mm_subs_epi16(input[15], input[48]);
+ x1[16] = _mm_adds_epi16(input[16], input[47]);
+ x1[47] = _mm_subs_epi16(input[16], input[47]);
+ x1[17] = _mm_adds_epi16(input[17], input[46]);
+ x1[46] = _mm_subs_epi16(input[17], input[46]);
+ x1[18] = _mm_adds_epi16(input[18], input[45]);
+ x1[45] = _mm_subs_epi16(input[18], input[45]);
+ x1[19] = _mm_adds_epi16(input[19], input[44]);
+ x1[44] = _mm_subs_epi16(input[19], input[44]);
+ x1[20] = _mm_adds_epi16(input[20], input[43]);
+ x1[43] = _mm_subs_epi16(input[20], input[43]);
+ x1[21] = _mm_adds_epi16(input[21], input[42]);
+ x1[42] = _mm_subs_epi16(input[21], input[42]);
+ x1[22] = _mm_adds_epi16(input[22], input[41]);
+ x1[41] = _mm_subs_epi16(input[22], input[41]);
+ x1[23] = _mm_adds_epi16(input[23], input[40]);
+ x1[40] = _mm_subs_epi16(input[23], input[40]);
+ x1[24] = _mm_adds_epi16(input[24], input[39]);
+ x1[39] = _mm_subs_epi16(input[24], input[39]);
+ x1[25] = _mm_adds_epi16(input[25], input[38]);
+ x1[38] = _mm_subs_epi16(input[25], input[38]);
+ x1[26] = _mm_adds_epi16(input[26], input[37]);
+ x1[37] = _mm_subs_epi16(input[26], input[37]);
+ x1[27] = _mm_adds_epi16(input[27], input[36]);
+ x1[36] = _mm_subs_epi16(input[27], input[36]);
+ x1[28] = _mm_adds_epi16(input[28], input[35]);
+ x1[35] = _mm_subs_epi16(input[28], input[35]);
+ x1[29] = _mm_adds_epi16(input[29], input[34]);
+ x1[34] = _mm_subs_epi16(input[29], input[34]);
+ x1[30] = _mm_adds_epi16(input[30], input[33]);
+ x1[33] = _mm_subs_epi16(input[30], input[33]);
+ x1[31] = _mm_adds_epi16(input[31], input[32]);
+ x1[32] = _mm_subs_epi16(input[31], input[32]);
+
+ // stage 2
+ __m128i x2[64];
+ x2[0] = _mm_adds_epi16(x1[0], x1[31]);
+ x2[31] = _mm_subs_epi16(x1[0], x1[31]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[30]);
+ x2[30] = _mm_subs_epi16(x1[1], x1[30]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[29]);
+ x2[29] = _mm_subs_epi16(x1[2], x1[29]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[28]);
+ x2[28] = _mm_subs_epi16(x1[3], x1[28]);
+ x2[4] = _mm_adds_epi16(x1[4], x1[27]);
+ x2[27] = _mm_subs_epi16(x1[4], x1[27]);
+ x2[5] = _mm_adds_epi16(x1[5], x1[26]);
+ x2[26] = _mm_subs_epi16(x1[5], x1[26]);
+ x2[6] = _mm_adds_epi16(x1[6], x1[25]);
+ x2[25] = _mm_subs_epi16(x1[6], x1[25]);
+ x2[7] = _mm_adds_epi16(x1[7], x1[24]);
+ x2[24] = _mm_subs_epi16(x1[7], x1[24]);
+ x2[8] = _mm_adds_epi16(x1[8], x1[23]);
+ x2[23] = _mm_subs_epi16(x1[8], x1[23]);
+ x2[9] = _mm_adds_epi16(x1[9], x1[22]);
+ x2[22] = _mm_subs_epi16(x1[9], x1[22]);
+ x2[10] = _mm_adds_epi16(x1[10], x1[21]);
+ x2[21] = _mm_subs_epi16(x1[10], x1[21]);
+ x2[11] = _mm_adds_epi16(x1[11], x1[20]);
+ x2[20] = _mm_subs_epi16(x1[11], x1[20]);
+ x2[12] = _mm_adds_epi16(x1[12], x1[19]);
+ x2[19] = _mm_subs_epi16(x1[12], x1[19]);
+ x2[13] = _mm_adds_epi16(x1[13], x1[18]);
+ x2[18] = _mm_subs_epi16(x1[13], x1[18]);
+ x2[14] = _mm_adds_epi16(x1[14], x1[17]);
+ x2[17] = _mm_subs_epi16(x1[14], x1[17]);
+ x2[15] = _mm_adds_epi16(x1[15], x1[16]);
+ x2[16] = _mm_subs_epi16(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+
+ // stage 3
+ __m128i x3[64];
+ x3[0] = _mm_adds_epi16(x2[0], x2[15]);
+ x3[15] = _mm_subs_epi16(x2[0], x2[15]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[14]);
+ x3[14] = _mm_subs_epi16(x2[1], x2[14]);
+ x3[2] = _mm_adds_epi16(x2[2], x2[13]);
+ x3[13] = _mm_subs_epi16(x2[2], x2[13]);
+ x3[3] = _mm_adds_epi16(x2[3], x2[12]);
+ x3[12] = _mm_subs_epi16(x2[3], x2[12]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[4], x2[11]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[5], x2[10]);
+ x3[6] = _mm_adds_epi16(x2[6], x2[9]);
+ x3[9] = _mm_subs_epi16(x2[6], x2[9]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[8]);
+ x3[8] = _mm_subs_epi16(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm_adds_epi16(x2[32], x2[47]);
+ x3[47] = _mm_subs_epi16(x2[32], x2[47]);
+ x3[33] = _mm_adds_epi16(x2[33], x2[46]);
+ x3[46] = _mm_subs_epi16(x2[33], x2[46]);
+ x3[34] = _mm_adds_epi16(x2[34], x2[45]);
+ x3[45] = _mm_subs_epi16(x2[34], x2[45]);
+ x3[35] = _mm_adds_epi16(x2[35], x2[44]);
+ x3[44] = _mm_subs_epi16(x2[35], x2[44]);
+ x3[36] = _mm_adds_epi16(x2[36], x2[43]);
+ x3[43] = _mm_subs_epi16(x2[36], x2[43]);
+ x3[37] = _mm_adds_epi16(x2[37], x2[42]);
+ x3[42] = _mm_subs_epi16(x2[37], x2[42]);
+ x3[38] = _mm_adds_epi16(x2[38], x2[41]);
+ x3[41] = _mm_subs_epi16(x2[38], x2[41]);
+ x3[39] = _mm_adds_epi16(x2[39], x2[40]);
+ x3[40] = _mm_subs_epi16(x2[39], x2[40]);
+ x3[48] = _mm_subs_epi16(x2[63], x2[48]);
+ x3[63] = _mm_adds_epi16(x2[63], x2[48]);
+ x3[49] = _mm_subs_epi16(x2[62], x2[49]);
+ x3[62] = _mm_adds_epi16(x2[62], x2[49]);
+ x3[50] = _mm_subs_epi16(x2[61], x2[50]);
+ x3[61] = _mm_adds_epi16(x2[61], x2[50]);
+ x3[51] = _mm_subs_epi16(x2[60], x2[51]);
+ x3[60] = _mm_adds_epi16(x2[60], x2[51]);
+ x3[52] = _mm_subs_epi16(x2[59], x2[52]);
+ x3[59] = _mm_adds_epi16(x2[59], x2[52]);
+ x3[53] = _mm_subs_epi16(x2[58], x2[53]);
+ x3[58] = _mm_adds_epi16(x2[58], x2[53]);
+ x3[54] = _mm_subs_epi16(x2[57], x2[54]);
+ x3[57] = _mm_adds_epi16(x2[57], x2[54]);
+ x3[55] = _mm_subs_epi16(x2[56], x2[55]);
+ x3[56] = _mm_adds_epi16(x2[56], x2[55]);
+
+ // stage 4
+ __m128i x4[64];
+ x4[0] = _mm_adds_epi16(x3[0], x3[7]);
+ x4[7] = _mm_subs_epi16(x3[0], x3[7]);
+ x4[1] = _mm_adds_epi16(x3[1], x3[6]);
+ x4[6] = _mm_subs_epi16(x3[1], x3[6]);
+ x4[2] = _mm_adds_epi16(x3[2], x3[5]);
+ x4[5] = _mm_subs_epi16(x3[2], x3[5]);
+ x4[3] = _mm_adds_epi16(x3[3], x3[4]);
+ x4[4] = _mm_subs_epi16(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm_adds_epi16(x3[16], x3[23]);
+ x4[23] = _mm_subs_epi16(x3[16], x3[23]);
+ x4[17] = _mm_adds_epi16(x3[17], x3[22]);
+ x4[22] = _mm_subs_epi16(x3[17], x3[22]);
+ x4[18] = _mm_adds_epi16(x3[18], x3[21]);
+ x4[21] = _mm_subs_epi16(x3[18], x3[21]);
+ x4[19] = _mm_adds_epi16(x3[19], x3[20]);
+ x4[20] = _mm_subs_epi16(x3[19], x3[20]);
+ x4[24] = _mm_subs_epi16(x3[31], x3[24]);
+ x4[31] = _mm_adds_epi16(x3[31], x3[24]);
+ x4[25] = _mm_subs_epi16(x3[30], x3[25]);
+ x4[30] = _mm_adds_epi16(x3[30], x3[25]);
+ x4[26] = _mm_subs_epi16(x3[29], x3[26]);
+ x4[29] = _mm_adds_epi16(x3[29], x3[26]);
+ x4[27] = _mm_subs_epi16(x3[28], x3[27]);
+ x4[28] = _mm_adds_epi16(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+
+ // stage 5
+ __m128i x5[64];
+ x5[0] = _mm_adds_epi16(x4[0], x4[3]);
+ x5[3] = _mm_subs_epi16(x4[0], x4[3]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[2]);
+ x5[2] = _mm_subs_epi16(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]);
+ x5[7] = x4[7];
+ x5[8] = _mm_adds_epi16(x4[8], x4[11]);
+ x5[11] = _mm_subs_epi16(x4[8], x4[11]);
+ x5[9] = _mm_adds_epi16(x4[9], x4[10]);
+ x5[10] = _mm_subs_epi16(x4[9], x4[10]);
+ x5[12] = _mm_subs_epi16(x4[15], x4[12]);
+ x5[15] = _mm_adds_epi16(x4[15], x4[12]);
+ x5[13] = _mm_subs_epi16(x4[14], x4[13]);
+ x5[14] = _mm_adds_epi16(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm_adds_epi16(x4[32], x4[39]);
+ x5[39] = _mm_subs_epi16(x4[32], x4[39]);
+ x5[33] = _mm_adds_epi16(x4[33], x4[38]);
+ x5[38] = _mm_subs_epi16(x4[33], x4[38]);
+ x5[34] = _mm_adds_epi16(x4[34], x4[37]);
+ x5[37] = _mm_subs_epi16(x4[34], x4[37]);
+ x5[35] = _mm_adds_epi16(x4[35], x4[36]);
+ x5[36] = _mm_subs_epi16(x4[35], x4[36]);
+ x5[40] = _mm_subs_epi16(x4[47], x4[40]);
+ x5[47] = _mm_adds_epi16(x4[47], x4[40]);
+ x5[41] = _mm_subs_epi16(x4[46], x4[41]);
+ x5[46] = _mm_adds_epi16(x4[46], x4[41]);
+ x5[42] = _mm_subs_epi16(x4[45], x4[42]);
+ x5[45] = _mm_adds_epi16(x4[45], x4[42]);
+ x5[43] = _mm_subs_epi16(x4[44], x4[43]);
+ x5[44] = _mm_adds_epi16(x4[44], x4[43]);
+ x5[48] = _mm_adds_epi16(x4[48], x4[55]);
+ x5[55] = _mm_subs_epi16(x4[48], x4[55]);
+ x5[49] = _mm_adds_epi16(x4[49], x4[54]);
+ x5[54] = _mm_subs_epi16(x4[49], x4[54]);
+ x5[50] = _mm_adds_epi16(x4[50], x4[53]);
+ x5[53] = _mm_subs_epi16(x4[50], x4[53]);
+ x5[51] = _mm_adds_epi16(x4[51], x4[52]);
+ x5[52] = _mm_subs_epi16(x4[51], x4[52]);
+ x5[56] = _mm_subs_epi16(x4[63], x4[56]);
+ x5[63] = _mm_adds_epi16(x4[63], x4[56]);
+ x5[57] = _mm_subs_epi16(x4[62], x4[57]);
+ x5[62] = _mm_adds_epi16(x4[62], x4[57]);
+ x5[58] = _mm_subs_epi16(x4[61], x4[58]);
+ x5[61] = _mm_adds_epi16(x4[61], x4[58]);
+ x5[59] = _mm_subs_epi16(x4[60], x4[59]);
+ x5[60] = _mm_adds_epi16(x4[60], x4[59]);
+
+ // stage 6
+ __m128i x6[64];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]);
+ x6[4] = _mm_adds_epi16(x5[4], x5[5]);
+ x6[5] = _mm_subs_epi16(x5[4], x5[5]);
+ x6[6] = _mm_subs_epi16(x5[7], x5[6]);
+ x6[7] = _mm_adds_epi16(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm_adds_epi16(x5[16], x5[19]);
+ x6[19] = _mm_subs_epi16(x5[16], x5[19]);
+ x6[17] = _mm_adds_epi16(x5[17], x5[18]);
+ x6[18] = _mm_subs_epi16(x5[17], x5[18]);
+ x6[20] = _mm_subs_epi16(x5[23], x5[20]);
+ x6[23] = _mm_adds_epi16(x5[23], x5[20]);
+ x6[21] = _mm_subs_epi16(x5[22], x5[21]);
+ x6[22] = _mm_adds_epi16(x5[22], x5[21]);
+ x6[24] = _mm_adds_epi16(x5[24], x5[27]);
+ x6[27] = _mm_subs_epi16(x5[24], x5[27]);
+ x6[25] = _mm_adds_epi16(x5[25], x5[26]);
+ x6[26] = _mm_subs_epi16(x5[25], x5[26]);
+ x6[28] = _mm_subs_epi16(x5[31], x5[28]);
+ x6[31] = _mm_adds_epi16(x5[31], x5[28]);
+ x6[29] = _mm_subs_epi16(x5[30], x5[29]);
+ x6[30] = _mm_adds_epi16(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+
+ // stage 7
+ __m128i x7[64];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]);
+ x7[8] = _mm_adds_epi16(x6[8], x6[9]);
+ x7[9] = _mm_subs_epi16(x6[8], x6[9]);
+ x7[10] = _mm_subs_epi16(x6[11], x6[10]);
+ x7[11] = _mm_adds_epi16(x6[11], x6[10]);
+ x7[12] = _mm_adds_epi16(x6[12], x6[13]);
+ x7[13] = _mm_subs_epi16(x6[12], x6[13]);
+ x7[14] = _mm_subs_epi16(x6[15], x6[14]);
+ x7[15] = _mm_adds_epi16(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm_adds_epi16(x6[32], x6[35]);
+ x7[35] = _mm_subs_epi16(x6[32], x6[35]);
+ x7[33] = _mm_adds_epi16(x6[33], x6[34]);
+ x7[34] = _mm_subs_epi16(x6[33], x6[34]);
+ x7[36] = _mm_subs_epi16(x6[39], x6[36]);
+ x7[39] = _mm_adds_epi16(x6[39], x6[36]);
+ x7[37] = _mm_subs_epi16(x6[38], x6[37]);
+ x7[38] = _mm_adds_epi16(x6[38], x6[37]);
+ x7[40] = _mm_adds_epi16(x6[40], x6[43]);
+ x7[43] = _mm_subs_epi16(x6[40], x6[43]);
+ x7[41] = _mm_adds_epi16(x6[41], x6[42]);
+ x7[42] = _mm_subs_epi16(x6[41], x6[42]);
+ x7[44] = _mm_subs_epi16(x6[47], x6[44]);
+ x7[47] = _mm_adds_epi16(x6[47], x6[44]);
+ x7[45] = _mm_subs_epi16(x6[46], x6[45]);
+ x7[46] = _mm_adds_epi16(x6[46], x6[45]);
+ x7[48] = _mm_adds_epi16(x6[48], x6[51]);
+ x7[51] = _mm_subs_epi16(x6[48], x6[51]);
+ x7[49] = _mm_adds_epi16(x6[49], x6[50]);
+ x7[50] = _mm_subs_epi16(x6[49], x6[50]);
+ x7[52] = _mm_subs_epi16(x6[55], x6[52]);
+ x7[55] = _mm_adds_epi16(x6[55], x6[52]);
+ x7[53] = _mm_subs_epi16(x6[54], x6[53]);
+ x7[54] = _mm_adds_epi16(x6[54], x6[53]);
+ x7[56] = _mm_adds_epi16(x6[56], x6[59]);
+ x7[59] = _mm_subs_epi16(x6[56], x6[59]);
+ x7[57] = _mm_adds_epi16(x6[57], x6[58]);
+ x7[58] = _mm_subs_epi16(x6[57], x6[58]);
+ x7[60] = _mm_subs_epi16(x6[63], x6[60]);
+ x7[63] = _mm_adds_epi16(x6[63], x6[60]);
+ x7[61] = _mm_subs_epi16(x6[62], x6[61]);
+ x7[62] = _mm_adds_epi16(x6[62], x6[61]);
+
+ // stage 8
+ __m128i x8[64];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]);
+ x8[16] = _mm_adds_epi16(x7[16], x7[17]);
+ x8[17] = _mm_subs_epi16(x7[16], x7[17]);
+ x8[18] = _mm_subs_epi16(x7[19], x7[18]);
+ x8[19] = _mm_adds_epi16(x7[19], x7[18]);
+ x8[20] = _mm_adds_epi16(x7[20], x7[21]);
+ x8[21] = _mm_subs_epi16(x7[20], x7[21]);
+ x8[22] = _mm_subs_epi16(x7[23], x7[22]);
+ x8[23] = _mm_adds_epi16(x7[23], x7[22]);
+ x8[24] = _mm_adds_epi16(x7[24], x7[25]);
+ x8[25] = _mm_subs_epi16(x7[24], x7[25]);
+ x8[26] = _mm_subs_epi16(x7[27], x7[26]);
+ x8[27] = _mm_adds_epi16(x7[27], x7[26]);
+ x8[28] = _mm_adds_epi16(x7[28], x7[29]);
+ x8[29] = _mm_subs_epi16(x7[28], x7[29]);
+ x8[30] = _mm_subs_epi16(x7[31], x7[30]);
+ x8[31] = _mm_adds_epi16(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]);
+ btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]);
+ btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+
+ // stage 9
+ __m128i x9[64];
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]);
+ btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]);
+ btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]);
+ btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]);
+ btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]);
+ btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]);
+ btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]);
+ btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]);
+ x9[32] = _mm_adds_epi16(x8[32], x8[33]);
+ x9[33] = _mm_subs_epi16(x8[32], x8[33]);
+ x9[34] = _mm_subs_epi16(x8[35], x8[34]);
+ x9[35] = _mm_adds_epi16(x8[35], x8[34]);
+ x9[36] = _mm_adds_epi16(x8[36], x8[37]);
+ x9[37] = _mm_subs_epi16(x8[36], x8[37]);
+ x9[38] = _mm_subs_epi16(x8[39], x8[38]);
+ x9[39] = _mm_adds_epi16(x8[39], x8[38]);
+ x9[40] = _mm_adds_epi16(x8[40], x8[41]);
+ x9[41] = _mm_subs_epi16(x8[40], x8[41]);
+ x9[42] = _mm_subs_epi16(x8[43], x8[42]);
+ x9[43] = _mm_adds_epi16(x8[43], x8[42]);
+ x9[44] = _mm_adds_epi16(x8[44], x8[45]);
+ x9[45] = _mm_subs_epi16(x8[44], x8[45]);
+ x9[46] = _mm_subs_epi16(x8[47], x8[46]);
+ x9[47] = _mm_adds_epi16(x8[47], x8[46]);
+ x9[48] = _mm_adds_epi16(x8[48], x8[49]);
+ x9[49] = _mm_subs_epi16(x8[48], x8[49]);
+ x9[50] = _mm_subs_epi16(x8[51], x8[50]);
+ x9[51] = _mm_adds_epi16(x8[51], x8[50]);
+ x9[52] = _mm_adds_epi16(x8[52], x8[53]);
+ x9[53] = _mm_subs_epi16(x8[52], x8[53]);
+ x9[54] = _mm_subs_epi16(x8[55], x8[54]);
+ x9[55] = _mm_adds_epi16(x8[55], x8[54]);
+ x9[56] = _mm_adds_epi16(x8[56], x8[57]);
+ x9[57] = _mm_subs_epi16(x8[56], x8[57]);
+ x9[58] = _mm_subs_epi16(x8[59], x8[58]);
+ x9[59] = _mm_adds_epi16(x8[59], x8[58]);
+ x9[60] = _mm_adds_epi16(x8[60], x8[61]);
+ x9[61] = _mm_subs_epi16(x8[60], x8[61]);
+ x9[62] = _mm_subs_epi16(x8[63], x8[62]);
+ x9[63] = _mm_adds_epi16(x8[63], x8[62]);
+
+ // stage 10
+ __m128i x10[64];
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]);
+ btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]);
+ btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]);
+ btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]);
+ btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]);
+ btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]);
+ btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]);
+ btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]);
+ btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]);
+ btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]);
+ btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]);
+ btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]);
+ btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]);
+ btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]);
+ btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]);
+ btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]);
+
+ // stage 11
+ output[0] = x10[0];
+ output[1] = x10[32];
+ output[2] = x10[16];
+ output[3] = x10[48];
+ output[4] = x10[8];
+ output[5] = x10[40];
+ output[6] = x10[24];
+ output[7] = x10[56];
+ output[8] = x10[4];
+ output[9] = x10[36];
+ output[10] = x10[20];
+ output[11] = x10[52];
+ output[12] = x10[12];
+ output[13] = x10[44];
+ output[14] = x10[28];
+ output[15] = x10[60];
+ output[16] = x10[2];
+ output[17] = x10[34];
+ output[18] = x10[18];
+ output[19] = x10[50];
+ output[20] = x10[10];
+ output[21] = x10[42];
+ output[22] = x10[26];
+ output[23] = x10[58];
+ output[24] = x10[6];
+ output[25] = x10[38];
+ output[26] = x10[22];
+ output[27] = x10[54];
+ output[28] = x10[14];
+ output[29] = x10[46];
+ output[30] = x10[30];
+ output[31] = x10[62];
+ output[32] = x10[1];
+ output[33] = x10[33];
+ output[34] = x10[17];
+ output[35] = x10[49];
+ output[36] = x10[9];
+ output[37] = x10[41];
+ output[38] = x10[25];
+ output[39] = x10[57];
+ output[40] = x10[5];
+ output[41] = x10[37];
+ output[42] = x10[21];
+ output[43] = x10[53];
+ output[44] = x10[13];
+ output[45] = x10[45];
+ output[46] = x10[29];
+ output[47] = x10[61];
+ output[48] = x10[3];
+ output[49] = x10[35];
+ output[50] = x10[19];
+ output[51] = x10[51];
+ output[52] = x10[11];
+ output[53] = x10[43];
+ output[54] = x10[27];
+ output[55] = x10[59];
+ output[56] = x10[7];
+ output[57] = x10[39];
+ output[58] = x10[23];
+ output[59] = x10[55];
+ output[60] = x10[15];
+ output[61] = x10[47];
+ output[62] = x10[31];
+ output[63] = x10[63];
+}
+
+static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *sinpi = sinpi_arr(cos_bit);
+ const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+ const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+ const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+ const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+ __m128i u[8], v[8];
+
+ u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u[1] = _mm_unpacklo_epi16(input[2], input[3]);
+ u[2] = _mm_unpacklo_epi16(in7, __zero);
+ u[3] = _mm_unpacklo_epi16(input[2], __zero);
+ u[4] = _mm_unpacklo_epi16(input[3], __zero);
+
+ v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02); // s0 + s2
+ v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04); // s4 + s5
+ v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03); // x1
+ v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01); // s1 - s3
+ v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02); // -s4 + s6
+ v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03); // s4
+ v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03);
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_sub_epi32(v[2], v[6]);
+ u[2] = _mm_add_epi32(v[3], v[4]);
+ u[3] = _mm_sub_epi32(u[2], u[0]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_sub_epi32(u[4], v[5]);
+ u[6] = _mm_add_epi32(u[3], u[5]);
+
+ v[0] = _mm_add_epi32(u[0], __rounding);
+ v[1] = _mm_add_epi32(u[1], __rounding);
+ v[2] = _mm_add_epi32(u[2], __rounding);
+ v[3] = _mm_add_epi32(u[6], __rounding);
+
+ u[0] = _mm_srai_epi32(v[0], cos_bit);
+ u[1] = _mm_srai_epi32(v[1], cos_bit);
+ u[2] = _mm_srai_epi32(v[2], cos_bit);
+ u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u[0], u[2]);
+ output[1] = _mm_packs_epi32(u[1], u[3]);
+ output[2] = _mm_srli_si128(output[0], 8);
+ output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[7]);
+ x1[2] = _mm_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2],
+ &x1[3], &x2[2], &x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6],
+ &x1[7], &x2[6], &x2[7]);
+
+ // stage 3
+ __m128i x3[8];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4],
+ &x3[5], &x4[4], &x4[5]);
+ btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6],
+ &x3[7], &x4[6], &x4[7]);
+
+ // stage 5
+ __m128i x5[8];
+ x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+ // stage 6
+ __m128i x6[8];
+ btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0],
+ &x5[1], &x6[0], &x6[1]);
+ btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2],
+ &x5[3], &x6[2], &x6[3]);
+ btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4],
+ &x5[5], &x6[4], &x6[5]);
+ btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6],
+ &x5[7], &x6[6], &x6[7]);
+
+ // stage 7
+ output[0] = x6[1];
+ output[1] = x6[6];
+ output[2] = x6[3];
+ output[3] = x6[4];
+ output[4] = x6[5];
+ output[5] = x6[2];
+ output[6] = x6[7];
+ output[7] = x6[0];
+}
+
+static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *sinpi = sinpi_arr(cos_bit);
+ const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+ const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+ const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+ const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+ __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];
+
+ u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]);
+ u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]);
+ u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]);
+ u_lo[2] = _mm_unpacklo_epi16(in7, __zero);
+ u_hi[2] = _mm_unpackhi_epi16(in7, __zero);
+ u_lo[3] = _mm_unpacklo_epi16(input[2], __zero);
+ u_hi[3] = _mm_unpackhi_epi16(input[2], __zero);
+ u_lo[4] = _mm_unpacklo_epi16(input[3], __zero);
+ u_hi[4] = _mm_unpackhi_epi16(input[3], __zero);
+
+ v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02); // s0 + s2
+ v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02); // s0 + s2
+ v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04); // s4 + s5
+ v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04); // s4 + s5
+ v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03); // x1
+ v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03); // x1
+ v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01); // s1 - s3
+ v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01); // s1 - s3
+ v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02); // -s4 + s6
+ v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02); // -s4 + s6
+ v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03); // s4
+ v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03); // s4
+ v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03);
+ v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03);
+
+ u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]);
+ u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]);
+ u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]);
+ u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]);
+ u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]);
+ u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]);
+ u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]);
+ u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]);
+ u_lo[4] = _mm_slli_epi32(v_lo[5], 2);
+ u_hi[4] = _mm_slli_epi32(v_hi[5], 2);
+ u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]);
+ u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]);
+ u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]);
+ u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]);
+
+ v_lo[0] = _mm_add_epi32(u_lo[0], __rounding);
+ v_hi[0] = _mm_add_epi32(u_hi[0], __rounding);
+ v_lo[1] = _mm_add_epi32(u_lo[1], __rounding);
+ v_hi[1] = _mm_add_epi32(u_hi[1], __rounding);
+ v_lo[2] = _mm_add_epi32(u_lo[2], __rounding);
+ v_hi[2] = _mm_add_epi32(u_hi[2], __rounding);
+ v_lo[3] = _mm_add_epi32(u_lo[6], __rounding);
+ v_hi[3] = _mm_add_epi32(u_hi[6], __rounding);
+
+ u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit);
+ u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit);
+ u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit);
+ u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit);
+ u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit);
+ u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit);
+ u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit);
+ u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]);
+ output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]);
+ output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]);
+ output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
+}
+
+static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m128i x1[16];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[15]);
+ x1[2] = _mm_subs_epi16(__zero, input[7]);
+ x1[3] = input[8];
+ x1[4] = _mm_subs_epi16(__zero, input[3]);
+ x1[5] = input[12];
+ x1[6] = input[4];
+ x1[7] = _mm_subs_epi16(__zero, input[11]);
+ x1[8] = _mm_subs_epi16(__zero, input[1]);
+ x1[9] = input[14];
+ x1[10] = input[6];
+ x1[11] = _mm_subs_epi16(__zero, input[9]);
+ x1[12] = input[2];
+ x1[13] = _mm_subs_epi16(__zero, input[13]);
+ x1[14] = _mm_subs_epi16(__zero, input[5]);
+ x1[15] = input[10];
+
+ // stage 2
+ __m128i x2[16];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+ x2[8] = x1[8];
+ x2[9] = x1[9];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]);
+ x2[12] = x1[12];
+ x2[13] = x1[13];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]);
+
+ // stage 3
+ __m128i x3[16];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+ x3[8] = _mm_adds_epi16(x2[8], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[8], x2[10]);
+ x3[9] = _mm_adds_epi16(x2[9], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[9], x2[11]);
+ x3[12] = _mm_adds_epi16(x2[12], x2[14]);
+ x3[14] = _mm_subs_epi16(x2[12], x2[14]);
+ x3[13] = _mm_adds_epi16(x2[13], x2[15]);
+ x3[15] = _mm_subs_epi16(x2[13], x2[15]);
+
+ // stage 4
+ __m128i x4[16];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ x4[10] = x3[10];
+ x4[11] = x3[11];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]);
+
+ // stage 5
+ __m128i x5[16];
+ x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+ x5[8] = _mm_adds_epi16(x4[8], x4[12]);
+ x5[12] = _mm_subs_epi16(x4[8], x4[12]);
+ x5[9] = _mm_adds_epi16(x4[9], x4[13]);
+ x5[13] = _mm_subs_epi16(x4[9], x4[13]);
+ x5[10] = _mm_adds_epi16(x4[10], x4[14]);
+ x5[14] = _mm_subs_epi16(x4[10], x4[14]);
+ x5[11] = _mm_adds_epi16(x4[11], x4[15]);
+ x5[15] = _mm_subs_epi16(x4[11], x4[15]);
+
+ // stage 6
+ __m128i x6[16];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]);
+ btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]);
+ btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]);
+ btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]);
+
+ // stage 7
+ __m128i x7[16];
+ x7[0] = _mm_adds_epi16(x6[0], x6[8]);
+ x7[8] = _mm_subs_epi16(x6[0], x6[8]);
+ x7[1] = _mm_adds_epi16(x6[1], x6[9]);
+ x7[9] = _mm_subs_epi16(x6[1], x6[9]);
+ x7[2] = _mm_adds_epi16(x6[2], x6[10]);
+ x7[10] = _mm_subs_epi16(x6[2], x6[10]);
+ x7[3] = _mm_adds_epi16(x6[3], x6[11]);
+ x7[11] = _mm_subs_epi16(x6[3], x6[11]);
+ x7[4] = _mm_adds_epi16(x6[4], x6[12]);
+ x7[12] = _mm_subs_epi16(x6[4], x6[12]);
+ x7[5] = _mm_adds_epi16(x6[5], x6[13]);
+ x7[13] = _mm_subs_epi16(x6[5], x6[13]);
+ x7[6] = _mm_adds_epi16(x6[6], x6[14]);
+ x7[14] = _mm_subs_epi16(x6[6], x6[14]);
+ x7[7] = _mm_adds_epi16(x6[7], x6[15]);
+ x7[15] = _mm_subs_epi16(x6[7], x6[15]);
+
+ // stage 8
+ __m128i x8[16];
+ btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]);
+ btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]);
+ btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]);
+ btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]);
+ btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]);
+ btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]);
+ btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]);
+ btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]);
+
+ // stage 9
+ output[0] = x8[1];
+ output[1] = x8[14];
+ output[2] = x8[3];
+ output[3] = x8[12];
+ output[4] = x8[5];
+ output[5] = x8[10];
+ output[6] = x8[7];
+ output[7] = x8[8];
+ output[8] = x8[9];
+ output[9] = x8[6];
+ output[10] = x8[11];
+ output[11] = x8[4];
+ output[12] = x8[13];
+ output[13] = x8[2];
+ output[14] = x8[15];
+ output[15] = x8[0];
+}
+
+static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_new_sse2, // DCT_DCT
+ fadst4x4_new_sse2, // ADST_DCT
+ fdct4x4_new_sse2, // DCT_ADST
+ fadst4x4_new_sse2, // ADST_ADST
+ fadst4x4_new_sse2, // FLIPADST_DCT
+ fdct4x4_new_sse2, // DCT_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_FLIPADST
+ fadst4x4_new_sse2, // ADST_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_ADST
+ fidentity4x4_new_sse2, // IDTX
+ fdct4x4_new_sse2, // V_DCT
+ fidentity4x4_new_sse2, // H_DCT
+ fadst4x4_new_sse2, // V_ADST
+ fidentity4x4_new_sse2, // H_ADST
+ fadst4x4_new_sse2, // V_FLIPADST
+ fidentity4x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_new_sse2, // DCT_DCT
+ fdct4x4_new_sse2, // ADST_DCT
+ fadst4x4_new_sse2, // DCT_ADST
+ fadst4x4_new_sse2, // ADST_ADST
+ fdct4x4_new_sse2, // FLIPADST_DCT
+ fadst4x4_new_sse2, // DCT_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_FLIPADST
+ fadst4x4_new_sse2, // ADST_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_ADST
+ fidentity4x4_new_sse2, // IDTX
+ fidentity4x4_new_sse2, // V_DCT
+ fdct4x4_new_sse2, // H_DCT
+ fidentity4x4_new_sse2, // V_ADST
+ fadst4x4_new_sse2, // H_ADST
+ fidentity4x4_new_sse2, // V_FLIPADST
+ fadst4x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_new_sse2, // DCT_DCT
+ fadst4x8_new_sse2, // ADST_DCT
+ fdct4x8_new_sse2, // DCT_ADST
+ fadst4x8_new_sse2, // ADST_ADST
+ fadst4x8_new_sse2, // FLIPADST_DCT
+ fdct4x8_new_sse2, // DCT_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_FLIPADST
+ fadst4x8_new_sse2, // ADST_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fdct4x8_new_sse2, // V_DCT
+ fidentity8x8_new_sse2, // H_DCT
+ fadst4x8_new_sse2, // V_ADST
+ fidentity8x8_new_sse2, // H_ADST
+ fadst4x8_new_sse2, // V_FLIPADST
+ fidentity8x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_new_sse2, // DCT_DCT
+ fdct8x4_new_sse2, // ADST_DCT
+ fadst8x4_new_sse2, // DCT_ADST
+ fadst8x4_new_sse2, // ADST_ADST
+ fdct8x4_new_sse2, // FLIPADST_DCT
+ fadst8x4_new_sse2, // DCT_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_FLIPADST
+ fadst8x4_new_sse2, // ADST_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_ADST
+ fidentity8x4_new_sse2, // IDTX
+ fidentity8x4_new_sse2, // V_DCT
+ fdct8x4_new_sse2, // H_DCT
+ fidentity8x4_new_sse2, // V_ADST
+ fadst8x4_new_sse2, // H_ADST
+ fidentity8x4_new_sse2, // V_FLIPADST
+ fadst8x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_new_sse2, // DCT_DCT
+ fadst8x4_new_sse2, // ADST_DCT
+ fdct8x4_new_sse2, // DCT_ADST
+ fadst8x4_new_sse2, // ADST_ADST
+ fadst8x4_new_sse2, // FLIPADST_DCT
+ fdct8x4_new_sse2, // DCT_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_FLIPADST
+ fadst8x4_new_sse2, // ADST_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_ADST
+ fidentity8x4_new_sse2, // IDTX
+ fdct8x4_new_sse2, // V_DCT
+ fidentity8x4_new_sse2, // H_DCT
+ fadst8x4_new_sse2, // V_ADST
+ fidentity8x4_new_sse2, // H_ADST
+ fadst8x4_new_sse2, // V_FLIPADST
+ fidentity8x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_new_sse2, // DCT_DCT
+ fdct4x8_new_sse2, // ADST_DCT
+ fadst4x8_new_sse2, // DCT_ADST
+ fadst4x8_new_sse2, // ADST_ADST
+ fdct4x8_new_sse2, // FLIPADST_DCT
+ fadst4x8_new_sse2, // DCT_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_FLIPADST
+ fadst4x8_new_sse2, // ADST_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fidentity8x8_new_sse2, // V_DCT
+ fdct4x8_new_sse2, // H_DCT
+ fidentity8x8_new_sse2, // V_ADST
+ fadst4x8_new_sse2, // H_ADST
+ fidentity8x8_new_sse2, // V_FLIPADST
+ fadst4x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fadst8x8_new_sse2, // ADST_DCT
+ fdct8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fadst8x8_new_sse2, // FLIPADST_DCT
+ fdct8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fdct8x8_new_sse2, // V_DCT
+ fidentity8x8_new_sse2, // H_DCT
+ fadst8x8_new_sse2, // V_ADST
+ fidentity8x8_new_sse2, // H_ADST
+ fadst8x8_new_sse2, // V_FLIPADST
+ fidentity8x8_new_sse2, // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fdct8x8_new_sse2, // ADST_DCT
+ fadst8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fdct8x8_new_sse2, // FLIPADST_DCT
+ fadst8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fidentity8x8_new_sse2, // V_DCT
+ fdct8x8_new_sse2, // H_DCT
+ fidentity8x8_new_sse2, // V_ADST
+ fadst8x8_new_sse2, // H_ADST
+ fidentity8x8_new_sse2, // V_FLIPADST
+ fadst8x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_sse2, // DCT_DCT
+ fadst8x16_new_sse2, // ADST_DCT
+ fdct8x16_new_sse2, // DCT_ADST
+ fadst8x16_new_sse2, // ADST_ADST
+ fadst8x16_new_sse2, // FLIPADST_DCT
+ fdct8x16_new_sse2, // DCT_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_FLIPADST
+ fadst8x16_new_sse2, // ADST_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_ADST
+ fidentity8x16_new_sse2, // IDTX
+ fdct8x16_new_sse2, // V_DCT
+ fidentity8x16_new_sse2, // H_DCT
+ fadst8x16_new_sse2, // V_ADST
+ fidentity8x16_new_sse2, // H_ADST
+ fadst8x16_new_sse2, // V_FLIPADST
+ fidentity8x16_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_sse2, // DCT_DCT
+ fdct8x16_new_sse2, // ADST_DCT
+ fadst8x16_new_sse2, // DCT_ADST
+ fadst8x16_new_sse2, // ADST_ADST
+ fdct8x16_new_sse2, // FLIPADST_DCT
+ fadst8x16_new_sse2, // DCT_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_FLIPADST
+ fadst8x16_new_sse2, // ADST_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_ADST
+ fidentity8x16_new_sse2, // IDTX
+ fidentity8x16_new_sse2, // V_DCT
+ fdct8x16_new_sse2, // H_DCT
+ fidentity8x16_new_sse2, // V_ADST
+ fadst8x16_new_sse2, // H_ADST
+ fidentity8x16_new_sse2, // V_FLIPADST
+ fadst8x16_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = {
+ av1_fdct8x32_new_sse2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_new_sse2, // IDTX
+ fidentity8x32_new_sse2, // V_DCT
+ av1_fdct8x32_new_sse2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[4], buf1[4], *buf;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+ const int txw_idx = get_txw_idx(TX_4X4);
+ const int txh_idx = get_txh_idx(TX_4X4);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x4(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w4(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)stride;
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+ const int txw_idx = get_txw_idx(TX_4X8);
+ const int txh_idx = get_txh_idx(TX_4X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+ const int txw_idx = get_txw_idx(TX_4X16);
+ const int txh_idx = get_txh_idx(TX_4X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x8(buf0, buf1);
+ transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + 8 * i, buf, width);
+ } else {
+ buf = buf1 + 8 * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+ const int txw_idx = get_txw_idx(TX_8X4);
+ const int txh_idx = get_txh_idx(TX_8X4);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip)
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ else
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w4(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip)
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ else
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+ const int txw_idx = get_txw_idx(TX_8X32);
+ const int txh_idx = get_txh_idx(TX_8X32);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+ transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+ transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+ const int txw_idx = get_txw_idx(TX_16X4);
+ const int txh_idx = get_txh_idx(TX_16X4);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x4(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w4(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+ const int txw_idx = get_txw_idx(TX_16X16);
+ const int txh_idx = get_txh_idx(TX_16X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+ const int txw_idx = get_txw_idx(TX_16X32);
+ const int txh_idx = get_txh_idx(TX_16X32);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+ } else {
+ av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+ const int txw_idx = get_txw_idx(TX_32X8);
+ const int txh_idx = get_txh_idx(TX_32X8);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 1; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+ } else {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+ const int txw_idx = get_txw_idx(TX_32X16);
+ const int txh_idx = get_txh_idx(TX_32X16);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+ } else {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[128];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
+ const int txw_idx = get_txw_idx(TX_32X32);
+ const int txh_idx = get_txh_idx(TX_32X32);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, height, width);
+ }
+ } else {
+ av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X16;
+ __m128i buf0[64], buf1[128];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = fdct8x16_new_sse2;
+ const transform_1d_sse2 row_txfm = av1_fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < height_div8; ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < height_div8; i++) {
+ __m128i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 32);
+ }
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_16X64;
+ __m128i buf0[64], buf1[128];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+ const transform_1d_sse2 row_txfm = fdct8x16_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < height_div8; ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16);
+ }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
+ av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform
+ av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform
+ NULL, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
+ av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform
+ av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform
+ av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+
+ if ((fwd_txfm2d_func == NULL) ||
+ (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ else
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
new file mode 100644
index 0000000000..3cb869a8fe
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+
+static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
+ __m128i *const output,
+ const int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 4; ++i) {
+ const __m128i a = _mm_unpacklo_epi16(input[i], one);
+ const __m128i b = scale_round_sse2(a, NewSqrt2);
+ output[i] = _mm_packs_epi32(b, b);
+ }
+}
+
+static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
+ __m128i *const output,
+ const int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 4; ++i) {
+ const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+ const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+ output[i] = _mm_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+
+ output[0] = _mm_adds_epi16(input[0], input[0]);
+ output[1] = _mm_adds_epi16(input[1], input[1]);
+ output[2] = _mm_adds_epi16(input[2], input[2]);
+ output[3] = _mm_adds_epi16(input[3], input[3]);
+ output[4] = _mm_adds_epi16(input[4], input[4]);
+ output[5] = _mm_adds_epi16(input[5], input[5]);
+ output[6] = _mm_adds_epi16(input[6], input[6]);
+ output[7] = _mm_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = _mm_adds_epi16(input[0], input[7]);
+ x1[7] = _mm_subs_epi16(input[0], input[7]);
+ x1[1] = _mm_adds_epi16(input[1], input[6]);
+ x1[6] = _mm_subs_epi16(input[1], input[6]);
+ x1[2] = _mm_adds_epi16(input[2], input[5]);
+ x1[5] = _mm_subs_epi16(input[2], input[5]);
+ x1[3] = _mm_adds_epi16(input[3], input[4]);
+ x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
+ x2[7] = x1[7];
+
+ // stage 3
+ __m128i x3[8];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+ // stage 4 and 5
+ output[0] = x3[0];
+ output[4] = x3[1];
+ output[2] = x3[2];
+ output[6] = x3[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], output[1], output[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], output[5], output[3]);
+}
+
+static INLINE void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[7]);
+ x1[2] = _mm_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+
+ // stage 3
+ __m128i x3[8];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+
+ // stage 5, 6 and 7
+ output[7] = _mm_adds_epi16(x4[0], x4[4]);
+ output[3] = _mm_subs_epi16(x4[0], x4[4]);
+ output[0] = _mm_adds_epi16(x4[1], x4[5]);
+ output[4] = _mm_subs_epi16(x4[1], x4[5]);
+ output[5] = _mm_adds_epi16(x4[2], x4[6]);
+ output[1] = _mm_subs_epi16(x4[2], x4[6]);
+ output[2] = _mm_adds_epi16(x4[3], x4[7]);
+ output[6] = _mm_subs_epi16(x4[3], x4[7]);
+
+ btf_16_sse2(cospi_p04_p60, cospi_p60_m04, output[7], output[0], output[7],
+ output[0]);
+ btf_16_sse2(cospi_p20_p44, cospi_p44_m20, output[5], output[2], output[5],
+ output[2]);
+ btf_16_sse2(cospi_p36_p28, cospi_p28_m36, output[3], output[4], output[3],
+ output[4]);
+ btf_16_sse2(cospi_p52_p12, cospi_p12_m52, output[1], output[6], output[1],
+ output[6]);
+}
+
+static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 16; ++i) {
+ const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+ const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+ const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2);
+ output[i] = _mm_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) {
+ output[i] = _mm_slli_epi16(input[i], 2);
+ }
+}
+
+static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
+ av1_fdct8x32_new_sse2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_new_sse2, // IDTX
+ av1_fdct8x32_new_sse2, // V_DCT
+ fidentity8x32_new_sse2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
new file mode 100644
index 0000000000..b58911fcb2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc = _mm_unpacklo_epi16(*p, zero);
+ const __m128i ac = _mm_unpackhi_epi16(*p, zero);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+ qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+ qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+ qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr, int log_scale,
+ __m256i *qp) {
+ __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ if (log_scale) {
+ const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale));
+ round = _mm_mulhrs_epi16(round, round_scale);
+ }
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+ init_one_qp(&round, &qp[0]);
+ init_one_qp(&quant, &qp[1]);
+ init_one_qp(&dequant, &qp[2]);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+ const int16_t *iscan_ptr, int log_scale,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i abs_coeff = _mm256_abs_epi32(*c);
+ __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
+
+ __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
+ __m256i q_hi = _mm256_srli_epi64(q, 32);
+ const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
+ q_hi = _mm256_mul_epi32(q_hi, qp_hi);
+ q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
+ q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
+ q_hi = _mm256_slli_epi64(q_hi, 32);
+ q = _mm256_or_si256(q_lo, q_hi);
+ const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
+ const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+ q = _mm256_andnot_si256(mask, q);
+
+ __m256i dq = _mm256_mullo_epi32(q, qp[2]);
+ dq = _mm256_srai_epi32(dq, log_scale);
+ q = _mm256_sign_epi32(q, *c);
+ dq = _mm256_sign_epi32(dq, *c);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+ const __m128i zr = _mm_setzero_si128();
+ const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+ const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+ const __m256i iscan =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+ const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+ __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+ cur_eob = _mm256_and_si256(cur_eob, nz);
+ *eob = _mm256_max_epi32(cur_eob, *eob);
+}
+
+void av1_highbd_quantize_fp_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ (void)scan;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 8;
+ __m256i qp[3], coeff;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+ while (n_coeffs > 0) {
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+ {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+ _mm256_extractf128_si256(eob, 1));
+ *eob_ptr = _mm_extract_epi16(final_eob, 0);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
new file mode 100644
index 0000000000..40b3b460b6
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+ const int shift, const int scale,
+ __m128i *qcoeff, __m128i *dquan,
+ __m128i *sign) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+
+ *sign = _mm_cmplt_epi32(*coeff, zero);
+ *sign = _mm_or_si128(*sign, one);
+ *coeff = _mm_abs_epi32(*coeff);
+
+ qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+ qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+ qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+ qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+ qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+ dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+ dquan[0] = _mm_srli_epi64(dquan[0], scale);
+ const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+ qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+ const __m128i *sign,
+ const __m128i *param, const int shift,
+ const int scale, tran_low_t *qAddr,
+ tran_low_t *dqAddr) {
+ __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+ __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+ qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+ qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+ dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+ dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+ // combine L&H
+ qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+ qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+ qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+ qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+ dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+ dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+ dquan[0] = _mm_and_si128(dquan[0], mask0H);
+ dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+ qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+ dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+ qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+ dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+ qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+ dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
+ _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+ _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+ __m128i *eob) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, iscanIdx;
+ const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+ const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+ __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+ __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+ nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+ nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+ mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+ iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+ iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+ iscanIdx = _mm_and_si128(iscanIdx, mask);
+ *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+ __m128i eob_shuffled;
+ uint16_t eobValue;
+ eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eobValue = _mm_extract_epi16(*eob, 0);
+ return eobValue;
+}
+
+void av1_highbd_quantize_fp_sse4_1(
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
+ __m128i eob = _mm_setzero_si128();
+ const tran_low_t *src = coeff_ptr;
+ tran_low_t *quanAddr = qcoeff_ptr;
+ tran_low_t *dquanAddr = dqcoeff_ptr;
+ const int shift = 16 - log_scale;
+ const int coeff_stride = 4;
+ const int quan_stride = coeff_stride;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+ memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+ const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+
+ qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
+ qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
+ qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+ qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+ dequant_ptr[0]);
+
+ // DC and first 3 AC
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+
+ // update round/quan/dquan for AC
+ qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+ qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
+ qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+ qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+ quanAddr, dquanAddr);
+
+ // next 4 AC
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+ quanAddr + quan_stride, dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+
+ // loop for the rest of AC
+ while (count > 0) {
+ src += coeff_stride << 1;
+ quanAddr += quan_stride << 1;
+ dquanAddr += quan_stride << 1;
+ iscan += quan_stride << 1;
+
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr, dquanAddr);
+
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr + quan_stride,
+ dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+ }
+ *eob_ptr = get_accumulated_eob(&eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c b/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c
new file mode 100644
index 0000000000..52ddc66437
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_k_means_avx2.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h> // AVX2
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static int64_t k_means_horizontal_sum_avx2(__m256i a) {
+ const __m128i low = _mm256_castsi256_si128(a);
+ const __m128i high = _mm256_extracti128_si256(a, 1);
+ const __m128i sum = _mm_add_epi64(low, high);
+ const __m128i sum_high = _mm_unpackhi_epi64(sum, sum);
+ int64_t res;
+ _mm_storel_epi64((__m128i *)&res, _mm_add_epi64(sum, sum_high));
+ return res;
+}
+
+void av1_calc_indices_dim1_avx2(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ const __m256i v_zero = _mm256_setzero_si256();
+ __m256i sum = _mm256_setzero_si256();
+ __m256i cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ cents[j] = _mm256_set1_epi16(centroids[j]);
+ }
+
+ for (int i = 0; i < n; i += 16) {
+ const __m256i in = _mm256_loadu_si256((__m256i *)data);
+ __m256i ind = _mm256_setzero_si256();
+ // Compute the distance to the first centroid.
+ __m256i d1 = _mm256_sub_epi16(in, cents[0]);
+ __m256i dist_min = _mm256_abs_epi16(d1);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ d1 = _mm256_sub_epi16(in, cents[j]);
+ const __m256i dist = _mm256_abs_epi16(d1);
+ // Compare to the minimal one.
+ const __m256i cmp = _mm256_cmpgt_epi16(dist_min, dist);
+ dist_min = _mm256_min_epi16(dist_min, dist);
+ const __m256i ind1 = _mm256_set1_epi16(j);
+ ind = _mm256_or_si256(_mm256_andnot_si256(cmp, ind),
+ _mm256_and_si256(cmp, ind1));
+ }
+
+ const __m256i p1 = _mm256_packus_epi16(ind, v_zero);
+ const __m256i px = _mm256_permute4x64_epi64(p1, 0x58);
+ const __m128i d2 = _mm256_extracti128_si256(px, 0);
+
+ _mm_storeu_si128((__m128i *)indices, d2);
+
+ if (total_dist) {
+ // Square, convert to 32 bit and add together.
+ dist_min = _mm256_madd_epi16(dist_min, dist_min);
+ // Convert to 64 bit and add to sum.
+ const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero);
+ const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero);
+ sum = _mm256_add_epi64(sum, dist1);
+ sum = _mm256_add_epi64(sum, dist2);
+ }
+
+ indices += 16;
+ data += 16;
+ }
+ if (total_dist) {
+ *total_dist = k_means_horizontal_sum_avx2(sum);
+ }
+}
+
+void av1_calc_indices_dim2_avx2(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ const __m256i v_zero = _mm256_setzero_si256();
+ const __m256i permute = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+ __m256i sum = _mm256_setzero_si256();
+ __m256i ind[2];
+ __m256i cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+ cents[j] = _mm256_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx, cy, cx, cy, cx,
+ cy, cx, cy, cx);
+ }
+
+ for (int i = 0; i < n; i += 16) {
+ for (int l = 0; l < 2; ++l) {
+ const __m256i in = _mm256_loadu_si256((__m256i *)data);
+ ind[l] = _mm256_setzero_si256();
+ // Compute the distance to the first centroid.
+ __m256i d1 = _mm256_sub_epi16(in, cents[0]);
+ __m256i dist_min = _mm256_madd_epi16(d1, d1);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ d1 = _mm256_sub_epi16(in, cents[j]);
+ const __m256i dist = _mm256_madd_epi16(d1, d1);
+ // Compare to the minimal one.
+ const __m256i cmp = _mm256_cmpgt_epi32(dist_min, dist);
+ dist_min = _mm256_min_epi32(dist_min, dist);
+ const __m256i ind1 = _mm256_set1_epi32(j);
+ ind[l] = _mm256_or_si256(_mm256_andnot_si256(cmp, ind[l]),
+ _mm256_and_si256(cmp, ind1));
+ }
+ if (total_dist) {
+ // Convert to 64 bit and add to sum.
+ const __m256i dist1 = _mm256_unpacklo_epi32(dist_min, v_zero);
+ const __m256i dist2 = _mm256_unpackhi_epi32(dist_min, v_zero);
+ sum = _mm256_add_epi64(sum, dist1);
+ sum = _mm256_add_epi64(sum, dist2);
+ }
+ data += 16;
+ }
+ // Cast to 8 bit and store.
+ const __m256i d2 = _mm256_packus_epi32(ind[0], ind[1]);
+ const __m256i d3 = _mm256_packus_epi16(d2, v_zero);
+ const __m256i d4 = _mm256_permutevar8x32_epi32(d3, permute);
+ const __m128i d5 = _mm256_extracti128_si256(d4, 0);
+ _mm_storeu_si128((__m128i *)indices, d5);
+ indices += 16;
+ }
+ if (total_dist) {
+ *total_dist = k_means_horizontal_sum_avx2(sum);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c b/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c
new file mode 100644
index 0000000000..6c75822350
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_k_means_sse2.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static int64_t k_means_horizontal_sum_sse2(__m128i a) {
+ const __m128i sum1 = _mm_unpackhi_epi64(a, a);
+ const __m128i sum2 = _mm_add_epi64(a, sum1);
+ int64_t res;
+ _mm_storel_epi64((__m128i *)&res, sum2);
+ return res;
+}
+
+void av1_calc_indices_dim1_sse2(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ const __m128i v_zero = _mm_setzero_si128();
+ __m128i sum = _mm_setzero_si128();
+ __m128i cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ cents[j] = _mm_set1_epi16(centroids[j]);
+ }
+
+ for (int i = 0; i < n; i += 8) {
+ const __m128i in = _mm_loadu_si128((__m128i *)data);
+ __m128i ind = _mm_setzero_si128();
+ // Compute the distance to the first centroid.
+ __m128i d1 = _mm_sub_epi16(in, cents[0]);
+ __m128i d2 = _mm_sub_epi16(cents[0], in);
+ __m128i dist_min = _mm_max_epi16(d1, d2);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ d1 = _mm_sub_epi16(in, cents[j]);
+ d2 = _mm_sub_epi16(cents[j], in);
+ const __m128i dist = _mm_max_epi16(d1, d2);
+ // Compare to the minimal one.
+ const __m128i cmp = _mm_cmpgt_epi16(dist_min, dist);
+ dist_min = _mm_min_epi16(dist_min, dist);
+ const __m128i ind1 = _mm_set1_epi16(j);
+ ind = _mm_or_si128(_mm_andnot_si128(cmp, ind), _mm_and_si128(cmp, ind1));
+ }
+ if (total_dist) {
+ // Square, convert to 32 bit and add together.
+ dist_min = _mm_madd_epi16(dist_min, dist_min);
+ // Convert to 64 bit and add to sum.
+ const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero);
+ const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero);
+ sum = _mm_add_epi64(sum, dist1);
+ sum = _mm_add_epi64(sum, dist2);
+ }
+ __m128i p2 = _mm_packus_epi16(ind, v_zero);
+ _mm_storel_epi64((__m128i *)indices, p2);
+ indices += 8;
+ data += 8;
+ }
+ if (total_dist) {
+ *total_dist = k_means_horizontal_sum_sse2(sum);
+ }
+}
+
+void av1_calc_indices_dim2_sse2(const int16_t *data, const int16_t *centroids,
+ uint8_t *indices, int64_t *total_dist, int n,
+ int k) {
+ const __m128i v_zero = _mm_setzero_si128();
+ __m128i sum = _mm_setzero_si128();
+ __m128i ind[2];
+ __m128i cents[PALETTE_MAX_SIZE];
+ for (int j = 0; j < k; ++j) {
+ const int16_t cx = centroids[2 * j], cy = centroids[2 * j + 1];
+ cents[j] = _mm_set_epi16(cy, cx, cy, cx, cy, cx, cy, cx);
+ }
+
+ for (int i = 0; i < n; i += 8) {
+ for (int l = 0; l < 2; ++l) {
+ const __m128i in = _mm_loadu_si128((__m128i *)data);
+ ind[l] = _mm_setzero_si128();
+ // Compute the distance to the first centroid.
+ __m128i d1 = _mm_sub_epi16(in, cents[0]);
+ __m128i dist_min = _mm_madd_epi16(d1, d1);
+
+ for (int j = 1; j < k; ++j) {
+ // Compute the distance to the centroid.
+ d1 = _mm_sub_epi16(in, cents[j]);
+ const __m128i dist = _mm_madd_epi16(d1, d1);
+ // Compare to the minimal one.
+ const __m128i cmp = _mm_cmpgt_epi32(dist_min, dist);
+ const __m128i dist1 = _mm_andnot_si128(cmp, dist_min);
+ const __m128i dist2 = _mm_and_si128(cmp, dist);
+ dist_min = _mm_or_si128(dist1, dist2);
+ const __m128i ind1 = _mm_set1_epi32(j);
+ ind[l] = _mm_or_si128(_mm_andnot_si128(cmp, ind[l]),
+ _mm_and_si128(cmp, ind1));
+ }
+ if (total_dist) {
+ // Convert to 64 bit and add to sum.
+ const __m128i dist1 = _mm_unpacklo_epi32(dist_min, v_zero);
+ const __m128i dist2 = _mm_unpackhi_epi32(dist_min, v_zero);
+ sum = _mm_add_epi64(sum, dist1);
+ sum = _mm_add_epi64(sum, dist2);
+ }
+ data += 8;
+ }
+ // Cast to 8 bit and store.
+ const __m128i d2 = _mm_packus_epi16(ind[0], ind[1]);
+ const __m128i d3 = _mm_packus_epi16(d2, v_zero);
+ _mm_storel_epi64((__m128i *)indices, d3);
+ indices += 8;
+ }
+ if (total_dist) {
+ *total_dist = k_means_horizontal_sum_sse2(sum);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
new file mode 100644
index 0000000000..75c5172f85
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void write_zero(tran_low_t *qcoeff) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+}
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i ac = _mm_unpackhi_epi64(*p, *p);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr, int log_scale,
+ __m256i *thr, __m256i *qp) {
+ __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+ if (log_scale > 0) {
+ const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
+ round = _mm_add_epi16(round, rnd);
+ round = _mm_srai_epi16(round, log_scale);
+ }
+
+ init_one_qp(&round, &qp[0]);
+ init_one_qp(&quant, &qp[1]);
+
+ if (log_scale == 1) {
+ qp[1] = _mm256_slli_epi16(qp[1], log_scale);
+ }
+
+ init_one_qp(&dequant, &qp[2]);
+ *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+ // calculating the zbin mask.
+ *thr = _mm256_sub_epi16(*thr, _mm256_set1_epi16(1));
+}
+
+static INLINE void update_qp(__m256i *thr, __m256i *qp) {
+ qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+ qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+ qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+ *thr = _mm256_permute2x128_si256(*thr, *thr, 0x11);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+ const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr);
+ const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+ return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+ tran_low_t *coeff_ptr) {
+ __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+ __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+ __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+ _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+ _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+ __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+ eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+ eob_s = _mm_minpos_epu16(eob_s);
+ return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
+static INLINE int16_t accumulate_eob256(__m256i eob256) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+ __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+ __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+static AOM_FORCE_INLINE void quantize_lp_16_first(
+ const int16_t *coeff_ptr, const int16_t *iscan_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, __m256i *round256, __m256i *quant256,
+ __m256i *dequant256, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256);
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, qcoeff);
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dqcoeff);
+
+ const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+ const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask);
+ const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask);
+ *eob = _mm256_max_epi16(*eob, nz_iscan);
+}
+
+static AOM_FORCE_INLINE void quantize_lp_16(
+ const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *iscan_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, __m256i *round256,
+ __m256i *quant256, __m256i *dequant256, __m256i *eob) {
+ const __m256i coeff =
+ _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs));
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round256);
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant256);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant256);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+
+ _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff);
+ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), dqcoeff);
+
+ const __m256i iscan =
+ _mm256_loadu_si256((const __m256i *)(iscan_ptr + n_coeffs));
+ const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, nz_mask);
+ const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, nz_mask);
+ *eob = _mm256_max_epi16(*eob, nz_iscan);
+}
+
+void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ __m256i eob256 = _mm256_setzero_si256();
+
+ // Setup global values.
+ __m256i round256 =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ __m256i quant256 =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ __m256i dequant256 =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+
+ // Populate upper AC values.
+ round256 = _mm256_permute4x64_epi64(round256, 0x54);
+ quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
+ dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
+
+ // Process DC and the first 15 AC coeffs.
+ quantize_lp_16_first(coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+ &quant256, &dequant256, &eob256);
+
+ if (n_coeffs > 16) {
+ // Overwrite the DC constants with AC constants
+ dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+ quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+ round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+
+ // AC only loop.
+ for (int idx = 16; idx < n_coeffs; idx += 16) {
+ quantize_lp_16(coeff_ptr, idx, iscan, qcoeff_ptr, dqcoeff_ptr, &round256,
+ &quant256, &dequant256, &eob256);
+ }
+ }
+
+ *eob_ptr = accumulate_eob256(eob256);
+}
+
+static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+ __m256i v_eobmax,
+ __m256i v_mask) {
+ const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+ const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
+ const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask);
+ const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask);
+ return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static AOM_FORCE_INLINE void quantize_fp_16(
+ const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ __m256i *eob) {
+ const __m256i coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]);
+ const __m256i abs_q = _mm256_mulhi_epi16(tmp_rnd, qp[1]);
+ const __m256i q = _mm256_sign_epi16(abs_q, coeff);
+ const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+ const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256());
+
+ store_coefficients_avx2(q, qcoeff_ptr);
+ store_coefficients_avx2(dq, dqcoeff_ptr);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ } else {
+ write_zero(qcoeff_ptr);
+ write_zero(dqcoeff_ptr);
+ }
+}
+
+void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ const int log_scale = 0;
+ const int step = 16;
+ __m256i qp[3], thr;
+ __m256i eob = _mm256_setzero_si256();
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+
+ quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(&thr, qp);
+
+ while (n_coeffs > 0) {
+ quantize_fp_16(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ *eob_ptr = quant_gather_eob(eob);
+}
+
+static AOM_FORCE_INLINE void quantize_fp_32x32(
+ const __m256i *thr, const __m256i *qp, const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ __m256i *eob) {
+ const __m256i coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, qp[0]);
+ const __m256i abs_q = _mm256_mulhi_epu16(tmp_rnd, qp[1]);
+ const __m256i q = _mm256_sign_epi16(abs_q, coeff);
+ const __m256i abs_dq =
+ _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 1);
+ const __m256i nz_mask = _mm256_cmpgt_epi16(abs_q, _mm256_setzero_si256());
+ const __m256i dq = _mm256_sign_epi16(abs_dq, coeff);
+
+ store_coefficients_avx2(q, qcoeff_ptr);
+ store_coefficients_avx2(dq, dqcoeff_ptr);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ } else {
+ write_zero(qcoeff_ptr);
+ write_zero(dqcoeff_ptr);
+ }
+}
+
+void av1_quantize_fp_32x32_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ const int log_scale = 1;
+ const unsigned int step = 16;
+ __m256i qp[3], thr;
+ __m256i eob = _mm256_setzero_si256();
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+
+ quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(&thr, qp);
+
+ while (n_coeffs > 0) {
+ quantize_fp_32x32(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_fp_64x64(const __m256i *thr, const __m256i *qp,
+ const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, __m256i *eob) {
+ const __m256i coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(abs_coeff, qp[0]), mask);
+ const __m256i qh = _mm256_slli_epi16(_mm256_mulhi_epi16(tmp_rnd, qp[1]), 2);
+ const __m256i ql =
+ _mm256_srli_epi16(_mm256_mullo_epi16(tmp_rnd, qp[1]), 14);
+ const __m256i abs_q = _mm256_or_si256(qh, ql);
+ const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(abs_q, qp[2]), 14);
+ const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(abs_q, qp[2]), 2);
+ const __m256i abs_dq = _mm256_or_si256(dqh, dql);
+ const __m256i q = _mm256_sign_epi16(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi16(abs_dq, coeff);
+ // Check the signed q/dq value here instead of the absolute value. When
+ // dequant equals 4, the dequant threshold (*thr) becomes 0 after being
+ // scaled down by (1 + log_scale). See init_qp(). When *thr is 0 and the
+ // abs_coeff is 0, the nzflag will be set. As a result, the eob will be
+ // incorrectly calculated. The psign instruction corrects the error by
+ // zeroing out q/dq if coeff is zero.
+ const __m256i z_mask = _mm256_cmpeq_epi16(dq, _mm256_setzero_si256());
+ const __m256i nz_mask = _mm256_cmpeq_epi16(z_mask, _mm256_setzero_si256());
+
+ store_coefficients_avx2(q, qcoeff_ptr);
+ store_coefficients_avx2(dq, dqcoeff_ptr);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+ } else {
+ write_zero(qcoeff_ptr);
+ write_zero(dqcoeff_ptr);
+ }
+}
+
+void av1_quantize_fp_64x64_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ const int log_scale = 2;
+ const unsigned int step = 16;
+ __m256i qp[3], thr;
+ __m256i eob = _mm256_setzero_si256();
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+
+ quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(&thr, qp);
+
+ while (n_coeffs > 0) {
+ quantize_fp_64x64(&thr, qp, coeff_ptr, iscan_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ *eob_ptr = quant_gather_eob(eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
new file mode 100644
index 0000000000..b533894015
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+ __m128i *c0, __m128i *c1) {
+ const tran_low_t *addr = coeff + offset;
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i x0 = _mm_load_si128((const __m128i *)addr);
+ const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
+ const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
+ const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
+ *c0 = _mm_packs_epi32(x0, x1);
+ *c1 = _mm_packs_epi32(x2, x3);
+ } else {
+ *c0 = _mm_load_si128((const __m128i *)addr);
+ *c1 = _mm_load_si128((const __m128i *)addr + 1);
+ }
+}
+
+static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
+ tran_low_t *qcoeff, intptr_t offset) {
+ tran_low_t *addr = qcoeff + offset;
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
+ __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
+ __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
+ _mm_store_si128((__m128i *)addr, y0);
+ _mm_store_si128((__m128i *)addr + 1, y1);
+
+ sign_bits = _mm_cmplt_epi16(*qc1, zero);
+ y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
+ y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
+ _mm_store_si128((__m128i *)addr + 2, y0);
+ _mm_store_si128((__m128i *)addr + 3, y1);
+ } else {
+ _mm_store_si128((__m128i *)addr, *qc0);
+ _mm_store_si128((__m128i *)addr + 1, *qc1);
+ }
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
+ const __m128i zero = _mm_setzero_si128();
+ tran_low_t *addr = qcoeff + offset;
+ if (sizeof(tran_low_t) == 4) {
+ _mm_store_si128((__m128i *)addr, zero);
+ _mm_store_si128((__m128i *)addr + 1, zero);
+ _mm_store_si128((__m128i *)addr + 2, zero);
+ _mm_store_si128((__m128i *)addr + 3, zero);
+ } else {
+ _mm_store_si128((__m128i *)addr, zero);
+ _mm_store_si128((__m128i *)addr + 1, zero);
+ }
+}
+
+static INLINE void quantize(const int16_t *iscan_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const __m128i *round0, const __m128i *round1,
+ const __m128i *quant0, const __m128i *quant1,
+ const __m128i *dequant0, const __m128i *dequant1,
+ const __m128i *thr0, const __m128i *thr1,
+ __m128i *eob) {
+ __m128i coeff0, coeff1;
+ // Do DC and first 15 AC
+ read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+
+ // Poor man's sign extract
+ const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
+ _mm_cmpeq_epi16(qcoeff0, *thr0));
+ const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
+ _mm_cmpeq_epi16(qcoeff1, *thr1));
+ const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
+
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+ const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+ const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+ coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+ write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+
+ const __m128i zero = _mm_setzero_si128();
+ // Scan for eob
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ const __m128i iscan0 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ const __m128i iscan1 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+ const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+ const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+ const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+ const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+ *eob = _mm_max_epi16(*eob, eob2);
+ } else {
+ write_zero(qcoeff_ptr, n_coeffs);
+ write_zero(dqcoeff_ptr, n_coeffs);
+ }
+}
+
+void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ coeff_ptr += n_coeffs;
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+ const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+ const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+ const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+ const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+ const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+ const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
+ const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
+ __m128i eob = _mm_setzero_si128();
+
+ quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+ &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
+
+ n_coeffs += 8 * 2;
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+ &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
+ &eob);
+ n_coeffs += 8 * 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
+ }
+}
+
+static INLINE void quantize_lp(const int16_t *iscan_ptr,
+ const int16_t *coeff_ptr, intptr_t n_coeffs,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const __m128i *round0, const __m128i *round1,
+ const __m128i *quant0, const __m128i *quant1,
+ const __m128i *dequant0, const __m128i *dequant1,
+ __m128i *eob) {
+ const int16_t *read = coeff_ptr + n_coeffs;
+ __m128i coeff0 = _mm_load_si128((const __m128i *)read);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1);
+
+ // Poor man's sign extract
+ const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+ const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+ const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ int16_t *addr = qcoeff_ptr + n_coeffs;
+ _mm_store_si128((__m128i *)addr, qcoeff0);
+ _mm_store_si128((__m128i *)addr + 1, qcoeff1);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+ coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+ addr = dqcoeff_ptr + n_coeffs;
+ _mm_store_si128((__m128i *)addr, coeff0);
+ _mm_store_si128((__m128i *)addr + 1, coeff1);
+
+ const __m128i zero = _mm_setzero_si128();
+ // Scan for eob
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+
+ const __m128i iscan0 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ const __m128i iscan1 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+
+ // Add one to convert from indices to counts
+ const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+ const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+ const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+ const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+ const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+ *eob = _mm_max_epi16(*eob, eob2);
+}
+
+void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ coeff_ptr += n_coeffs;
+ iscan += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ // Setup global values
+ const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+ const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+ const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+ const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+ const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+ const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+ __m128i eob = _mm_setzero_si128();
+
+ // DC and first 15 AC
+ quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+ &round1, &quant0, &quant1, &dequant0, &dequant1, &eob);
+ n_coeffs += 8 * 2;
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+ &round1, &quant1, &quant1, &dequant1, &dequant1, &eob);
+ n_coeffs += 8 * 2;
+ }
+
+ // Accumulate EOB
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..ad4ae274e2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
@@ -0,0 +1,204 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, \
+ eob, scan, iscan
+ cmp dword skipm, 0
+ jne .blank
+
+ ; actual quantize loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, dequantmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ mova m1, [roundq] ; m1 = round
+ mova m2, [quantq] ; m2 = quant
+%ifidn %1, fp_32x32
+ pcmpeqw m5, m5
+ psrlw m5, 15
+ paddw m1, m5
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
+ mova m3, [r2q] ; m3 = dequant
+ mov r3, qcoeffmp
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+%ifidn %1, fp_32x32
+ psllw m2, 1
+%endif
+ pxor m5, m5 ; m5 = dedicated zero
+
+ lea coeffq, [ coeffq+ncoeffq*2]
+ lea r5q, [ r5q+ncoeffq*2]
+ lea r3q, [ r3q+ncoeffq*2]
+ lea r4q, [r4q+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpeqw m7, m7
+
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ mova [r3q+ncoeffq*2+ 0], m8
+ mova [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; r4[i] = r3[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+ psrlw m0, m3, 2
+%else
+ psrlw m0, m3, 1
+%endif
+ mova [r4q+ncoeffq*2+ 0], m8
+ mova [r4q+ncoeffq*2+16], m13
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m7 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jz .accumulate_eob
+
+.ac_only_loop:
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+
+ pcmpgtw m7, m6, m0
+ pcmpgtw m12, m11, m0
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
+
+ or r6, r2
+ jz .skip_iter
+
+ pcmpeqw m7, m7
+
+ paddsw m6, m1 ; m6 += round
+ paddsw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ mova [r3q+ncoeffq*2+ 0], m14
+ mova [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; r4[i] = r3[i] * q
+ pmullw m13, m3 ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+ mova [r4q+ncoeffq*2+ 0], m14
+ mova [r4q+ncoeffq*2+16], m13
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m7 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+ jmp .accumulate_eob
+.skip_iter:
+ mova [r3q+ncoeffq*2+ 0], m5
+ mova [r3q+ncoeffq*2+16], m5
+ mova [r4q+ncoeffq*2+ 0], m5
+ mova [r4q+ncoeffq*2+16], m5
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+.accumulate_eob:
+ ; horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ pextrw r6, m8, 0
+ mov [r2], r6
+ RET
+
+ ; skip-block, i.e. just write all zeroes
+.blank:
+ mov r0, dqcoeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, qcoeffmp
+ mov r3, eobmp
+
+ lea r0q, [r0q+ncoeffq*2]
+ lea r2q, [r2q+ncoeffq*2]
+ neg ncoeffq
+ pxor m7, m7
+.blank_loop:
+ mova [r0q+ncoeffq*2+ 0], m7
+ mova [r0q+ncoeffq*2+16], m7
+ mova [r2q+ncoeffq*2+ 0], m7
+ mova [r2q+ncoeffq*2+16], m7
+ add ncoeffq, mmsize
+ jl .blank_loop
+ mov word [r3q], 0
+ RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
new file mode 100644
index 0000000000..618758105a
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(av1_ssim_parms_16x16_sse2)
+sym(av1_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(av1_ssim_parms_8x8_sse2)
+sym(av1_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c b/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c
new file mode 100644
index 0000000000..830f40ecb0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_temporal_denoiser_sse2.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/av1_temporal_denoiser.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int sum_diff_16x1(__m128i acc_diff) {
+ const __m128i k_1 = _mm_set1_epi16(1);
+ const __m128i acc_diff_lo =
+ _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+ const __m128i acc_diff_hi =
+ _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+ const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+ const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+ const __m128i hgfe_dcba =
+ _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+ const __m128i hgfedcba =
+ _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+ return _mm_cvtsi128_si32(hgfedcba);
+}
+
+// Denoise a 16x1 vector.
+static INLINE __m128i av1_denoiser_16x1_sse2(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
+ const __m128i *k_16, const __m128i *l3, const __m128i *l32,
+ const __m128i *l21, __m128i acc_diff) {
+ // Calculate differences
+ const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+ const __m128i v_mc_running_avg_y =
+ _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+ __m128i v_running_avg_y;
+ const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+ const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+ // Obtain the sign. FF if diff is negative.
+ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
+ // Clamp absolute difference to 16 to be used to get mask. Doing this
+ // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+ const __m128i clamped_absdiff =
+ _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
+ // Get masks for l2 l1 and l0 adjustments.
+ const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
+ const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
+ const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
+ // Get adjustments for l2, l1, and l0.
+ __m128i adj2 = _mm_and_si128(mask2, *l32);
+ const __m128i adj1 = _mm_and_si128(mask1, *l21);
+ const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+ __m128i adj, padj, nadj;
+
+ // Combine the adjustments and get absolute adjustments.
+ adj2 = _mm_add_epi8(adj2, adj1);
+ adj = _mm_sub_epi8(*l3, adj2);
+ adj = _mm_andnot_si128(mask0, adj);
+ adj = _mm_or_si128(adj, adj0);
+
+ // Restore the sign and get positive and negative adjustments.
+ padj = _mm_andnot_si128(diff_sign, adj);
+ nadj = _mm_and_si128(diff_sign, adj);
+
+ // Calculate filtered value.
+ v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+ v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+ _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+ // Adjustments <=7, and each element in acc_diff can fit in signed
+ // char.
+ acc_diff = _mm_adds_epi8(acc_diff, padj);
+ acc_diff = _mm_subs_epi8(acc_diff, nadj);
+ return acc_diff;
+}
+
+// Denoise a 16x1 vector with a weaker filter.
+static INLINE __m128i av1_denoiser_adj_16x1_sse2(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
+ __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+ // Calculate differences.
+ const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+ const __m128i v_mc_running_avg_y =
+ _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+ const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+ const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+ // Obtain the sign. FF if diff is negative.
+ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+ // Clamp absolute difference to delta to get the adjustment.
+ const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+ // Restore the sign and get positive and negative adjustments.
+ __m128i padj, nadj;
+ padj = _mm_andnot_si128(diff_sign, adj);
+ nadj = _mm_and_si128(diff_sign, adj);
+ // Calculate filtered value.
+ v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+ v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+ _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+ // Accumulate the adjustments.
+ acc_diff = _mm_subs_epi8(acc_diff, padj);
+ acc_diff = _mm_adds_epi8(acc_diff, nadj);
+ return acc_diff;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int av1_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride,
+ uint8_t *running_avg_y, int avg_y_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude, int width) {
+ int sum_diff_thresh, r, sum_diff = 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+ __m128i acc_diff = _mm_setzero_si128();
+ const __m128i k_0 = _mm_setzero_si128();
+ const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+ const __m128i k_8 = _mm_set1_epi8(8);
+ const __m128i k_16 = _mm_set1_epi8(16);
+ // Modify each level's adjustment according to motion_magnitude.
+ const __m128i l3 = _mm_set1_epi8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+ // Difference between level 3 and level 2 is 2.
+ const __m128i l32 = _mm_set1_epi8(2);
+ // Difference between level 2 and level 1 is 1.
+ const __m128i l21 = _mm_set1_epi8(1);
+ const int b_height = block_size_high[bs] >> 1;
+
+ for (r = 0; r < b_height; ++r) {
+ memcpy(sig_buffer[r], sig, width);
+ memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+ memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+ memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+ width);
+ memcpy(running_buffer[r], running_avg_y, width);
+ memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+ acc_diff = av1_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r],
+ running_buffer[r], &k_0, &k_4, &k_8,
+ &k_16, &l3, &l32, &l21, acc_diff);
+ memcpy(running_avg_y, running_buffer[r], width);
+ memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+ // Update pointers for next iteration.
+ sig += (sig_stride << 1);
+ mc_running_avg_y += (mc_avg_y_stride << 1);
+ running_avg_y += (avg_y_stride << 1);
+ }
+
+ {
+ sum_diff = sum_diff_16x1(acc_diff);
+ sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ // Before returning to copy the block (i.e., apply no denoising),
+ // check if we can still apply some (weaker) temporal filtering to
+ // this block, that would otherwise not be denoised at all. Simplest
+ // is to apply an additional adjustment to running_avg_y to bring it
+ // closer to sig. The adjustment is capped by a maximum delta, and
+ // chosen such that in most cases the resulting sum_diff will be
+ // within the acceptable range given by sum_diff_thresh.
+
+ // The delta is set by the excess of absolute pixel diff over the
+ // threshold.
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const __m128i k_delta = _mm_set1_epi8(delta);
+ running_avg_y -= avg_y_stride * (b_height << 1);
+ for (r = 0; r < b_height; ++r) {
+ acc_diff = av1_denoiser_adj_16x1_sse2(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0,
+ k_delta, acc_diff);
+ memcpy(running_avg_y, running_buffer[r], width);
+ memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width,
+ width);
+ // Update pointers for next iteration.
+ running_avg_y += (avg_y_stride << 1);
+ }
+ sum_diff = sum_diff_16x1(acc_diff);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+ return FILTER_BLOCK;
+}
+
+// Denoise 16x16 to 128x128 blocks.
+static int av1_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride,
+ uint8_t *running_avg_y, int avg_y_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ int sum_diff_thresh, r, c, sum_diff = 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ __m128i acc_diff[8][8];
+ const __m128i k_0 = _mm_setzero_si128();
+ const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+ const __m128i k_8 = _mm_set1_epi8(8);
+ const __m128i k_16 = _mm_set1_epi8(16);
+ // Modify each level's adjustment according to motion_magnitude.
+ const __m128i l3 = _mm_set1_epi8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+ // Difference between level 3 and level 2 is 2.
+ const __m128i l32 = _mm_set1_epi8(2);
+ // Difference between level 2 and level 1 is 1.
+ const __m128i l21 = _mm_set1_epi8(1);
+ const int b_width = block_size_wide[bs];
+ const int b_height = block_size_high[bs];
+ const int b_width_shift4 = b_width >> 4;
+
+ for (r = 0; r < 8; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ acc_diff[c][r] = _mm_setzero_si128();
+ }
+ }
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ acc_diff[c][r >> 4] = av1_denoiser_16x1_sse2(
+ sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3,
+ &l32, &l21, acc_diff[c][r >> 4]);
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+ }
+ }
+
+ // Update pointers for next iteration.
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ {
+ sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const __m128i k_delta = _mm_set1_epi8(delta);
+ sig -= sig_stride * b_height;
+ mc_running_avg_y -= mc_avg_y_stride * b_height;
+ running_avg_y -= avg_y_stride * b_height;
+ sum_diff = 0;
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ acc_diff[c][r >> 4] =
+ av1_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y,
+ k_0, k_delta, acc_diff[c][r >> 4]);
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+ }
+ }
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+ return FILTER_BLOCK;
+}
+
+int av1_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ // Rank by frequency of the block type to have an early termination.
+ if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+ bs == BLOCK_128X128 || bs == BLOCK_128X64 || bs == BLOCK_64X128 ||
+ bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+ bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+ return av1_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride,
+ avg, avg_stride, increase_denoising, bs,
+ motion_magnitude);
+ } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+ return av1_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride,
+ avg, avg_stride, increase_denoising, bs,
+ motion_magnitude, 8);
+ } else {
+ return COPY_BLOCK;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
new file mode 100644
index 0000000000..7a0f32898b
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct8_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct16_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+ const int stride);
+void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
+ const int instride, const int outstride);
+void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst8_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst16_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idct4_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct8_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct16_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct32_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct64_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst8_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst16_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+ const int col_num);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+ __m128i *output) {
+ __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+ __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+ __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+ __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+ output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+ output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+ output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+ output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// then transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+ __m128i *output) {
+ const int num_per_128 = 4;
+ const int row_size = txfm_size;
+ const int col_size = txfm_size / num_per_128;
+ int r, c;
+
+ // transpose each 4x4 block internally
+ for (r = 0; r < row_size; r += 4) {
+ for (c = 0; c < col_size; c++) {
+ transpose_32_4x4(col_size, &input[r * col_size + c],
+ &output[c * 4 * col_size + r / 4]);
+ }
+ }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ const __m128i ww0 = _mm_set1_epi32(w0); \
+ const __m128i ww1 = _mm_set1_epi32(w1); \
+ const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \
+ const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \
+ out0 = _mm_add_epi32(in0_w0, in1_w1); \
+ out0 = av1_round_shift_32_sse4_1(out0, bit); \
+ const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \
+ const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \
+ out1 = _mm_sub_epi32(in0_w1, in1_w0); \
+ out1 = av1_round_shift_32_sse4_1(out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \
+ const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \
+ out0 = _mm_add_epi32(in0_w0, in1_w1); \
+ out0 = _mm_add_epi32(out0, r); \
+ out0 = _mm_srai_epi32(out0, bit); \
+ const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \
+ const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \
+ out1 = _mm_sub_epi32(in0_w1, in1_w0); \
+ out1 = _mm_add_epi32(out1, r); \
+ out1 = _mm_srai_epi32(out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \
+ } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
diff --git a/third_party/aom/av1/encoder/x86/cnn_avx2.c b/third_party/aom/av1/encoder/x86/cnn_avx2.c
new file mode 100644
index 0000000000..ee93b3d5a0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/cnn_avx2.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/cnn.h"
+
+// This mask rearranges source pixels in the order shown below.
+// shuffle_src_layer0[0][8]: applied on source pixels 0 to 7.
+// shuffle_src_layer0[1][8]: applied on source pixels 7 to 14.
+// This shuffling is needed to process 3 5x5 blocks which need
+// source pixels in the following order.
+// 1st 5x5 block: source pixels needed are 0 to 4,
+// 2nd 5x5 block: source pixels needed are 4 to 8,
+// 3rd 5x5 block: source pixels needed are 8 to 12.
+// Source pixels are loaded like mentioned below.
+// load_src0 : 0, 1, 2, 3, 4, 5, 6, 7
+// load_src1 : 7, 8, 9, 10, 11, 12, 13, 14
+// After applying masks, source bytes will be in the order:
+// load_src0 : 0, 1, 2, 3, 4, 4, 5, 6
+// consists 5 pixels needed for 1st 5x5 block and
+// first 3 pixels needed for 2nd 5x5 block.
+// load_src1 : 7, 8, 8, 9, 10, 11, 12, x
+// consists last 2 pixels needed for 2nd 5x5 block and
+// 5 pixels needed for 3rd 5x5 block.
+DECLARE_ALIGNED(32, static const uint32_t,
+ shuffle_src_layer0[2][8]) = { { 0, 1, 2, 3, 4, 4, 5, 6 },
+ { 0, 1, 1, 2, 3, 4, 5, 0 } };
+
+// This mask rearrange the weights to match shuffled source pixels order.
+DECLARE_ALIGNED(32, static const uint32_t,
+ shuffle_weight_layer0[2][8]) = { { 0, 1, 2, 3, 4, 0, 1, 2 },
+ { 3, 4, 0, 1, 2, 3, 4, 0 } };
+
+// Shuffle mask used to rearrange weights corresponding to layer 1 and layer 2.
+// For layer 1 and layer 2, convolution happens at 2x2 as filter_width and
+// filter_height are equal to 2. So rearranging the weights in the
+// order shown below to match source pixels. Basically this mask replicates
+// the weights across the width of 2.
+DECLARE_ALIGNED(32, static const uint32_t,
+ shuffle_weight_layer_1_and_2[2][8]) = {
+ { 0, 1, 0, 1, 0, 1, 0, 1 }, { 2, 3, 2, 3, 2, 3, 2, 3 }
+};
+
+// After the stages of multiplication and accumulation, the output values
+// in the register will be jumbled. In order to store register into
+// output buffer in a proper way, the following mask is applied on output
+// register.
+DECLARE_ALIGNED(32, static const uint32_t,
+ shuffle_output_layer_1_and_2[8]) = { 0, 1, 4, 5, 2, 3, 6, 7 };
+
+// Load weights needed for layer 0 (for 5x5 block processing),
+// and fill the registers appropriately to match source pixel mapping.
+static INLINE void prepare_weights_for_5x5_convolve(
+ const float *layer_config_weights, int off, float weight[5][8],
+ const int cstep, __m256 *shuffle_weight, const __m256i weight_mask_0,
+ const __m256i weight_mask_1) {
+ for (int row = 0; row < 5; ++row) {
+ for (int col = 0; col < 5; ++col) {
+ weight[row][col] = layer_config_weights[off];
+ off += cstep;
+ }
+ }
+ shuffle_weight[0] = _mm256_loadu_ps(weight[0]);
+ shuffle_weight[1] = _mm256_loadu_ps(weight[1]);
+ shuffle_weight[2] = _mm256_loadu_ps(weight[2]);
+ shuffle_weight[3] = _mm256_loadu_ps(weight[3]);
+ shuffle_weight[4] = _mm256_loadu_ps(weight[4]);
+
+ shuffle_weight[0] =
+ _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_0);
+ shuffle_weight[1] =
+ _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_0);
+ shuffle_weight[2] =
+ _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_0);
+ shuffle_weight[3] =
+ _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_0);
+ shuffle_weight[4] =
+ _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_0);
+ shuffle_weight[5] =
+ _mm256_permutevar8x32_ps(shuffle_weight[0], weight_mask_1);
+ shuffle_weight[6] =
+ _mm256_permutevar8x32_ps(shuffle_weight[1], weight_mask_1);
+ shuffle_weight[7] =
+ _mm256_permutevar8x32_ps(shuffle_weight[2], weight_mask_1);
+ shuffle_weight[8] =
+ _mm256_permutevar8x32_ps(shuffle_weight[3], weight_mask_1);
+ shuffle_weight[9] =
+ _mm256_permutevar8x32_ps(shuffle_weight[4], weight_mask_1);
+}
+
+// For each row, loads source pixels 0 to 7(load_src_0), 7 to 14(load_src_1) and
+// arranges them appropriately to process 3 blocks.
+#define PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS() \
+ do { \
+ for (int row = 0; row < 5; row++) { \
+ load_src_0 = _mm256_loadu_ps(input_ptr); \
+ load_src_1 = _mm256_loadu_ps(input_ptr + 7); \
+ load_src_0 = _mm256_permutevar8x32_ps(load_src_0, block0_1); \
+ load_src_1 = _mm256_permutevar8x32_ps(load_src_1, block1_2); \
+ load_src_0 = _mm256_mul_ps(load_src_0, shuffle_weight[0 + row]); \
+ load_src_1 = _mm256_mul_ps(load_src_1, shuffle_weight[5 + row]); \
+ accum_src_0 = _mm256_add_ps(load_src_0, accum_src_0); \
+ accum_src_1 = _mm256_add_ps(load_src_1, accum_src_1); \
+ input_ptr += in_stride; \
+ } \
+ } while (0)
+
+// Load masks needed for shuffling of output and weights.
+static INLINE void load_shuffle_masks_for_2x2_convolve(__m256i *output_mask,
+ __m256i *weight_mask) {
+ // Load shuffle buffer needed to sort the output.
+ *output_mask =
+ _mm256_load_si256((const __m256i *)shuffle_output_layer_1_and_2);
+
+ // Load shuffle buffers needed for weight.
+ weight_mask[0] =
+ _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[0]);
+ weight_mask[1] =
+ _mm256_load_si256((const __m256i *)shuffle_weight_layer_1_and_2[1]);
+}
+
+// Load weights needed for layer 1 and 2 (for 2x2 block processing),
+// and fill the registers appropriately to match source pixel mapping.
+static INLINE void prepare_weights_for_2x2_convolve(
+ const float *layer_config_weights, int off, const int cstep,
+ __m256 *shuffle_weight, __m256i *weight_mask) {
+ // Weights needed for 2x2 block.
+ float weight[4] = { 0 };
+ for (int i = 0; i < 4; ++i) {
+ weight[i] = layer_config_weights[off];
+ off += cstep;
+ }
+
+ const __m256 weight_vec = _mm256_castps128_ps256(_mm_loadu_ps(weight));
+ shuffle_weight[0] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[0]);
+ shuffle_weight[1] = _mm256_permutevar8x32_ps(weight_vec, weight_mask[1]);
+}
+
+// Do convolution of one 5x5 block.
+#define PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(w, accum0, in_stride) \
+ do { \
+ __m128 load_src[5]; \
+ load_src[0] = _mm_loadu_ps(input_ptr); \
+ last_column_sum += input_ptr[4] * weight[0][4]; \
+ input_ptr += in_stride; \
+ load_src[1] = _mm_loadu_ps(input_ptr); \
+ last_column_sum += input_ptr[4] * weight[1][4]; \
+ input_ptr += in_stride; \
+ load_src[2] = _mm_loadu_ps(input_ptr); \
+ last_column_sum += input_ptr[4] * weight[2][4]; \
+ input_ptr += in_stride; \
+ load_src[3] = _mm_loadu_ps(input_ptr); \
+ last_column_sum += input_ptr[4] * weight[3][4]; \
+ input_ptr += in_stride; \
+ load_src[4] = _mm_loadu_ps(input_ptr); \
+ last_column_sum += input_ptr[4] * weight[4][4]; \
+ \
+ load_src[0] = _mm_mul_ps(load_src[0], _mm256_castps256_ps128(w[0])); \
+ load_src[1] = _mm_mul_ps(load_src[1], _mm256_castps256_ps128(w[1])); \
+ load_src[2] = _mm_mul_ps(load_src[2], _mm256_castps256_ps128(w[2])); \
+ load_src[3] = _mm_mul_ps(load_src[3], _mm256_castps256_ps128(w[3])); \
+ load_src[4] = _mm_mul_ps(load_src[4], _mm256_castps256_ps128(w[4])); \
+ \
+ accum0 = _mm_add_ps(load_src[0], accum0); \
+ load_src[1] = _mm_add_ps(load_src[1], load_src[2]); \
+ load_src[3] = _mm_add_ps(load_src[3], load_src[4]); \
+ load_src[1] = _mm_add_ps(load_src[1], load_src[3]); \
+ accum0 = _mm_add_ps(accum0, load_src[1]); \
+ } while (0)
+
+// Do convolution on 8 horizontal 2x2 blocks.
+static INLINE void perform_convolve_for_8h_2x2_blocks(
+ const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum,
+ __m256i shuffle_output_mask) {
+ __m256 load_src[4];
+ // Load input into source registers.
+ load_src[0] = _mm256_loadu_ps(input_ptr);
+ load_src[1] = _mm256_loadu_ps(input_ptr + 8);
+ load_src[2] = _mm256_loadu_ps(input_ptr + in_stride);
+ load_src[3] = _mm256_loadu_ps(input_ptr + in_stride + 8);
+
+ // Multiply the loaded input with corresponding weights.
+ load_src[0] = _mm256_mul_ps(load_src[0], weight[0]);
+ load_src[1] = _mm256_mul_ps(load_src[1], weight[0]);
+ load_src[2] = _mm256_mul_ps(load_src[2], weight[1]);
+ load_src[3] = _mm256_mul_ps(load_src[3], weight[1]);
+
+ // Accumulate across 2x2 blocks.
+ load_src[0] = _mm256_add_ps(load_src[0], load_src[2]);
+ load_src[1] = _mm256_add_ps(load_src[1], load_src[3]);
+ load_src[0] = _mm256_hadd_ps(load_src[0], load_src[1]);
+
+ // Sort the output in order to store into output buffer.
+ load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask);
+ *out_accum = _mm256_add_ps(*out_accum, load_src[0]);
+}
+
+// Do convolution on 8 (4 horizontal x 2 vertical) 2x2 blocks.
+static INLINE void perform_convolve_for_4hx2v_2x2_blocks(
+ const float *input_ptr, int in_stride, __m256 *weight, __m256 *out_accum,
+ __m256i shuffle_output_mask) {
+ __m256 load_src[4];
+ // Load input into source registers.
+ load_src[0] = _mm256_loadu_ps(input_ptr);
+ load_src[1] = _mm256_loadu_ps(input_ptr + in_stride);
+ load_src[2] = _mm256_loadu_ps(input_ptr + (in_stride * 2));
+ load_src[3] = _mm256_loadu_ps(input_ptr + (in_stride * 3));
+
+ // Multiply the loaded input with corresponding weights.
+ load_src[0] = _mm256_mul_ps(load_src[0], weight[0]);
+ load_src[1] = _mm256_mul_ps(load_src[1], weight[1]);
+ load_src[2] = _mm256_mul_ps(load_src[2], weight[0]);
+ load_src[3] = _mm256_mul_ps(load_src[3], weight[1]);
+
+ // Accumulate across 2x2 blocks.
+ load_src[0] = _mm256_add_ps(load_src[0], load_src[1]);
+ load_src[2] = _mm256_add_ps(load_src[2], load_src[3]);
+ load_src[0] = _mm256_hadd_ps(load_src[0], load_src[2]);
+
+ // Sort the output in order to store into output buffer.
+ load_src[0] = _mm256_permutevar8x32_ps(load_src[0], shuffle_output_mask);
+ *out_accum = _mm256_add_ps(*out_accum, load_src[0]);
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when
+// filter_width and filter_height are equal to 5.
+// CNN convolve parsing is based on av1_intra_mode_cnn_partition_cnn_config.
+// Based on the configuration set for each layer, the current encoder
+// always chooses the case of no_maxpool_padding_valid.
+// And also for layer 0 convolution happens at 5x5 level as the
+// filter_width and filter_height are set as 5.
+static void cnn_convolve_no_maxpool_padding_valid_5x5_avx2(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int channel_step) {
+ const int kFilterWidth = 5;
+ const int kFilterHeight = 5;
+ const int kSkipWidth = 4;
+ const int kSkipHeight = 4;
+ assert(layer_config->filter_width == kFilterWidth &&
+ layer_config->filter_height == kFilterHeight);
+ assert(layer_config->skip_width == kSkipWidth &&
+ layer_config->skip_height == kSkipHeight);
+
+ // Load shuffle buffers needed for source.
+ const __m256i block0_1 =
+ _mm256_load_si256((const __m256i *)shuffle_src_layer0[0]);
+ const __m256i block1_2 =
+ _mm256_load_si256((const __m256i *)shuffle_src_layer0[1]);
+
+ // Load shuffle buffers needed for weight.
+ const __m256i weight_mask_0 =
+ _mm256_load_si256((const __m256i *)shuffle_weight_layer0[0]);
+ const __m256i weight_mask_1 =
+ _mm256_load_si256((const __m256i *)shuffle_weight_layer0[1]);
+
+ // Width needs to be moved to go to next iteration of processing 3 5x5 blocks.
+ const int kSkipWidthForNextIter = kSkipWidth * 3;
+
+ // Minimum width required to process 3 5x5 blocks at a time.
+ // min width (for processing 3 5x5 block) = 2*skip_width + filter_width
+ // Here, skip_width specifies how much width we should move while processing
+ // next block convolution and filter_width specifies for how many pixels
+ // filter needs to be applied.
+ const int kMinWidthFor3_5x5Blocks = (kSkipWidth * 2) + kFilterWidth;
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ const float out_ch_bias = layer_config->bias[i];
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ __m256 shuffle_weight[10];
+
+ // Weights needed are 5x5, for SIMD purpose made this array as 5x8.
+ float weight[5][8] = { { 0 } };
+ int off = k * layer_config->out_channels + i;
+
+ // In layer 0, the convolution process happens at 5x5.
+ // The weights needed for 5x5 block are same across the in-channels,
+ // which is why the load of weights happens once for each in-channel.
+ prepare_weights_for_5x5_convolve(layer_config->weights, off, weight,
+ cstep, shuffle_weight, weight_mask_0,
+ weight_mask_1);
+
+ for (int h = 0, u = 0; h < in_height - kFilterHeight + 1;
+ h += kSkipHeight, ++u) {
+ const int out_h = u * out_stride;
+ int v = 0;
+ int w = 0;
+ int rem_width = in_width;
+ // Processing 3 5x5 blocks at a time, if sufficient width is present.
+ while (rem_width >= kMinWidthFor3_5x5Blocks) {
+ __m256 load_src_0, load_src_1;
+ __m256 accum_src_0 = _mm256_setzero_ps();
+ __m256 accum_src_1 = _mm256_setzero_ps();
+ const float *input_ptr = &input[k][h * in_stride + w];
+ PERFORM_CONVOLVE_FOR_3_5X5_BLOCKS();
+
+ // Accumulate across column.
+ __m256 accum = _mm256_hadd_ps(accum_src_0, accum_src_1);
+ __m128 tmp_reg_0 = _mm256_extractf128_ps(accum_src_0, 1);
+ __m128 tmp_reg_1 = _mm256_extractf128_ps(accum_src_1, 1);
+
+ __m128 accum_l = _mm256_castps256_ps128(accum);
+ __m128 accum_h = _mm256_extractf128_ps(accum, 1);
+
+ __m128 tmp_reg_2 = _mm_add_ps(accum_l, tmp_reg_0);
+ __m128 tmp_reg_3 = _mm_add_ps(tmp_reg_0, accum_h);
+ __m128 tmp_reg_4 = _mm_add_ps(tmp_reg_1, accum_h);
+
+ // 1st 5x5 block output.
+ output[i][out_h + v] =
+ out_ch_bias + _mm_cvtss_f32(tmp_reg_2) +
+ _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 1));
+
+ // 2nd 5x5 block output.
+ output[i][out_h + v + 1] =
+ out_ch_bias +
+ _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_3, tmp_reg_3, 1)) +
+ _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 2));
+
+ // 3rd 5x5 block output.
+ output[i][out_h + v + 2] =
+ out_ch_bias +
+ _mm_cvtss_f32(_mm_shuffle_ps(tmp_reg_4, tmp_reg_4, 2)) +
+ _mm_cvtss_f32(_mm_shuffle_ps(accum_l, accum_l, 3));
+
+ v += 3;
+ w += kSkipWidthForNextIter;
+ rem_width -= kSkipWidthForNextIter;
+ }
+
+ // Process remaining blocks as single 5x5 block at a time.
+ while (rem_width >= kFilterWidth) {
+ float last_column_sum = 0;
+ __m128 accum = _mm_setzero_ps();
+ const float *input_ptr = &input[k][h * in_stride + w];
+ PERFORM_CONVOLVE_FOR_1_5X5_BLOCK(shuffle_weight, accum, in_stride);
+
+ // Accumulate across column.
+ accum = _mm_hadd_ps(accum, accum);
+ output[i][out_h + v] = out_ch_bias + last_column_sum +
+ _mm_cvtss_f32(accum) +
+ _mm_cvtss_f32(_mm_shuffle_ps(accum, accum, 1));
+
+ v += 1;
+ w += kSkipWidth;
+ rem_width -= kSkipWidth;
+ }
+ }
+ }
+ }
+}
+
+// AVX2 implementation for layer 1.
+static INLINE void cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
+ const float **input, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int channel_step) {
+ __m256i weight_mask[2];
+ __m256i shuffle_output_mask;
+ load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask);
+
+ const int kInHeight = 16;
+ const int kFilterHeight = 2;
+ const int kSkipHeight = 2;
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+ // out_accum registers are used to store the 2x2 convolve outputs
+ // (calculated over input block size), which are accumulated across the
+ // in_channels. As per the design, each iteration of for loop processes 8
+ // (horizontal) 2x2 blocks and stores in corresponding out_accum register
+ // (as input size is 16x16, a total of 64 2x2 blocks are present and 8
+ // out_accum registers are enough to store the outputs).
+ // Hence for loops corresponding to 'j' and 'h', below, run over the number
+ // of out_accum registers.
+ __m256 out_accum[8];
+ for (int j = 0; j < 8; ++j) out_accum[j] = bias_reg;
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ __m256 shuffle_weight[2];
+ int off = k * layer_config->out_channels + i;
+ // In layer 1, the convolution process happens at 2x2.
+ // The weights needed for 2x2 block are same across the in-channels,
+ // which is why the load of weights happens once for each in-channel.
+ prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep,
+ shuffle_weight, weight_mask);
+
+ for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
+ h += kSkipHeight, ++u) {
+ const float *input_ptr = &input[k][h * in_stride];
+ perform_convolve_for_8h_2x2_blocks(input_ptr, in_stride, shuffle_weight,
+ &out_accum[u], shuffle_output_mask);
+ }
+ }
+ // Store output of layer 1.
+ for (int j = 0; j < 8; ++j) {
+ _mm256_storeu_ps(&output[i][j * out_stride], out_accum[j]);
+ }
+ }
+}
+
+// AVX2 implementation for layer 2.
+static INLINE void cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
+ const float **input, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int channel_step) {
+ __m256i weight_mask[2];
+ __m256i shuffle_output_mask;
+ load_shuffle_masks_for_2x2_convolve(&shuffle_output_mask, weight_mask);
+
+ const int kInHeight = 8;
+ const int kFilterHeight = 2;
+ const int kSkipHeight = 2;
+ for (int i = start_idx; i < layer_config->out_channels; i += channel_step) {
+ __m256 bias_reg = _mm256_set1_ps(layer_config->bias[i]);
+ // out_accum registers are used to store the 2x2 convolve outputs
+ // (calculated over input block size), which are accumulated across the
+ // in_channels. As per the design, each iteration of for loop processes 8
+ // (4 horizontal x 2 vertical) 2x2 blocks and stores in corresponding
+ // out_accum register (as input size is 8x8, a total of 16 2x2 blocks are
+ // present and 2 out_accum registers are enough to store the outputs).
+ // Hence for loops corresponding to 'j' and 'h', below, run over the number
+ // of out_accum registers.
+ __m256 out_accum[2];
+
+ // Height needs to be moved to go to next iteration of processing
+ // while processing 2 2x2 blocks vertically.
+ const int kSkipHeightForNextIter = kSkipHeight * 2;
+ for (int j = 0; j < 2; ++j) out_accum[j] = bias_reg;
+ for (int k = 0; k < layer_config->in_channels; ++k) {
+ __m256 shuffle_weight[2];
+ int off = k * layer_config->out_channels + i;
+ // In layer 2, the convolution process happens at 2x2.
+ // The weights needed for 2x2 block are same across the in-channels,
+ // which is why the load of weights happens once for each in-channel.
+ prepare_weights_for_2x2_convolve(layer_config->weights, off, cstep,
+ shuffle_weight, weight_mask);
+
+ for (int h = 0, u = 0; h < kInHeight - kFilterHeight + 1;
+ h += kSkipHeightForNextIter, ++u) {
+ const float *input_ptr = &input[k][h * in_stride];
+ perform_convolve_for_4hx2v_2x2_blocks(input_ptr, in_stride,
+ shuffle_weight, &out_accum[u],
+ shuffle_output_mask);
+ }
+ }
+ // Store output of layer 2.
+ for (int j = 0; j < 2; ++j) {
+ _mm256_storeu_ps(&output[i][j * out_stride * 2], out_accum[j]);
+ }
+ }
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c(), when
+// filter_width and filter_height are equal to 2.
+// As per the layer config set by av1_intra_mode_cnn_partition_cnn_config,
+// the filter_width and filter_height are equal to 2 for layer >= 1. So
+// convolution happens at 2x2 for layer >= 1.
+void cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *const layer_config, float **output, int out_stride,
+ int start_idx, const int cstep, const int channel_step) {
+ assert(layer_config->filter_width == 2 && layer_config->filter_height == 2);
+ assert(layer_config->skip_width == 2 && layer_config->skip_height == 2);
+
+ if (in_width == 16 && in_height == 16) {
+ // This case of in_width and in_height equal to 16 corresponds to layer 1.
+ // The output size of this layer is 8x8.
+ cnn_convolve_no_maxpool_padding_valid_layer1_avx2(
+ input, in_stride, layer_config, output, out_stride, start_idx, cstep,
+ channel_step);
+ } else if (in_width == 8 && in_height == 8) {
+ // This case of in_width and in_height equal to 8 corresponds to layer 2.
+ // The output size of this layer is 4x4.
+ cnn_convolve_no_maxpool_padding_valid_layer2_avx2(
+ input, in_stride, layer_config, output, out_stride, start_idx, cstep,
+ channel_step);
+ } else {
+ // For layer equal to 3 and 4, the input is of size 4x4 and 2x2
+ // respectively. Implementing SIMD for these cases might not be optimal,
+ // which is why we call C path for layer >= 3.
+ av1_cnn_convolve_no_maxpool_padding_valid_c(
+ input, in_width, in_height, in_stride, layer_config, output, out_stride,
+ start_idx, cstep, channel_step);
+ }
+}
+
+// AVX2 variant of av1_cnn_convolve_no_maxpool_padding_valid_c().
+// As per the current encoder, av1_cnn_convolve function gets called for
+// block size equal to 64x64. av1_cnn_convolve() uses layer config values
+// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few
+// details related to each layer's config parameters.
+// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht
+// 0 64x64 16x16 5 5 4 4
+// 1 16x16 8x8 2 2 2 2
+// 2 8x8 4x4 2 2 2 2
+// 3 4x4 2x2 2 2 2 2
+// 4 2x2 1x1 2 2 2 2
+// Here,
+// filter_wd = filter_width and filter_ht = filter_height,
+// skip_wd = skip_width and skip_ht = skip_height.
+void av1_cnn_convolve_no_maxpool_padding_valid_avx2(
+ const float **input, int in_width, int in_height, int in_stride,
+ const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride,
+ int start_idx, int cstep, int channel_step) {
+ if (layer_config->filter_width == 5 && layer_config->filter_height == 5 &&
+ layer_config->skip_width == 4 && layer_config->skip_height == 4) {
+ cnn_convolve_no_maxpool_padding_valid_5x5_avx2(
+ input, in_width, in_height, in_stride, layer_config, output, out_stride,
+ start_idx, cstep, channel_step);
+ } else if (layer_config->filter_width == 2 &&
+ layer_config->filter_height == 2 &&
+ layer_config->skip_width == 2 && layer_config->skip_height == 2) {
+ cnn_convolve_no_maxpool_padding_valid_2x2_avx2(
+ input, in_width, in_height, in_stride, layer_config, output, out_stride,
+ start_idx, cstep, channel_step);
+ } else {
+ av1_cnn_convolve_no_maxpool_padding_valid_c(
+ input, in_width, in_height, in_stride, layer_config, output, out_stride,
+ start_idx, cstep, channel_step);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm
new file mode 100644
index 0000000000..b185548184
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm
@@ -0,0 +1,82 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+ paddw m0, m1
+ movq m4, m0
+ psubw m3, m2
+ psubw m4, m3
+ psraw m4, 1
+ movq m5, m4
+ psubw m5, m1 ;b1
+ psubw m4, m2 ;c1
+ psubw m0, m4
+ paddw m3, m5
+ ; m0 a0
+ SWAP 1, 4 ; m1 c1
+ SWAP 2, 3 ; m2 d1
+ SWAP 3, 5 ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ ; 00 01 02 03
+ ; 10 11 12 13
+ ; 20 21 22 23
+ ; 30 31 32 33
+ punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
+ punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
+ mova m1, m0
+ punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
+ punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+ lea r3q, [inputq + strideq*4]
+ movq m0, [inputq] ;a1
+ movq m1, [inputq + strideq*2] ;b1
+ movq m2, [r3q] ;c1
+ movq m3, [r3q + strideq*2] ;d1
+
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ SWAP 1, 2
+ psrldq m1, m0, 8
+ psrldq m3, m2, 8
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+
+ psllw m0, 2
+ psllw m1, 2
+
+ ; sign extension
+ mova m2, m0
+ mova m3, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+ punpckhwd m2, m2
+ punpckhwd m3, m3
+ psrad m0, 16
+ psrad m1, 16
+ psrad m2, 16
+ psrad m3, 16
+ mova [outputq], m0
+ mova [outputq + 16], m2
+ mova [outputq + 32], m1
+ mova [outputq + 48], m3
+
+ RET
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
new file mode 100644
index 0000000000..9627f75930
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+#include <immintrin.h> /* AVX2 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = height + TX_PAD_HOR;
+ const __m256i y_zeros = _mm256_setzero_si256();
+
+ const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+ uint8_t *bottom_buf_end = levels + (width + TX_PAD_BOTTOM) * stride;
+ uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
+
+ do {
+ yy_storeu_256(bottom_buf, y_zeros);
+ bottom_buf += 32;
+ } while (bottom_buf < bottom_buf_end);
+
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (height == 4) {
+ do {
+ const __m256i c0 = yy_loadu_256(cf);
+ const __m256i c1 = yy_loadu_256(cf + 8);
+ const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
+ const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
+ const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
+ const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
+ yy_storeu_256(ls, res);
+ ls += 32;
+ cf += 16;
+ i += 4;
+ } while (i < width);
+ } else if (height == 8) {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ const __m128i res0 = _mm256_castsi256_si128(res);
+ const __m128i res1 = _mm256_extracti128_si256(res, 1);
+ xx_storel_64(ls, res0);
+ *(int32_t *)(ls + height) = 0;
+ xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
+ *(int32_t *)(ls + height + stride) = 0;
+ xx_storel_64(ls + stride * 2, res1);
+ *(int32_t *)(ls + height + stride * 2) = 0;
+ xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
+ *(int32_t *)(ls + height + stride * 3) = 0;
+ cf += 32;
+ ls += stride << 2;
+ i += 4;
+ } while (i < width);
+ } else if (height == 16) {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ xx_storeu_128(ls, _mm256_castsi256_si128(res));
+ xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
+ cf += 32;
+ *(int32_t *)(ls + height) = 0;
+ *(int32_t *)(ls + stride + height) = 0;
+ ls += stride << 1;
+ i += 2;
+ } while (i < width);
+ } else {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ yy_storeu_256(ls, res);
+ cf += 32;
+ *(int32_t *)(ls + height) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < width);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
new file mode 100644
index 0000000000..d23a688747
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+
+static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
+ level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
+ level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
+ level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
+ level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
+ level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
+ level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
+ level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
+ level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = _mm_loadu_si128((__m128i *)(src + 1));
+ level[1] = _mm_loadu_si128((__m128i *)(src + stride));
+ level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
+ level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
+ level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
+}
+
+static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+ const __m128i const_3 = _mm_set1_epi8(3);
+ const __m128i const_4 = _mm_set1_epi8(4);
+ __m128i count;
+
+ count = _mm_min_epu8(level[0], const_3);
+ level[1] = _mm_min_epu8(level[1], const_3);
+ level[2] = _mm_min_epu8(level[2], const_3);
+ level[3] = _mm_min_epu8(level[3], const_3);
+ level[4] = _mm_min_epu8(level[4], const_3);
+ count = _mm_add_epi8(count, level[1]);
+ count = _mm_add_epi8(count, level[2]);
+ count = _mm_add_epi8(count, level[3]);
+ count = _mm_add_epi8(count, level[4]);
+ count = _mm_avg_epu8(count, _mm_setzero_si128());
+ count = _mm_min_epu8(count, const_4);
+ return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *const coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(21);
+ __m128i pos_to_offset =
+ (width == 4)
+ ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
+ : _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 6, 6, 21, 21, 6, 21,
+ 21, 21);
+ __m128i count;
+ __m128i level[5];
+ int8_t *cc = coeff_contexts;
+ int col = width;
+
+ assert(!(width % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)cc, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ cc += 16;
+ col -= 4;
+ } while (col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int col = width;
+
+ assert(!(width % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ col -= 4;
+ } while (col);
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int col = width;
+
+ assert(!(width % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ col -= 4;
+ } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ int8_t *cc = coeff_contexts;
+ int col = width;
+ __m128i count;
+ __m128i level[5];
+ __m128i pos_to_offset[3];
+
+ assert(!(width % 2));
+
+ if (width == 8) {
+ pos_to_offset[0] =
+ _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+ 21, 21, 21, 21, 21);
+ } else if (width < 8) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 11, 11, 6, 21,
+ 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(11, 11, 21, 21, 21, 21, 21, 21, 11, 11, 21,
+ 21, 21, 21, 21, 21);
+ } else {
+ pos_to_offset[0] = _mm_setr_epi8(0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 16, 16, 16, 16, 16);
+ pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+ 21, 21, 21, 21, 21);
+ }
+ pos_to_offset[2] = _mm_set1_epi8(21);
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)cc, count);
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += 2 * stride;
+ cc += 16;
+ col -= 2;
+ } while (col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ int col = width;
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(width % 2));
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ col -= 2;
+ } while (col);
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+ const int width,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
+ int col = width;
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(width % 2));
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ col -= 2;
+ } while (col);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+ const int real_width,
+ const int real_height,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+ int8_t *cc = coeff_contexts;
+ int col = width;
+ __m128i pos_to_offset[5];
+ __m128i pos_to_offset_large[3];
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(height % 16));
+
+ pos_to_offset_large[2] = _mm_set1_epi8(21);
+ if (real_width == real_height) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+ pos_to_offset_large[2];
+ } else if (real_width < real_height) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(11, 11, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
+ 11, 11, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
+ pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+ } else { // real_width > real_height
+ pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+ pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[4] = pos_to_offset_large[2];
+ pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(16);
+ }
+
+ do {
+ int h = height;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)cc, count);
+ levels += 16;
+ cc += 16;
+ h -= 16;
+ pos_to_offset[0] = pos_to_offset_large[0];
+ } while (h);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ pos_to_offset[2] = pos_to_offset[3];
+ pos_to_offset[3] = pos_to_offset[4];
+ pos_to_offset_large[0] = pos_to_offset_large[1];
+ pos_to_offset_large[1] = pos_to_offset_large[2];
+ levels += TX_PAD_HOR;
+ } while (--col);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+ const __m128i pos_to_offset_large =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int col = width;
+
+ assert(!(height % 16));
+
+ do {
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ int h = height;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 16;
+ coeff_contexts += 16;
+ h -= 16;
+ } while (h);
+
+ levels += TX_PAD_HOR;
+ } while (--col);
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = height + TX_PAD_HOR;
+ __m128i pos_to_offset[3];
+ __m128i count;
+ __m128i level[5];
+ int col = width;
+
+ assert(!(height % 16));
+
+ pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
+ pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
+ pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+
+ do {
+ int h = height;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 16;
+ coeff_contexts += 16;
+ h -= 16;
+ } while (h);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += TX_PAD_HOR;
+ } while (--col);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size,
+ const TX_CLASS tx_class,
+ int8_t *const coeff_contexts) {
+ const int last_idx = eob - 1;
+ if (!last_idx) {
+ coeff_contexts[0] = 0;
+ return;
+ }
+
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const int stride = height + TX_PAD_HOR;
+ ptrdiff_t offsets[3];
+
+ /* coeff_contexts must be 16 byte aligned. */
+ assert(!((intptr_t)coeff_contexts & 0xf));
+
+ if (tx_class == TX_CLASS_2D) {
+ offsets[0] = 0 * stride + 2;
+ offsets[1] = 1 * stride + 1;
+ offsets[2] = 2 * stride + 0;
+
+ if (height == 4) {
+ get_4_nz_map_contexts_2d(levels, width, offsets, coeff_contexts);
+ } else if (height == 8) {
+ get_8_coeff_contexts_2d(levels, width, offsets, coeff_contexts);
+ } else if (height == 16) {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coeff_contexts);
+ }
+ } else if (tx_class == TX_CLASS_HORIZ) {
+ offsets[0] = 2 * stride;
+ offsets[1] = 3 * stride;
+ offsets[2] = 4 * stride;
+ if (height == 4) {
+ get_4_nz_map_contexts_hor(levels, width, offsets, coeff_contexts);
+ } else if (height == 8) {
+ get_8_coeff_contexts_hor(levels, width, offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_hor(levels, width, height, offsets,
+ coeff_contexts);
+ }
+ } else { // TX_CLASS_VERT
+ offsets[0] = 2;
+ offsets[1] = 3;
+ offsets[2] = 4;
+ if (height == 4) {
+ get_4_nz_map_contexts_ver(levels, width, offsets, coeff_contexts);
+ } else if (height == 8) {
+ get_8_coeff_contexts_ver(levels, width, offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_ver(levels, width, height, offsets,
+ coeff_contexts);
+ }
+ }
+
+ const int bhl = get_txb_bhl(tx_size);
+ const int pos = scan[last_idx];
+ if (last_idx <= (width << bhl) / 8)
+ coeff_contexts[pos] = 1;
+ else if (last_idx <= (width << bhl) / 4)
+ coeff_contexts[pos] = 2;
+ else
+ coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
new file mode 100644
index 0000000000..72bd8e3411
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = height + TX_PAD_HOR;
+ const __m128i zeros = _mm_setzero_si128();
+
+ const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+ uint8_t *bottom_buf = levels + stride * width;
+ uint8_t *bottom_buf_end = bottom_buf + bottom_len;
+ do {
+ _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
+ bottom_buf += 16;
+ } while (bottom_buf < bottom_buf_end);
+
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (height == 4) {
+ do {
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+ const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
+ xx_storeu_128(ls, lsAB);
+ ls += (stride << 1);
+ cf += (height << 1);
+ i += 2;
+ } while (i < width);
+ } else if (height == 8) {
+ do {
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+ xx_storeu_128(ls, absAB8);
+ ls += stride;
+ cf += height;
+ i += 1;
+ } while (i < width);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffC = xx_loadu_128(cf + 8);
+ const __m128i coeffD = xx_loadu_128(cf + 12);
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absCD = _mm_abs_epi16(coeffCD);
+ const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
+ xx_storeu_128(ls + j, absABCD);
+ j += 16;
+ cf += 16;
+ } while (j < height);
+ *(int32_t *)(ls + height) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < width);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
new file mode 100644
index 0000000000..57725d1795
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+ __m256i *c) {
+ const tran_low_t *addr = coeff + offset;
+
+ if (sizeof(tran_low_t) == 4) {
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+ const __m256i y = _mm256_packs_epi32(x0, x1);
+ *c = _mm256_permute4x64_epi64(y, 0xD8);
+ } else {
+ *c = _mm256_loadu_si256((const __m256i *)addr);
+ }
+}
+
+static INLINE void av1_block_error_num_coeff16_avx2(const int16_t *coeff,
+ const int16_t *dqcoeff,
+ __m256i *sse_256) {
+ const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
+ // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+ const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+ // r0 r1 r2 r3 r4 r5 r6 r7
+ const __m256i error = _mm256_madd_epi16(diff, diff);
+ // r0+r1 r2+r3 | r0+r1 r2+r3 | r4+r5 r6+r7 | r4+r5 r6+r7
+ const __m256i error_hi = _mm256_hadd_epi32(error, error);
+ // r0+r1 | r2+r3 | r4+r5 | r6+r7
+ *sse_256 = _mm256_unpacklo_epi32(error_hi, _mm256_setzero_si256());
+}
+
+static INLINE void av1_block_error_num_coeff32_avx2(const int16_t *coeff,
+ const int16_t *dqcoeff,
+ __m256i *sse_256) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff);
+ const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16));
+ const __m256i _dqcoeff_1 =
+ _mm256_loadu_si256((const __m256i *)(dqcoeff + 16));
+
+ // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+ const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0);
+ const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1);
+
+ // r0 r1 r2 r3 r4 r5 r6 r7
+ const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0);
+ const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1);
+ const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1);
+
+ // For extreme input values, the accumulation needs to happen in 64 bit
+ // precision to avoid any overflow.
+ const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero);
+ const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero);
+ const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo);
+ *sse_256 = _mm256_add_epi64(*sse_256, sum_temp_0);
+}
+
+static INLINE void av1_block_error_num_coeff64_avx2(const int16_t *coeff,
+ const int16_t *dqcoeff,
+ __m256i *sse_256,
+ intptr_t num_coeff) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int i = 0; i < num_coeff; i += 64) {
+ // Load 64 elements for coeff and dqcoeff.
+ const __m256i _coeff_0 = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i _dqcoeff_0 = _mm256_loadu_si256((const __m256i *)dqcoeff);
+ const __m256i _coeff_1 = _mm256_loadu_si256((const __m256i *)(coeff + 16));
+ const __m256i _dqcoeff_1 =
+ _mm256_loadu_si256((const __m256i *)(dqcoeff + 16));
+ const __m256i _coeff_2 = _mm256_loadu_si256((const __m256i *)(coeff + 32));
+ const __m256i _dqcoeff_2 =
+ _mm256_loadu_si256((const __m256i *)(dqcoeff + 32));
+ const __m256i _coeff_3 = _mm256_loadu_si256((const __m256i *)(coeff + 48));
+ const __m256i _dqcoeff_3 =
+ _mm256_loadu_si256((const __m256i *)(dqcoeff + 48));
+
+ // d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15
+ const __m256i diff_0 = _mm256_sub_epi16(_dqcoeff_0, _coeff_0);
+ const __m256i diff_1 = _mm256_sub_epi16(_dqcoeff_1, _coeff_1);
+ const __m256i diff_2 = _mm256_sub_epi16(_dqcoeff_2, _coeff_2);
+ const __m256i diff_3 = _mm256_sub_epi16(_dqcoeff_3, _coeff_3);
+
+ // r0 r1 r2 r3 r4 r5 r6 r7
+ const __m256i error_0 = _mm256_madd_epi16(diff_0, diff_0);
+ const __m256i error_1 = _mm256_madd_epi16(diff_1, diff_1);
+ const __m256i error_2 = _mm256_madd_epi16(diff_2, diff_2);
+ const __m256i error_3 = _mm256_madd_epi16(diff_3, diff_3);
+ // r00 r01 r02 r03 r04 r05 r06 r07
+ const __m256i err_final_0 = _mm256_add_epi32(error_0, error_1);
+ // r10 r11 r12 r13 r14 r15 r16 r17
+ const __m256i err_final_1 = _mm256_add_epi32(error_2, error_3);
+
+ // For extreme input values, the accumulation needs to happen in 64 bit
+ // precision to avoid any overflow. r00 r01 r04 r05
+ const __m256i exp0_error_lo = _mm256_unpacklo_epi32(err_final_0, zero);
+ // r02 r03 r06 r07
+ const __m256i exp0_error_hi = _mm256_unpackhi_epi32(err_final_0, zero);
+ // r10 r11 r14 r15
+ const __m256i exp1_error_lo = _mm256_unpacklo_epi32(err_final_1, zero);
+ // r12 r13 r16 r17
+ const __m256i exp1_error_hi = _mm256_unpackhi_epi32(err_final_1, zero);
+
+ const __m256i sum_temp_0 = _mm256_add_epi64(exp0_error_hi, exp0_error_lo);
+ const __m256i sum_temp_1 = _mm256_add_epi64(exp1_error_hi, exp1_error_lo);
+ const __m256i sse_256_temp = _mm256_add_epi64(sum_temp_1, sum_temp_0);
+ *sse_256 = _mm256_add_epi64(*sse_256, sse_256_temp);
+ coeff += 64;
+ dqcoeff += 64;
+ }
+}
+
+int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+ intptr_t num_coeff) {
+ assert(num_coeff % 16 == 0);
+ __m256i sse_256 = _mm256_setzero_si256();
+ int64_t sse;
+
+ if (num_coeff == 16)
+ av1_block_error_num_coeff16_avx2(coeff, dqcoeff, &sse_256);
+ else if (num_coeff == 32)
+ av1_block_error_num_coeff32_avx2(coeff, dqcoeff, &sse_256);
+ else
+ av1_block_error_num_coeff64_avx2(coeff, dqcoeff, &sse_256, num_coeff);
+
+ // Save the higher 64 bit of each 128 bit lane.
+ const __m256i sse_hi = _mm256_srli_si256(sse_256, 8);
+ // Add the higher 64 bit to the low 64 bit.
+ sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+ // Accumulate the sse_256 register to get final sse
+ const __m128i sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+ _mm256_extractf128_si256(sse_256, 1));
+
+ // Store the results.
+ _mm_storel_epi64((__m128i *)&sse, sse_128);
+ return sse;
+}
+
+int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+ __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+ __m256i sse_reg_64hi, ssz_reg_64hi;
+ __m128i sse_reg128, ssz_reg128;
+ int64_t sse;
+ int i;
+ const __m256i zero_reg = _mm256_setzero_si256();
+
+ // init sse and ssz registerd to zero
+ sse_reg = _mm256_setzero_si256();
+ ssz_reg = _mm256_setzero_si256();
+
+ for (i = 0; i < block_size; i += 16) {
+ // load 32 bytes from coeff and dqcoeff
+ read_coeff(coeff, i, &coeff_reg);
+ read_coeff(dqcoeff, i, &dqcoeff_reg);
+ // dqcoeff - coeff
+ dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+ // madd (dqcoeff - coeff)
+ dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+ // madd coeff
+ coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+ // expand each double word of madd (dqcoeff - coeff) to quad word
+ exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+ exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+ // expand each double word of madd (coeff) to quad word
+ exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+ exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+ // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+ sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+ ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+ sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+ ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+ }
+ // save the higher 64 bit of each 128 bit lane
+ sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+ ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+ // add the higher 64 bit to the low 64 bit
+ sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+ ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+ // add each 64 bit from each of the 128 bit lane of the 256 bit
+ sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+ _mm256_extractf128_si256(sse_reg, 1));
+
+ ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+ _mm256_extractf128_si256(ssz_reg, 1));
+
+ // store the results
+ _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
+
+ _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
+ _mm256_zeroupper();
+ return sse;
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/error_intrin_sse2.c
new file mode 100644
index 0000000000..61f65c623f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_intrin_sse2.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static AOM_INLINE __m128i reduce_sum_epi64(__m128i reg) {
+ __m128i reg_hi = _mm_srli_si128(reg, 8);
+ reg = _mm_add_epi64(reg, reg_hi);
+
+ return reg;
+}
+
+int64_t av1_block_error_lp_sse2(const int16_t *coeff, const int16_t *dqcoeff,
+ intptr_t block_size) {
+ assert(block_size % 16 == 0);
+ assert(block_size >= 16);
+
+ const __m128i zero = _mm_setzero_si128();
+ __m128i accum_0 = zero;
+ __m128i accum_1 = zero;
+
+ for (int i = 0; i < block_size; i += 16) {
+ // Load 8 elements for coeff and dqcoeff.
+ const __m128i _coeff_0 = _mm_loadu_si128((const __m128i *)coeff);
+ const __m128i _coeff_1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+ const __m128i _dqcoeff_0 = _mm_loadu_si128((const __m128i *)dqcoeff);
+ const __m128i _dqcoeff_1 = _mm_loadu_si128((const __m128i *)(dqcoeff + 8));
+ // Compute the diff
+ const __m128i diff_0 = _mm_sub_epi16(_dqcoeff_0, _coeff_0);
+ const __m128i diff_1 = _mm_sub_epi16(_dqcoeff_1, _coeff_1);
+ // Compute the error
+ const __m128i error_0 = _mm_madd_epi16(diff_0, diff_0);
+ const __m128i error_1 = _mm_madd_epi16(diff_1, diff_1);
+
+ const __m128i error_lo_0 = _mm_unpacklo_epi32(error_0, zero);
+ const __m128i error_lo_1 = _mm_unpacklo_epi32(error_1, zero);
+ const __m128i error_hi_0 = _mm_unpackhi_epi32(error_0, zero);
+ const __m128i error_hi_1 = _mm_unpackhi_epi32(error_1, zero);
+
+ // Accumulate
+ accum_0 = _mm_add_epi64(accum_0, error_lo_0);
+ accum_1 = _mm_add_epi64(accum_1, error_lo_1);
+ accum_0 = _mm_add_epi64(accum_0, error_hi_0);
+ accum_1 = _mm_add_epi64(accum_1, error_hi_1);
+
+ // Advance
+ coeff += 16;
+ dqcoeff += 16;
+ }
+
+ __m128i accum = _mm_add_epi64(accum_0, accum_1);
+ // Reduce sum the register
+ accum = reduce_sum_epi64(accum);
+
+ // Store the results.
+#if AOM_ARCH_X86_64
+ return _mm_cvtsi128_si64(accum);
+#else
+ int64_t result;
+ _mm_storel_epi64((__m128i *)&result, accum);
+ return result;
+#endif // AOM_ARCH_X86_64
+}
diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm
new file mode 100644
index 0000000000..6407c106ab
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_sse2.asm
@@ -0,0 +1,88 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+ lea %1, [%1 + %2 * 4]
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+ mova m%1, [%2 + (%3) * 4]
+ packssdw m%1, [%2 + (%3) * 4 + 16]
+%endmacro
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+; int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+ pxor m4, m4 ; sse accumulator
+ pxor m6, m6 ; ssz accumulator
+ pxor m5, m5 ; dedicated zero register
+.loop:
+ LOAD_TRAN_LOW 2, uqcq, 0
+ LOAD_TRAN_LOW 0, dqcq, 0
+ LOAD_TRAN_LOW 3, uqcq, 8
+ LOAD_TRAN_LOW 1, dqcq, 8
+ INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+ INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+ sub sizeq, 16
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
+ paddd m2, m3
+ ; accumulate in 64bit
+ punpckldq m7, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m7
+ punpckldq m7, m2, m5
+ paddq m4, m0
+ punpckhdq m2, m5
+ paddq m6, m7
+ paddq m6, m2
+ jg .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ movhlps m7, m6
+ paddq m4, m5
+ paddq m6, m7
+%if AOM_ARCH_X86_64
+ movq rax, m4
+ movq [sszq], m6
+%else
+ mov eax, sszm
+ pshufd m5, m4, 0x1
+ movq [eax], m6
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 0000000000..ebe75310e9
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+ while ((len) >= sizeof(type)) { \
+ (crc) = op((crc), *(type *)(buf)); \
+ (len) -= sizeof(type); \
+ buf += sizeof(type); \
+ }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p,
+ size_t len) {
+ (void)crc_calculator;
+ const uint8_t *buf = p;
+ uint32_t crc = 0xFFFFFFFF;
+
+ // Align the input to the word boundary
+ for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+ crc = _mm_crc32_u8(crc, *buf);
+ }
+
+#ifdef __x86_64__
+ uint64_t crc64 = crc;
+ CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len)
+ crc = (uint32_t)crc64;
+#endif
+ CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len)
+ CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len)
+ CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len)
+ return (crc ^ 0xFFFFFFFF);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
new file mode 100644
index 0000000000..340307cb3e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <stdio.h>
+#include "aom/aom_integer.h"
+#include "av1/common/common.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz,
+ int bps) {
+ int i;
+ int64_t temp1[8];
+ int64_t error = 0, sqcoeff = 0;
+ const int shift = 2 * (bps - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i += 16) {
+ __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i));
+ __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8));
+ __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i));
+ __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8));
+
+ __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff);
+ __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2);
+ __m256i diff1h = _mm256_srli_epi64(diff1, 32);
+ __m256i diff2h = _mm256_srli_epi64(diff2, 32);
+ __m256i res = _mm256_mul_epi32(diff1, diff1);
+ __m256i res1 = _mm256_mul_epi32(diff1h, diff1h);
+ __m256i res2 = _mm256_mul_epi32(diff2, diff2);
+ __m256i res3 = _mm256_mul_epi32(diff2h, diff2h);
+ __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+ _mm256_add_epi64(res2, res3));
+ __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32);
+ __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32);
+ res = _mm256_mul_epi32(mm256_coeff, mm256_coeff);
+ res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh);
+ res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2);
+ res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2);
+ __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+ _mm256_add_epi64(res2, res3));
+ _mm256_storeu_si256((__m256i *)temp1, res_diff);
+ _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff);
+
+ error += temp1[0] + temp1[1] + temp1[2] + temp1[3];
+ sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7];
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000000..b0b2757568
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "av1/common/common.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz,
+ int bps) {
+ int i, j, test;
+ uint32_t temp[4];
+ __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+ int64_t error = 0, sqcoeff = 0;
+ const int shift = 2 * (bps - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i += 8) {
+ // Load the data into xmm registers
+ __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
+ __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
+ __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
+ __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
+ // Check if any values require more than 15 bit
+ max = _mm_set1_epi32(0x3fff);
+ min = _mm_set1_epi32((int)0xffffc000);
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+ _mm_cmplt_epi32(mm_coeff, min));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+ _mm_cmplt_epi32(mm_coeff2, min));
+ cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+ _mm_cmplt_epi32(mm_dqcoeff, min));
+ cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+ _mm_cmplt_epi32(mm_dqcoeff2, min));
+ test = _mm_movemask_epi8(
+ _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
+
+ if (!test) {
+ __m128i mm_diff, error_sse2, sqcoeff_sse2;
+ mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+ mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+ mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+ error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+ sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+ _mm_storeu_si128((__m128i *)temp, error_sse2);
+ error = error + temp[0] + temp[1] + temp[2] + temp[3];
+ _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
+ sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+ } else {
+ for (j = 0; j < 8; j++) {
+ const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+ }
+ }
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
new file mode 100644
index 0000000000..9cdf21fc7c
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -0,0 +1,3132 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h> /*AVX2*/
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i out1[8];
+ if (!flipud) {
+ out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ } else {
+ out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ }
+ if (!fliplr) {
+ out[0] = _mm256_cvtepi16_epi32(out1[0]);
+ out[1] = _mm256_cvtepi16_epi32(out1[1]);
+ out[2] = _mm256_cvtepi16_epi32(out1[2]);
+ out[3] = _mm256_cvtepi16_epi32(out1[3]);
+ out[4] = _mm256_cvtepi16_epi32(out1[4]);
+ out[5] = _mm256_cvtepi16_epi32(out1[5]);
+ out[6] = _mm256_cvtepi16_epi32(out1[6]);
+ out[7] = _mm256_cvtepi16_epi32(out1[7]);
+
+ } else {
+ out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0]));
+ out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1]));
+ out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2]));
+ out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3]));
+ out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4]));
+ out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5]));
+ out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6]));
+ out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7]));
+ }
+ out[0] = _mm256_slli_epi32(out[0], shift);
+ out[1] = _mm256_slli_epi32(out[1], shift);
+ out[2] = _mm256_slli_epi32(out[2], shift);
+ out[3] = _mm256_slli_epi32(out[3], shift);
+ out[4] = _mm256_slli_epi32(out[4], shift);
+ out[5] = _mm256_slli_epi32(out[5], shift);
+ out[6] = _mm256_slli_epi32(out[6], shift);
+ out[7] = _mm256_slli_epi32(out[7], shift);
+}
+static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm256_add_epi32(in[0], rounding);
+ in[1] = _mm256_add_epi32(in[1], rounding);
+ in[2] = _mm256_add_epi32(in[2], rounding);
+ in[3] = _mm256_add_epi32(in[3], rounding);
+ in[4] = _mm256_add_epi32(in[4], rounding);
+ in[5] = _mm256_add_epi32(in[5], rounding);
+ in[6] = _mm256_add_epi32(in[6], rounding);
+ in[7] = _mm256_add_epi32(in[7], rounding);
+
+ in[0] = _mm256_srai_epi32(in[0], shift);
+ in[1] = _mm256_srai_epi32(in[1], shift);
+ in[2] = _mm256_srai_epi32(in[2], shift);
+ in[3] = _mm256_srai_epi32(in[3], shift);
+ in[4] = _mm256_srai_epi32(in[4], shift);
+ in[5] = _mm256_srai_epi32(in[5], shift);
+ in[6] = _mm256_srai_epi32(in[6], shift);
+ in[7] = _mm256_srai_epi32(in[7], shift);
+}
+static INLINE void load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 8 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+ load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift);
+}
+static INLINE void load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
+ int stride, int height, int outstride,
+ int flipud, int fliplr) {
+ __m256i out1[64];
+ if (!flipud) {
+ for (int i = 0; i < height; i++) {
+ out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride));
+ }
+ } else {
+ for (int i = 0; i < height; i++) {
+ out1[(height - 1) - i] =
+ _mm256_loadu_si256((const __m256i *)(input + i * stride));
+ }
+ }
+ if (!fliplr) {
+ for (int i = 0; i < height; i++) {
+ out[i * outstride] =
+ _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i]));
+ out[i * outstride + 1] =
+ _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1));
+ }
+ } else {
+ for (int i = 0; i < height; i++) {
+ out[i * outstride + 1] = _mm256_cvtepi16_epi32(
+ mm_reverse_epi16(_mm256_castsi256_si128(out1[i])));
+ out[i * outstride + 0] = _mm256_cvtepi16_epi32(
+ mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1)));
+ }
+ }
+}
+
+static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
+ const int instride,
+ const int outstride) {
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i x0, x1;
+
+ u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]);
+ u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]);
+
+ u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]);
+ u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]);
+
+ u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]);
+ u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]);
+
+ u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]);
+ u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]);
+
+ x0 = _mm256_unpacklo_epi64(u0, u2);
+ x1 = _mm256_unpacklo_epi64(u4, u6);
+ out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u0, u2);
+ x1 = _mm256_unpackhi_epi64(u4, u6);
+ out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpacklo_epi64(u1, u3);
+ x1 = _mm256_unpacklo_epi64(u5, u7);
+ out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u1, u3);
+ x1 = _mm256_unpackhi_epi64(u5, u7);
+ out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+static INLINE void round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
+ int stride) {
+ if (bit < 0) {
+ bit = -bit;
+ __m256i round = _mm256_set1_epi32(1 << (bit - 1));
+ for (int i = 0; i < size; ++i) {
+ in[stride * i] = _mm256_add_epi32(in[stride * i], round);
+ in[stride * i] = _mm256_srai_epi32(in[stride * i], bit);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[stride * i] = _mm256_slli_epi32(in[stride * i], bit);
+ }
+ }
+}
+static INLINE void store_buffer_avx2(const __m256i *const in, int32_t *out,
+ const int stride, const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out), in[i]);
+ out += stride;
+ }
+}
+static INLINE void fwd_txfm_transpose_16x16_avx2(const __m256i *in,
+ __m256i *out) {
+ fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2);
+ fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2);
+ fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2);
+ fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2);
+}
+
+static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *w1, const __m256i *n1,
+ const __m256i *rounding, int bit) {
+ __m256i x, y;
+
+ x = _mm256_mullo_epi32(*w0, *n0);
+ y = _mm256_mullo_epi32(*w1, *n1);
+ x = _mm256_add_epi32(x, y);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
+}
+#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ const __m256i ww0 = _mm256_set1_epi32(w0); \
+ const __m256i ww1 = _mm256_set1_epi32(w1); \
+ const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \
+ const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \
+ out0 = _mm256_add_epi32(in0_w0, in1_w1); \
+ round_shift_32_8xn_avx2(&out0, 1, -bit, 1); \
+ const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \
+ const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \
+ out1 = _mm256_sub_epi32(in0_w1, in1_w0); \
+ round_shift_32_8xn_avx2(&out1, 1, -bit, 1); \
+ } while (0)
+
+#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \
+ const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \
+ out0 = _mm256_add_epi32(in0_w0, in1_w1); \
+ out0 = _mm256_add_epi32(out0, r); \
+ out0 = _mm256_srai_epi32(out0, bit); \
+ const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \
+ const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \
+ out1 = _mm256_sub_epi32(in0_w1, in1_w0); \
+ out1 = _mm256_add_epi32(out1, r); \
+ out1 = _mm256_srai_epi32(out1, bit); \
+ } while (0)
+
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out,
+ const int8_t cos_bit, int instride,
+ int outstride);
+static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int col_num, const int outstride) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ __m256i u[8], v[8];
+ for (int col = 0; col < col_num; ++col) {
+ u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+ v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+ u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+ u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+ u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+ u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+ u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+ v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+ v[0] = _mm256_add_epi32(u[0], u[3]);
+ v[3] = _mm256_sub_epi32(u[0], u[3]);
+ v[1] = _mm256_add_epi32(u[1], u[2]);
+ v[2] = _mm256_sub_epi32(u[1], u[2]);
+
+ v[5] = _mm256_mullo_epi32(u[5], cospim32);
+ v[6] = _mm256_mullo_epi32(u[6], cospi32);
+ v[5] = _mm256_add_epi32(v[5], v[6]);
+ v[5] = _mm256_add_epi32(v[5], rnding);
+ v[5] = _mm256_srai_epi32(v[5], bit);
+
+ u[0] = _mm256_mullo_epi32(u[5], cospi32);
+ v[6] = _mm256_mullo_epi32(u[6], cospim32);
+ v[6] = _mm256_sub_epi32(u[0], v[6]);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ // stage 3
+ // type 0
+ v[0] = _mm256_mullo_epi32(v[0], cospi32);
+ v[1] = _mm256_mullo_epi32(v[1], cospi32);
+ u[0] = _mm256_add_epi32(v[0], v[1]);
+ u[0] = _mm256_add_epi32(u[0], rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ u[1] = _mm256_sub_epi32(v[0], v[1]);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ // type 1
+ v[0] = _mm256_mullo_epi32(v[2], cospi48);
+ v[1] = _mm256_mullo_epi32(v[3], cospi16);
+ u[2] = _mm256_add_epi32(v[0], v[1]);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ v[0] = _mm256_mullo_epi32(v[2], cospi16);
+ v[1] = _mm256_mullo_epi32(v[3], cospi48);
+ u[3] = _mm256_sub_epi32(v[1], v[0]);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ u[4] = _mm256_add_epi32(v[4], v[5]);
+ u[5] = _mm256_sub_epi32(v[4], v[5]);
+ u[6] = _mm256_sub_epi32(v[7], v[6]);
+ u[7] = _mm256_add_epi32(v[7], v[6]);
+
+ // stage 4
+ // stage 5
+ v[0] = _mm256_mullo_epi32(u[4], cospi56);
+ v[1] = _mm256_mullo_epi32(u[7], cospi8);
+ v[0] = _mm256_add_epi32(v[0], v[1]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[4]
+
+ v[0] = _mm256_mullo_epi32(u[4], cospi8);
+ v[1] = _mm256_mullo_epi32(u[7], cospi56);
+ v[0] = _mm256_sub_epi32(v[1], v[0]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[7]
+
+ v[0] = _mm256_mullo_epi32(u[5], cospi24);
+ v[1] = _mm256_mullo_epi32(u[6], cospi40);
+ v[0] = _mm256_add_epi32(v[0], v[1]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[5]
+
+ v[0] = _mm256_mullo_epi32(u[5], cospi40);
+ v[1] = _mm256_mullo_epi32(u[6], cospi24);
+ v[0] = _mm256_sub_epi32(v[1], v[0]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[6]
+
+ out[0 * outstride + col] = u[0]; // buf0[0]
+ out[4 * outstride + col] = u[1]; // buf0[1]
+ out[2 * outstride + col] = u[2]; // buf0[2]
+ out[6 * outstride + col] = u[3]; // buf0[3]
+ }
+}
+static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int col_num, const int outstirde) {
+ (void)col_num;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m256i x, y;
+ for (int col = 0; col < col_num; ++col) {
+ u0 = in[0 * col_num + col];
+ u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]);
+ u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]);
+ u3 = in[4 * col_num + col];
+ u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]);
+ u5 = in[6 * col_num + col];
+ u6 = in[2 * col_num + col];
+ u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]);
+
+ // stage 2
+ v0 = u0;
+ v1 = u1;
+
+ x = _mm256_mullo_epi32(u2, cospi32);
+ y = _mm256_mullo_epi32(u3, cospi32);
+ v2 = _mm256_add_epi32(x, y);
+ v2 = _mm256_add_epi32(v2, rnding);
+ v2 = _mm256_srai_epi32(v2, bit);
+
+ v3 = _mm256_sub_epi32(x, y);
+ v3 = _mm256_add_epi32(v3, rnding);
+ v3 = _mm256_srai_epi32(v3, bit);
+
+ v4 = u4;
+ v5 = u5;
+
+ x = _mm256_mullo_epi32(u6, cospi32);
+ y = _mm256_mullo_epi32(u7, cospi32);
+ v6 = _mm256_add_epi32(x, y);
+ v6 = _mm256_add_epi32(v6, rnding);
+ v6 = _mm256_srai_epi32(v6, bit);
+
+ v7 = _mm256_sub_epi32(x, y);
+ v7 = _mm256_add_epi32(v7, rnding);
+ v7 = _mm256_srai_epi32(v7, bit);
+
+ // stage 3
+ u0 = _mm256_add_epi32(v0, v2);
+ u1 = _mm256_add_epi32(v1, v3);
+ u2 = _mm256_sub_epi32(v0, v2);
+ u3 = _mm256_sub_epi32(v1, v3);
+ u4 = _mm256_add_epi32(v4, v6);
+ u5 = _mm256_add_epi32(v5, v7);
+ u6 = _mm256_sub_epi32(v4, v6);
+ u7 = _mm256_sub_epi32(v5, v7);
+
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
+
+ x = _mm256_mullo_epi32(u4, cospi16);
+ y = _mm256_mullo_epi32(u5, cospi48);
+ v4 = _mm256_add_epi32(x, y);
+ v4 = _mm256_add_epi32(v4, rnding);
+ v4 = _mm256_srai_epi32(v4, bit);
+
+ x = _mm256_mullo_epi32(u4, cospi48);
+ y = _mm256_mullo_epi32(u5, cospim16);
+ v5 = _mm256_add_epi32(x, y);
+ v5 = _mm256_add_epi32(v5, rnding);
+ v5 = _mm256_srai_epi32(v5, bit);
+
+ x = _mm256_mullo_epi32(u6, cospim48);
+ y = _mm256_mullo_epi32(u7, cospi16);
+ v6 = _mm256_add_epi32(x, y);
+ v6 = _mm256_add_epi32(v6, rnding);
+ v6 = _mm256_srai_epi32(v6, bit);
+
+ x = _mm256_mullo_epi32(u6, cospi16);
+ y = _mm256_mullo_epi32(u7, cospi48);
+ v7 = _mm256_add_epi32(x, y);
+ v7 = _mm256_add_epi32(v7, rnding);
+ v7 = _mm256_srai_epi32(v7, bit);
+
+ // stage 5
+ u0 = _mm256_add_epi32(v0, v4);
+ u1 = _mm256_add_epi32(v1, v5);
+ u2 = _mm256_add_epi32(v2, v6);
+ u3 = _mm256_add_epi32(v3, v7);
+ u4 = _mm256_sub_epi32(v0, v4);
+ u5 = _mm256_sub_epi32(v1, v5);
+ u6 = _mm256_sub_epi32(v2, v6);
+ u7 = _mm256_sub_epi32(v3, v7);
+
+ // stage 6
+ x = _mm256_mullo_epi32(u0, cospi4);
+ y = _mm256_mullo_epi32(u1, cospi60);
+ v0 = _mm256_add_epi32(x, y);
+ v0 = _mm256_add_epi32(v0, rnding);
+ v0 = _mm256_srai_epi32(v0, bit);
+
+ x = _mm256_mullo_epi32(u0, cospi60);
+ y = _mm256_mullo_epi32(u1, cospim4);
+ v1 = _mm256_add_epi32(x, y);
+ v1 = _mm256_add_epi32(v1, rnding);
+ v1 = _mm256_srai_epi32(v1, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi20);
+ y = _mm256_mullo_epi32(u3, cospi44);
+ v2 = _mm256_add_epi32(x, y);
+ v2 = _mm256_add_epi32(v2, rnding);
+ v2 = _mm256_srai_epi32(v2, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi44);
+ y = _mm256_mullo_epi32(u3, cospim20);
+ v3 = _mm256_add_epi32(x, y);
+ v3 = _mm256_add_epi32(v3, rnding);
+ v3 = _mm256_srai_epi32(v3, bit);
+
+ x = _mm256_mullo_epi32(u4, cospi36);
+ y = _mm256_mullo_epi32(u5, cospi28);
+ v4 = _mm256_add_epi32(x, y);
+ v4 = _mm256_add_epi32(v4, rnding);
+ v4 = _mm256_srai_epi32(v4, bit);
+
+ x = _mm256_mullo_epi32(u4, cospi28);
+ y = _mm256_mullo_epi32(u5, cospim36);
+ v5 = _mm256_add_epi32(x, y);
+ v5 = _mm256_add_epi32(v5, rnding);
+ v5 = _mm256_srai_epi32(v5, bit);
+
+ x = _mm256_mullo_epi32(u6, cospi52);
+ y = _mm256_mullo_epi32(u7, cospi12);
+ v6 = _mm256_add_epi32(x, y);
+ v6 = _mm256_add_epi32(v6, rnding);
+ v6 = _mm256_srai_epi32(v6, bit);
+
+ x = _mm256_mullo_epi32(u6, cospi12);
+ y = _mm256_mullo_epi32(u7, cospim52);
+ v7 = _mm256_add_epi32(x, y);
+ v7 = _mm256_add_epi32(v7, rnding);
+ v7 = _mm256_srai_epi32(v7, bit);
+
+ // stage 7
+ out[0 * outstirde + col] = v1;
+ out[1 * outstirde + col] = v6;
+ out[2 * outstirde + col] = v3;
+ out[3 * outstirde + col] = v4;
+ out[4 * outstirde + col] = v5;
+ out[5 * outstirde + col] = v2;
+ out[6 * outstirde + col] = v7;
+ out[7 * outstirde + col] = v0;
+ }
+}
+static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num,
+ int outstride) {
+ (void)bit;
+ (void)outstride;
+ int num_iters = 8 * col_num;
+ for (int i = 0; i < num_iters; i += 8) {
+ out[i] = _mm256_add_epi32(in[i], in[i]);
+ out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]);
+ out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]);
+ out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]);
+ out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]);
+ out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]);
+ out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]);
+ out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]);
+ }
+}
+void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[8], out[8];
+ const TX_SIZE tx_size = TX_8X8;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int width = tx_size_wide[tx_size];
+ const int width_div8 = (width >> 3);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case IDTX:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case V_DCT:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case H_DCT:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case V_ADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case H_ADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case V_FLIPADST:
+ load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ case H_FLIPADST:
+ load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+ idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 8);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int col_num, const int outstride) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ __m256i u[16], v[16], x;
+ int col;
+
+ // Calculate the column 0, 1, 2, 3
+ for (col = 0; col < col_num; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+ u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+ // stage 2
+ v[0] = _mm256_add_epi32(u[0], u[7]);
+ v[7] = _mm256_sub_epi32(u[0], u[7]);
+ v[1] = _mm256_add_epi32(u[1], u[6]);
+ v[6] = _mm256_sub_epi32(u[1], u[6]);
+ v[2] = _mm256_add_epi32(u[2], u[5]);
+ v[5] = _mm256_sub_epi32(u[2], u[5]);
+ v[3] = _mm256_add_epi32(u[3], u[4]);
+ v[4] = _mm256_sub_epi32(u[3], u[4]);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ v[10] = _mm256_mullo_epi32(u[10], cospim32);
+ x = _mm256_mullo_epi32(u[13], cospi32);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_mullo_epi32(u[10], cospi32);
+ x = _mm256_mullo_epi32(u[13], cospim32);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[11] = _mm256_mullo_epi32(u[11], cospim32);
+ x = _mm256_mullo_epi32(u[12], cospi32);
+ v[11] = _mm256_add_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(u[11], cospi32);
+ x = _mm256_mullo_epi32(u[12], cospim32);
+ v[12] = _mm256_sub_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 3
+ u[0] = _mm256_add_epi32(v[0], v[3]);
+ u[3] = _mm256_sub_epi32(v[0], v[3]);
+ u[1] = _mm256_add_epi32(v[1], v[2]);
+ u[2] = _mm256_sub_epi32(v[1], v[2]);
+ u[4] = v[4];
+
+ u[5] = _mm256_mullo_epi32(v[5], cospim32);
+ x = _mm256_mullo_epi32(v[6], cospi32);
+ u[5] = _mm256_add_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(v[5], cospi32);
+ x = _mm256_mullo_epi32(v[6], cospim32);
+ u[6] = _mm256_sub_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ u[8] = _mm256_add_epi32(v[8], v[11]);
+ u[11] = _mm256_sub_epi32(v[8], v[11]);
+ u[9] = _mm256_add_epi32(v[9], v[10]);
+ u[10] = _mm256_sub_epi32(v[9], v[10]);
+ u[12] = _mm256_sub_epi32(v[15], v[12]);
+ u[15] = _mm256_add_epi32(v[15], v[12]);
+ u[13] = _mm256_sub_epi32(v[14], v[13]);
+ u[14] = _mm256_add_epi32(v[14], v[13]);
+
+ // stage 4
+ u[0] = _mm256_mullo_epi32(u[0], cospi32);
+ u[1] = _mm256_mullo_epi32(u[1], cospi32);
+ v[0] = _mm256_add_epi32(u[0], u[1]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ v[1] = _mm256_sub_epi32(u[0], u[1]);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ v[2] = _mm256_mullo_epi32(u[2], cospi48);
+ x = _mm256_mullo_epi32(u[3], cospi16);
+ v[2] = _mm256_add_epi32(v[2], x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_mullo_epi32(u[2], cospi16);
+ x = _mm256_mullo_epi32(u[3], cospi48);
+ v[3] = _mm256_sub_epi32(x, v[3]);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = _mm256_add_epi32(u[4], u[5]);
+ v[5] = _mm256_sub_epi32(u[4], u[5]);
+ v[6] = _mm256_sub_epi32(u[7], u[6]);
+ v[7] = _mm256_add_epi32(u[7], u[6]);
+ v[8] = u[8];
+
+ v[9] = _mm256_mullo_epi32(u[9], cospim16);
+ x = _mm256_mullo_epi32(u[14], cospi48);
+ v[9] = _mm256_add_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[14] = _mm256_mullo_epi32(u[9], cospi48);
+ x = _mm256_mullo_epi32(u[14], cospim16);
+ v[14] = _mm256_sub_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[10] = _mm256_mullo_epi32(u[10], cospim48);
+ x = _mm256_mullo_epi32(u[13], cospim16);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_mullo_epi32(u[10], cospim16);
+ x = _mm256_mullo_epi32(u[13], cospim48);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[11] = u[11];
+ v[12] = u[12];
+ v[15] = u[15];
+
+ // stage 5
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm256_mullo_epi32(v[4], cospi56);
+ x = _mm256_mullo_epi32(v[7], cospi8);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ u[7] = _mm256_mullo_epi32(v[4], cospi8);
+ x = _mm256_mullo_epi32(v[7], cospi56);
+ u[7] = _mm256_sub_epi32(x, u[7]);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ u[5] = _mm256_mullo_epi32(v[5], cospi24);
+ x = _mm256_mullo_epi32(v[6], cospi40);
+ u[5] = _mm256_add_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(v[5], cospi40);
+ x = _mm256_mullo_epi32(v[6], cospi24);
+ u[6] = _mm256_sub_epi32(x, u[6]);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[8] = _mm256_add_epi32(v[8], v[9]);
+ u[9] = _mm256_sub_epi32(v[8], v[9]);
+ u[10] = _mm256_sub_epi32(v[11], v[10]);
+ u[11] = _mm256_add_epi32(v[11], v[10]);
+ u[12] = _mm256_add_epi32(v[12], v[13]);
+ u[13] = _mm256_sub_epi32(v[12], v[13]);
+ u[14] = _mm256_sub_epi32(v[15], v[14]);
+ u[15] = _mm256_add_epi32(v[15], v[14]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm256_mullo_epi32(u[8], cospi60);
+ x = _mm256_mullo_epi32(u[15], cospi4);
+ v[8] = _mm256_add_epi32(v[8], x);
+ v[8] = _mm256_add_epi32(v[8], rnding);
+ v[8] = _mm256_srai_epi32(v[8], bit);
+
+ v[15] = _mm256_mullo_epi32(u[8], cospi4);
+ x = _mm256_mullo_epi32(u[15], cospi60);
+ v[15] = _mm256_sub_epi32(x, v[15]);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ v[9] = _mm256_mullo_epi32(u[9], cospi28);
+ x = _mm256_mullo_epi32(u[14], cospi36);
+ v[9] = _mm256_add_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[14] = _mm256_mullo_epi32(u[9], cospi36);
+ x = _mm256_mullo_epi32(u[14], cospi28);
+ v[14] = _mm256_sub_epi32(x, v[14]);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[10] = _mm256_mullo_epi32(u[10], cospi44);
+ x = _mm256_mullo_epi32(u[13], cospi20);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_mullo_epi32(u[10], cospi20);
+ x = _mm256_mullo_epi32(u[13], cospi44);
+ v[13] = _mm256_sub_epi32(x, v[13]);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[11] = _mm256_mullo_epi32(u[11], cospi12);
+ x = _mm256_mullo_epi32(u[12], cospi52);
+ v[11] = _mm256_add_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(u[11], cospi52);
+ x = _mm256_mullo_epi32(u[12], cospi12);
+ v[12] = _mm256_sub_epi32(x, v[12]);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ out[0 * outstride + col] = v[0];
+ out[1 * outstride + col] = v[8];
+ out[2 * outstride + col] = v[4];
+ out[3 * outstride + col] = v[12];
+ out[4 * outstride + col] = v[2];
+ out[5 * outstride + col] = v[10];
+ out[6 * outstride + col] = v[6];
+ out[7 * outstride + col] = v[14];
+ out[8 * outstride + col] = v[1];
+ out[9 * outstride + col] = v[9];
+ out[10 * outstride + col] = v[5];
+ out[11 * outstride + col] = v[13];
+ out[12 * outstride + col] = v[3];
+ out[13 * outstride + col] = v[11];
+ out[14 * outstride + col] = v[7];
+ out[15 * outstride + col] = v[15];
+ }
+}
+static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int num_cols, const int outstride) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i zero = _mm256_setzero_si256();
+
+ __m256i u[16], v[16], x, y;
+ int col;
+
+ for (col = 0; col < num_cols; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = in[0 * num_cols + col];
+ u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]);
+ u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]);
+ u[3] = in[8 * num_cols + col];
+ u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]);
+ u[5] = in[12 * num_cols + col];
+ u[6] = in[4 * num_cols + col];
+ u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]);
+ u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]);
+ u[9] = in[14 * num_cols + col];
+ u[10] = in[6 * num_cols + col];
+ u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]);
+ u[12] = in[2 * num_cols + col];
+ u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]);
+ u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]);
+ u[15] = in[10 * num_cols + col];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+
+ x = _mm256_mullo_epi32(u[2], cospi32);
+ y = _mm256_mullo_epi32(u[3], cospi32);
+ v[2] = _mm256_add_epi32(x, y);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_sub_epi32(x, y);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ x = _mm256_mullo_epi32(u[6], cospi32);
+ y = _mm256_mullo_epi32(u[7], cospi32);
+ v[6] = _mm256_add_epi32(x, y);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_sub_epi32(x, y);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm256_mullo_epi32(u[10], cospi32);
+ y = _mm256_mullo_epi32(u[11], cospi32);
+ v[10] = _mm256_add_epi32(x, y);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_sub_epi32(x, y);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ x = _mm256_mullo_epi32(u[14], cospi32);
+ y = _mm256_mullo_epi32(u[15], cospi32);
+ v[14] = _mm256_add_epi32(x, y);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_sub_epi32(x, y);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 3
+ u[0] = _mm256_add_epi32(v[0], v[2]);
+ u[1] = _mm256_add_epi32(v[1], v[3]);
+ u[2] = _mm256_sub_epi32(v[0], v[2]);
+ u[3] = _mm256_sub_epi32(v[1], v[3]);
+ u[4] = _mm256_add_epi32(v[4], v[6]);
+ u[5] = _mm256_add_epi32(v[5], v[7]);
+ u[6] = _mm256_sub_epi32(v[4], v[6]);
+ u[7] = _mm256_sub_epi32(v[5], v[7]);
+ u[8] = _mm256_add_epi32(v[8], v[10]);
+ u[9] = _mm256_add_epi32(v[9], v[11]);
+ u[10] = _mm256_sub_epi32(v[8], v[10]);
+ u[11] = _mm256_sub_epi32(v[9], v[11]);
+ u[12] = _mm256_add_epi32(v[12], v[14]);
+ u[13] = _mm256_add_epi32(v[13], v[15]);
+ u[14] = _mm256_sub_epi32(v[12], v[14]);
+ u[15] = _mm256_sub_epi32(v[13], v[15]);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+ v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+ v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+ v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+ v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+ v[13] =
+ av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+ v[14] =
+ av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+ v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+ // stage 5
+ u[0] = _mm256_add_epi32(v[0], v[4]);
+ u[1] = _mm256_add_epi32(v[1], v[5]);
+ u[2] = _mm256_add_epi32(v[2], v[6]);
+ u[3] = _mm256_add_epi32(v[3], v[7]);
+ u[4] = _mm256_sub_epi32(v[0], v[4]);
+ u[5] = _mm256_sub_epi32(v[1], v[5]);
+ u[6] = _mm256_sub_epi32(v[2], v[6]);
+ u[7] = _mm256_sub_epi32(v[3], v[7]);
+ u[8] = _mm256_add_epi32(v[8], v[12]);
+ u[9] = _mm256_add_epi32(v[9], v[13]);
+ u[10] = _mm256_add_epi32(v[10], v[14]);
+ u[11] = _mm256_add_epi32(v[11], v[15]);
+ u[12] = _mm256_sub_epi32(v[8], v[12]);
+ u[13] = _mm256_sub_epi32(v[9], v[13]);
+ u[14] = _mm256_sub_epi32(v[10], v[14]);
+ u[15] = _mm256_sub_epi32(v[11], v[15]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+ v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+ v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+ v[11] =
+ av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+ v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+ v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+ v[14] =
+ av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+ v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+ // stage 7
+ u[0] = _mm256_add_epi32(v[0], v[8]);
+ u[1] = _mm256_add_epi32(v[1], v[9]);
+ u[2] = _mm256_add_epi32(v[2], v[10]);
+ u[3] = _mm256_add_epi32(v[3], v[11]);
+ u[4] = _mm256_add_epi32(v[4], v[12]);
+ u[5] = _mm256_add_epi32(v[5], v[13]);
+ u[6] = _mm256_add_epi32(v[6], v[14]);
+ u[7] = _mm256_add_epi32(v[7], v[15]);
+ u[8] = _mm256_sub_epi32(v[0], v[8]);
+ u[9] = _mm256_sub_epi32(v[1], v[9]);
+ u[10] = _mm256_sub_epi32(v[2], v[10]);
+ u[11] = _mm256_sub_epi32(v[3], v[11]);
+ u[12] = _mm256_sub_epi32(v[4], v[12]);
+ u[13] = _mm256_sub_epi32(v[5], v[13]);
+ u[14] = _mm256_sub_epi32(v[6], v[14]);
+ u[15] = _mm256_sub_epi32(v[7], v[15]);
+
+ // stage 8
+ v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+ v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+ v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+ v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+ v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+ v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+ v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+ v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+ v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+ v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+ v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+ v[11] =
+ av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+ v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+ v[13] =
+ av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+ v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+ v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+ // stage 9
+ out[0 * outstride + col] = v[1];
+ out[1 * outstride + col] = v[14];
+ out[2 * outstride + col] = v[3];
+ out[3 * outstride + col] = v[12];
+ out[4 * outstride + col] = v[5];
+ out[5 * outstride + col] = v[10];
+ out[6 * outstride + col] = v[7];
+ out[7 * outstride + col] = v[8];
+ out[8 * outstride + col] = v[9];
+ out[9 * outstride + col] = v[6];
+ out[10 * outstride + col] = v[11];
+ out[11 * outstride + col] = v[4];
+ out[12 * outstride + col] = v[13];
+ out[13 * outstride + col] = v[2];
+ out[14 * outstride + col] = v[15];
+ out[15 * outstride + col] = v[0];
+ }
+}
+static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ int col_num, const int outstride) {
+ (void)bit;
+ (void)outstride;
+ __m256i fact = _mm256_set1_epi32(2 * NewSqrt2);
+ __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m256i a_low;
+
+ int num_iters = 16 * col_num;
+ for (int i = 0; i < num_iters; i++) {
+ a_low = _mm256_mullo_epi32(in[i], fact);
+ a_low = _mm256_add_epi32(a_low, offset);
+ out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
+ }
+}
+static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16_avx2, // DCT_DCT
+ fadst16_avx2, // ADST_DCT
+ fdct16_avx2, // DCT_ADST
+ fadst16_avx2, // ADST_ADST
+ fadst16_avx2, // FLIPADST_DCT
+ fdct16_avx2, // DCT_FLIPADST
+ fadst16_avx2, // FLIPADST_FLIPADST
+ fadst16_avx2, // ADST_FLIPADST
+ fadst16_avx2, // FLIPADST_ADST
+ idtx16_avx2, // IDTX
+ fdct16_avx2, // V_DCT
+ idtx16_avx2, // H_DCT
+ fadst16_avx2, // V_ADST
+ idtx16_avx2, // H_ADST
+ fadst16_avx2, // V_FLIPADST
+ idtx16_avx2 // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8_avx2, // DCT_DCT
+ fdct8_avx2, // ADST_DCT
+ fadst8_avx2, // DCT_ADST
+ fadst8_avx2, // ADST_ADST
+ fdct8_avx2, // FLIPADST_DCT
+ fadst8_avx2, // DCT_FLIPADST
+ fadst8_avx2, // FLIPADST_FLIPADST
+ fadst8_avx2, // ADST_FLIPADST
+ fadst8_avx2, // FLIPADST_ADST
+ idtx8_avx2, // IDTX
+ idtx8_avx2, // V_DCT
+ fdct8_avx2, // H_DCT
+ idtx8_avx2, // V_ADST
+ fadst8_avx2, // H_ADST
+ idtx8_avx2, // V_FLIPADST
+ fadst8_avx2 // H_FLIPADST
+};
+void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[16], out[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+ const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, out, bit, 1, 1);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ col_txfm_8x8_rounding(&out[8], -shift[1]);
+ fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+ fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+ row_txfm(in, out, bit, 2, 2);
+ round_shift_rect_array_32_avx2(out, in, 16, -shift[2], NewSqrt2);
+ store_buffer_avx2(in, coeff, 8, 16);
+ (void)bd;
+}
+static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8_avx2, // DCT_DCT
+ fadst8_avx2, // ADST_DCT
+ fdct8_avx2, // DCT_ADST
+ fadst8_avx2, // ADST_ADST
+ fadst8_avx2, // FLIPADST_DCT
+ fdct8_avx2, // DCT_FLIPADST
+ fadst8_avx2, // FLIPADST_FLIPADST
+ fadst8_avx2, // ADST_FLIPADST
+ fadst8_avx2, // FLIPADST_ADST
+ idtx8_avx2, // IDTX
+ fdct8_avx2, // V_DCT
+ idtx8_avx2, // H_DCT
+ fadst8_avx2, // V_ADST
+ idtx8_avx2, // H_ADST
+ fadst8_avx2, // V_FLIPADST
+ idtx8_avx2 // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16_avx2, // DCT_DCT
+ fdct16_avx2, // ADST_DCT
+ fadst16_avx2, // DCT_ADST
+ fadst16_avx2, // ADST_ADST
+ fdct16_avx2, // FLIPADST_DCT
+ fadst16_avx2, // DCT_FLIPADST
+ fadst16_avx2, // FLIPADST_FLIPADST
+ fadst16_avx2, // ADST_FLIPADST
+ fadst16_avx2, // FLIPADST_ADST
+ idtx16_avx2, // IDTX
+ idtx16_avx2, // V_DCT
+ fdct16_avx2, // H_DCT
+ idtx16_avx2, // V_ADST
+ fadst16_avx2, // H_ADST
+ idtx16_avx2, // V_FLIPADST
+ fadst16_avx2 // H_FLIPADST
+};
+void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[16], out[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+ const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip);
+ round_shift_32_8xn_avx2(in, 16, shift[0], 1);
+ col_txfm(in, out, bit, 2, 2);
+ round_shift_32_8xn_avx2(out, 16, shift[1], 1);
+ fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+ fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+ row_txfm(in, out, bit, 1, 1);
+ round_shift_rect_array_32_avx2(out, out, 16, -shift[2], NewSqrt2);
+ store_buffer_avx2(out, coeff, 8, 16);
+ (void)bd;
+}
+void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[32], out[32];
+ const TX_SIZE tx_size = TX_16X16;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const int width_div8 = (width >> 3);
+ const int width_div16 = (width >> 4);
+ const int size = (height << 1);
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case ADST_DCT:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case DCT_ADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case ADST_ADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case IDTX:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case V_DCT:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case H_DCT:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case V_ADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case H_ADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case V_FLIPADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ case H_FLIPADST:
+ load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+ round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ fwd_txfm_transpose_16x16_avx2(out, in);
+ fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ store_buffer_avx2(out, coeff, 8, 32);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+static INLINE void fdct32_avx2(__m256i *input, __m256i *output,
+ const int8_t cos_bit, const int instride,
+ const int outstride) {
+ __m256i buf0[32];
+ __m256i buf1[32];
+ const int32_t *cospi;
+ int startidx = 0 * instride;
+ int endidx = 31 * instride;
+ // stage 0
+ // stage 1
+ buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]);
+ buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
+ buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]);
+ buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
+ buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]);
+ buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
+ buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]);
+ buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
+ buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]);
+ buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
+ buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]);
+ buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
+ buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]);
+ buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]);
+ buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]);
+ buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
+ buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
+ buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
+ buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
+ buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
+ buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
+ buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
+ buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
+ buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
+ buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
+ buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
+ buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
+ buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
+ buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
+ buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
+ buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
+ buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
+ buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
+ buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
+ buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
+ buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
+ buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
+
+ // stage 4
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
+ buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
+ buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
+ buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
+ buf0[4] = buf1[4];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[7] = buf1[7];
+ buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]);
+ buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
+ buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]);
+ buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
+ buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
+ buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
+ buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
+ buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ cospi = cospi_arr(cos_bit);
+ btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+ cos_bit);
+ btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3],
+ cos_bit);
+ buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
+ buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
+ buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
+ buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
+ buf1[8] = buf0[8];
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+ cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
+ buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
+ buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
+ buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
+ buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
+ buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
+ buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
+ buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
+ buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
+ buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
+ buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
+ buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
+ buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
+ buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
+ buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
+
+ // stage 6
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7],
+ cos_bit);
+ btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6],
+ cos_bit);
+ buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]);
+ buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]);
+ buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
+ buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
+ buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
+ buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
+ buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
+ buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
+ buf0[16] = buf1[16];
+ btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15],
+ cos_bit);
+ btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14],
+ cos_bit);
+ btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11],
+ buf1[12], cos_bit);
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
+ buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
+ buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
+ buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
+ buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
+ buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
+ buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
+ buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
+ buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
+ buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
+ buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
+ buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
+ buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
+ buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
+ buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
+
+ // stage 8
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31],
+ cos_bit);
+ btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24],
+ cos_bit);
+
+ startidx = 0 * outstride;
+ endidx = 31 * outstride;
+ // stage 9
+ output[startidx] = buf0[0];
+ output[endidx] = buf0[31];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[16];
+ output[endidx] = buf0[15];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[8];
+ output[endidx] = buf0[23];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[24];
+ output[endidx] = buf0[7];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[4];
+ output[endidx] = buf0[27];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[20];
+ output[endidx] = buf0[11];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[12];
+ output[endidx] = buf0[19];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[28];
+ output[endidx] = buf0[3];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[2];
+ output[endidx] = buf0[29];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[18];
+ output[endidx] = buf0[13];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[10];
+ output[endidx] = buf0[21];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[26];
+ output[endidx] = buf0[5];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[6];
+ output[endidx] = buf0[25];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[22];
+ output[endidx] = buf0[9];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[14];
+ output[endidx] = buf0[17];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[30];
+ output[endidx] = buf0[1];
+}
+static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output,
+ const int8_t cos_bit, int instride,
+ int outstride) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; i += 8) {
+ output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2);
+ output[(i + 1) * outstride] =
+ _mm256_slli_epi32(input[(i + 1) * instride], 2);
+ output[(i + 2) * outstride] =
+ _mm256_slli_epi32(input[(i + 2) * instride], 2);
+ output[(i + 3) * outstride] =
+ _mm256_slli_epi32(input[(i + 3) * instride], 2);
+ output[(i + 4) * outstride] =
+ _mm256_slli_epi32(input[(i + 4) * instride], 2);
+ output[(i + 5) * outstride] =
+ _mm256_slli_epi32(input[(i + 5) * instride], 2);
+ output[(i + 6) * outstride] =
+ _mm256_slli_epi32(input[(i + 6) * instride], 2);
+ output[(i + 7) * outstride] =
+ _mm256_slli_epi32(input[(i + 7) * instride], 2);
+ }
+}
+static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = {
+ fdct32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ idtx32x32_avx2, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = {
+ fdct32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ idtx32x32_avx2, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m256i buf0[128], buf1[128];
+ const int tx_size = TX_32X32;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type];
+ int r, c;
+ const int width_div16 = (width >> 4);
+ const int width_div8 = (width >> 3);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height,
+ width_div8, 0, 0);
+ round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8);
+ round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
+ col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8,
+ width_div8);
+ col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8);
+ round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
+ }
+
+ for (r = 0; r < height; r += 8) {
+ for (c = 0; c < width_div8; c++) {
+ fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+ &buf1[c * 8 * width_div8 + (r >> 3)],
+ width_div8, width_div8);
+ }
+ }
+
+ for (int i = 0; i < width_div16; i++) {
+ row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8,
+ width_div8);
+ row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8);
+ round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8);
+ }
+
+ store_buffer_avx2(buf1, output, 8, 128);
+}
+static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
+ __m256i *cospi_m32, __m256i *cospi_p32,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x2[0] = _mm256_add_epi32(x1[0], x1[31]);
+ x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
+ x2[1] = _mm256_add_epi32(x1[1], x1[30]);
+ x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
+ x2[2] = _mm256_add_epi32(x1[2], x1[29]);
+ x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
+ x2[3] = _mm256_add_epi32(x1[3], x1[28]);
+ x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
+ x2[4] = _mm256_add_epi32(x1[4], x1[27]);
+ x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
+ x2[5] = _mm256_add_epi32(x1[5], x1[26]);
+ x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
+ x2[6] = _mm256_add_epi32(x1[6], x1[25]);
+ x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
+ x2[7] = _mm256_add_epi32(x1[7], x1[24]);
+ x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
+ x2[8] = _mm256_add_epi32(x1[8], x1[23]);
+ x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
+ x2[9] = _mm256_add_epi32(x1[9], x1[22]);
+ x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
+ x2[10] = _mm256_add_epi32(x1[10], x1[21]);
+ x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
+ x2[11] = _mm256_add_epi32(x1[11], x1[20]);
+ x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
+ x2[12] = _mm256_add_epi32(x1[12], x1[19]);
+ x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
+ x2[13] = _mm256_add_epi32(x1[13], x1[18]);
+ x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
+ x2[14] = _mm256_add_epi32(x1[14], x1[17]);
+ x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
+ x2[15] = _mm256_add_epi32(x1[15], x1[16]);
+ x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48],
+ *__rounding, cos_bit);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+}
+static INLINE void fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
+ __m256i *cospi_m32, __m256i *cospi_p32,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x3[0] = _mm256_add_epi32(x2[0], x2[15]);
+ x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
+ x3[1] = _mm256_add_epi32(x2[1], x2[14]);
+ x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
+ x3[2] = _mm256_add_epi32(x2[2], x2[13]);
+ x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
+ x3[3] = _mm256_add_epi32(x2[3], x2[12]);
+ x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
+ x3[4] = _mm256_add_epi32(x2[4], x2[11]);
+ x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
+ x3[5] = _mm256_add_epi32(x2[5], x2[10]);
+ x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
+ x3[6] = _mm256_add_epi32(x2[6], x2[9]);
+ x3[9] = _mm256_sub_epi32(x2[6], x2[9]);
+ x3[7] = _mm256_add_epi32(x2[7], x2[8]);
+ x3[8] = _mm256_sub_epi32(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24],
+ *__rounding, cos_bit);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm256_add_epi32(x2[32], x2[47]);
+ x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
+ x3[33] = _mm256_add_epi32(x2[33], x2[46]);
+ x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
+ x3[34] = _mm256_add_epi32(x2[34], x2[45]);
+ x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
+ x3[35] = _mm256_add_epi32(x2[35], x2[44]);
+ x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
+ x3[36] = _mm256_add_epi32(x2[36], x2[43]);
+ x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
+ x3[37] = _mm256_add_epi32(x2[37], x2[42]);
+ x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
+ x3[38] = _mm256_add_epi32(x2[38], x2[41]);
+ x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
+ x3[39] = _mm256_add_epi32(x2[39], x2[40]);
+ x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
+ x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
+ x3[63] = _mm256_add_epi32(x2[63], x2[48]);
+ x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
+ x3[62] = _mm256_add_epi32(x2[62], x2[49]);
+ x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
+ x3[61] = _mm256_add_epi32(x2[61], x2[50]);
+ x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
+ x3[60] = _mm256_add_epi32(x2[60], x2[51]);
+ x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
+ x3[59] = _mm256_add_epi32(x2[59], x2[52]);
+ x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
+ x3[58] = _mm256_add_epi32(x2[58], x2[53]);
+ x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
+ x3[57] = _mm256_add_epi32(x2[57], x2[54]);
+ x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
+ x3[56] = _mm256_add_epi32(x2[56], x2[55]);
+}
+static INLINE void fdct64_stage4_avx2(__m256i *x3, __m256i *x4,
+ __m256i *cospi_m32, __m256i *cospi_p32,
+ __m256i *cospi_m16, __m256i *cospi_p48,
+ __m256i *cospi_m48,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x4[0] = _mm256_add_epi32(x3[0], x3[7]);
+ x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
+ x4[1] = _mm256_add_epi32(x3[1], x3[6]);
+ x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
+ x4[2] = _mm256_add_epi32(x3[2], x3[5]);
+ x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
+ x4[3] = _mm256_add_epi32(x3[3], x3[4]);
+ x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12],
+ *__rounding, cos_bit);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm256_add_epi32(x3[16], x3[23]);
+ x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
+ x4[17] = _mm256_add_epi32(x3[17], x3[22]);
+ x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
+ x4[18] = _mm256_add_epi32(x3[18], x3[21]);
+ x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
+ x4[19] = _mm256_add_epi32(x3[19], x3[20]);
+ x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
+ x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
+ x4[31] = _mm256_add_epi32(x3[31], x3[24]);
+ x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
+ x4[30] = _mm256_add_epi32(x3[30], x3[25]);
+ x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
+ x4[29] = _mm256_add_epi32(x3[29], x3[26]);
+ x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
+ x4[28] = _mm256_add_epi32(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52],
+ *__rounding, cos_bit);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+}
+static INLINE void fdct64_stage5_avx2(__m256i *x4, __m256i *x5,
+ __m256i *cospi_m32, __m256i *cospi_p32,
+ __m256i *cospi_m16, __m256i *cospi_p48,
+ __m256i *cospi_m48,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x5[0] = _mm256_add_epi32(x4[0], x4[3]);
+ x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
+ x5[1] = _mm256_add_epi32(x4[1], x4[2]);
+ x5[2] = _mm256_sub_epi32(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6],
+ *__rounding, cos_bit);
+ x5[7] = x4[7];
+ x5[8] = _mm256_add_epi32(x4[8], x4[11]);
+ x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
+ x5[9] = _mm256_add_epi32(x4[9], x4[10]);
+ x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
+ x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
+ x5[15] = _mm256_add_epi32(x4[15], x4[12]);
+ x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
+ x5[14] = _mm256_add_epi32(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26],
+ *__rounding, cos_bit);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm256_add_epi32(x4[32], x4[39]);
+ x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
+ x5[33] = _mm256_add_epi32(x4[33], x4[38]);
+ x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
+ x5[34] = _mm256_add_epi32(x4[34], x4[37]);
+ x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
+ x5[35] = _mm256_add_epi32(x4[35], x4[36]);
+ x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
+ x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
+ x5[47] = _mm256_add_epi32(x4[47], x4[40]);
+ x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
+ x5[46] = _mm256_add_epi32(x4[46], x4[41]);
+ x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
+ x5[45] = _mm256_add_epi32(x4[45], x4[42]);
+ x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
+ x5[44] = _mm256_add_epi32(x4[44], x4[43]);
+ x5[48] = _mm256_add_epi32(x4[48], x4[55]);
+ x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
+ x5[49] = _mm256_add_epi32(x4[49], x4[54]);
+ x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
+ x5[50] = _mm256_add_epi32(x4[50], x4[53]);
+ x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
+ x5[51] = _mm256_add_epi32(x4[51], x4[52]);
+ x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
+ x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
+ x5[63] = _mm256_add_epi32(x4[63], x4[56]);
+ x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
+ x5[62] = _mm256_add_epi32(x4[62], x4[57]);
+ x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
+ x5[61] = _mm256_add_epi32(x4[61], x4[58]);
+ x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
+ x5[60] = _mm256_add_epi32(x4[60], x4[59]);
+}
+static INLINE void fdct64_stage6_avx2(
+ __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32,
+ __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
+ __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56,
+ __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24,
+ const __m256i *__rounding, int8_t cos_bit) {
+ btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3],
+ *__rounding, cos_bit);
+ x6[4] = _mm256_add_epi32(x5[4], x5[5]);
+ x6[5] = _mm256_sub_epi32(x5[4], x5[5]);
+ x6[6] = _mm256_sub_epi32(x5[7], x5[6]);
+ x6[7] = _mm256_add_epi32(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13],
+ *__rounding, cos_bit);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm256_add_epi32(x5[16], x5[19]);
+ x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
+ x6[17] = _mm256_add_epi32(x5[17], x5[18]);
+ x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
+ x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
+ x6[23] = _mm256_add_epi32(x5[23], x5[20]);
+ x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
+ x6[22] = _mm256_add_epi32(x5[22], x5[21]);
+ x6[24] = _mm256_add_epi32(x5[24], x5[27]);
+ x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
+ x6[25] = _mm256_add_epi32(x5[25], x5[26]);
+ x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
+ x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
+ x6[31] = _mm256_add_epi32(x5[31], x5[28]);
+ x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
+ x6[30] = _mm256_add_epi32(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58],
+ *__rounding, cos_bit);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50],
+ *__rounding, cos_bit);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+}
+static INLINE void fdct64_stage7_avx2(__m256i *x6, __m256i *x7,
+ __m256i *cospi_p08, __m256i *cospi_p56,
+ __m256i *cospi_p40, __m256i *cospi_p24,
+ __m256i *cospi_m08, __m256i *cospi_m56,
+ __m256i *cospi_m40, __m256i *cospi_m24,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6],
+ *__rounding, cos_bit);
+ x7[8] = _mm256_add_epi32(x6[8], x6[9]);
+ x7[9] = _mm256_sub_epi32(x6[8], x6[9]);
+ x7[10] = _mm256_sub_epi32(x6[11], x6[10]);
+ x7[11] = _mm256_add_epi32(x6[11], x6[10]);
+ x7[12] = _mm256_add_epi32(x6[12], x6[13]);
+ x7[13] = _mm256_sub_epi32(x6[12], x6[13]);
+ x7[14] = _mm256_sub_epi32(x6[15], x6[14]);
+ x7[15] = _mm256_add_epi32(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29],
+ *__rounding, cos_bit);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25],
+ *__rounding, cos_bit);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm256_add_epi32(x6[32], x6[35]);
+ x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
+ x7[33] = _mm256_add_epi32(x6[33], x6[34]);
+ x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
+ x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
+ x7[39] = _mm256_add_epi32(x6[39], x6[36]);
+ x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
+ x7[38] = _mm256_add_epi32(x6[38], x6[37]);
+ x7[40] = _mm256_add_epi32(x6[40], x6[43]);
+ x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
+ x7[41] = _mm256_add_epi32(x6[41], x6[42]);
+ x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
+ x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
+ x7[47] = _mm256_add_epi32(x6[47], x6[44]);
+ x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
+ x7[46] = _mm256_add_epi32(x6[46], x6[45]);
+ x7[48] = _mm256_add_epi32(x6[48], x6[51]);
+ x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
+ x7[49] = _mm256_add_epi32(x6[49], x6[50]);
+ x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
+ x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
+ x7[55] = _mm256_add_epi32(x6[55], x6[52]);
+ x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
+ x7[54] = _mm256_add_epi32(x6[54], x6[53]);
+ x7[56] = _mm256_add_epi32(x6[56], x6[59]);
+ x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
+ x7[57] = _mm256_add_epi32(x6[57], x6[58]);
+ x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
+ x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
+ x7[63] = _mm256_add_epi32(x6[63], x6[60]);
+ x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
+ x7[62] = _mm256_add_epi32(x6[62], x6[61]);
+}
+static INLINE void fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
+ const int32_t *cospi,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+ __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+ __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+ __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+ __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+ __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+ __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+ __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+ __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+ __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+ __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+ __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+ __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+ __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+ __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+ __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+
+ btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12],
+ *__rounding, cos_bit);
+ x8[16] = _mm256_add_epi32(x7[16], x7[17]);
+ x8[17] = _mm256_sub_epi32(x7[16], x7[17]);
+ x8[18] = _mm256_sub_epi32(x7[19], x7[18]);
+ x8[19] = _mm256_add_epi32(x7[19], x7[18]);
+ x8[20] = _mm256_add_epi32(x7[20], x7[21]);
+ x8[21] = _mm256_sub_epi32(x7[20], x7[21]);
+ x8[22] = _mm256_sub_epi32(x7[23], x7[22]);
+ x8[23] = _mm256_add_epi32(x7[23], x7[22]);
+ x8[24] = _mm256_add_epi32(x7[24], x7[25]);
+ x8[25] = _mm256_sub_epi32(x7[24], x7[25]);
+ x8[26] = _mm256_sub_epi32(x7[27], x7[26]);
+ x8[27] = _mm256_add_epi32(x7[27], x7[26]);
+ x8[28] = _mm256_add_epi32(x7[28], x7[29]);
+ x8[29] = _mm256_sub_epi32(x7[28], x7[29]);
+ x8[30] = _mm256_sub_epi32(x7[31], x7[30]);
+ x8[31] = _mm256_add_epi32(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+ *__rounding, cos_bit);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+ *__rounding, cos_bit);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+ *__rounding, cos_bit);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+ *__rounding, cos_bit);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+}
+static INLINE void fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
+ const int32_t *cospi,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+ __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+ __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+ __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+ __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+ __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+ __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+ __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+ __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+ __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+ __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+ __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+ __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+ __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+ __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+ __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24],
+ *__rounding, cos_bit);
+ x9[32] = _mm256_add_epi32(x8[32], x8[33]);
+ x9[33] = _mm256_sub_epi32(x8[32], x8[33]);
+ x9[34] = _mm256_sub_epi32(x8[35], x8[34]);
+ x9[35] = _mm256_add_epi32(x8[35], x8[34]);
+ x9[36] = _mm256_add_epi32(x8[36], x8[37]);
+ x9[37] = _mm256_sub_epi32(x8[36], x8[37]);
+ x9[38] = _mm256_sub_epi32(x8[39], x8[38]);
+ x9[39] = _mm256_add_epi32(x8[39], x8[38]);
+ x9[40] = _mm256_add_epi32(x8[40], x8[41]);
+ x9[41] = _mm256_sub_epi32(x8[40], x8[41]);
+ x9[42] = _mm256_sub_epi32(x8[43], x8[42]);
+ x9[43] = _mm256_add_epi32(x8[43], x8[42]);
+ x9[44] = _mm256_add_epi32(x8[44], x8[45]);
+ x9[45] = _mm256_sub_epi32(x8[44], x8[45]);
+ x9[46] = _mm256_sub_epi32(x8[47], x8[46]);
+ x9[47] = _mm256_add_epi32(x8[47], x8[46]);
+ x9[48] = _mm256_add_epi32(x8[48], x8[49]);
+ x9[49] = _mm256_sub_epi32(x8[48], x8[49]);
+ x9[50] = _mm256_sub_epi32(x8[51], x8[50]);
+ x9[51] = _mm256_add_epi32(x8[51], x8[50]);
+ x9[52] = _mm256_add_epi32(x8[52], x8[53]);
+ x9[53] = _mm256_sub_epi32(x8[52], x8[53]);
+ x9[54] = _mm256_sub_epi32(x8[55], x8[54]);
+ x9[55] = _mm256_add_epi32(x8[55], x8[54]);
+ x9[56] = _mm256_add_epi32(x8[56], x8[57]);
+ x9[57] = _mm256_sub_epi32(x8[56], x8[57]);
+ x9[58] = _mm256_sub_epi32(x8[59], x8[58]);
+ x9[59] = _mm256_add_epi32(x8[59], x8[58]);
+ x9[60] = _mm256_add_epi32(x8[60], x8[61]);
+ x9[61] = _mm256_sub_epi32(x8[60], x8[61]);
+ x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
+ x9[63] = _mm256_add_epi32(x8[63], x8[62]);
+}
+static INLINE void fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
+ const int32_t *cospi,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+ __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+ __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+ __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+ __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+ __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+ __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+ __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+ __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+ __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+ __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+ __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+ __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+ __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+ __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+ __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+ __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+ __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+ __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+ __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+ __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+ __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+ __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+ __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+ __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+ __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+ __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+ __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+ __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+ __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+ __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+ __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48],
+ *__rounding, cos_bit);
+}
+static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit,
+ const int instride, const int outstride) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+ __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+ __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+ __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+ __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+ __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+ __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+ __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+ __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+ __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+ __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+ __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+ __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+ __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+ __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+
+ int startidx = 0 * instride;
+ int endidx = 63 * instride;
+ // stage 1
+ __m256i x1[64];
+ x1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[16] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[17] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[18] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[19] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[20] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[21] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[22] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[23] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[24] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[25] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[26] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[27] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[28] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[29] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[30] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[31] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ __m256i x2[64];
+ fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+ // stage 3
+ fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+ // stage 4
+ fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+ &cospi_m48, &__rounding, cos_bit);
+ // stage 5
+ fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+ &cospi_m48, &__rounding, cos_bit);
+ // stage 6
+ fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48,
+ &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40,
+ &cospi_p24, &cospi_m24, &__rounding, cos_bit);
+ // stage 7
+ fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24,
+ &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24,
+ &__rounding, cos_bit);
+ // stage 8
+ fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit);
+ // stage 9
+ fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit);
+ // stage 10
+ fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit);
+
+ startidx = 0 * outstride;
+ endidx = 63 * outstride;
+
+ // stage 11
+ output[startidx] = x2[0];
+ output[endidx] = x2[63];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[32];
+ output[endidx] = x2[31];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[16];
+ output[endidx] = x2[47];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[48];
+ output[endidx] = x2[15];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[8];
+ output[endidx] = x2[55];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[40];
+ output[endidx] = x2[23];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[24];
+ output[endidx] = x2[39];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[56];
+ output[endidx] = x2[7];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[4];
+ output[endidx] = x2[59];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[36];
+ output[endidx] = x2[27];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[20];
+ output[endidx] = x2[43];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[52];
+ output[endidx] = x2[11];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[12];
+ output[endidx] = x2[51];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[44];
+ output[endidx] = x2[19];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[28];
+ output[endidx] = x2[35];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[60];
+ output[endidx] = x2[3];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[2];
+ output[endidx] = x2[61];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[34];
+ output[endidx] = x2[29];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[18];
+ output[endidx] = x2[45];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[50];
+ output[endidx] = x2[13];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[10];
+ output[endidx] = x2[53];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[42];
+ output[endidx] = x2[21];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[26];
+ output[endidx] = x2[37];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[58];
+ output[endidx] = x2[5];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[6];
+ output[endidx] = x2[57];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[38];
+ output[endidx] = x2[25];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[22];
+ output[endidx] = x2[41];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[54];
+ output[endidx] = x2[9];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[14];
+ output[endidx] = x2[49];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[46];
+ output[endidx] = x2[17];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[30];
+ output[endidx] = x2[33];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[62];
+ output[endidx] = x2[1];
+}
+void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m256i buf0[512], buf1[512];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct64_avx2;
+ const transform_1d_avx2 row_txfm = fdct64_avx2;
+ const int width_div16 = (width >> 4);
+ const int width_div8 = (width >> 3);
+ int r, c;
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height,
+ width_div8, 0, 0);
+ round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8);
+ round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
+ col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8);
+ col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+ width_div8);
+ round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8);
+ round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
+ }
+
+ for (r = 0; r < height; r += 8) {
+ for (c = 0; c < width_div8; c++) {
+ fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+ &buf1[c * 8 * width_div8 + (r >> 3)],
+ width_div8, width_div8);
+ }
+ }
+
+ for (int i = 0; i < 2; i++) {
+ row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8,
+ width_div16);
+ row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8,
+ width_div16);
+ round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2],
+ width_div16);
+ round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2],
+ width_div16);
+ }
+
+ store_buffer_avx2(buf0, output, 8, 128);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
new file mode 100644
index 0000000000..158b4ae439
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -0,0 +1,2629 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+static INLINE void store_output_w4(int32_t *const out, const __m128i *const in,
+ const int stride, const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+ }
+}
+
+void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) {
+ __m128i in[4];
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+ // Convert to int32_t.
+ __m128i op[4];
+ op[0] = _mm_cvtepi16_epi32(in[0]);
+ op[1] = _mm_cvtepi16_epi32(in[1]);
+ op[2] = _mm_cvtepi16_epi32(in[2]);
+ op[3] = _mm_cvtepi16_epi32(in[3]);
+
+ for (int i = 0; i < 2; ++i) {
+ __m128i a1 = op[0];
+ __m128i b1 = op[1];
+ __m128i c1 = op[2];
+ __m128i d1 = op[3];
+ __m128i e1;
+
+ a1 = _mm_add_epi32(a1, b1); // a1 += b1
+ d1 = _mm_sub_epi32(d1, c1); // d1 = d1 - c1
+ e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1
+ e1 = _mm_srai_epi32(e1, 1);
+ b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1
+ c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1
+ a1 = _mm_sub_epi32(a1, c1); // a1 -= c1
+ d1 = _mm_add_epi32(d1, b1); // d1 += b1
+
+ op[0] = a1;
+ op[1] = c1;
+ op[2] = d1;
+ op[3] = b1;
+
+ if (i == 0) {
+ transpose_32bit_4x4(op, op);
+ }
+ }
+
+ op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT);
+ op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT);
+ op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT);
+ op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT);
+
+ _mm_storeu_si128((__m128i *)(output + 0), op[0]);
+ _mm_storeu_si128((__m128i *)(output + 4), op[1]);
+ _mm_storeu_si128((__m128i *)(output + 8), op[2]);
+ _mm_storeu_si128((__m128i *)(output + 12), op[3]);
+}
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ if (!flipud) {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ } else {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+ in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+ in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+ in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+ }
+
+ in[0] = _mm_cvtepi16_epi32(in[0]);
+ in[1] = _mm_cvtepi16_epi32(in[1]);
+ in[2] = _mm_cvtepi16_epi32(in[2]);
+ in[3] = _mm_cvtepi16_epi32(in[3]);
+
+ in[0] = _mm_slli_epi32(in[0], shift);
+ in[1] = _mm_slli_epi32(in[1], shift);
+ in[2] = _mm_slli_epi32(in[2], shift);
+ in[3] = _mm_slli_epi32(in[3], shift);
+}
+
+// We only use stage-2 bit;
+// shift[0] is used in load_buffer_4x4()
+// shift[1] is used in txfm_func_col()
+// shift[2] is used in txfm_func_row()
+static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int num_col) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i s0, s1, s2, s3;
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ int endidx = 3 * num_col;
+ s0 = _mm_add_epi32(in[0], in[endidx]);
+ s3 = _mm_sub_epi32(in[0], in[endidx]);
+ endidx -= num_col;
+ s1 = _mm_add_epi32(in[num_col], in[endidx]);
+ s2 = _mm_sub_epi32(in[num_col], in[endidx]);
+
+ // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
+ u0 = _mm_mullo_epi32(s0, cospi32);
+ u1 = _mm_mullo_epi32(s1, cospi32);
+ u2 = _mm_add_epi32(u0, u1);
+ v0 = _mm_sub_epi32(u0, u1);
+
+ u3 = _mm_add_epi32(u2, rnding);
+ v1 = _mm_add_epi32(v0, rnding);
+
+ u0 = _mm_srai_epi32(u3, bit);
+ u2 = _mm_srai_epi32(v1, bit);
+
+ // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
+ v0 = _mm_mullo_epi32(s2, cospi48);
+ v1 = _mm_mullo_epi32(s3, cospi16);
+ v2 = _mm_add_epi32(v0, v1);
+
+ v3 = _mm_add_epi32(v2, rnding);
+ u1 = _mm_srai_epi32(v3, bit);
+
+ v0 = _mm_mullo_epi32(s2, cospi16);
+ v1 = _mm_mullo_epi32(s3, cospi48);
+ v2 = _mm_sub_epi32(v1, v0);
+
+ v3 = _mm_add_epi32(v2, rnding);
+ u3 = _mm_srai_epi32(v3, bit);
+
+ // Note: shift[1] and shift[2] are zeros
+
+ out[0] = u0;
+ out[1] = u1;
+ out[2] = u2;
+ out[3] = u3;
+}
+
+static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
+ _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int num_col) {
+ const int32_t *sinpi = sinpi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+ const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+ const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+ const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+ __m128i t;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i x0, x1, x2, x3;
+ __m128i u0, u1, u2, u3;
+
+ int idx = 0 * num_col;
+ s0 = _mm_mullo_epi32(in[idx], sinpi1);
+ s1 = _mm_mullo_epi32(in[idx], sinpi4);
+ t = _mm_add_epi32(in[idx], in[idx + num_col]);
+ idx += num_col;
+ s2 = _mm_mullo_epi32(in[idx], sinpi2);
+ s3 = _mm_mullo_epi32(in[idx], sinpi1);
+ idx += num_col;
+ s4 = _mm_mullo_epi32(in[idx], sinpi3);
+ idx += num_col;
+ s5 = _mm_mullo_epi32(in[idx], sinpi4);
+ s6 = _mm_mullo_epi32(in[idx], sinpi2);
+ s7 = _mm_sub_epi32(t, in[idx]);
+
+ t = _mm_add_epi32(s0, s2);
+ x0 = _mm_add_epi32(t, s5);
+ x1 = _mm_mullo_epi32(s7, sinpi3);
+ t = _mm_sub_epi32(s1, s3);
+ x2 = _mm_add_epi32(t, s6);
+ x3 = s4;
+
+ s0 = _mm_add_epi32(x0, x3);
+ s1 = x1;
+ s2 = _mm_sub_epi32(x2, x3);
+ t = _mm_sub_epi32(x2, x0);
+ s3 = _mm_add_epi32(t, x3);
+
+ u0 = _mm_add_epi32(s0, rnding);
+ u0 = _mm_srai_epi32(u0, bit);
+
+ u1 = _mm_add_epi32(s1, rnding);
+ u1 = _mm_srai_epi32(u1, bit);
+
+ u2 = _mm_add_epi32(s2, rnding);
+ u2 = _mm_srai_epi32(u2, bit);
+
+ u3 = _mm_add_epi32(s3, rnding);
+ u3 = _mm_srai_epi32(u3, bit);
+
+ out[0] = u0;
+ out[1] = u1;
+ out[2] = u2;
+ out[3] = u3;
+}
+static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+ (void)bit;
+ __m128i fact = _mm_set1_epi32(NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a_low;
+
+ for (int i = 0; i < 4; i++) {
+ a_low = _mm_mullo_epi32(in[i * col_num], fact);
+ a_low = _mm_add_epi32(a_low, offset);
+ out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
+ }
+}
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
+ int input_stride, TX_TYPE tx_type, int bd) {
+ __m128i in[4];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+ const int txw_idx = get_txw_idx(TX_4X4);
+ const int txh_idx = get_txh_idx(TX_4X4);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_DCT:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_ADST:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case IDTX:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case V_DCT:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case H_DCT:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case V_ADST:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case H_ADST:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ case H_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+ idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ transpose_32bit_4x4(in, in);
+ fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+ write_buffer_4x4(in, coeff);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i u;
+ if (!flipud) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ } else {
+ in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+ in[4] = mm_reverse_epi16(in[4]);
+ in[5] = mm_reverse_epi16(in[5]);
+ in[6] = mm_reverse_epi16(in[6]);
+ in[7] = mm_reverse_epi16(in[7]);
+ }
+
+ u = _mm_unpackhi_epi64(in[4], in[4]);
+ in[8] = _mm_cvtepi16_epi32(in[4]);
+ in[9] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[5], in[5]);
+ in[10] = _mm_cvtepi16_epi32(in[5]);
+ in[11] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[6], in[6]);
+ in[12] = _mm_cvtepi16_epi32(in[6]);
+ in[13] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[7], in[7]);
+ in[14] = _mm_cvtepi16_epi32(in[7]);
+ in[15] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[3], in[3]);
+ in[6] = _mm_cvtepi16_epi32(in[3]);
+ in[7] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[2], in[2]);
+ in[4] = _mm_cvtepi16_epi32(in[2]);
+ in[5] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[1], in[1]);
+ in[2] = _mm_cvtepi16_epi32(in[1]);
+ in[3] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[0], in[0]);
+ in[0] = _mm_cvtepi16_epi32(in[0]);
+ in[1] = _mm_cvtepi16_epi32(u);
+
+ in[0] = _mm_slli_epi32(in[0], shift);
+ in[1] = _mm_slli_epi32(in[1], shift);
+ in[2] = _mm_slli_epi32(in[2], shift);
+ in[3] = _mm_slli_epi32(in[3], shift);
+ in[4] = _mm_slli_epi32(in[4], shift);
+ in[5] = _mm_slli_epi32(in[5], shift);
+ in[6] = _mm_slli_epi32(in[6], shift);
+ in[7] = _mm_slli_epi32(in[7], shift);
+
+ in[8] = _mm_slli_epi32(in[8], shift);
+ in[9] = _mm_slli_epi32(in[9], shift);
+ in[10] = _mm_slli_epi32(in[10], shift);
+ in[11] = _mm_slli_epi32(in[11], shift);
+ in[12] = _mm_slli_epi32(in[12], shift);
+ in[13] = _mm_slli_epi32(in[13], shift);
+ in[14] = _mm_slli_epi32(in[14], shift);
+ in[15] = _mm_slli_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm_add_epi32(in[0], rounding);
+ in[1] = _mm_add_epi32(in[1], rounding);
+ in[2] = _mm_add_epi32(in[2], rounding);
+ in[3] = _mm_add_epi32(in[3], rounding);
+ in[4] = _mm_add_epi32(in[4], rounding);
+ in[5] = _mm_add_epi32(in[5], rounding);
+ in[6] = _mm_add_epi32(in[6], rounding);
+ in[7] = _mm_add_epi32(in[7], rounding);
+ in[8] = _mm_add_epi32(in[8], rounding);
+ in[9] = _mm_add_epi32(in[9], rounding);
+ in[10] = _mm_add_epi32(in[10], rounding);
+ in[11] = _mm_add_epi32(in[11], rounding);
+ in[12] = _mm_add_epi32(in[12], rounding);
+ in[13] = _mm_add_epi32(in[13], rounding);
+ in[14] = _mm_add_epi32(in[14], rounding);
+ in[15] = _mm_add_epi32(in[15], rounding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+ in[4] = _mm_srai_epi32(in[4], shift);
+ in[5] = _mm_srai_epi32(in[5], shift);
+ in[6] = _mm_srai_epi32(in[6], shift);
+ in[7] = _mm_srai_epi32(in[7], shift);
+ in[8] = _mm_srai_epi32(in[8], shift);
+ in[9] = _mm_srai_epi32(in[9], shift);
+ in[10] = _mm_srai_epi32(in[10], shift);
+ in[11] = _mm_srai_epi32(in[11], shift);
+ in[12] = _mm_srai_epi32(in[12], shift);
+ in[13] = _mm_srai_epi32(in[13], shift);
+ in[14] = _mm_srai_epi32(in[14], shift);
+ in[15] = _mm_srai_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) {
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm_add_epi32(in[0], rounding);
+ in[1] = _mm_add_epi32(in[1], rounding);
+ in[2] = _mm_add_epi32(in[2], rounding);
+ in[3] = _mm_add_epi32(in[3], rounding);
+ in[4] = _mm_add_epi32(in[4], rounding);
+ in[5] = _mm_add_epi32(in[5], rounding);
+ in[6] = _mm_add_epi32(in[6], rounding);
+ in[7] = _mm_add_epi32(in[7], rounding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+ in[4] = _mm_srai_epi32(in[4], shift);
+ in[5] = _mm_srai_epi32(in[5], shift);
+ in[6] = _mm_srai_epi32(in[6], shift);
+ in[7] = _mm_srai_epi32(in[7], shift);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
+ _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+
+ _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
+ _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
+ _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
+ _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
+
+ _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
+ _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
+ _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
+ _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
+
+ _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
+ _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
+ _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
+ _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
+}
+
+static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
+ const int stride) {
+ _mm_storeu_si128((__m128i *)(output), res[0]);
+ _mm_storeu_si128((__m128i *)(output + 4), res[1]);
+ _mm_storeu_si128((__m128i *)(output + stride), res[2]);
+ _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
+}
+
+static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u[8], v[8];
+
+ int startidx = 0 * col_num;
+ int endidx = 7 * col_num;
+ // Even 8 points 0, 2, ..., 14
+ // stage 0
+ // stage 1
+ u[0] = _mm_add_epi32(in[startidx], in[endidx]);
+ v[7] = _mm_sub_epi32(in[startidx], in[endidx]); // v[7]
+ startidx += col_num;
+ endidx -= col_num;
+ u[1] = _mm_add_epi32(in[startidx], in[endidx]);
+ u[6] = _mm_sub_epi32(in[startidx], in[endidx]);
+ startidx += col_num;
+ endidx -= col_num;
+ u[2] = _mm_add_epi32(in[startidx], in[endidx]);
+ u[5] = _mm_sub_epi32(in[startidx], in[endidx]);
+ startidx += col_num;
+ endidx -= col_num;
+ u[3] = _mm_add_epi32(in[startidx], in[endidx]);
+ v[4] = _mm_sub_epi32(in[startidx], in[endidx]); // v[4]
+
+ // stage 2
+ v[0] = _mm_add_epi32(u[0], u[3]);
+ v[3] = _mm_sub_epi32(u[0], u[3]);
+ v[1] = _mm_add_epi32(u[1], u[2]);
+ v[2] = _mm_sub_epi32(u[1], u[2]);
+
+ v[5] = _mm_mullo_epi32(u[5], cospim32);
+ v[6] = _mm_mullo_epi32(u[6], cospi32);
+ v[5] = _mm_add_epi32(v[5], v[6]);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ u[0] = _mm_mullo_epi32(u[5], cospi32);
+ v[6] = _mm_mullo_epi32(u[6], cospim32);
+ v[6] = _mm_sub_epi32(u[0], v[6]);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ // stage 3
+ // type 0
+ v[0] = _mm_mullo_epi32(v[0], cospi32);
+ v[1] = _mm_mullo_epi32(v[1], cospi32);
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_sub_epi32(v[0], v[1]);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // type 1
+ v[0] = _mm_mullo_epi32(v[2], cospi48);
+ v[1] = _mm_mullo_epi32(v[3], cospi16);
+ u[2] = _mm_add_epi32(v[0], v[1]);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ v[0] = _mm_mullo_epi32(v[2], cospi16);
+ v[1] = _mm_mullo_epi32(v[3], cospi48);
+ u[3] = _mm_sub_epi32(v[1], v[0]);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ u[4] = _mm_add_epi32(v[4], v[5]);
+ u[5] = _mm_sub_epi32(v[4], v[5]);
+ u[6] = _mm_sub_epi32(v[7], v[6]);
+ u[7] = _mm_add_epi32(v[7], v[6]);
+
+ // stage 4
+ // stage 5
+ v[0] = _mm_mullo_epi32(u[4], cospi56);
+ v[1] = _mm_mullo_epi32(u[7], cospi8);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[1 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[4]
+
+ v[0] = _mm_mullo_epi32(u[4], cospi8);
+ v[1] = _mm_mullo_epi32(u[7], cospi56);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[7 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[7]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi24);
+ v[1] = _mm_mullo_epi32(u[6], cospi40);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[5 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[5]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi40);
+ v[1] = _mm_mullo_epi32(u[6], cospi24);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[3 * col_num] = _mm_srai_epi32(v[0], bit); // buf0[6]
+
+ out[0 * col_num] = u[0]; // buf0[0]
+ out[4 * col_num] = u[1]; // buf0[1]
+ out[2 * col_num] = u[2]; // buf0[2]
+ out[6 * col_num] = u[3]; // buf0[3]
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ fdct4x8_sse4_1(in, out, bit, col_num);
+ fdct4x8_sse4_1(in + 1, out + 1, bit, col_num);
+}
+
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+ int col;
+
+ // Note:
+ // Even column: 0, 2, ..., 14
+ // Odd column: 1, 3, ..., 15
+ // one even column plus one odd column constructs one row (8 coeffs)
+ // total we have 8 rows (8x8).
+ for (col = 0; col < col_num; ++col) {
+ // stage 0
+ // stage 1
+ u0 = in[col_num * 0 + col];
+ u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
+ u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
+ u3 = in[col_num * 4 + col];
+ u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
+ u5 = in[col_num * 6 + col];
+ u6 = in[col_num * 2 + col];
+ u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
+
+ // stage 2
+ v0 = u0;
+ v1 = u1;
+
+ x = _mm_mullo_epi32(u2, cospi32);
+ y = _mm_mullo_epi32(u3, cospi32);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ v3 = _mm_sub_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ v4 = u4;
+ v5 = u5;
+
+ x = _mm_mullo_epi32(u6, cospi32);
+ y = _mm_mullo_epi32(u7, cospi32);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ v7 = _mm_sub_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 3
+ u0 = _mm_add_epi32(v0, v2);
+ u1 = _mm_add_epi32(v1, v3);
+ u2 = _mm_sub_epi32(v0, v2);
+ u3 = _mm_sub_epi32(v1, v3);
+ u4 = _mm_add_epi32(v4, v6);
+ u5 = _mm_add_epi32(v5, v7);
+ u6 = _mm_sub_epi32(v4, v6);
+ u7 = _mm_sub_epi32(v5, v7);
+
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
+
+ x = _mm_mullo_epi32(u4, cospi16);
+ y = _mm_mullo_epi32(u5, cospi48);
+ v4 = _mm_add_epi32(x, y);
+ v4 = _mm_add_epi32(v4, rnding);
+ v4 = _mm_srai_epi32(v4, bit);
+
+ x = _mm_mullo_epi32(u4, cospi48);
+ y = _mm_mullo_epi32(u5, cospim16);
+ v5 = _mm_add_epi32(x, y);
+ v5 = _mm_add_epi32(v5, rnding);
+ v5 = _mm_srai_epi32(v5, bit);
+
+ x = _mm_mullo_epi32(u6, cospim48);
+ y = _mm_mullo_epi32(u7, cospi16);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ x = _mm_mullo_epi32(u6, cospi16);
+ y = _mm_mullo_epi32(u7, cospi48);
+ v7 = _mm_add_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 5
+ u0 = _mm_add_epi32(v0, v4);
+ u1 = _mm_add_epi32(v1, v5);
+ u2 = _mm_add_epi32(v2, v6);
+ u3 = _mm_add_epi32(v3, v7);
+ u4 = _mm_sub_epi32(v0, v4);
+ u5 = _mm_sub_epi32(v1, v5);
+ u6 = _mm_sub_epi32(v2, v6);
+ u7 = _mm_sub_epi32(v3, v7);
+
+ // stage 6
+ x = _mm_mullo_epi32(u0, cospi4);
+ y = _mm_mullo_epi32(u1, cospi60);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ x = _mm_mullo_epi32(u0, cospi60);
+ y = _mm_mullo_epi32(u1, cospim4);
+ v1 = _mm_add_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi20);
+ y = _mm_mullo_epi32(u3, cospi44);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi44);
+ y = _mm_mullo_epi32(u3, cospim20);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ x = _mm_mullo_epi32(u4, cospi36);
+ y = _mm_mullo_epi32(u5, cospi28);
+ v4 = _mm_add_epi32(x, y);
+ v4 = _mm_add_epi32(v4, rnding);
+ v4 = _mm_srai_epi32(v4, bit);
+
+ x = _mm_mullo_epi32(u4, cospi28);
+ y = _mm_mullo_epi32(u5, cospim36);
+ v5 = _mm_add_epi32(x, y);
+ v5 = _mm_add_epi32(v5, rnding);
+ v5 = _mm_srai_epi32(v5, bit);
+
+ x = _mm_mullo_epi32(u6, cospi52);
+ y = _mm_mullo_epi32(u7, cospi12);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ x = _mm_mullo_epi32(u6, cospi12);
+ y = _mm_mullo_epi32(u7, cospim52);
+ v7 = _mm_add_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 7
+ out[col_num * 0 + col] = v1;
+ out[col_num * 1 + col] = v6;
+ out[col_num * 2 + col] = v3;
+ out[col_num * 3 + col] = v4;
+ out[col_num * 4 + col] = v5;
+ out[col_num * 5 + col] = v2;
+ out[col_num * 6 + col] = v7;
+ out[col_num * 7 + col] = v0;
+ }
+}
+static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+ (void)bit;
+
+ for (int i = 0; i < col_num; i += 1) {
+ out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]);
+ out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]);
+ out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]);
+ out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]);
+ out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]);
+ out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]);
+ out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]);
+ out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]);
+ }
+}
+#if !CONFIG_REALTIME_ONLY
+static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+ (void)bit;
+ (void)col_num;
+ for (int j = 0; j < 2; j++) {
+ out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]);
+ out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]);
+ out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]);
+ out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]);
+ out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]);
+ out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]);
+ out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]);
+ out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]);
+ }
+}
+#endif
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m128i in[16], out[16];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case IDTX:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case V_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case H_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case V_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case H_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case V_FLIPADST:
+ load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ case H_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+ idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+ write_buffer_8x8(out, coeff);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+ int row_index = 0;
+ int dst_index = 0;
+ int src_index = 0;
+
+ // row 0, 1, .., 7
+ do {
+ out[dst_index] = in[src_index];
+ out[dst_index + 1] = in[src_index + 1];
+ out[dst_index + 2] = in[src_index + 16];
+ out[dst_index + 3] = in[src_index + 17];
+ dst_index += 4;
+ src_index += 2;
+ row_index += 1;
+ } while (row_index < 8);
+
+ // row 8, 9, ..., 15
+ src_index += 16;
+ do {
+ out[dst_index] = in[src_index];
+ out[dst_index + 1] = in[src_index + 1];
+ out[dst_index + 2] = in[src_index + 16];
+ out[dst_index + 3] = in[src_index + 17];
+ dst_index += 4;
+ src_index += 2;
+ row_index += 1;
+ } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i in[64];
+ // Load 4 8x8 blocks
+ const int16_t *topL = input;
+ const int16_t *topR = input + 8;
+ const int16_t *botL = input + 8 * stride;
+ const int16_t *botR = input + 8 * stride + 8;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ // Swap left columns
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ // Swap right columns
+ tmp = topR;
+ topR = botR;
+ botR = tmp;
+ }
+
+ if (fliplr) {
+ // Swap top rows
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ // Swap bottom rows
+ tmp = botL;
+ botL = botR;
+ botR = tmp;
+ }
+
+ // load first 8 columns
+ load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+ load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+ // load second 8 columns
+ load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+ load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+ convert_8x8_to_16x16(in, out);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 8 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+
+ load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *topR = input + 4;
+
+ const int16_t *tmp;
+
+ if (fliplr) {
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ }
+
+ load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *topR = input + 8;
+
+ const int16_t *tmp;
+
+ if (fliplr) {
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ }
+
+ load_buffer_8x4(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 4 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+
+ load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out,
+ const int stride, const int flipud,
+ const int fliplr, const int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 8 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+ load_buffer_4x8(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift);
+}
+#endif
+
+static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift, const int height) {
+ const int16_t *in = input;
+ __m128i *output = out;
+ for (int col = 0; col < height; col++) {
+ in = input + col * stride;
+ output = out + col * 8;
+ load_buffer_4x4(in, output, 4, flipud, fliplr, shift);
+ load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift);
+ }
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u[16], v[16], x;
+ int col;
+
+ // Calculate the column 0, 1, 2, 3
+ for (col = 0; col < col_num; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+ u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+ // stage 2
+ v[0] = _mm_add_epi32(u[0], u[7]);
+ v[7] = _mm_sub_epi32(u[0], u[7]);
+ v[1] = _mm_add_epi32(u[1], u[6]);
+ v[6] = _mm_sub_epi32(u[1], u[6]);
+ v[2] = _mm_add_epi32(u[2], u[5]);
+ v[5] = _mm_sub_epi32(u[2], u[5]);
+ v[3] = _mm_add_epi32(u[3], u[4]);
+ v[4] = _mm_sub_epi32(u[3], u[4]);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ v[10] = _mm_mullo_epi32(u[10], cospim32);
+ x = _mm_mullo_epi32(u[13], cospi32);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[13], cospim32);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = _mm_mullo_epi32(u[11], cospim32);
+ x = _mm_mullo_epi32(u[12], cospi32);
+ v[11] = _mm_add_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[11], cospi32);
+ x = _mm_mullo_epi32(u[12], cospim32);
+ v[12] = _mm_sub_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 3
+ u[0] = _mm_add_epi32(v[0], v[3]);
+ u[3] = _mm_sub_epi32(v[0], v[3]);
+ u[1] = _mm_add_epi32(v[1], v[2]);
+ u[2] = _mm_sub_epi32(v[1], v[2]);
+ u[4] = v[4];
+
+ u[5] = _mm_mullo_epi32(v[5], cospim32);
+ x = _mm_mullo_epi32(v[6], cospi32);
+ u[5] = _mm_add_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[5], cospi32);
+ x = _mm_mullo_epi32(v[6], cospim32);
+ u[6] = _mm_sub_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ u[8] = _mm_add_epi32(v[8], v[11]);
+ u[11] = _mm_sub_epi32(v[8], v[11]);
+ u[9] = _mm_add_epi32(v[9], v[10]);
+ u[10] = _mm_sub_epi32(v[9], v[10]);
+ u[12] = _mm_sub_epi32(v[15], v[12]);
+ u[15] = _mm_add_epi32(v[15], v[12]);
+ u[13] = _mm_sub_epi32(v[14], v[13]);
+ u[14] = _mm_add_epi32(v[14], v[13]);
+
+ // stage 4
+ u[0] = _mm_mullo_epi32(u[0], cospi32);
+ u[1] = _mm_mullo_epi32(u[1], cospi32);
+ v[0] = _mm_add_epi32(u[0], u[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_sub_epi32(u[0], u[1]);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = _mm_mullo_epi32(u[2], cospi48);
+ x = _mm_mullo_epi32(u[3], cospi16);
+ v[2] = _mm_add_epi32(v[2], x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_mullo_epi32(u[2], cospi16);
+ x = _mm_mullo_epi32(u[3], cospi48);
+ v[3] = _mm_sub_epi32(x, v[3]);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = _mm_add_epi32(u[4], u[5]);
+ v[5] = _mm_sub_epi32(u[4], u[5]);
+ v[6] = _mm_sub_epi32(u[7], u[6]);
+ v[7] = _mm_add_epi32(u[7], u[6]);
+ v[8] = u[8];
+
+ v[9] = _mm_mullo_epi32(u[9], cospim16);
+ x = _mm_mullo_epi32(u[14], cospi48);
+ v[9] = _mm_add_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[14] = _mm_mullo_epi32(u[9], cospi48);
+ x = _mm_mullo_epi32(u[14], cospim16);
+ v[14] = _mm_sub_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospim48);
+ x = _mm_mullo_epi32(u[13], cospim16);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospim16);
+ x = _mm_mullo_epi32(u[13], cospim48);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = u[11];
+ v[12] = u[12];
+ v[15] = u[15];
+
+ // stage 5
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi56);
+ x = _mm_mullo_epi32(v[7], cospi8);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[7] = _mm_mullo_epi32(v[4], cospi8);
+ x = _mm_mullo_epi32(v[7], cospi56);
+ u[7] = _mm_sub_epi32(x, u[7]);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ u[5] = _mm_mullo_epi32(v[5], cospi24);
+ x = _mm_mullo_epi32(v[6], cospi40);
+ u[5] = _mm_add_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[5], cospi40);
+ x = _mm_mullo_epi32(v[6], cospi24);
+ u[6] = _mm_sub_epi32(x, u[6]);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[8] = _mm_add_epi32(v[8], v[9]);
+ u[9] = _mm_sub_epi32(v[8], v[9]);
+ u[10] = _mm_sub_epi32(v[11], v[10]);
+ u[11] = _mm_add_epi32(v[11], v[10]);
+ u[12] = _mm_add_epi32(v[12], v[13]);
+ u[13] = _mm_sub_epi32(v[12], v[13]);
+ u[14] = _mm_sub_epi32(v[15], v[14]);
+ u[15] = _mm_add_epi32(v[15], v[14]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm_mullo_epi32(u[8], cospi60);
+ x = _mm_mullo_epi32(u[15], cospi4);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[15] = _mm_mullo_epi32(u[8], cospi4);
+ x = _mm_mullo_epi32(u[15], cospi60);
+ v[15] = _mm_sub_epi32(x, v[15]);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ v[9] = _mm_mullo_epi32(u[9], cospi28);
+ x = _mm_mullo_epi32(u[14], cospi36);
+ v[9] = _mm_add_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[14] = _mm_mullo_epi32(u[9], cospi36);
+ x = _mm_mullo_epi32(u[14], cospi28);
+ v[14] = _mm_sub_epi32(x, v[14]);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospi44);
+ x = _mm_mullo_epi32(u[13], cospi20);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospi20);
+ x = _mm_mullo_epi32(u[13], cospi44);
+ v[13] = _mm_sub_epi32(x, v[13]);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = _mm_mullo_epi32(u[11], cospi12);
+ x = _mm_mullo_epi32(u[12], cospi52);
+ v[11] = _mm_add_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[11], cospi52);
+ x = _mm_mullo_epi32(u[12], cospi12);
+ v[12] = _mm_sub_epi32(x, v[12]);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ out[0 * col_num + col] = v[0];
+ out[1 * col_num + col] = v[8];
+ out[2 * col_num + col] = v[4];
+ out[3 * col_num + col] = v[12];
+ out[4 * col_num + col] = v[2];
+ out[5 * col_num + col] = v[10];
+ out[6 * col_num + col] = v[6];
+ out[7 * col_num + col] = v[14];
+ out[8 * col_num + col] = v[1];
+ out[9 * col_num + col] = v[9];
+ out[10 * col_num + col] = v[5];
+ out[11 * col_num + col] = v[13];
+ out[12 * col_num + col] = v[3];
+ out[13 * col_num + col] = v[11];
+ out[14 * col_num + col] = v[7];
+ out[15 * col_num + col] = v[15];
+ }
+}
+
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int num_cols) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i u[16], v[16], x, y;
+ int col;
+
+ for (col = 0; col < num_cols; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = in[0 * num_cols + col];
+ u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
+ u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
+ u[3] = in[8 * num_cols + col];
+ u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
+ u[5] = in[12 * num_cols + col];
+ u[6] = in[4 * num_cols + col];
+ u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
+ u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
+ u[9] = in[14 * num_cols + col];
+ u[10] = in[6 * num_cols + col];
+ u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
+ u[12] = in[2 * num_cols + col];
+ u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
+ u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
+ u[15] = in[10 * num_cols + col];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+
+ x = _mm_mullo_epi32(u[2], cospi32);
+ y = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(x, y);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(x, y);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ x = _mm_mullo_epi32(u[6], cospi32);
+ y = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(x, y);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(x, y);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(x, y);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(x, y);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ x = _mm_mullo_epi32(u[14], cospi32);
+ y = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(x, y);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(x, y);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 3
+ u[0] = _mm_add_epi32(v[0], v[2]);
+ u[1] = _mm_add_epi32(v[1], v[3]);
+ u[2] = _mm_sub_epi32(v[0], v[2]);
+ u[3] = _mm_sub_epi32(v[1], v[3]);
+ u[4] = _mm_add_epi32(v[4], v[6]);
+ u[5] = _mm_add_epi32(v[5], v[7]);
+ u[6] = _mm_sub_epi32(v[4], v[6]);
+ u[7] = _mm_sub_epi32(v[5], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[10]);
+ u[9] = _mm_add_epi32(v[9], v[11]);
+ u[10] = _mm_sub_epi32(v[8], v[10]);
+ u[11] = _mm_sub_epi32(v[9], v[11]);
+ u[12] = _mm_add_epi32(v[12], v[14]);
+ u[13] = _mm_add_epi32(v[13], v[15]);
+ u[14] = _mm_sub_epi32(v[12], v[14]);
+ u[15] = _mm_sub_epi32(v[13], v[15]);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+ v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+ v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+ v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+ v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+ // stage 5
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+ // stage 7
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ // stage 8
+ v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+ v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+ v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+ v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+ v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+ v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+ v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+ v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+ v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+ // stage 9
+ out[0 * num_cols + col] = v[1];
+ out[1 * num_cols + col] = v[14];
+ out[2 * num_cols + col] = v[3];
+ out[3 * num_cols + col] = v[12];
+ out[4 * num_cols + col] = v[5];
+ out[5 * num_cols + col] = v[10];
+ out[6 * num_cols + col] = v[7];
+ out[7 * num_cols + col] = v[8];
+ out[8 * num_cols + col] = v[9];
+ out[9 * num_cols + col] = v[6];
+ out[10 * num_cols + col] = v[11];
+ out[11 * num_cols + col] = v[4];
+ out[12 * num_cols + col] = v[13];
+ out[13 * num_cols + col] = v[2];
+ out[14 * num_cols + col] = v[15];
+ out[15 * num_cols + col] = v[0];
+ }
+}
+
+static void col_txfm_16x16_rounding(__m128i *in, int shift) {
+ // Note:
+ // We split 16x16 rounding into 4 sections of 8x8 rounding,
+ // instead of 4 columns
+ col_txfm_8x8_rounding(&in[0], shift);
+ col_txfm_8x8_rounding(&in[16], shift);
+ col_txfm_8x8_rounding(&in[32], shift);
+ col_txfm_8x8_rounding(&in[48], shift);
+}
+
+static void col_txfm_8x16_rounding(__m128i *in, int shift) {
+ col_txfm_8x8_rounding(&in[0], shift);
+ col_txfm_8x8_rounding(&in[16], shift);
+}
+
+static void write_buffer_16x16(const __m128i *in, int32_t *output) {
+ const int size_8x8 = 16 * 4;
+ write_buffer_8x8(&in[0], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[16], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[32], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[48], output);
+}
+static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+ (void)bit;
+ __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a_low;
+
+ int num_iters = 16 * col_num;
+ for (int i = 0; i < num_iters; i++) {
+ a_low = _mm_mullo_epi32(in[i], fact);
+ a_low = _mm_add_epi32(a_low, offset);
+ out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
+ }
+}
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[64], out[64];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+ const int txw_idx = get_txw_idx(TX_16X16);
+ const int txh_idx = get_txh_idx(TX_16X16);
+ const int col_num = 4;
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case ADST_DCT:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case DCT_ADST:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case ADST_ADST:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case IDTX:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case V_DCT:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case H_DCT:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case V_ADST:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case H_ADST:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case V_FLIPADST:
+ load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+ col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ case H_FLIPADST:
+ load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+ idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+ col_num);
+ write_buffer_16x16(out, coeff);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
+ for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+ for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ fadst8x8_sse4_1, // ADST_DCT
+ fdct8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fadst8x8_sse4_1, // FLIPADST_DCT
+ fdct8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ idtx8x8_sse4_1, // IDTX
+ fdct8x8_sse4_1, // V_DCT
+ idtx8x8_sse4_1, // H_DCT
+ fadst8x8_sse4_1, // V_ADST
+ idtx8x8_sse4_1, // H_ADST
+ fadst8x8_sse4_1, // V_FLIPADST
+ idtx8x8_sse4_1 // H_FLIPADST
+};
+#if !CONFIG_REALTIME_ONLY
+static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST-ADST
+ idtx32x8_sse4_1, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL, // H_FLIPADST
+};
+#endif
+static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_sse4_1, // DCT_DCT
+ fadst8x8_sse4_1, // ADST_DCT
+ fdct4x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fadst8x8_sse4_1, // FLIPADST_DCT
+ fdct4x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ idtx8x8_sse4_1, // IDTX
+ fdct4x8_sse4_1, // V_DCT
+ idtx8x8_sse4_1, // H_DCT
+ fadst8x8_sse4_1, // V_ADST
+ idtx8x8_sse4_1, // H_ADST
+ fadst8x8_sse4_1, // V_FLIPADST
+ idtx8x8_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ fdct16x16_sse4_1, // ADST_DCT
+ fadst16x16_sse4_1, // DCT_ADST
+ fadst16x16_sse4_1, // ADST_ADST
+ fdct16x16_sse4_1, // FLIPADST_DCT
+ fadst16x16_sse4_1, // DCT_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_FLIPADST
+ fadst16x16_sse4_1, // ADST_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_ADST
+ idtx16x16_sse4_1, // IDTX
+ idtx16x16_sse4_1, // V_DCT
+ fdct16x16_sse4_1, // H_DCT
+ idtx16x16_sse4_1, // V_ADST
+ fadst16x16_sse4_1, // H_ADST
+ idtx16x16_sse4_1, // V_FLIPADST
+ fadst16x16_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ fadst16x16_sse4_1, // ADST_DCT
+ fdct16x16_sse4_1, // DCT_ADST
+ fadst16x16_sse4_1, // ADST_ADST
+ fadst16x16_sse4_1, // FLIPADST_DCT
+ fdct16x16_sse4_1, // DCT_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_FLIPADST
+ fadst16x16_sse4_1, // ADST_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_ADST
+ idtx16x16_sse4_1, // IDTX
+ fdct16x16_sse4_1, // V_DCT
+ idtx16x16_sse4_1, // H_DCT
+ fadst16x16_sse4_1, // V_ADST
+ idtx16x16_sse4_1, // H_ADST
+ fadst16x16_sse4_1, // V_FLIPADST
+ idtx16x16_sse4_1 // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ fdct8x8_sse4_1, // ADST_DCT
+ fadst8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fdct8x8_sse4_1, // FLIPADST_DCT
+ fadst8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ idtx8x8_sse4_1, // IDTX
+ idtx8x8_sse4_1, // V_DCT
+ fdct8x8_sse4_1, // H_DCT
+ idtx8x8_sse4_1, // V_ADST
+ fadst8x8_sse4_1, // H_ADST
+ idtx8x8_sse4_1, // V_FLIPADST
+ fadst8x8_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_sse4_1, // DCT_DCT
+ fdct4x8_sse4_1, // ADST_DCT
+ fadst8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fdct4x8_sse4_1, // FLIPADST_DCT
+ fadst8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ idtx8x8_sse4_1, // IDTX
+ idtx8x8_sse4_1, // V_DCT
+ fdct4x8_sse4_1, // H_DCT
+ idtx8x8_sse4_1, // V_ADST
+ fadst8x8_sse4_1, // H_ADST
+ idtx8x8_sse4_1, // V_FLIPADST
+ fadst8x8_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_sse4_1, // DCT_DCT
+ fdct4x4_sse4_1, // ADST_DCT
+ fadst4x4_sse4_1, // DCT_ADST
+ fadst4x4_sse4_1, // ADST_ADST
+ fdct4x4_sse4_1, // FLIPADST_DCT
+ fadst4x4_sse4_1, // DCT_FLIPADST
+ fadst4x4_sse4_1, // FLIPADST_FLIPADST
+ fadst4x4_sse4_1, // ADST_FLIPADST
+ fadst4x4_sse4_1, // FLIPADST_ADST
+ idtx4x4_sse4_1, // IDTX
+ idtx4x4_sse4_1, // V_DCT
+ fdct4x4_sse4_1, // H_DCT
+ idtx4x4_sse4_1, // V_ADST
+ fadst4x4_sse4_1, // H_ADST
+ idtx4x4_sse4_1, // V_FLIPADST
+ fadst4x4_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_sse4_1, // DCT_DCT
+ fadst4x4_sse4_1, // ADST_DCT
+ fdct4x4_sse4_1, // DCT_ADST
+ fadst4x4_sse4_1, // ADST_ADST
+ fadst4x4_sse4_1, // FLIPADST_DCT
+ fdct4x4_sse4_1, // DCT_FLIPADST
+ fadst4x4_sse4_1, // FLIPADST_FLIPADST
+ fadst4x4_sse4_1, // ADST_FLIPADST
+ fadst4x4_sse4_1, // FLIPADST_ADST
+ idtx4x4_sse4_1, // IDTX
+ fdct4x4_sse4_1, // V_DCT
+ idtx4x4_sse4_1, // H_DCT
+ fadst4x4_sse4_1, // V_ADST
+ idtx4x4_sse4_1, // H_ADST
+ fadst4x4_sse4_1, // V_FLIPADST
+ idtx4x4_sse4_1 // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = {
+ av1_fdct32_sse4_1, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ av1_idtx32_sse4_1, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ idtx16x16_sse4_1, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[32], out[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+ int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+ col_txfm(in, in, bit, 2);
+ col_txfm_8x8_rounding(in, -shift[1]);
+ transpose_8x8(in, out + i * 16);
+ }
+
+ if (lr_flip) {
+ flip_buf_sse4_1(in, out, 32);
+ row_txfm(in, out, bit, 2);
+ } else {
+ row_txfm(out, out, bit, 2);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ av1_round_shift_rect_array_32_sse4_1(out + i * 16, in, 16, -shift[2],
+ NewSqrt2);
+ write_buffer_8x8(in, coeff + i * 64);
+ }
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[32], out[32];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+ int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, in, bit, 2);
+ col_txfm_8x16_rounding(in, -shift[1]);
+ transpose_8x8(in, out);
+ transpose_8x8(in + 16, out + 16);
+
+ for (int i = 0; i < 2; i++) {
+ row_txfm(out + i * 16, out, bit, 2);
+ av1_round_shift_rect_array_32_sse4_1(out, out, 16, -shift[2], NewSqrt2);
+ write_buffer_16x8(out, coeff + i * 8, 16);
+ }
+ (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[16];
+ __m128i *outcoeff128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+ const int txw_idx = get_txw_idx(TX_4X16);
+ const int txh_idx = get_txh_idx(TX_4X16);
+ const int txfm_size_col = tx_size_wide[TX_4X16];
+ const int txfm_size_row = tx_size_high[TX_4X16];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // col transform
+ load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, outcoeff128, bitcol, 1);
+ col_txfm_8x8_rounding(outcoeff128, -shift[1]);
+ transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+
+ // row transform
+ for (int i = 0; i < 4; i++) {
+ __m128i tmp[4];
+ row_txfm(in + i, tmp, bitrow, txfm_size_row >> 2);
+ store_output_w4(coeff + i * 4, tmp, txfm_size_row, txfm_size_col);
+ }
+ (void)bd;
+}
+#endif
+
+void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[16];
+ __m128i *outcoeff128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+ const int txw_idx = get_txw_idx(TX_16X4);
+ const int txh_idx = get_txh_idx(TX_16X4);
+ const int txfm_size_col = tx_size_wide[TX_16X4];
+ const int txfm_size_row = tx_size_high[TX_16X4];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // col transform
+ load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]);
+
+ for (int i = 0; i < (txfm_size_col >> 2); i++) {
+ __m128i *cur_in = &in[i * txfm_size_row];
+ col_txfm(cur_in, cur_in, bitcol, 1);
+ transpose_32bit_4x4(cur_in, cur_in);
+ }
+ col_txfm_8x8_rounding(in, -shift[1]);
+
+ // row transform
+ row_txfm(in, outcoeff128, bitrow, 1);
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[128];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+ const int txw_idx = get_txw_idx(TX_16X32);
+ const int txh_idx = get_txh_idx(TX_16X32);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+ // column transform
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
+
+ for (int i = 0; i < 4; i++) {
+ col_txfm((in + i), (in + i), bitcol, 4);
+ }
+ col_txfm_16x16_rounding(&in[0], -shift[1]);
+ col_txfm_16x16_rounding(&in[64], -shift[1]);
+ transpose_8nx8n(in, outcoef128, 16, 32);
+
+ // row transform
+ row_txfm(outcoef128, in, bitrow, 8);
+ av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2],
+ NewSqrt2);
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)tx_type;
+ __m128i in[512];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
+ const int txw_idx = get_txw_idx(TX_32X64);
+ const int txh_idx = get_txh_idx(TX_32X64);
+ const int txfm_size_col = tx_size_wide[TX_32X64];
+ const int txfm_size_row = tx_size_high[TX_32X64];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int num_row = txfm_size_row >> 2;
+ const int num_col = txfm_size_col >> 2;
+
+ // column transform
+ load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
+ for (int i = 0; i < num_col; i++) {
+ av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col);
+ }
+ for (int i = 0; i < num_col; i++) {
+ col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
+ }
+ transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+ // row transform
+ for (int i = 0; i < num_row; i++) {
+ av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row);
+ }
+ for (int i = 0; i < txfm_size_col; i++) {
+ av1_round_shift_rect_array_32_sse4_1(in + i * 16, outcoef128 + i * 8, 8,
+ -shift[2], NewSqrt2);
+ }
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)tx_type;
+ __m128i in[512];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
+ const int txw_idx = get_txw_idx(TX_64X32);
+ const int txh_idx = get_txh_idx(TX_64X32);
+ const int txfm_size_col = tx_size_wide[TX_64X32];
+ const int txfm_size_row = tx_size_high[TX_64X32];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const int num_row = txfm_size_row >> 2;
+ const int num_col = txfm_size_col >> 2;
+
+ // column transform
+ for (int i = 0; i < 32; i++) {
+ load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]);
+ load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
+ shift[0]);
+ load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
+ shift[0]);
+ load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
+ shift[0]);
+ }
+
+ for (int i = 0; i < num_col; i++) {
+ av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col);
+ }
+
+ for (int i = 0; i < num_row; i++) {
+ col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
+ }
+ transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+ // row transform
+ for (int i = 0; i < num_row; i++) {
+ av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row);
+ }
+ av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 512, -shift[2],
+ NewSqrt2);
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[128];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+ const int txw_idx = get_txw_idx(TX_32X16);
+ const int txh_idx = get_txh_idx(TX_32X16);
+ const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+ // column transform
+ load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
+ col_txfm(in, in, bitcol, 8);
+ col_txfm_16x16_rounding(&in[0], -shift[1]);
+ col_txfm_16x16_rounding(&in[64], -shift[1]);
+ transpose_8nx8n(in, outcoef128, 32, 16);
+
+ // row transform
+ for (int i = 0; i < 4; i++) {
+ row_txfm((outcoef128 + i), (in + i), bitrow, 4);
+ }
+ av1_round_shift_rect_array_32_sse4_1(in, outcoef128, 128, -shift[2],
+ NewSqrt2);
+ (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[64];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+ const int txw_idx = get_txw_idx(TX_8X32);
+ const int txh_idx = get_txh_idx(TX_8X32);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+ const int txfm_size_col = tx_size_wide[TX_8X32];
+ const int txfm_size_row = tx_size_high[TX_8X32];
+ const int num_col = txfm_size_col >> 2;
+
+ // column transform
+ load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
+ load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
+ stride, 0, 0, shift[0]);
+
+ for (int i = 0; i < num_col; i++) {
+ col_txfm((in + i), (in + i), bitcol, num_col);
+ }
+ col_txfm_16x16_rounding(in, -shift[1]);
+ transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+ // row transform
+ for (int i = 0; i < txfm_size_col; i += 2) {
+ row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, txfm_size_col);
+ }
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[64];
+ __m128i *outcoef128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+ const int txw_idx = get_txw_idx(TX_32X8);
+ const int txh_idx = get_txh_idx(TX_32X8);
+ const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+ const int txfm_size_col = tx_size_wide[TX_32X8];
+ const int txfm_size_row = tx_size_high[TX_32X8];
+ const int num_col = txfm_size_row >> 2;
+
+ // column transform
+ load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
+ for (int i = 0; i < txfm_size_row; i += 2) {
+ col_txfm((in + i), (in + i), bitcol, txfm_size_row);
+ }
+
+ col_txfm_16x16_rounding(&in[0], -shift[1]);
+ transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+ // row transform
+ for (int i = 0; i < num_col; i++) {
+ row_txfm((outcoef128 + i), (outcoef128 + i), bitrow, num_col);
+ }
+ (void)bd;
+}
+#endif
+
+void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m128i in[8];
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+ const int txw_idx = get_txw_idx(TX_4X8);
+ const int txh_idx = get_txh_idx(TX_4X8);
+ const int txfm_size_col = tx_size_wide[TX_4X8];
+ const int txfm_size_row = tx_size_high[TX_4X8];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, in, bitcol, 1);
+ col_txfm_4x8_rounding(in, -shift[1]);
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *cur_in = &in[i * 4];
+ transpose_32bit_4x4(cur_in, cur_in);
+ row_txfm(cur_in, cur_in, bitrow, 1);
+ av1_round_shift_rect_array_32_sse4_1(cur_in, cur_in, txfm_size_col,
+ -shift[2], NewSqrt2);
+ store_output_w4(coeff + i * 4, cur_in, txfm_size_row, 4);
+ }
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m128i in[8];
+ __m128i *outcoeff128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+ const int txw_idx = get_txw_idx(TX_8X4);
+ const int txh_idx = get_txh_idx(TX_8X4);
+ const int txfm_size_col = tx_size_wide[TX_8X4];
+ const int txfm_size_row = tx_size_high[TX_8X4];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // col tranform
+ load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]);
+ for (int i = 0; i < 2; i++) {
+ __m128i *cur_in = &in[i * txfm_size_row];
+ col_txfm(cur_in, cur_in, bitcol, 1);
+ transpose_32bit_4x4(cur_in, cur_in);
+ }
+ col_txfm_4x8_rounding(in, -shift[1]);
+
+ // row tranform
+ row_txfm(in, outcoeff128, bitrow, 1);
+ av1_round_shift_rect_array_32_sse4_1(outcoeff128, outcoeff128, txfm_size_col,
+ -shift[2], NewSqrt2);
+ (void)bd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[256];
+ __m128i *outcoeff128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
+ const int txw_idx = get_txw_idx(TX_16X64);
+ const int txh_idx = get_txh_idx(TX_16X64);
+ const int txfm_size_col = tx_size_wide[TX_16X64];
+ const int txfm_size_row = tx_size_high[TX_16X64];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int num_col = txfm_size_col >> 2;
+ // col tranform
+ for (int i = 0; i < txfm_size_row; i += num_col) {
+ load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
+ ud_flip, lr_flip, shift[0]);
+ }
+
+ for (int i = 0; i < num_col; i++) {
+ av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
+ }
+
+ col_txfm_16x16_rounding(outcoeff128, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
+
+ transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
+ fdct16x16_sse4_1(in, outcoeff128, bitrow, 8);
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[256];
+ __m128i *outcoeff128 = (__m128i *)coeff;
+ const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
+ const int txw_idx = get_txw_idx(TX_64X16);
+ const int txh_idx = get_txh_idx(TX_64X16);
+ const int txfm_size_col = tx_size_wide[TX_64X16];
+ const int txfm_size_row = tx_size_high[TX_64X16];
+ int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+ int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ // col tranform
+ for (int i = 0; i < txfm_size_row; i++) {
+ load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
+ ud_flip, lr_flip, shift[0]);
+ load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
+ ud_flip, lr_flip, shift[0]);
+ }
+
+ fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row);
+ col_txfm_16x16_rounding(outcoeff128, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
+ col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
+
+ transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+ for (int i = 0; i < 4; i++) {
+ av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitrow, 4, 4);
+ }
+ memset(coeff + txfm_size_row * 32, 0, txfm_size_row * 32 * sizeof(*coeff));
+ (void)bd;
+}
+#endif
diff --git a/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c
new file mode 100644
index 0000000000..ca448ca37b
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_avx2.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+ { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+ { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+ { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+ const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ uint32_t *frame_sse, const unsigned int sse_stride) {
+ (void)block_width;
+ const uint16_t *src1 = frame1;
+ const uint16_t *src2 = frame2;
+ uint32_t *dst = frame_sse + 2;
+ for (int i = 0; i < block_height; i++) {
+ __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1);
+ __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2);
+ __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2);
+ __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+ __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+ __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+ __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+ __m256i diff_lo =
+ _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+ __m256i diff_hi =
+ _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+ _mm256_storeu_si256((__m256i *)dst, diff_lo);
+ dst += 8;
+ _mm256_storeu_si256((__m256i *)dst, diff_hi);
+
+ src1 += stride, src2 += stride2;
+ dst += sse_stride - 8;
+ }
+}
+
+static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+ const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ uint32_t *frame_sse, const unsigned int sse_stride) {
+ (void)block_width;
+ const uint16_t *src1 = frame1;
+ const uint16_t *src2 = frame2;
+ uint32_t *dst = frame_sse + 2;
+ for (int i = 0; i < block_height; i++) {
+ __m256i v_src1 = _mm256_loadu_si256((__m256i *)src1);
+ __m256i v_src2 = _mm256_loadu_si256((__m256i *)src2);
+ __m256i v_diff = _mm256_sub_epi16(v_src1, v_src2);
+ __m256i v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+ __m256i v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+ __m256i v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+ __m256i v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+ __m256i diff_lo =
+ _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+ __m256i diff_hi =
+ _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+ _mm256_storeu_si256((__m256i *)dst, diff_lo);
+ _mm256_storeu_si256((__m256i *)(dst + 8), diff_hi);
+
+ v_src1 = _mm256_loadu_si256((__m256i *)(src1 + 16));
+ v_src2 = _mm256_loadu_si256((__m256i *)(src2 + 16));
+ v_diff = _mm256_sub_epi16(v_src1, v_src2);
+ v_mullo = _mm256_mullo_epi16(v_diff, v_diff);
+ v_mulhi = _mm256_mulhi_epi16(v_diff, v_diff);
+
+ v_lo = _mm256_unpacklo_epi16(v_mullo, v_mulhi);
+ v_hi = _mm256_unpackhi_epi16(v_mullo, v_mulhi);
+ diff_lo =
+ _mm256_inserti128_si256(v_lo, _mm256_extracti128_si256(v_hi, 0), 1);
+ diff_hi =
+ _mm256_inserti128_si256(v_hi, _mm256_extracti128_si256(v_lo, 1), 0);
+
+ _mm256_storeu_si256((__m256i *)(dst + 16), diff_lo);
+ _mm256_storeu_si256((__m256i *)(dst + 24), diff_hi);
+
+ src1 += stride;
+ src2 += stride2;
+ dst += sse_stride;
+ }
+}
+
+static AOM_FORCE_INLINE void xx_load_and_pad_left(uint32_t *src,
+ __m256i *v256tmp) {
+ *v256tmp = _mm256_loadu_si256((__m256i *)src);
+ // For the first column, replicate the first element twice to the left
+ __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0xEA);
+ *v256tmp = _mm256_inserti128_si256(*v256tmp,
+ _mm256_extracti128_si256(v256tmp1, 0), 0);
+}
+
+static AOM_FORCE_INLINE void xx_load_and_pad_right(uint32_t *src,
+ __m256i *v256tmp) {
+ *v256tmp = _mm256_loadu_si256((__m256i *)src);
+ // For the last column, replicate the last element twice to the right
+ __m256i v256tmp1 = _mm256_shuffle_epi32(*v256tmp, 0x54);
+ *v256tmp = _mm256_inserti128_si256(*v256tmp,
+ _mm256_extracti128_si256(v256tmp1, 1), 1);
+}
+
+static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+ // Mask the required 5 values inside the vector
+ __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+ __m128i v128a, v128b;
+ // Extract 256b as two 128b registers A and B
+ v128a = _mm256_castsi256_si128(vtmp);
+ v128b = _mm256_extracti128_si256(vtmp, 1);
+ // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+ v128a = _mm_add_epi32(v128a, v128b);
+ // B = [A2+B2, A3+B3, 0, 0]
+ v128b = _mm_srli_si128(v128a, 8);
+ // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+ v128a = _mm_add_epi32(v128a, v128b);
+ // B = [A1+B1+A3+B3, 0, 0, 0]
+ v128b = _mm_srli_si128(v128a, 4);
+ // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+ v128a = _mm_add_epi32(v128a, v128b);
+ return _mm_extract_epi32(v128a, 0);
+}
+
+static void highbd_apply_temporal_filter(
+ const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
+ const double inv_num_ref_pixels, const double decay_factor,
+ const double inv_factor, const double weight_factor, double *d_factor,
+ int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_sse[BH][BW];
+
+ if (block_width == 32) {
+ get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+ block_height, frame_sse, SSE_STRIDE);
+ } else {
+ get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+ block_height, frame_sse, SSE_STRIDE);
+ }
+
+ __m256i vsrc[5];
+
+ // Traverse 4 columns at a time
+ // First and last columns will require padding
+ int col;
+ uint32_t *src = frame_sse;
+ for (int i = 2; i < 5; i++) {
+ xx_load_and_pad_left(src, &vsrc[i]);
+ src += SSE_STRIDE;
+ }
+
+ // Copy first row to first 2 vectors
+ vsrc[0] = vsrc[2];
+ vsrc[1] = vsrc[2];
+
+ for (int row = 0; row < block_height - 3; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ xx_load_and_pad_left(src, &vsrc[4]);
+ src += SSE_STRIDE;
+
+ acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3);
+ }
+ for (int row = block_height - 3; row < block_height; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ acc_5x5_sse[row][0] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][3] = xx_mask_and_hadd(vsum, 3);
+ }
+ for (col = 4; col < block_width - 4; col += 4) {
+ src = frame_sse + col;
+
+ // Load and pad(for first and last col) 3 rows from the top
+ for (int i = 2; i < 5; i++) {
+ vsrc[i] = _mm256_loadu_si256((__m256i *)src);
+ src += SSE_STRIDE;
+ }
+
+ // Copy first row to first 2 vectors
+ vsrc[0] = vsrc[2];
+ vsrc[1] = vsrc[2];
+
+ for (int row = 0; row < block_height - 3; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ vsrc[4] = _mm256_loadu_si256((__m256i *)src);
+
+ src += SSE_STRIDE;
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+ }
+ for (int row = block_height - 3; row < block_height; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+ }
+ }
+
+ src = frame_sse + col;
+
+ // Load and pad(for first and last col) 3 rows from the top
+ for (int i = 2; i < 5; i++) {
+ xx_load_and_pad_right(src, &vsrc[i]);
+ src += SSE_STRIDE;
+ }
+
+ // Copy first row to first 2 vectors
+ vsrc[0] = vsrc[2];
+ vsrc[1] = vsrc[2];
+
+ for (int row = 0; row < block_height - 3; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ xx_load_and_pad_right(src, &vsrc[4]);
+ src += SSE_STRIDE;
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+ }
+ for (int row = block_height - 3; row < block_height; row++) {
+ __m256i vsum1 = _mm256_add_epi32(vsrc[0], vsrc[1]);
+ __m256i vsum2 = _mm256_add_epi32(vsrc[2], vsrc[3]);
+ __m256i vsum3 = _mm256_add_epi32(vsum1, vsum2);
+ __m256i vsum = _mm256_add_epi32(vsum3, vsrc[4]);
+
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum, 3);
+ }
+
+ double subblock_mses_scaled[4];
+ double d_factor_decayed[4];
+ for (int idx = 0; idx < 4; idx++) {
+ subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+ d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+ }
+ if (tf_wgt_calc_lvl == 0) {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ } else {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ }
+}
+
+void av1_highbd_apply_temporal_filter_avx2(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint32_t frame_sse[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+ uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred);
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint16_t *ref =
+ CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++, k++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+ }
+ }
+ }
+ }
+ }
+
+ highbd_apply_temporal_filter(
+ ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
+ subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
+ luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c
new file mode 100644
index 0000000000..2032847083
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_temporal_filter_sse2.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+// For the squared error buffer, keep a padding for 4 samples
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
+ { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+ { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+ { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+ { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
+};
+
+static void get_squared_error(const uint16_t *frame1, const unsigned int stride,
+ const uint16_t *frame2,
+ const unsigned int stride2, const int block_width,
+ const int block_height, uint32_t *frame_sse,
+ const unsigned int dst_stride) {
+ const uint16_t *src1 = frame1;
+ const uint16_t *src2 = frame2;
+ uint32_t *dst = frame_sse;
+
+ for (int i = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j += 8) {
+ __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
+ __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
+
+ __m128i vdiff = _mm_sub_epi16(vsrc1, vsrc2);
+ __m128i vmullo = _mm_mullo_epi16(vdiff, vdiff);
+ __m128i vmullh = _mm_mulhi_epi16(vdiff, vdiff);
+
+ __m128i vres1 = _mm_unpacklo_epi16(vmullo, vmullh);
+ __m128i vres2 = _mm_unpackhi_epi16(vmullo, vmullh);
+
+ _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
+ _mm_storeu_si128((__m128i *)(dst + j + 6), vres2);
+ }
+
+ src1 += stride;
+ src2 += stride2;
+ dst += dst_stride;
+ }
+}
+
+static void xx_load_and_pad(uint32_t *src, __m128i *dstvec, int col,
+ int block_width) {
+ __m128i vtmp1 = _mm_loadu_si128((__m128i *)src);
+ __m128i vtmp2 = _mm_loadu_si128((__m128i *)(src + 4));
+ // For the first column, replicate the first element twice to the left
+ dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
+ // For the last column, replicate the last element twice to the right
+ dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
+}
+
+static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
+ __m128i veca, vecb;
+ // Mask and obtain the required 5 values inside the vector
+ veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
+ vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
+ // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+ veca = _mm_add_epi32(veca, vecb);
+ // B = [A2+B2, A3+B3, 0, 0]
+ vecb = _mm_srli_si128(veca, 8);
+ // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+ veca = _mm_add_epi32(veca, vecb);
+ // B = [A1+B1+A3+B3, 0, 0, 0]
+ vecb = _mm_srli_si128(veca, 4);
+ // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+ veca = _mm_add_epi32(veca, vecb);
+ return _mm_cvtsi128_si32(veca);
+}
+
+static void highbd_apply_temporal_filter(
+ const uint16_t *frame1, const unsigned int stride, const uint16_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ uint32_t *frame_sse, uint32_t *luma_sse_sum, int bd,
+ const double inv_num_ref_pixels, const double decay_factor,
+ const double inv_factor, const double weight_factor, double *d_factor,
+ int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_sse[BH][BW];
+
+ get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
+ frame_sse, SSE_STRIDE);
+
+ __m128i vsrc[5][2];
+
+ // Traverse 4 columns at a time
+ // First and last columns will require padding
+ for (int col = 0; col < block_width; col += 4) {
+ uint32_t *src = frame_sse + col;
+
+ // Load and pad(for first and last col) 3 rows from the top
+ for (int i = 2; i < 5; i++) {
+ xx_load_and_pad(src, vsrc[i], col, block_width);
+ src += SSE_STRIDE;
+ }
+
+ // Padding for top 2 rows
+ vsrc[0][0] = vsrc[2][0];
+ vsrc[0][1] = vsrc[2][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+
+ for (int row = 0; row < block_height - 3; row++) {
+ __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]);
+ __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]);
+ __m128i vsum13 = _mm_add_epi32(vsum11, vsum12);
+ __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]);
+
+ __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]);
+ __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]);
+ __m128i vsum23 = _mm_add_epi32(vsum21, vsum22);
+ __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]);
+
+ vsrc[0][0] = vsrc[1][0];
+ vsrc[0][1] = vsrc[1][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+ vsrc[2][0] = vsrc[3][0];
+ vsrc[2][1] = vsrc[3][1];
+ vsrc[3][0] = vsrc[4][0];
+ vsrc[3][1] = vsrc[4][1];
+
+ // Load next row
+ xx_load_and_pad(src, vsrc[4], col, block_width);
+ src += SSE_STRIDE;
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3);
+ }
+ for (int row = block_height - 3; row < block_height; row++) {
+ __m128i vsum11 = _mm_add_epi32(vsrc[0][0], vsrc[1][0]);
+ __m128i vsum12 = _mm_add_epi32(vsrc[2][0], vsrc[3][0]);
+ __m128i vsum13 = _mm_add_epi32(vsum11, vsum12);
+ __m128i vsum1 = _mm_add_epi32(vsum13, vsrc[4][0]);
+
+ __m128i vsum21 = _mm_add_epi32(vsrc[0][1], vsrc[1][1]);
+ __m128i vsum22 = _mm_add_epi32(vsrc[2][1], vsrc[3][1]);
+ __m128i vsum23 = _mm_add_epi32(vsum21, vsum22);
+ __m128i vsum2 = _mm_add_epi32(vsum23, vsrc[4][1]);
+
+ vsrc[0][0] = vsrc[1][0];
+ vsrc[0][1] = vsrc[1][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+ vsrc[2][0] = vsrc[3][0];
+ vsrc[2][1] = vsrc[3][1];
+ vsrc[3][0] = vsrc[4][0];
+ vsrc[3][1] = vsrc[4][1];
+
+ acc_5x5_sse[row][col] = xx_mask_and_hadd(vsum1, vsum2, 0);
+ acc_5x5_sse[row][col + 1] = xx_mask_and_hadd(vsum1, vsum2, 1);
+ acc_5x5_sse[row][col + 2] = xx_mask_and_hadd(vsum1, vsum2, 2);
+ acc_5x5_sse[row][col + 3] = xx_mask_and_hadd(vsum1, vsum2, 3);
+ }
+ }
+
+ double subblock_mses_scaled[4];
+ double d_factor_decayed[4];
+ for (int idx = 0; idx < 4; idx++) {
+ subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+ d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+ }
+ if (tf_wgt_calc_lvl == 0) {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ } else {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ // Scale down the difference for high bit depth input.
+ diff_sse >>= ((bd - 8) * 2);
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ }
+}
+
+void av1_highbd_apply_temporal_filter_sse2(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint32_t frame_sse[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+ uint16_t *pred1 = CONVERT_TO_SHORTPTR(pred);
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint16_t *ref =
+ CONVERT_TO_SHORTPTR(frame_to_filter->buffers[plane]) + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++, k++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+ }
+ }
+ }
+ }
+ }
+
+ highbd_apply_temporal_filter(
+ ref, frame_stride, pred1 + plane_offset, plane_w, plane_w, plane_h,
+ subblock_mses, accum + plane_offset, count + plane_offset, frame_sse,
+ luma_sse_sum, mbd->bd, inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/ml_avx2.c b/third_party/aom/av1/encoder/x86/ml_avx2.c
new file mode 100644
index 0000000000..6432708416
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/ml_avx2.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/x86/ml_sse3.h"
+
+#define CALC_OUTPUT_FOR_2ROWS \
+ const int index = weight_idx + (2 * i * tot_num_inputs); \
+ const __m256 weight0 = _mm256_loadu_ps(&weights[index]); \
+ const __m256 weight1 = _mm256_loadu_ps(&weights[index + tot_num_inputs]); \
+ const __m256 mul0 = _mm256_mul_ps(inputs256, weight0); \
+ const __m256 mul1 = _mm256_mul_ps(inputs256, weight1); \
+ hadd[i] = _mm256_hadd_ps(mul0, mul1);
+
+static INLINE void nn_propagate_8to1(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ int num_outputs, float *const output_nodes, int is_clip_required) {
+ // Process one output row at a time.
+ for (int out = 0; out < num_outputs; out++) {
+ __m256 in_result = _mm256_setzero_ps();
+ float bias_val = bias[out];
+ for (int in = 0; in < num_inputs_to_process; in += 8) {
+ const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+ const int weight_idx = in + (out * tot_num_inputs);
+ const __m256 weight0 = _mm256_loadu_ps(&weights[weight_idx]);
+ const __m256 mul0 = _mm256_mul_ps(inputs256, weight0);
+ in_result = _mm256_add_ps(in_result, mul0);
+ }
+ const __m128 low_128 = _mm256_castps256_ps128(in_result);
+ const __m128 high_128 = _mm256_extractf128_ps(in_result, 1);
+ const __m128 sum_par_0 = _mm_add_ps(low_128, high_128);
+ const __m128 sum_par_1 = _mm_hadd_ps(sum_par_0, sum_par_0);
+ const __m128 sum_tot =
+ _mm_add_ps(_mm_shuffle_ps(sum_par_1, sum_par_1, 0x99), sum_par_1);
+
+ bias_val += _mm_cvtss_f32(sum_tot);
+ if (is_clip_required) bias_val = AOMMAX(bias_val, 0);
+ output_nodes[out] = bias_val;
+ }
+}
+
+static INLINE void nn_propagate_8to4(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ int num_outputs, float *const output_nodes, int is_clip_required) {
+ __m256 hadd[2];
+ for (int out = 0; out < num_outputs; out += 4) {
+ __m128 bias_reg = _mm_loadu_ps(&bias[out]);
+ __m128 in_result = _mm_setzero_ps();
+ for (int in = 0; in < num_inputs_to_process; in += 8) {
+ const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+ const int weight_idx = in + (out * tot_num_inputs);
+ // Process two output row at a time.
+ for (int i = 0; i < 2; i++) {
+ CALC_OUTPUT_FOR_2ROWS
+ }
+
+ const __m256 sum_par = _mm256_hadd_ps(hadd[0], hadd[1]);
+ const __m128 low_128 = _mm256_castps256_ps128(sum_par);
+ const __m128 high_128 = _mm256_extractf128_ps(sum_par, 1);
+ const __m128 result = _mm_add_ps(low_128, high_128);
+
+ in_result = _mm_add_ps(in_result, result);
+ }
+
+ in_result = _mm_add_ps(in_result, bias_reg);
+ if (is_clip_required) in_result = _mm_max_ps(in_result, _mm_setzero_ps());
+ _mm_storeu_ps(&output_nodes[out], in_result);
+ }
+}
+
+static INLINE void nn_propagate_8to8(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ int num_outputs, float *const output_nodes, int is_clip_required) {
+ __m256 hadd[4];
+ for (int out = 0; out < num_outputs; out += 8) {
+ __m256 bias_reg = _mm256_loadu_ps(&bias[out]);
+ __m256 in_result = _mm256_setzero_ps();
+ for (int in = 0; in < num_inputs_to_process; in += 8) {
+ const __m256 inputs256 = _mm256_loadu_ps(&inputs[in]);
+ const int weight_idx = in + (out * tot_num_inputs);
+ // Process two output rows at a time.
+ for (int i = 0; i < 4; i++) {
+ CALC_OUTPUT_FOR_2ROWS
+ }
+ const __m256 hh0 = _mm256_hadd_ps(hadd[0], hadd[1]);
+ const __m256 hh1 = _mm256_hadd_ps(hadd[2], hadd[3]);
+
+ __m256 ht_0 = _mm256_permute2f128_ps(hh0, hh1, 0x20);
+ __m256 ht_1 = _mm256_permute2f128_ps(hh0, hh1, 0x31);
+
+ __m256 result = _mm256_add_ps(ht_0, ht_1);
+ in_result = _mm256_add_ps(in_result, result);
+ }
+ in_result = _mm256_add_ps(in_result, bias_reg);
+ if (is_clip_required)
+ in_result = _mm256_max_ps(in_result, _mm256_setzero_ps());
+ _mm256_storeu_ps(&output_nodes[out], in_result);
+ }
+}
+
+static INLINE void nn_propagate_input_multiple_of_8(
+ const float *const inputs, const float *const weights,
+ const float *const bias, int num_inputs_to_process, int tot_num_inputs,
+ bool is_output_layer, int num_outputs, float *const output_nodes) {
+ // The saturation of output is considered for hidden layer which is not equal
+ // to final hidden layer.
+ const int is_clip_required =
+ !is_output_layer && num_inputs_to_process == tot_num_inputs;
+ if (num_outputs % 8 == 0) {
+ nn_propagate_8to8(inputs, weights, bias, num_inputs_to_process,
+ tot_num_inputs, num_outputs, output_nodes,
+ is_clip_required);
+ } else if (num_outputs % 4 == 0) {
+ nn_propagate_8to4(inputs, weights, bias, num_inputs_to_process,
+ tot_num_inputs, num_outputs, output_nodes,
+ is_clip_required);
+ } else {
+ nn_propagate_8to1(inputs, weights, bias, num_inputs_to_process,
+ tot_num_inputs, num_outputs, output_nodes,
+ is_clip_required);
+ }
+}
+
+void av1_nn_predict_avx2(const float *input_nodes,
+ const NN_CONFIG *const nn_config, int reduce_prec,
+ float *const output) {
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+ int buf_index = 0;
+ int num_inputs = nn_config->num_inputs;
+ assert(num_inputs > 0 && num_inputs <= NN_MAX_NODES_PER_LAYER);
+
+ for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+ const float *layer_weights = nn_config->weights[layer];
+ const float *layer_bias = nn_config->bias[layer];
+ bool is_output_layer = layer == nn_config->num_hidden_layers;
+ float *const output_nodes = is_output_layer ? output : &buf[buf_index][0];
+ const int num_outputs = is_output_layer
+ ? nn_config->num_outputs
+ : nn_config->num_hidden_nodes[layer];
+ assert(num_outputs > 0 && num_outputs <= NN_MAX_NODES_PER_LAYER);
+
+ // Process input multiple of 8 using AVX2 intrinsic.
+ if (num_inputs % 8 == 0) {
+ nn_propagate_input_multiple_of_8(input_nodes, layer_weights, layer_bias,
+ num_inputs, num_inputs, is_output_layer,
+ num_outputs, output_nodes);
+ } else {
+ // When number of inputs is not multiple of 8, use hybrid approach of AVX2
+ // and SSE3 based on the need.
+ const int in_mul_8 = num_inputs / 8;
+ const int num_inputs_to_process = in_mul_8 * 8;
+ int bias_is_considered = 0;
+ if (in_mul_8) {
+ nn_propagate_input_multiple_of_8(
+ input_nodes, layer_weights, layer_bias, num_inputs_to_process,
+ num_inputs, is_output_layer, num_outputs, output_nodes);
+ bias_is_considered = 1;
+ }
+
+ const float *out_temp = bias_is_considered ? output_nodes : layer_bias;
+ const int input_remaining = num_inputs % 8;
+ if (input_remaining % 4 == 0 && num_outputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out += 8) {
+ __m128 out_h = _mm_loadu_ps(&out_temp[out + 4]);
+ __m128 out_l = _mm_loadu_ps(&out_temp[out]);
+ for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to8_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &out_h, &out_l, num_inputs);
+ }
+ if (!is_output_layer) {
+ const __m128 zero = _mm_setzero_ps();
+ out_h = _mm_max_ps(out_h, zero);
+ out_l = _mm_max_ps(out_l, zero);
+ }
+ _mm_storeu_ps(&output_nodes[out + 4], out_h);
+ _mm_storeu_ps(&output_nodes[out], out_l);
+ }
+ } else if (input_remaining % 4 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ __m128 outputs = _mm_loadu_ps(&out_temp[out]);
+ for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to4_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &outputs, num_inputs);
+ }
+ if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+ _mm_storeu_ps(&output_nodes[out], outputs);
+ }
+ } else if (input_remaining % 4 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 outputs = _mm_load1_ps(&out_temp[out]);
+ for (int in = in_mul_8 * 8; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to1_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &outputs);
+ }
+ if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+ output_nodes[out] = _mm_cvtss_f32(outputs);
+ }
+ } else {
+ // Use SSE instructions for scalar operations to avoid the latency
+ // of swapping between SIMD and FPU modes.
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 outputs = _mm_load1_ps(&out_temp[out]);
+ for (int in_node = in_mul_8 * 8; in_node < num_inputs; in_node++) {
+ __m128 input = _mm_load1_ps(&input_nodes[in_node]);
+ __m128 weight =
+ _mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
+ outputs = _mm_add_ps(outputs, _mm_mul_ps(input, weight));
+ }
+ if (!is_output_layer) outputs = _mm_max_ps(outputs, _mm_setzero_ps());
+ output_nodes[out] = _mm_cvtss_f32(outputs);
+ }
+ }
+ }
+ // Before processing the next layer, treat the output of current layer as
+ // input to next layer.
+ input_nodes = output_nodes;
+ num_inputs = num_outputs;
+ buf_index = 1 - buf_index;
+ }
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/third_party/aom/av1/encoder/x86/ml_sse3.c b/third_party/aom/av1/encoder/x86/ml_sse3.c
new file mode 100644
index 0000000000..4748a68d38
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/ml_sse3.c
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/x86/ml_sse3.h"
+
+// In order to avoid the high-latency of swapping between FPU and SIMD
+// operations, we keep the result in a 128-bit register even though we only
+// care about a single value.
+static void nn_propagate_8to1(const float *const inputs,
+ const float *const weights,
+ __m128 *const output) {
+ const __m128 inputs_h = _mm_loadu_ps(&inputs[4]);
+ const __m128 inputs_l = _mm_loadu_ps(inputs);
+
+ const __m128 weights_h = _mm_loadu_ps(&weights[4]);
+ const __m128 weights_l = _mm_loadu_ps(weights);
+
+ const __m128 mul_h = _mm_mul_ps(inputs_h, weights_h);
+ const __m128 mul_l = _mm_mul_ps(inputs_l, weights_l);
+ // [7 6 5 4] [3 2 1 0] (weight and input indices)
+
+ const __m128 vadd = _mm_add_ps(mul_l, mul_h);
+ // [7+3 6+2 5+1 4+0]
+ const __m128 hadd1 = _mm_hadd_ps(vadd, vadd);
+ // [7+6+3+2 5+4+1+0 7+6+3+2 5+4+1+0]
+ const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
+ // [7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0]
+ *output = _mm_add_ps(*output, hadd2);
+}
+
+void av1_nn_propagate_4to1_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const output) {
+ const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+ const __m128 weights128 = _mm_loadu_ps(weights);
+
+ const __m128 mul = _mm_mul_ps(inputs128, weights128);
+ // [3 2 1 0] (weight and input indices)
+
+ const __m128 hadd1 = _mm_hadd_ps(mul, mul);
+ // [3+2 1+0 3+2 1+0]
+ const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
+ // [3+2+1+0 3+2+1+0 3+2+1+0 3+2+1+0]
+ *output = _mm_add_ps(*output, hadd2);
+}
+
+void av1_nn_propagate_4to4_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const outputs, const int num_inputs) {
+ const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+ __m128 hadd[2];
+ for (int i = 0; i < 2; i++) { // For each pair of outputs
+ const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]);
+ const __m128 mul0 = _mm_mul_ps(weight0, inputs128);
+ const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]);
+ const __m128 mul1 = _mm_mul_ps(weight1, inputs128);
+ hadd[i] = _mm_hadd_ps(mul0, mul1);
+ }
+ // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices)
+ // hadd[1] = [15+14 13+12 11+10 9+8]
+
+ const __m128 hh = _mm_hadd_ps(hadd[0], hadd[1]);
+ // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0]
+
+ *outputs = _mm_add_ps(*outputs, hh);
+}
+
+void av1_nn_propagate_4to8_sse3(const float *const inputs,
+ const float *const weights, __m128 *const out_h,
+ __m128 *const out_l, const int num_inputs) {
+ const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+ __m128 hadd[4];
+ for (int i = 0; i < 4; i++) { // For each pair of outputs
+ const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]);
+ const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]);
+ const __m128 mul0 = _mm_mul_ps(inputs128, weight0);
+ const __m128 mul1 = _mm_mul_ps(inputs128, weight1);
+ hadd[i] = _mm_hadd_ps(mul0, mul1);
+ }
+ // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices)
+ // hadd[1] = [15+14 13+12 11+10 9+8]
+ // hadd[2] = [23+22 21+20 19+18 17+16]
+ // hadd[3] = [31+30 29+28 27+26 25+24]
+
+ const __m128 hh0 = _mm_hadd_ps(hadd[0], hadd[1]);
+ // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0]
+ const __m128 hh1 = _mm_hadd_ps(hadd[2], hadd[3]);
+ // [31+30+29+28 27+26+25+24 23+22+21+20 19+18+17+16]
+
+ *out_h = _mm_add_ps(*out_h, hh1);
+ *out_l = _mm_add_ps(*out_l, hh0);
+}
+
+static void nn_propagate_8to4(const float *const inputs,
+ const float *const weights, __m128 *const outputs,
+ const int num_inputs) {
+ const __m128 inputs_h = _mm_loadu_ps(inputs + 4);
+ const __m128 inputs_l = _mm_loadu_ps(inputs);
+ // [7 6 5 4] [3 2 1 0] (input indices)
+
+ __m128 add[4];
+ for (int i = 0; i < 4; i++) { // For each output:
+ const __m128 weight_h = _mm_loadu_ps(&weights[i * num_inputs + 4]);
+ const __m128 weight_l = _mm_loadu_ps(&weights[i * num_inputs]);
+ const __m128 mul_h = _mm_mul_ps(inputs_h, weight_h);
+ const __m128 mul_l = _mm_mul_ps(inputs_l, weight_l);
+ add[i] = _mm_add_ps(mul_l, mul_h);
+ }
+ // add[0] = [7+3 6+2 5+1 4+0]
+ // add[1] = [15+11 14+10 13+9 12+8]
+ // add[2] = [23+19 22+18 21+17 20+16]
+ // add[3] = [31+27 30+26 29+25 28+24]
+
+ const __m128 hadd_h = _mm_hadd_ps(add[2], add[3]);
+ // [31+30+27+26 29+28+25+24 23+22+19+18 21+20+17+16]
+ const __m128 hadd_l = _mm_hadd_ps(add[0], add[1]);
+ // [15+14+11+10 13+12+9+8 7+6+3+2 5+4+1+0]
+
+ const __m128 haddhadd = _mm_hadd_ps(hadd_l, hadd_h);
+ // [31+30+29+28+27+26+25+24 23+22+21+20+19+18+17+16
+ // 15+14+13+12+11+10+9+8 7+6+5+4+3+2+1+0]
+
+ *outputs = _mm_add_ps(*outputs, haddhadd);
+}
+
+static void nn_activate8(__m128 *out_h, __m128 *out_l) {
+ const __m128 zero = _mm_setzero_ps();
+ *out_h = _mm_max_ps(*out_h, zero);
+ *out_l = _mm_max_ps(*out_l, zero);
+}
+
+static void nn_activate4(__m128 *x) { *x = _mm_max_ps(*x, _mm_setzero_ps()); }
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_sse3(const float *input_nodes,
+ const NN_CONFIG *const nn_config, int reduce_prec,
+ float *const output) {
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+ int buf_index = 0;
+ int num_inputs = nn_config->num_inputs;
+
+ // Hidden layers, except the final iteration is the output layer.
+ for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+ const float *layer_weights = nn_config->weights[layer];
+ const float *layer_bias = nn_config->bias[layer];
+ bool output_layer = (layer == nn_config->num_hidden_layers);
+ float *const output_nodes = output_layer ? output : &buf[buf_index][0];
+ const int num_outputs = output_layer ? nn_config->num_outputs
+ : nn_config->num_hidden_nodes[layer];
+
+ if (num_inputs % 4 == 0 && num_outputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out += 8) {
+ __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]);
+ __m128 out_l = _mm_loadu_ps(&layer_bias[out]);
+ for (int in = 0; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to8_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &out_h, &out_l, num_inputs);
+ }
+ if (!output_layer) nn_activate8(&out_h, &out_l);
+ _mm_storeu_ps(&output_nodes[out + 4], out_h);
+ _mm_storeu_ps(&output_nodes[out], out_l);
+ }
+ } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
+ for (int in = 0; in < num_inputs; in += 8) {
+ nn_propagate_8to4(&input_nodes[in],
+ &layer_weights[out * num_inputs + in], &outputs,
+ num_inputs);
+ }
+ if (!output_layer) nn_activate4(&outputs);
+ _mm_storeu_ps(&output_nodes[out], outputs);
+ }
+ } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out += 4) {
+ __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
+ for (int in = 0; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to4_sse3(&input_nodes[in],
+ &layer_weights[out * num_inputs + in],
+ &outputs, num_inputs);
+ }
+ if (!output_layer) nn_activate4(&outputs);
+ _mm_storeu_ps(&output_nodes[out], outputs);
+ }
+ } else if (num_inputs % 8 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 total = _mm_load1_ps(&layer_bias[out]);
+ for (int in = 0; in < num_inputs; in += 8) {
+ nn_propagate_8to1(&input_nodes[in],
+ &layer_weights[out * num_inputs + in], &total);
+ }
+ if (!output_layer) nn_activate4(&total);
+ output_nodes[out] = _mm_cvtss_f32(total);
+ }
+ } else if (num_inputs % 4 == 0) {
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 total = _mm_load1_ps(&layer_bias[out]);
+ for (int in = 0; in < num_inputs; in += 4) {
+ av1_nn_propagate_4to1_sse3(
+ &input_nodes[in], &layer_weights[out * num_inputs + in], &total);
+ }
+ if (!output_layer) nn_activate4(&total);
+ output_nodes[out] = _mm_cvtss_f32(total);
+ }
+ } else {
+ // Use SSE instructions for scalar operations to avoid the latency of
+ // swapping between SIMD and FPU modes.
+ for (int out = 0; out < num_outputs; out++) {
+ __m128 total = _mm_load1_ps(&layer_bias[out]);
+ for (int in_node = 0; in_node < num_inputs; in_node++) {
+ __m128 input = _mm_load1_ps(&input_nodes[in_node]);
+ __m128 weight =
+ _mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
+ total = _mm_add_ps(total, _mm_mul_ps(input, weight));
+ }
+ if (!output_layer) nn_activate4(&total);
+ output_nodes[out] = _mm_cvtss_f32(total);
+ }
+ }
+ input_nodes = output_nodes;
+ num_inputs = num_outputs;
+ buf_index = 1 - buf_index;
+ }
+ if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
+
+// Based on N. N. Schraudolph. A Fast, Compact Approximation of the Exponential
+// Function. Neural Computation, 11(4):853–862, 1999.
+static AOM_INLINE __m128 approx_exp(__m128 y) {
+#define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2)
+#define B \
+ 127 // Offset for the exponent according to IEEE floating point standard.
+#define C 60801 // Magic number controls the accuracy of approximation
+ const __m128 multiplier = _mm_set1_ps(A);
+ const __m128i offset = _mm_set1_epi32(B * (1 << 23) - C);
+
+ y = _mm_mul_ps(y, multiplier);
+ y = _mm_castsi128_ps(_mm_add_epi32(_mm_cvtps_epi32(y), offset));
+ return y;
+#undef A
+#undef B
+#undef C
+}
+
+static AOM_INLINE __m128 reduce_max(__m128 reg) {
+ __m128 tmp_reg;
+
+ tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e); // 01 00 11 10
+ reg = _mm_max_ps(reg, tmp_reg);
+
+ tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1); // 10 11 00 01
+ reg = _mm_max_ps(reg, tmp_reg);
+
+ return reg;
+}
+
+static AOM_INLINE __m128 reduce_sum(__m128 reg) {
+ __m128 tmp_reg;
+
+ tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e); // 01 00 11 10
+ reg = _mm_add_ps(reg, tmp_reg);
+
+ tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1); // 10 11 00 01
+ reg = _mm_add_ps(reg, tmp_reg);
+
+ return reg;
+}
+
+void av1_nn_fast_softmax_16_sse3(const float *input, float *output) {
+ // Clips at -10 to avoid underflowing
+ const __m128 clipper = _mm_set1_ps(-10.0f);
+
+ // Load in 16 values
+ __m128 in_0 = _mm_loadu_ps(&input[0]);
+ __m128 in_1 = _mm_loadu_ps(&input[4]);
+ __m128 in_2 = _mm_loadu_ps(&input[8]);
+ __m128 in_3 = _mm_loadu_ps(&input[12]);
+
+ // Get the max
+ __m128 max_0 = _mm_max_ps(in_0, in_1);
+ __m128 max_1 = _mm_max_ps(in_2, in_3);
+
+ max_0 = _mm_max_ps(max_0, max_1);
+ max_0 = reduce_max(max_0);
+
+ // Subtract the max off and clip
+ in_0 = _mm_sub_ps(in_0, max_0);
+ in_1 = _mm_sub_ps(in_1, max_0);
+ in_2 = _mm_sub_ps(in_2, max_0);
+ in_3 = _mm_sub_ps(in_3, max_0);
+
+ in_0 = _mm_max_ps(in_0, clipper);
+ in_1 = _mm_max_ps(in_1, clipper);
+ in_2 = _mm_max_ps(in_2, clipper);
+ in_3 = _mm_max_ps(in_3, clipper);
+
+ // Exponentiate and compute the denominator
+ __m128 sum = in_0 = approx_exp(in_0);
+ in_1 = approx_exp(in_1);
+ sum = _mm_add_ps(sum, in_1);
+ in_2 = approx_exp(in_2);
+ sum = _mm_add_ps(sum, in_2);
+ in_3 = approx_exp(in_3);
+ sum = _mm_add_ps(sum, in_3);
+ sum = reduce_sum(sum);
+
+ // Divide to get the probability
+ in_0 = _mm_div_ps(in_0, sum);
+ in_1 = _mm_div_ps(in_1, sum);
+ in_2 = _mm_div_ps(in_2, sum);
+ in_3 = _mm_div_ps(in_3, sum);
+
+ _mm_storeu_ps(&output[0], in_0);
+ _mm_storeu_ps(&output[4], in_1);
+ _mm_storeu_ps(&output[8], in_2);
+ _mm_storeu_ps(&output[12], in_3);
+}
diff --git a/third_party/aom/av1/encoder/x86/ml_sse3.h b/third_party/aom/av1/encoder/x86/ml_sse3.h
new file mode 100644
index 0000000000..f41a2474af
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/ml_sse3.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_ML_SSE3_H_
+#define AOM_AV1_ENCODER_X86_ML_SSE3_H_
+
+#include <pmmintrin.h>
+
+void av1_nn_propagate_4to1_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const output);
+
+void av1_nn_propagate_4to4_sse3(const float *const inputs,
+ const float *const weights,
+ __m128 *const outputs, const int num_inputs);
+
+void av1_nn_propagate_4to8_sse3(const float *const inputs,
+ const float *const weights, __m128 *const out_h,
+ __m128 *const out_l, const int num_inputs);
+
+#endif // AOM_AV1_ENCODER_X86_ML_SSE3_H_
diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
new file mode 100644
index 0000000000..6658ed39a8
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
@@ -0,0 +1,2348 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
+ const __m256i *shuffle,
+ const __m256i *dgd_ijkl) {
+ // Load two 128-bit chunks from dgd
+ const __m256i s0 = _mm256_inserti128_si256(
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)dgd)),
+ _mm_loadu_si128((__m128i *)(dgd + 4)), 1);
+ // s0 = [11 10 9 8 7 6 5 4] [7 6 5 4 3 2 1 0] as u16 (values are dgd indices)
+ // The weird order is so the shuffle stays within 128-bit lanes
+
+ // Shuffle 16x u16 values within lanes according to the mask:
+ // [0 1 1 2 2 3 3 4] [0 1 1 2 2 3 3 4]
+ // (Actually we shuffle u8 values as there's no 16-bit shuffle)
+ const __m256i s1 = _mm256_shuffle_epi8(s0, *shuffle);
+ // s1 = [8 7 7 6 6 5 5 4] [4 3 3 2 2 1 1 0] as u16 (values are dgd indices)
+
+ // Multiply 16x 16-bit integers in dgd_ijkl and s1, resulting in 16x 32-bit
+ // integers then horizontally add pairs of these integers resulting in 8x
+ // 32-bit integers
+ const __m256i d0 = _mm256_madd_epi16(*dgd_ijkl, s1);
+ // d0 = [a b c d] [e f g h] as u32
+
+ // Take the lower-half of d0, extend to u64, add it on to dst (H)
+ const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0));
+ // d0l = [a b] [c d] as u64
+ const __m256i dst0 = yy_load_256(dst);
+ yy_store_256(dst, _mm256_add_epi64(d0l, dst0));
+
+ // Take the upper-half of d0, extend to u64, add it on to dst (H)
+ const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1));
+ // d0h = [e f] [g h] as u64
+ const __m256i dst1 = yy_load_256(dst + 4);
+ yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1));
+}
+
+static INLINE void acc_stat_highbd_win7_one_line_avx2(
+ const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+ int dgd_stride, const __m256i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
+ int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint16_t X1 = src[j];
+ const uint16_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ const uint16_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ // Load two u16 values from dgd_ijkl combined as a u32,
+ // then broadcast to 8x u32 slots of a 256
+ const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
+ // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
+
+ acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint16_t X1 = src[j];
+ *sumX += X1;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_highbd_avx2` function wants its input to have
+ // interleaved copies of two pixels, but we only have one. However, the
+ // pixels are (effectively) used as inputs to a multiply-accumulate. So
+ // if we set the extra pixel slot to 0, then it is effectively ignored.
+ const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
+
+ acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_highbd_win7_opt_avx2(
+ const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+ int64_t *H, aom_bit_depth_t bit_depth) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_highbd_win7_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+ }
+ }
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = (M_int[k][l] +
+ (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+ bit_depth_divider;
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] =
+ (H_int_[n * 8 + m] +
+ (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+ bit_depth_divider;
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_highbd_win5_one_line_avx2(
+ const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+ int dgd_stride, const __m256i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint16_t X1 = src[j];
+ const uint16_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ const uint16_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ // Load two u16 values from dgd_ijkl combined as a u32,
+ // then broadcast to 8x u32 slots of a 256
+ const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
+ // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
+
+ acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint16_t X1 = src[j];
+ *sumX += X1;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_highbd_avx2` function wants its input to have
+ // interleaved copies of two pixels, but we only have one. However, the
+ // pixels are (effectively) used as inputs to a multiply-accumulate. So
+ // if we set the extra pixel slot to 0, then it is effectively ignored.
+ const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
+
+ acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_highbd_win5_opt_avx2(
+ const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+ int64_t *H, aom_bit_depth_t bit_depth) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ DECLARE_ALIGNED(
+ 32, int64_t,
+ H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_highbd_win5_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int64, H_int64);
+ }
+ }
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = (M_int64[k][l] +
+ (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+ bit_depth_divider;
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] =
+ (H_int_[n * 8 + m] +
+ (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+ bit_depth_divider;
+ }
+ }
+ }
+ }
+}
+
+void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8,
+ const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ aom_bit_depth_t bit_depth) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H,
+ bit_depth);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H,
+ bit_depth);
+ } else {
+ av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H, bit_depth);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void madd_and_accum_avx2(__m256i src, __m256i dgd, __m256i *sum) {
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(src, dgd));
+}
+
+static INLINE __m256i convert_and_add_avx2(__m256i src) {
+ const __m256i s0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src));
+ const __m256i s1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 1));
+ return _mm256_add_epi64(s0, s1);
+}
+
+static INLINE __m256i hadd_four_32_to_64_avx2(__m256i src0, __m256i src1,
+ __m256i *src2, __m256i *src3) {
+ // 00 01 10 11 02 03 12 13
+ const __m256i s_0 = _mm256_hadd_epi32(src0, src1);
+ // 20 21 30 31 22 23 32 33
+ const __m256i s_1 = _mm256_hadd_epi32(*src2, *src3);
+ // 00+01 10+11 20+21 30+31 02+03 12+13 22+23 32+33
+ const __m256i s_2 = _mm256_hadd_epi32(s_0, s_1);
+ return convert_and_add_avx2(s_2);
+}
+
+static INLINE __m128i add_64bit_lvl_avx2(__m256i src0, __m256i src1) {
+ // 00 10 02 12
+ const __m256i t0 = _mm256_unpacklo_epi64(src0, src1);
+ // 01 11 03 13
+ const __m256i t1 = _mm256_unpackhi_epi64(src0, src1);
+ // 00+01 10+11 02+03 12+13
+ const __m256i sum = _mm256_add_epi64(t0, t1);
+ // 00+01 10+11
+ const __m128i sum0 = _mm256_castsi256_si128(sum);
+ // 02+03 12+13
+ const __m128i sum1 = _mm256_extracti128_si256(sum, 1);
+ // 00+01+02+03 10+11+12+13
+ return _mm_add_epi64(sum0, sum1);
+}
+
+static INLINE __m128i convert_32_to_64_add_avx2(__m256i src0, __m256i src1) {
+ // 00 01 02 03
+ const __m256i s0 = convert_and_add_avx2(src0);
+ // 10 11 12 13
+ const __m256i s1 = convert_and_add_avx2(src1);
+ return add_64bit_lvl_avx2(s0, s1);
+}
+
+static INLINE int32_t calc_sum_of_register(__m256i src) {
+ const __m128i src_l = _mm256_castsi256_si128(src);
+ const __m128i src_h = _mm256_extracti128_si256(src, 1);
+ const __m128i sum = _mm_add_epi32(src_l, src_h);
+ const __m128i dst0 = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ const __m128i dst1 = _mm_add_epi32(dst0, _mm_srli_si128(dst0, 4));
+ return _mm_cvtsi128_si32(dst1);
+}
+
+static INLINE void transpose_64bit_4x4_avx2(const __m256i *const src,
+ __m256i *const dst) {
+ // Unpack 64 bit elements. Goes from:
+ // src[0]: 00 01 02 03
+ // src[1]: 10 11 12 13
+ // src[2]: 20 21 22 23
+ // src[3]: 30 31 32 33
+ // to:
+ // reg0: 00 10 02 12
+ // reg1: 20 30 22 32
+ // reg2: 01 11 03 13
+ // reg3: 21 31 23 33
+ const __m256i reg0 = _mm256_unpacklo_epi64(src[0], src[1]);
+ const __m256i reg1 = _mm256_unpacklo_epi64(src[2], src[3]);
+ const __m256i reg2 = _mm256_unpackhi_epi64(src[0], src[1]);
+ const __m256i reg3 = _mm256_unpackhi_epi64(src[2], src[3]);
+
+ // Unpack 64 bit elements resulting in:
+ // dst[0]: 00 10 20 30
+ // dst[1]: 01 11 21 31
+ // dst[2]: 02 12 22 32
+ // dst[3]: 03 13 23 33
+ dst[0] = _mm256_inserti128_si256(reg0, _mm256_castsi256_si128(reg1), 1);
+ dst[1] = _mm256_inserti128_si256(reg2, _mm256_castsi256_si128(reg3), 1);
+ dst[2] = _mm256_inserti128_si256(reg1, _mm256_extracti128_si256(reg0, 1), 0);
+ dst[3] = _mm256_inserti128_si256(reg3, _mm256_extracti128_si256(reg2, 1), 0);
+}
+
+// When we load 32 values of int8_t type and need less than 32 values for
+// processing, the below mask is used to make the extra values zero.
+static const int8_t mask_8bit[32] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes
+};
+
+// When we load 16 values of int16_t type and need less than 16 values for
+// processing, the below mask is used to make the extra values zero.
+static const int16_t mask_16bit[32] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes
+};
+
+static INLINE uint8_t calc_dgd_buf_avg_avx2(const uint8_t *src, int32_t h_start,
+ int32_t h_end, int32_t v_start,
+ int32_t v_end, int32_t stride) {
+ const uint8_t *src_temp = src + v_start * stride + h_start;
+ const __m256i zero = _mm256_setzero_si256();
+ const int32_t width = h_end - h_start;
+ const int32_t height = v_end - v_start;
+ const int32_t wd_beyond_mul32 = width & 31;
+ const int32_t wd_mul32 = width - wd_beyond_mul32;
+ __m128i mask_low, mask_high;
+ __m256i ss = zero;
+
+ // When width is not multiple of 32, it still loads 32 and to make the data
+ // which is extra (beyond required) as zero using the below mask.
+ if (wd_beyond_mul32 >= 16) {
+ mask_low = _mm_set1_epi8(-1);
+ mask_high = _mm_loadu_si128((__m128i *)(&mask_8bit[32 - wd_beyond_mul32]));
+ } else {
+ mask_low = _mm_loadu_si128((__m128i *)(&mask_8bit[16 - wd_beyond_mul32]));
+ mask_high = _mm_setzero_si128();
+ }
+ const __m256i mask =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(mask_low), mask_high, 1);
+
+ int32_t proc_ht = 0;
+ do {
+ // Process width in multiple of 32.
+ int32_t proc_wd = 0;
+ while (proc_wd < wd_mul32) {
+ const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd));
+ const __m256i sad_0 = _mm256_sad_epu8(s_0, zero);
+ ss = _mm256_add_epi32(ss, sad_0);
+ proc_wd += 32;
+ }
+
+ // Process the remaining width.
+ if (wd_beyond_mul32) {
+ const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd));
+ const __m256i s_m_0 = _mm256_and_si256(s_0, mask);
+ const __m256i sad_0 = _mm256_sad_epu8(s_m_0, zero);
+ ss = _mm256_add_epi32(ss, sad_0);
+ }
+ src_temp += stride;
+ proc_ht++;
+ } while (proc_ht < height);
+
+ const uint32_t sum = calc_sum_of_register(ss);
+ const uint8_t avg = sum / (width * height);
+ return avg;
+}
+
+// Fill (src-avg) or (dgd-avg) buffers. Note that when n = (width % 16) is not
+// 0, it writes (16 - n) more data than required.
+static INLINE void sub_avg_block_avx2(const uint8_t *src, int32_t src_stride,
+ uint8_t avg, int32_t width,
+ int32_t height, int16_t *dst,
+ int32_t dst_stride,
+ int use_downsampled_wiener_stats) {
+ const __m256i avg_reg = _mm256_set1_epi16(avg);
+
+ int32_t proc_ht = 0;
+ do {
+ int ds_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ if (use_downsampled_wiener_stats &&
+ (height - proc_ht < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+ ds_factor = height - proc_ht;
+ }
+
+ int32_t proc_wd = 0;
+ while (proc_wd < width) {
+ const __m128i s = _mm_loadu_si128((__m128i *)(src + proc_wd));
+ const __m256i ss = _mm256_cvtepu8_epi16(s);
+ const __m256i d = _mm256_sub_epi16(ss, avg_reg);
+ _mm256_storeu_si256((__m256i *)(dst + proc_wd), d);
+ proc_wd += 16;
+ }
+
+ src += ds_factor * src_stride;
+ dst += ds_factor * dst_stride;
+ proc_ht += ds_factor;
+ } while (proc_ht < height);
+}
+
+// Fills lower-triangular elements of H buffer from upper triangular elements of
+// the same
+static INLINE void fill_lower_triag_elements_avx2(const int32_t wiener_win2,
+ int64_t *const H) {
+ for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
+ __m256i in[4], out[4];
+
+ in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + i + 1));
+ in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + i + 1));
+ in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + i + 1));
+ in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i + 1));
+
+ transpose_64bit_4x4_avx2(in, out);
+
+ _mm_storel_epi64((__m128i *)(H + (i + 1) * wiener_win2 + i),
+ _mm256_castsi256_si128(out[0]));
+ _mm_storeu_si128((__m128i *)(H + (i + 2) * wiener_win2 + i),
+ _mm256_castsi256_si128(out[1]));
+ _mm256_storeu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i), out[2]);
+ _mm256_storeu_si256((__m256i *)(H + (i + 4) * wiener_win2 + i), out[3]);
+
+ for (int32_t j = i + 5; j < wiener_win2; j += 4) {
+ in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + j));
+ in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + j));
+ in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + j));
+ in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + j));
+
+ transpose_64bit_4x4_avx2(in, out);
+
+ _mm256_storeu_si256((__m256i *)(H + (j + 0) * wiener_win2 + i), out[0]);
+ _mm256_storeu_si256((__m256i *)(H + (j + 1) * wiener_win2 + i), out[1]);
+ _mm256_storeu_si256((__m256i *)(H + (j + 2) * wiener_win2 + i), out[2]);
+ _mm256_storeu_si256((__m256i *)(H + (j + 3) * wiener_win2 + i), out[3]);
+ }
+ }
+}
+
+// Fill H buffer based on loop_count.
+#define INIT_H_VALUES(d, loop_count) \
+ for (int g = 0; g < (loop_count); g++) { \
+ const __m256i dgd0 = \
+ _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \
+ madd_and_accum_avx2(dgd_mul_df, dgd0, &sum_h[g]); \
+ }
+
+// Fill M & H buffer.
+#define INIT_MH_VALUES(d) \
+ for (int g = 0; g < wiener_win; g++) { \
+ const __m256i dgds_0 = \
+ _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \
+ madd_and_accum_avx2(src_mul_df, dgds_0, &sum_m[g]); \
+ madd_and_accum_avx2(dgd_mul_df, dgds_0, &sum_h[g]); \
+ }
+
+// Update the dgd pointers appropriately.
+#define INITIALIZATION(wiener_window_sz) \
+ j = i / (wiener_window_sz); \
+ const int16_t *d_window = d + j; \
+ const int16_t *d_current_row = \
+ d + j + ((i % (wiener_window_sz)) * d_stride); \
+ int proc_ht = v_start; \
+ downsample_factor = \
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \
+ __m256i sum_h[wiener_window_sz]; \
+ memset(sum_h, 0, sizeof(sum_h));
+
+// Update the downsample factor appropriately.
+#define UPDATE_DOWNSAMPLE_FACTOR \
+ int proc_wd = 0; \
+ if (use_downsampled_wiener_stats && \
+ ((v_end - proc_ht) < WIENER_STATS_DOWNSAMPLE_FACTOR)) { \
+ downsample_factor = v_end - proc_ht; \
+ } \
+ const __m256i df_reg = _mm256_set1_epi16(downsample_factor);
+
+#define CALCULATE_REMAINING_H_WIN5 \
+ while (j < wiener_win) { \
+ d_window = d; \
+ d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \
+ const __m256i zero = _mm256_setzero_si256(); \
+ sum_h[0] = zero; \
+ sum_h[1] = zero; \
+ sum_h[2] = zero; \
+ sum_h[3] = zero; \
+ sum_h[4] = zero; \
+ \
+ proc_ht = v_start; \
+ downsample_factor = \
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \
+ do { \
+ UPDATE_DOWNSAMPLE_FACTOR; \
+ \
+ /* Process the amount of width multiple of 16.*/ \
+ while (proc_wd < wd_mul16) { \
+ const __m256i dgd = \
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \
+ INIT_H_VALUES(d_window + j + proc_wd, 5) \
+ \
+ proc_wd += 16; \
+ }; \
+ \
+ /* Process the remaining width here. */ \
+ if (wd_beyond_mul16) { \
+ const __m256i dgd = \
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \
+ INIT_H_VALUES(d_window + j + proc_wd, 5) \
+ } \
+ proc_ht += downsample_factor; \
+ d_window += downsample_factor * d_stride; \
+ d_current_row += downsample_factor * d_stride; \
+ } while (proc_ht < v_end); \
+ const __m256i s_h0 = \
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \
+ s_h0); \
+ const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); \
+ const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); \
+ _mm_storel_epi64( \
+ (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0); \
+ j++; \
+ }
+
+#define CALCULATE_REMAINING_H_WIN7 \
+ while (j < wiener_win) { \
+ d_window = d; \
+ d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \
+ const __m256i zero = _mm256_setzero_si256(); \
+ sum_h[0] = zero; \
+ sum_h[1] = zero; \
+ sum_h[2] = zero; \
+ sum_h[3] = zero; \
+ sum_h[4] = zero; \
+ sum_h[5] = zero; \
+ sum_h[6] = zero; \
+ \
+ proc_ht = v_start; \
+ downsample_factor = \
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \
+ do { \
+ UPDATE_DOWNSAMPLE_FACTOR; \
+ \
+ /* Process the amount of width multiple of 16.*/ \
+ while (proc_wd < wd_mul16) { \
+ const __m256i dgd = \
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \
+ INIT_H_VALUES(d_window + j + proc_wd, 7) \
+ \
+ proc_wd += 16; \
+ }; \
+ \
+ /* Process the remaining width here. */ \
+ if (wd_beyond_mul16) { \
+ const __m256i dgd = \
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \
+ INIT_H_VALUES(d_window + j + proc_wd, 7) \
+ } \
+ proc_ht += downsample_factor; \
+ d_window += downsample_factor * d_stride; \
+ d_current_row += downsample_factor * d_stride; \
+ } while (proc_ht < v_end); \
+ const __m256i s_h1 = \
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \
+ s_h1); \
+ const __m256i s_h2 = \
+ hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); \
+ _mm256_storeu_si256( \
+ (__m256i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_h2); \
+ j++; \
+ }
+
+// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate
+// the filter tap values required for wiener filtering. Here, the buffer H is of
+// size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size
+// (wiener_window_size*wiener_window_size). H is a symmetric matrix where the
+// value above the diagonal (upper triangle) are equal to the values below the
+// diagonal (lower triangle). The calculation of elements/stats of H(upper
+// triangle) and M is done in steps as described below where each step fills
+// specific values of H and M.
+// Once the upper triangular elements of H matrix are derived, the same will be
+// copied to lower triangular using the function
+// fill_lower_triag_elements_avx2().
+// Example: Wiener window size =
+// WIENER_WIN_CHROMA (5) M buffer = [M0 M1 M2 ---- M23 M24] H buffer = Hxy
+// (x-row, y-column) [H00 H01 H02 ---- H023 H024] [H10 H11 H12 ---- H123 H124]
+// [H30 H31 H32 ---- H323 H324]
+// [H40 H41 H42 ---- H423 H424]
+// [H50 H51 H52 ---- H523 H524]
+// [H60 H61 H62 ---- H623 H624]
+// ||
+// ||
+// [H230 H231 H232 ---- H2323 H2324]
+// [H240 H241 H242 ---- H2423 H2424]
+// In Step 1, whole M buffers (i.e., M0 to M24) and the first row of H (i.e.,
+// H00 to H024) is filled. The remaining rows of H buffer are filled through
+// steps 2 to 6.
+static void compute_stats_win5_avx2(const int16_t *const d, int32_t d_stride,
+ const int16_t *const s, int32_t s_stride,
+ int32_t width, int v_start, int v_end,
+ int64_t *const M, int64_t *const H,
+ int use_downsampled_wiener_stats) {
+ const int32_t wiener_win = WIENER_WIN_CHROMA;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ // Amount of width which is beyond multiple of 16. This case is handled
+ // appropriately to process only the required width towards the end.
+ const int32_t wd_mul16 = width & ~15;
+ const int32_t wd_beyond_mul16 = width - wd_mul16;
+ const __m256i mask =
+ _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16]));
+ int downsample_factor;
+
+ // Step 1: Full M (i.e., M0 to M24) and first row H (i.e., H00 to H024)
+ // values are filled here. Here, the loop over 'j' is executed for values 0
+ // to 4 (wiener_win-1). When the loop executed for a specific 'j', 5 values of
+ // M and H are filled as shown below.
+ // j=0: M0-M4 and H00-H04, j=1: M5-M9 and H05-H09 are filled etc,.
+ int j = 0;
+ do {
+ const int16_t *s_t = s;
+ const int16_t *d_t = d;
+ __m256i sum_m[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
+ __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
+ downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ int proc_ht = v_start;
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+ const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+ const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_MH_VALUES(d_t + j + proc_wd)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+ const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+ const __m256i src_mask = _mm256_and_si256(src, mask);
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_MH_VALUES(d_t + j + proc_wd)
+ }
+ proc_ht += downsample_factor;
+ s_t += downsample_factor * s_stride;
+ d_t += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+
+ const __m256i s_m =
+ hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]);
+ const __m128i s_m_h = convert_32_to_64_add_avx2(sum_m[4], sum_h[4]);
+ _mm256_storeu_si256((__m256i *)(M + wiener_win * j), s_m);
+ _mm_storel_epi64((__m128i *)&M[wiener_win * j + 4], s_m_h);
+
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + wiener_win * j), s_h);
+ _mm_storeh_epi64((__m128i *)&H[wiener_win * j + 4], s_m_h);
+ } while (++j < wiener_win);
+
+ // The below steps are designed to fill remaining rows of H buffer. Here, aim
+ // is to fill only upper triangle elements correspond to each row and lower
+ // triangle elements are copied from upper-triangle elements. Also, as
+ // mentioned in Step 1, the core function is designed to fill 5
+ // elements/stats/values of H buffer.
+ //
+ // Step 2: Here, the rows 1, 6, 11, 16 and 21 are filled. As we need to fill
+ // only upper-triangle elements, H10 from row1, H60-H64 and H65 from row6,etc,
+ // are need not be filled. As the core function process 5 values, in first
+ // iteration of 'j' only 4 values to be filled i.e., H11-H14 from row1,H66-H69
+ // from row6, etc.
+ for (int i = 1; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN_CHROMA)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN5
+ }
+
+ // Step 3: Here, the rows 2, 7, 12, 17 and 22 are filled. As we need to fill
+ // only upper-triangle elements, H20-H21 from row2, H70-H74 and H75-H76 from
+ // row7, etc, are need not be filled. As the core function process 5 values,
+ // in first iteration of 'j' only 3 values to be filled i.e., H22-H24 from
+ // row2, H77-H79 from row7, etc.
+ for (int i = 2; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN_CHROMA)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN5
+ }
+
+ // Step 4: Here, the rows 3, 8, 13, 18 and 23 are filled. As we need to fill
+ // only upper-triangle elements, H30-H32 from row3, H80-H84 and H85-H87 from
+ // row8, etc, are need not be filled. As the core function process 5 values,
+ // in first iteration of 'j' only 2 values to be filled i.e., H33-H34 from
+ // row3, H88-89 from row8, etc.
+ for (int i = 3; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN_CHROMA)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]);
+ _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN5
+ }
+
+ // Step 5: Here, the rows 4, 9, 14, 19 and 24 are filled. As we need to fill
+ // only upper-triangle elements, H40-H43 from row4, H90-H94 and H95-H98 from
+ // row9, etc, are need not be filled. As the core function process 5 values,
+ // in first iteration of 'j' only 1 values to be filled i.e., H44 from row4,
+ // H99 from row9, etc.
+ for (int i = 4; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN_CHROMA)
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]);
+ _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN5
+ }
+
+ // Step 6: Here, the rows 5, 10, 15 and 20 are filled. As we need to fill only
+ // upper-triangle elements, H50-H54 from row5, H100-H104 and H105-H109 from
+ // row10,etc, are need not be filled. The first iteration of 'j' fills H55-H59
+ // from row5 and H1010-H1014 from row10, etc.
+ for (int i = 5; i < wiener_win2; i += wiener_win) {
+ // Derive j'th iteration from where the H buffer filling needs to be
+ // started.
+ j = i / wiener_win;
+ int shift = 0;
+ do {
+ // Update the dgd pointers appropriately.
+ int proc_ht = v_start;
+ const int16_t *d_window = d + (i / wiener_win);
+ const int16_t *d_current_row =
+ d + (i / wiener_win) + ((i % wiener_win) * d_stride);
+ downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + shift + proc_wd, 5)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + shift + proc_wd, 5)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)),
+ s_h);
+ const __m256i s_m_h = convert_and_add_avx2(sum_h[4]);
+ const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h);
+ _mm_storel_epi64(
+ (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0);
+ shift++;
+ } while (++j < wiener_win);
+ }
+
+ fill_lower_triag_elements_avx2(wiener_win2, H);
+}
+
+// The buffers H(auto-covariance) and M(cross-correlation) are used to estimate
+// the filter tap values required for wiener filtering. Here, the buffer H is of
+// size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size
+// (wiener_window_size*wiener_window_size). H is a symmetric matrix where the
+// value above the diagonal (upper triangle) are equal to the values below the
+// diagonal (lower triangle). The calculation of elements/stats of H(upper
+// triangle) and M is done in steps as described below where each step fills
+// specific values of H and M.
+// Example:
+// Wiener window size = WIENER_WIN (7)
+// M buffer = [M0 M1 M2 ---- M47 M48]
+// H buffer = Hxy (x-row, y-column)
+// [H00 H01 H02 ---- H047 H048]
+// [H10 H11 H12 ---- H147 H148]
+// [H30 H31 H32 ---- H347 H348]
+// [H40 H41 H42 ---- H447 H448]
+// [H50 H51 H52 ---- H547 H548]
+// [H60 H61 H62 ---- H647 H648]
+// ||
+// ||
+// [H470 H471 H472 ---- H4747 H4748]
+// [H480 H481 H482 ---- H4847 H4848]
+// In Step 1, whole M buffers (i.e., M0 to M48) and the first row of H (i.e.,
+// H00 to H048) is filled. The remaining rows of H buffer are filled through
+// steps 2 to 8.
+static void compute_stats_win7_avx2(const int16_t *const d, int32_t d_stride,
+ const int16_t *const s, int32_t s_stride,
+ int32_t width, int v_start, int v_end,
+ int64_t *const M, int64_t *const H,
+ int use_downsampled_wiener_stats) {
+ const int32_t wiener_win = WIENER_WIN;
+ const int32_t wiener_win2 = wiener_win * wiener_win;
+ // Amount of width which is beyond multiple of 16. This case is handled
+ // appropriately to process only the required width towards the end.
+ const int32_t wd_mul16 = width & ~15;
+ const int32_t wd_beyond_mul16 = width - wd_mul16;
+ const __m256i mask =
+ _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16]));
+ int downsample_factor;
+
+ // Step 1: Full M (i.e., M0 to M48) and first row H (i.e., H00 to H048)
+ // values are filled here. Here, the loop over 'j' is executed for values 0
+ // to 6. When the loop executed for a specific 'j', 7 values of M and H are
+ // filled as shown below.
+ // j=0: M0-M6 and H00-H06, j=1: M7-M13 and H07-H013 are filled etc,.
+ int j = 0;
+ do {
+ const int16_t *s_t = s;
+ const int16_t *d_t = d;
+ __m256i sum_m[WIENER_WIN] = { _mm256_setzero_si256() };
+ __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() };
+ downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ int proc_ht = v_start;
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+ const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+ const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_MH_VALUES(d_t + j + proc_wd)
+
+ proc_wd += 16;
+ }
+
+ if (wd_beyond_mul16) {
+ const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
+ const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
+ const __m256i src_mask = _mm256_and_si256(src, mask);
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_MH_VALUES(d_t + j + proc_wd)
+ }
+ proc_ht += downsample_factor;
+ s_t += downsample_factor * s_stride;
+ d_t += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+
+ const __m256i s_m0 =
+ hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]);
+ const __m256i s_m1 =
+ hadd_four_32_to_64_avx2(sum_m[4], sum_m[5], &sum_m[6], &sum_m[6]);
+ _mm256_storeu_si256((__m256i *)(M + wiener_win * j + 0), s_m0);
+ _mm_storeu_si128((__m128i *)(M + wiener_win * j + 4),
+ _mm256_castsi256_si128(s_m1));
+ _mm_storel_epi64((__m128i *)&M[wiener_win * j + 6],
+ _mm256_extracti128_si256(s_m1, 1));
+
+ const __m256i sh_0 =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ const __m256i sh_1 =
+ hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]);
+ _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 0), sh_0);
+ _mm_storeu_si128((__m128i *)(H + wiener_win * j + 4),
+ _mm256_castsi256_si128(sh_1));
+ _mm_storel_epi64((__m128i *)&H[wiener_win * j + 6],
+ _mm256_extracti128_si256(sh_1, 1));
+ } while (++j < wiener_win);
+
+ // The below steps are designed to fill remaining rows of H buffer. Here, aim
+ // is to fill only upper triangle elements correspond to each row and lower
+ // triangle elements are copied from upper-triangle elements. Also, as
+ // mentioned in Step 1, the core function is designed to fill 7
+ // elements/stats/values of H buffer.
+ //
+ // Step 2: Here, the rows 1, 8, 15, 22, 29, 36 and 43 are filled. As we need
+ // to fill only upper-triangle elements, H10 from row1, H80-H86 and H87 from
+ // row8, etc. are need not be filled. As the core function process 7 values,
+ // in first iteration of 'j' only 6 values to be filled i.e., H11-H16 from
+ // row1 and H88-H813 from row8, etc.
+ for (int i = 1; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+ const __m128i s_h0 = convert_32_to_64_add_avx2(sum_h[4], sum_h[5]);
+ _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i + 4), s_h0);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 3: Here, the rows 2, 9, 16, 23, 30, 37 and 44 are filled. As we need
+ // to fill only upper-triangle elements, H20-H21 from row2, H90-H96 and
+ // H97-H98 from row9, etc. are need not be filled. As the core function
+ // process 7 values, in first iteration of 'j' only 5 values to be filled
+ // i.e., H22-H26 from row2 and H99-H913 from row9, etc.
+ for (int i = 2; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+ const __m256i s_m_h = convert_and_add_avx2(sum_h[4]);
+ const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h);
+ _mm_storel_epi64((__m128i *)(H + (i * wiener_win2) + i + 4), s_m_h0);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 4: Here, the rows 3, 10, 17, 24, 31, 38 and 45 are filled. As we need
+ // to fill only upper-triangle elements, H30-H32 from row3, H100-H106 and
+ // H107-H109 from row10, etc. are need not be filled. As the core function
+ // process 7 values, in first iteration of 'j' only 4 values to be filled
+ // i.e., H33-H36 from row3 and H1010-H1013 from row10, etc.
+ for (int i = 3; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 5: Here, the rows 4, 11, 18, 25, 32, 39 and 46 are filled. As we need
+ // to fill only upper-triangle elements, H40-H43 from row4, H110-H116 and
+ // H117-H1110 from row10, etc. are need not be filled. As the core function
+ // process 7 values, in first iteration of 'j' only 3 values to be filled
+ // i.e., H44-H46 from row4 and H1111-H1113 from row11, etc.
+ for (int i = 4; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 6: Here, the rows 5, 12, 19, 26, 33, 40 and 47 are filled. As we need
+ // to fill only upper-triangle elements, H50-H54 from row5, H120-H126 and
+ // H127-H1211 from row12, etc. are need not be filled. As the core function
+ // process 7 values, in first iteration of 'j' only 2 values to be filled
+ // i.e., H55-H56 from row5 and H1212-H1213 from row12, etc.
+ for (int i = 5; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 7: Here, the rows 6, 13, 20, 27, 34, 41 and 48 are filled. As we need
+ // to fill only upper-triangle elements, H60-H65 from row6, H130-H136 and
+ // H137-H1312 from row13, etc. are need not be filled. As the core function
+ // process 7 values, in first iteration of 'j' only 1 value to be filled
+ // i.e., H66 from row6 and H1313 from row13, etc.
+ for (int i = 6; i < wiener_win2; i += wiener_win) {
+ // Update the dgd pointers appropriately and also derive the 'j'th iteration
+ // from where the H buffer filling needs to be started.
+ INITIALIZATION(WIENER_WIN)
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+ const __m256i s_h =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ xx_storel_64(&H[(i * wiener_win2) + i], _mm256_castsi256_si128(s_h));
+
+ // process the remaining 'j' iterations.
+ j++;
+ CALCULATE_REMAINING_H_WIN7
+ }
+
+ // Step 8: Here, the rows 7, 14, 21, 28, 35 and 42 are filled. As we need
+ // to fill only upper-triangle elements, H70-H75 from row7, H140-H146 and
+ // H147-H1413 from row14, etc. are need not be filled. The first iteration of
+ // 'j' fills H77-H713 from row7 and H1414-H1420 from row14, etc.
+ for (int i = 7; i < wiener_win2; i += wiener_win) {
+ // Derive j'th iteration from where the H buffer filling needs to be
+ // started.
+ j = i / wiener_win;
+ int shift = 0;
+ do {
+ // Update the dgd pointers appropriately.
+ int proc_ht = v_start;
+ const int16_t *d_window = d + (i / WIENER_WIN);
+ const int16_t *d_current_row =
+ d + (i / WIENER_WIN) + ((i % WIENER_WIN) * d_stride);
+ downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() };
+ do {
+ UPDATE_DOWNSAMPLE_FACTOR
+
+ // Process the amount of width multiple of 16.
+ while (proc_wd < wd_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
+ INIT_H_VALUES(d_window + shift + proc_wd, 7)
+
+ proc_wd += 16;
+ }
+
+ // Process the remaining width here.
+ if (wd_beyond_mul16) {
+ const __m256i dgd =
+ _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
+ const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
+ const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
+ INIT_H_VALUES(d_window + shift + proc_wd, 7)
+ }
+ proc_ht += downsample_factor;
+ d_window += downsample_factor * d_stride;
+ d_current_row += downsample_factor * d_stride;
+ } while (proc_ht < v_end);
+
+ const __m256i sh_0 =
+ hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
+ const __m256i sh_1 =
+ hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]);
+ _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)),
+ sh_0);
+ _mm_storeu_si128(
+ (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4),
+ _mm256_castsi256_si128(sh_1));
+ _mm_storel_epi64((__m128i *)&H[(i * wiener_win2) + (wiener_win * j) + 6],
+ _mm256_extracti128_si256(sh_1, 1));
+ shift++;
+ } while (++j < wiener_win);
+ }
+
+ fill_lower_triag_elements_avx2(wiener_win2, H);
+}
+
+void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ if (wiener_win != WIENER_WIN && wiener_win != WIENER_WIN_CHROMA) {
+ // Currently, libaom supports Wiener filter processing with window sizes as
+ // WIENER_WIN_CHROMA(5) and WIENER_WIN(7). For any other window size, SIMD
+ // support is not facilitated. Hence, invoke C function for the same.
+ av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M, H,
+ use_downsampled_wiener_stats);
+ return;
+ }
+
+ const int32_t wiener_halfwin = wiener_win >> 1;
+ const uint8_t avg =
+ calc_dgd_buf_avg_avx2(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+ const int32_t width = h_end - h_start;
+ const int32_t height = v_end - v_start;
+ const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15;
+ const int32_t s_stride = (width + 15) & ~15;
+
+ // Based on the sf 'use_downsampled_wiener_stats', process either once for
+ // UPDATE_DOWNSAMPLE_FACTOR or for each row.
+ sub_avg_block_avx2(src + v_start * src_stride + h_start, src_stride, avg,
+ width, height, src_avg, s_stride,
+ use_downsampled_wiener_stats);
+
+ // Compute (dgd-avg) buffer here which is used to fill H buffer.
+ sub_avg_block_avx2(
+ dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin,
+ dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin,
+ dgd_avg, d_stride, 0);
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_avx2(dgd_avg, d_stride, src_avg, s_stride, width,
+ v_start, v_end, M, H, use_downsampled_wiener_stats);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_avx2(dgd_avg, d_stride, src_avg, s_stride, width,
+ v_start, v_end, M, H, use_downsampled_wiener_stats);
+ }
+}
+
+static INLINE __m256i pair_set_epi16(int a, int b) {
+ return _mm256_set1_epi32(
+ (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+ __m256i sum64 = _mm256_setzero_si256();
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt0_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+ yy_loadu_256(flt0 + j + 8)),
+ 0xd8);
+ const __m256i flt1_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+ yy_loadu_256(flt1 + j + 8)),
+ 0xd8);
+ const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+ const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
+ const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
+ const __m256i v0 = _mm256_madd_epi16(
+ xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m256i v1 = _mm256_madd_epi16(
+ xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+ const __m256i xq_coeff =
+ pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS));
+ const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt + j),
+ yy_loadu_256(flt + j + 8)),
+ 0xd8);
+ const __m256i v0 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0));
+ const __m256i v1 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq_active * (flt[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i diff0 = _mm256_sub_epi16(d0, s0);
+ const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64_0, sum64_1);
+ }
+ int64_t sum[4];
+ yy_storeu_256(sum, sum64);
+ err += sum[0] + sum[1] + sum[2] + sum[3];
+ return err;
+}
+
+// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
+// C and H need to be computed.
+static AOM_INLINE void calc_proj_params_r0_r1_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m256i h00, h01, h11, c0, c1;
+ const __m256i zero = _mm256_setzero_si256();
+ h01 = h11 = c0 = c1 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+ __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f1 = _mm256_sub_epi32(f1, d);
+ f2 = _mm256_sub_epi32(f2, d);
+
+ const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+ const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f1, 32));
+ h00 = _mm256_add_epi64(h00, h00_even);
+ h00 = _mm256_add_epi64(h00, h00_odd);
+
+ const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+ const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f2, 32));
+ h01 = _mm256_add_epi64(h01, h01_even);
+ h01 = _mm256_add_epi64(h01, h01_odd);
+
+ const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+ const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+ _mm256_srli_epi64(f2, 32));
+ h11 = _mm256_add_epi64(h11, h11_even);
+ h11 = _mm256_add_epi64(h11, h11_odd);
+
+ const __m256i c0_even = _mm256_mul_epi32(f1, s);
+ const __m256i c0_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+ c0 = _mm256_add_epi64(c0, c0_even);
+ c0 = _mm256_add_epi64(c0, c0_odd);
+
+ const __m256i c1_even = _mm256_mul_epi32(f2, s);
+ const __m256i c1_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+ c1 = _mm256_add_epi64(c1, c1_even);
+ c1 = _mm256_add_epi64(c1, c1_odd);
+ }
+ }
+
+ __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+ const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+ c_low = _mm256_add_epi64(c_low, c_high);
+ const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+ _mm256_castsi256_si128(c_low));
+
+ __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+ const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+ h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+ const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+ _mm256_castsi256_si128(h0x_low));
+
+ // Using the symmetric properties of H, calculations of H[1][0] are not
+ // needed.
+ __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+ const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+ h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+ const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+ _mm256_castsi256_si128(h1x_low));
+
+ xx_storeu_128(C, c_128bit);
+ xx_storeu_128(H[0], h0x_128bit);
+ xx_storeu_128(H[1], h1x_128bit);
+
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+
+ // Since H is a symmetric matrix
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m256i h00, c0;
+ const __m256i zero = _mm256_setzero_si256();
+ c0 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f1 = _mm256_sub_epi32(f1, d);
+
+ const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+ const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f1, 32));
+ h00 = _mm256_add_epi64(h00, h00_even);
+ h00 = _mm256_add_epi64(h00, h00_odd);
+
+ const __m256i c0_even = _mm256_mul_epi32(f1, s);
+ const __m256i c0_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+ c0 = _mm256_add_epi64(c0, c0_even);
+ c0 = _mm256_add_epi64(c0, c0_odd);
+ }
+ }
+ const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+ _mm256_castsi256_si128(h00));
+ const __m128i h00_val =
+ _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+ const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+ _mm256_castsi256_si128(c0));
+ const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+ const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[0], h0x);
+
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_avx2(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8,
+ int dat_stride, int32_t *flt1,
+ int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m256i h11, c1;
+ const __m256i zero = _mm256_setzero_si256();
+ c1 = h11 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu8_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f2 = _mm256_sub_epi32(f2, d);
+
+ const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+ const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+ _mm256_srli_epi64(f2, 32));
+ h11 = _mm256_add_epi64(h11, h11_even);
+ h11 = _mm256_add_epi64(h11, h11_odd);
+
+ const __m256i c1_even = _mm256_mul_epi32(f2, s);
+ const __m256i c1_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+ c1 = _mm256_add_epi64(c1, c1_even);
+ c1 = _mm256_add_epi64(c1, c1_odd);
+ }
+ }
+
+ const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+ _mm256_castsi256_si128(h11));
+ const __m128i h11_val =
+ _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+ const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+ _mm256_castsi256_si128(c1));
+ const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+ const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[1], h1x);
+
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_c.
+void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2],
+ int64_t C[2], const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride,
+ flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride,
+ flt1, flt1_stride, H, C);
+ }
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m256i h00, h01, h11, c0, c1;
+ const __m256i zero = _mm256_setzero_si256();
+ h01 = h11 = c0 = c1 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+ __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+ __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f1 = _mm256_sub_epi32(f1, d);
+ f2 = _mm256_sub_epi32(f2, d);
+
+ const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+ const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f1, 32));
+ h00 = _mm256_add_epi64(h00, h00_even);
+ h00 = _mm256_add_epi64(h00, h00_odd);
+
+ const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+ const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f2, 32));
+ h01 = _mm256_add_epi64(h01, h01_even);
+ h01 = _mm256_add_epi64(h01, h01_odd);
+
+ const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+ const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+ _mm256_srli_epi64(f2, 32));
+ h11 = _mm256_add_epi64(h11, h11_even);
+ h11 = _mm256_add_epi64(h11, h11_odd);
+
+ const __m256i c0_even = _mm256_mul_epi32(f1, s);
+ const __m256i c0_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+ c0 = _mm256_add_epi64(c0, c0_even);
+ c0 = _mm256_add_epi64(c0, c0_odd);
+
+ const __m256i c1_even = _mm256_mul_epi32(f2, s);
+ const __m256i c1_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+ c1 = _mm256_add_epi64(c1, c1_even);
+ c1 = _mm256_add_epi64(c1, c1_odd);
+ }
+ }
+
+ __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+ const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+ c_low = _mm256_add_epi64(c_low, c_high);
+ const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+ _mm256_castsi256_si128(c_low));
+
+ __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+ const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+ h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+ const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+ _mm256_castsi256_si128(h0x_low));
+
+ // Using the symmetric properties of H, calculations of H[1][0] are not
+ // needed.
+ __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+ const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+ h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+ const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+ _mm256_castsi256_si128(h1x_low));
+
+ xx_storeu_128(C, c_128bit);
+ xx_storeu_128(H[0], h0x_128bit);
+ xx_storeu_128(H[1], h1x_128bit);
+
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+
+ // Since H is a symmetric matrix
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m256i h00, c0;
+ const __m256i zero = _mm256_setzero_si256();
+ c0 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+ __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f1 = _mm256_sub_epi32(f1, d);
+
+ const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+ const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+ _mm256_srli_epi64(f1, 32));
+ h00 = _mm256_add_epi64(h00, h00_even);
+ h00 = _mm256_add_epi64(h00, h00_odd);
+
+ const __m256i c0_even = _mm256_mul_epi32(f1, s);
+ const __m256i c0_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+ c0 = _mm256_add_epi64(c0, c0_even);
+ c0 = _mm256_add_epi64(c0, c0_odd);
+ }
+ }
+ const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+ _mm256_castsi256_si128(h00));
+ const __m128i h00_val =
+ _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+ const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+ _mm256_castsi256_si128(c0));
+ const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+ const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[0], h0x);
+
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m256i h11, c1;
+ const __m256i zero = _mm256_setzero_si256();
+ c1 = h11 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i u_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
+ const __m256i s_load = _mm256_cvtepu16_epi32(
+ _mm_load_si128((__m128i *)(src + i * src_stride + j)));
+ __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+ __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm256_sub_epi32(s, d);
+ f2 = _mm256_sub_epi32(f2, d);
+
+ const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+ const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+ _mm256_srli_epi64(f2, 32));
+ h11 = _mm256_add_epi64(h11, h11_even);
+ h11 = _mm256_add_epi64(h11, h11_odd);
+
+ const __m256i c1_even = _mm256_mul_epi32(f2, s);
+ const __m256i c1_odd =
+ _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+ c1 = _mm256_add_epi64(c1, c1_even);
+ c1 = _mm256_add_epi64(c1, c1_odd);
+ }
+ }
+
+ const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+ _mm256_castsi256_si128(h11));
+ const __m128i h11_val =
+ _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+ const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+ _mm256_castsi256_si128(c1));
+ const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+ const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[1], h1x);
+
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+ __m256i sum64 = _mm256_setzero_si256();
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled
+ const __m256i xq0 = _mm256_set1_epi32(xq[0]);
+ const __m256i xq1 = _mm256_set1_epi32(xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) { // Process 16 pixels at a time
+ // Load 16 pixels each from source image and corrupted image
+ const __m256i s0 = yy_loadu_256(src + j);
+ const __m256i d0 = yy_loadu_256(dat + j);
+ // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 (indices)
+
+ // Shift-up each pixel to match filtered image scaling
+ const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+
+ // Split u0 into two halves and pad each from u16 to i32
+ const __m256i u0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(u0));
+ const __m256i u0h =
+ _mm256_cvtepu16_epi32(_mm256_extracti128_si256(u0, 1));
+ // u0h, u0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
+
+ // Load 16 pixels from each filtered image
+ const __m256i flt0l = yy_loadu_256(flt0 + j);
+ const __m256i flt0h = yy_loadu_256(flt0 + j + 8);
+ const __m256i flt1l = yy_loadu_256(flt1 + j);
+ const __m256i flt1h = yy_loadu_256(flt1 + j + 8);
+ // flt?l, flt?h = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
+
+ // Subtract shifted corrupt image from each filtered image
+ const __m256i flt0l_subu = _mm256_sub_epi32(flt0l, u0l);
+ const __m256i flt0h_subu = _mm256_sub_epi32(flt0h, u0h);
+ const __m256i flt1l_subu = _mm256_sub_epi32(flt1l, u0l);
+ const __m256i flt1h_subu = _mm256_sub_epi32(flt1h, u0h);
+
+ // Multiply basis vectors by appropriate coefficients
+ const __m256i v0l = _mm256_mullo_epi32(flt0l_subu, xq0);
+ const __m256i v0h = _mm256_mullo_epi32(flt0h_subu, xq0);
+ const __m256i v1l = _mm256_mullo_epi32(flt1l_subu, xq1);
+ const __m256i v1h = _mm256_mullo_epi32(flt1h_subu, xq1);
+
+ // Add together the contributions from the two basis vectors
+ const __m256i vl = _mm256_add_epi32(v0l, v1l);
+ const __m256i vh = _mm256_add_epi32(v0h, v1h);
+
+ // Right-shift v with appropriate rounding
+ const __m256i vrl =
+ _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
+ const __m256i vrh =
+ _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
+ // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0]
+
+ // Saturate each i32 to an i16 then combine both halves
+ // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
+ const __m256i vr =
+ _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
+ // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0]
+ // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0]
+
+ // Add twin-subspace-sgr-filter to corrupt image then subtract source
+ const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
+
+ // Calculate squared error and add adjacent values
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+
+ const __m256i sum32l =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+ sum64 = _mm256_add_epi64(sum64, sum32l);
+ const __m256i sum32h =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels in this row (modulo 16)
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled
+ const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1];
+ const __m256i xq_active = _mm256_set1_epi32(xq_on);
+ const __m256i xq_inactive =
+ _mm256_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS));
+ const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ // Load 16 pixels from source image
+ const __m256i s0 = yy_loadu_256(src + j);
+ // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+
+ // Load 16 pixels from corrupted image and pad each u16 to i32
+ const __m256i d0 = yy_loadu_256(dat + j);
+ const __m256i d0h =
+ _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d0, 1));
+ const __m256i d0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d0));
+ // d0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+ // d0h, d0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+ // Load 16 pixels from the filtered image
+ const __m256i flth = yy_loadu_256(flt + j + 8);
+ const __m256i fltl = yy_loadu_256(flt + j);
+ // flth, fltl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+ const __m256i flth_xq = _mm256_mullo_epi32(flth, xq_active);
+ const __m256i fltl_xq = _mm256_mullo_epi32(fltl, xq_active);
+ const __m256i d0h_xq = _mm256_mullo_epi32(d0h, xq_inactive);
+ const __m256i d0l_xq = _mm256_mullo_epi32(d0l, xq_inactive);
+
+ const __m256i vh = _mm256_add_epi32(flth_xq, d0h_xq);
+ const __m256i vl = _mm256_add_epi32(fltl_xq, d0l_xq);
+
+ // Shift this down with appropriate rounding
+ const __m256i vrh =
+ _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
+ const __m256i vrl =
+ _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
+ // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+ // Saturate each i32 to an i16 then combine both halves
+ // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
+ const __m256i vr =
+ _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
+ // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] as u16
+ // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+
+ // Subtract twin-subspace-sgr filtered from source image to get error
+ const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
+
+ // Calculate squared error and add adjacent values
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+
+ const __m256i sum32l =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+ sum64 = _mm256_add_epi64(sum64, sum32l);
+ const __m256i sum32h =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels in this row (modulo 16)
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq_on * (flt[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ }
+ } else { // Neither filter is enabled
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 32; j += 32) {
+ // Load 2x16 u16 from source image
+ const __m256i s0l = yy_loadu_256(src + j);
+ const __m256i s0h = yy_loadu_256(src + j + 16);
+
+ // Load 2x16 u16 from corrupted image
+ const __m256i d0l = yy_loadu_256(dat + j);
+ const __m256i d0h = yy_loadu_256(dat + j + 16);
+
+ // Subtract corrupted image from source image
+ const __m256i diffl = _mm256_sub_epi16(d0l, s0l);
+ const __m256i diffh = _mm256_sub_epi16(d0h, s0h);
+
+ // Square error and add adjacent values
+ const __m256i err0l = _mm256_madd_epi16(diffl, diffl);
+ const __m256i err0h = _mm256_madd_epi16(diffh, diffh);
+
+ sum32 = _mm256_add_epi32(sum32, err0l);
+ sum32 = _mm256_add_epi32(sum32, err0h);
+ }
+
+ const __m256i sum32l =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+ sum64 = _mm256_add_epi64(sum64, sum32l);
+ const __m256i sum32h =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels (modulu 16)
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+
+ // Sum 4 values from sum64l and sum64h into err
+ int64_t sum[4];
+ yy_storeu_256(sum, sum64);
+ err += sum[0] + sum[1] + sum[2] + sum[3];
+ return err;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
new file mode 100644
index 0000000000..50db305802
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
@@ -0,0 +1,1483 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+ const __m128i *shuffle, const __m128i *kl) {
+ const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+ const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
+ const __m128i d1 =
+ _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
+ const __m128i dst0 = xx_loadu_128(dst);
+ const __m128i dst1 = xx_loadu_128(dst + 4);
+ const __m128i r0 = _mm_add_epi32(dst0, d0);
+ const __m128i r1 = _mm_add_epi32(dst1, d1);
+ xx_storeu_128(dst, r0);
+ xx_storeu_128(dst + 4, r1);
+}
+
+static INLINE void acc_stat_win7_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+ int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ const int wiener_win = 7;
+ int j, k, l;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l)));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ *sumX += X1;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_sse41` function wants its input to have interleaved
+ // copies of two pixels, but we only have one. However, the pixels
+ // are (effectively) used as inputs to a multiply-accumulate.
+ // So if we set the extra pixel slot to 0, then it is effectively
+ // ignored.
+ const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win7_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint8_t avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t M_int32_row[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t H_int32_row[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+ int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ int32_t sumX_row = 0;
+ int32_t sumY_row[WIENER_WIN][WIENER_WIN] = { { 0 } };
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i = i + downsample_factor) {
+ if (use_downsampled_wiener_stats &&
+ (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+ downsample_factor = vert_end - i;
+ }
+ sumX_row = 0;
+ memset(sumY_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN);
+ memset(M_int32_row, 0, sizeof(int32_t) * WIENER_WIN * WIENER_WIN);
+ memset(H_int32_row, 0, sizeof(int32_t) * WIENER_WIN2 * (WIENER_WIN * 8));
+ acc_stat_win7_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row);
+ sumX += sumX_row * downsample_factor;
+ // Scale M matrix based on the downsampling factor
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ sumY[k][l] += (sumY_row[k][l] * downsample_factor);
+ M_int32[k][l] += (M_int32_row[k][l] * downsample_factor);
+ }
+ }
+ // Scale H matrix based on the downsampling factor
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int32[k][l] += (H_int32_row[k][l] * downsample_factor);
+ }
+ }
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] =
+ M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd,
+ const __m128i *shuffle,
+ const __m128i *dgd_ijkl) {
+ // Load 256 bits from dgd in two chunks
+ const __m128i s0l = xx_loadu_128(dgd);
+ const __m128i s0h = xx_loadu_128(dgd + 4);
+ // s0l = [7 6 5 4 3 2 1 0] as u16 values (dgd indices)
+ // s0h = [11 10 9 8 7 6 5 4] as u16 values (dgd indices)
+ // (Slightly strange order so we can apply the same shuffle to both halves)
+
+ // Shuffle the u16 values in each half (actually using 8-bit shuffle mask)
+ const __m128i s1l = _mm_shuffle_epi8(s0l, *shuffle);
+ const __m128i s1h = _mm_shuffle_epi8(s0h, *shuffle);
+ // s1l = [4 3 3 2 2 1 1 0] as u16 values (dgd indices)
+ // s1h = [8 7 7 6 6 5 5 4] as u16 values (dgd indices)
+
+ // Multiply s1 by dgd_ijkl resulting in 8x u32 values
+ // Horizontally add pairs of u32 resulting in 4x u32
+ const __m128i dl = _mm_madd_epi16(*dgd_ijkl, s1l);
+ const __m128i dh = _mm_madd_epi16(*dgd_ijkl, s1h);
+ // dl = [d c b a] as u32 values
+ // dh = [h g f e] as u32 values
+
+ // Add these 8x u32 results on to dst in four parts
+ const __m128i dll = _mm_cvtepu32_epi64(dl);
+ const __m128i dlh = _mm_cvtepu32_epi64(_mm_srli_si128(dl, 8));
+ const __m128i dhl = _mm_cvtepu32_epi64(dh);
+ const __m128i dhh = _mm_cvtepu32_epi64(_mm_srli_si128(dh, 8));
+ // dll = [b a] as u64 values, etc.
+
+ const __m128i rll = _mm_add_epi64(xx_loadu_128(dst), dll);
+ xx_storeu_128(dst, rll);
+ const __m128i rlh = _mm_add_epi64(xx_loadu_128(dst + 2), dlh);
+ xx_storeu_128(dst + 2, rlh);
+ const __m128i rhl = _mm_add_epi64(xx_loadu_128(dst + 4), dhl);
+ xx_storeu_128(dst + 4, rhl);
+ const __m128i rhh = _mm_add_epi64(xx_loadu_128(dst + 6), dhh);
+ xx_storeu_128(dst + 6, rhh);
+}
+
+static INLINE void acc_stat_highbd_win7_one_line_sse4_1(
+ const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
+ int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint16_t X1 = src[j];
+ const uint16_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ const uint16_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ // Load two u16 values from dgd as a single u32
+ // Then broadcast to 4x u32 slots of a 128
+ const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l));
+ // dgd_ijkl = [y x y x y x y x] as u16
+
+ acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint16_t X1 = src[j];
+ *sumX += X1;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_highbd_sse41` function wants its input to have
+ // interleaved copies of two pixels, but we only have one. However, the
+ // pixels are (effectively) used as inputs to a multiply-accumulate. So
+ // if we set the extra pixel slot to 0, then it is effectively ignored.
+ const __m128i dgd_ijkl = _mm_set1_epi32((int)D1);
+
+ acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_highbd_win7_opt_sse4_1(
+ const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+ int64_t *H, aom_bit_depth_t bit_depth) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ // Load just half of the 256-bit shuffle control used for the AVX2 version
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_highbd_win7_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+ }
+ }
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = (M_int[k][l] +
+ (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+ bit_depth_divider;
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] =
+ (H_int_[n * 8 + m] +
+ (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+ bit_depth_divider;
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_highbd_win5_one_line_sse4_1(
+ const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint16_t X1 = src[j];
+ const uint16_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ const uint16_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ // Load two u16 values from dgd as a single u32
+ // then broadcast to 4x u32 slots of a 128
+ const __m128i dgd_ijkl = _mm_set1_epi32(loadu_int32(dgd_ijk + l));
+ // dgd_ijkl = [y x y x y x y x] as u16
+
+ acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint16_t X1 = src[j];
+ *sumX += X1;
+ const uint16_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint16_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_highbd_sse41` function wants its input to have
+ // interleaved copies of two pixels, but we only have one. However, the
+ // pixels are (effectively) used as inputs to a multiply-accumulate. So
+ // if we set the extra pixel slot to 0, then it is effectively ignored.
+ const __m128i dgd_ijkl = _mm_set1_epi32((int)D1);
+
+ acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+ &dgd_ijkl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_highbd_win5_opt_sse4_1(
+ const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+ int64_t *H, aom_bit_depth_t bit_depth) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const uint16_t avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ // Load just half of the 256-bit shuffle control used for the AVX2 version
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_highbd_win5_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+ }
+ }
+
+ uint8_t bit_depth_divider = 1;
+ if (bit_depth == AOM_BITS_12)
+ bit_depth_divider = 16;
+ else if (bit_depth == AOM_BITS_10)
+ bit_depth_divider = 4;
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = (M_int[k][l] +
+ (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+ bit_depth_divider;
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] =
+ (H_int_[n * 8 + m] +
+ (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+ bit_depth_divider;
+ }
+ }
+ }
+ }
+}
+
+void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8,
+ const uint8_t *src8, int h_start,
+ int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, int64_t *M,
+ int64_t *H, aom_bit_depth_t bit_depth) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H,
+ bit_depth);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H,
+ bit_depth);
+ } else {
+ av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
+ v_end, dgd_stride, src_stride, M, H, bit_depth);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void acc_stat_win5_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ const int wiener_win = WIENER_WIN_CHROMA;
+ int j, k, l;
+ // Main loop handles two pixels at a time
+ // We can assume that h_start is even, since it will always be aligned to
+ // a tile edge + some number of restoration units, and both of those will
+ // be 64-pixel aligned.
+ // However, at the edge of the image, h_end may be odd, so we need to handle
+ // that case correctly.
+ assert(h_start % 2 == 0);
+ const int h_end_even = h_end & ~1;
+ const int has_odd_pixel = h_end & 1;
+ for (j = h_start; j < h_end_even; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(loadu_int16(dgd_ijk + l)));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+ // If the width is odd, add in the final pixel
+ if (has_odd_pixel) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ *sumX += X1;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ sumY[k][l] += D1;
+ M_int[k][l] += D1 * X1;
+
+ // The `acc_stat_sse41` function wants its input to have interleaved
+ // copies of two pixels, but we only have one. However, the pixels
+ // are (effectively) used as inputs to a multiply-accumulate.
+ // So if we set the extra pixel slot to 0, then it is effectively
+ // ignored.
+ const __m128i kl = _mm_cvtepu8_epi16(_mm_set1_epi16((int16_t)D1));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win5_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint8_t avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t M_int32_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t H_int32_row[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+ int downsample_factor =
+ use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
+ int32_t sumX_row = 0;
+ int32_t sumY_row[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i = i + downsample_factor) {
+ if (use_downsampled_wiener_stats &&
+ (vert_end - i < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
+ downsample_factor = vert_end - i;
+ }
+ sumX_row = 0;
+ memset(sumY_row, 0,
+ sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA);
+ memset(M_int32_row, 0,
+ sizeof(int32_t) * WIENER_WIN_CHROMA * WIENER_WIN_CHROMA);
+ memset(H_int32_row, 0,
+ sizeof(int32_t) * WIENER_WIN2_CHROMA * (WIENER_WIN_CHROMA * 8));
+ acc_stat_win5_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX_row, sumY_row, M_int32_row, H_int32_row);
+ sumX += sumX_row * downsample_factor;
+ // Scale M matrix based on the downsampling factor
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ sumY[k][l] += (sumY_row[k][l] * downsample_factor);
+ M_int32[k][l] += (M_int32_row[k][l] * downsample_factor);
+ }
+ }
+ // Scale H matrix based on the downsampling factor
+ for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int32[k][l] += (H_int32_row[k][l] * downsample_factor);
+ }
+ }
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] =
+ M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
+ int64_t *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int16_t *dgd_avg,
+ int16_t *src_avg, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, int64_t *M, int64_t *H,
+ int use_downsampled_wiener_stats) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H,
+ use_downsampled_wiener_stats);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H,
+ use_downsampled_wiener_stats);
+ } else {
+ av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end,
+ v_start, v_end, dgd_stride, src_stride, M, H,
+ use_downsampled_wiener_stats);
+ }
+}
+
+static INLINE __m128i pair_set_epi16(int a, int b) {
+ return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+ __m128i sum64 = _mm_setzero_si128();
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j <= width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt0_16b =
+ _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+ const __m128i flt1_16b =
+ _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+ const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+ const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0);
+ const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0);
+ const __m128i v0 = _mm_madd_epi16(
+ xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m128i v1 = _mm_madd_epi16(
+ xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+ const __m128i xq_coeff =
+ pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS));
+ const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j <= width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt_16b =
+ _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4));
+ const __m128i v0 =
+ _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0));
+ const __m128i v1 =
+ _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq_active * (flt[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else {
+ __m128i sum32 = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m128i d = xx_loadu_128(dat + j);
+ const __m128i s = xx_loadu_128(src + j);
+ const __m128i d0 = _mm_cvtepu8_epi16(d);
+ const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8));
+ const __m128i s0 = _mm_cvtepu8_epi16(s);
+ const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8));
+ const __m128i diff0 = _mm_sub_epi16(d0, s0);
+ const __m128i diff1 = _mm_sub_epi16(d1, s1);
+ const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+ const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+ sum32 = _mm_add_epi32(sum32, err0);
+ sum32 = _mm_add_epi32(sum32, err1);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64_0, sum64_1);
+ }
+ int64_t sum[2];
+ xx_storeu_128(sum, sum64);
+ err += sum[0] + sum[1];
+ return err;
+}
+
+// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
+// C and H need to be computed.
+static AOM_INLINE void calc_proj_params_r0_r1_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m128i h00, h01, h11, c0, c1;
+ const __m128i zero = _mm_setzero_si128();
+ h01 = h11 = c0 = c1 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+ const __m128i s_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+ __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+ __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f1 = _mm_sub_epi32(f1, d);
+ f2 = _mm_sub_epi32(f2, d);
+
+ const __m128i h00_even = _mm_mul_epi32(f1, f1);
+ const __m128i h00_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+ h00 = _mm_add_epi64(h00, h00_even);
+ h00 = _mm_add_epi64(h00, h00_odd);
+
+ const __m128i h01_even = _mm_mul_epi32(f1, f2);
+ const __m128i h01_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32));
+ h01 = _mm_add_epi64(h01, h01_even);
+ h01 = _mm_add_epi64(h01, h01_odd);
+
+ const __m128i h11_even = _mm_mul_epi32(f2, f2);
+ const __m128i h11_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+ h11 = _mm_add_epi64(h11, h11_even);
+ h11 = _mm_add_epi64(h11, h11_odd);
+
+ const __m128i c0_even = _mm_mul_epi32(f1, s);
+ const __m128i c0_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+ c0 = _mm_add_epi64(c0, c0_even);
+ c0 = _mm_add_epi64(c0, c0_odd);
+
+ const __m128i c1_even = _mm_mul_epi32(f2, s);
+ const __m128i c1_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+ c1 = _mm_add_epi64(c1, c1_even);
+ c1 = _mm_add_epi64(c1, c1_odd);
+ }
+ }
+
+ __m128i c_low = _mm_unpacklo_epi64(c0, c1);
+ const __m128i c_high = _mm_unpackhi_epi64(c0, c1);
+ c_low = _mm_add_epi64(c_low, c_high);
+
+ __m128i h0x_low = _mm_unpacklo_epi64(h00, h01);
+ const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01);
+ h0x_low = _mm_add_epi64(h0x_low, h0x_high);
+
+ // Using the symmetric properties of H, calculations of H[1][0] are not
+ // needed.
+ __m128i h1x_low = _mm_unpacklo_epi64(zero, h11);
+ const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11);
+ h1x_low = _mm_add_epi64(h1x_low, h1x_high);
+
+ xx_storeu_128(C, c_low);
+ xx_storeu_128(H[0], h0x_low);
+ xx_storeu_128(H[1], h1x_low);
+
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+
+ // Since H is a symmetric matrix
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m128i h00, c0;
+ const __m128i zero = _mm_setzero_si128();
+ c0 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+ const __m128i s_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+ __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f1 = _mm_sub_epi32(f1, d);
+
+ const __m128i h00_even = _mm_mul_epi32(f1, f1);
+ const __m128i h00_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+ h00 = _mm_add_epi64(h00, h00_even);
+ h00 = _mm_add_epi64(h00, h00_odd);
+
+ const __m128i c0_even = _mm_mul_epi32(f1, s);
+ const __m128i c0_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+ c0 = _mm_add_epi64(c0, c0_even);
+ c0 = _mm_add_epi64(c0, c0_odd);
+ }
+ }
+ const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8));
+
+ const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(c0_val, zero);
+ const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[0], h0x);
+
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ __m128i h11, c1;
+ const __m128i zero = _mm_setzero_si128();
+ c1 = h11 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(dat + i * dat_stride + j))));
+ const __m128i s_load = _mm_cvtepu8_epi32(
+ _mm_cvtsi32_si128(*((int *)(src + i * src_stride + j))));
+ __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f2 = _mm_sub_epi32(f2, d);
+
+ const __m128i h11_even = _mm_mul_epi32(f2, f2);
+ const __m128i h11_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+ h11 = _mm_add_epi64(h11, h11_even);
+ h11 = _mm_add_epi64(h11, h11_odd);
+
+ const __m128i c1_even = _mm_mul_epi32(f2, s);
+ const __m128i c1_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+ c1 = _mm_add_epi64(c1, c1_even);
+ c1 = _mm_add_epi64(c1, c1_odd);
+ }
+ }
+
+ const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8));
+
+ const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(zero, c1_val);
+ const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[1], h1x);
+
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// SSE4.1 variant of av1_calc_proj_params_c.
+void av1_calc_proj_params_sse4_1(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m128i h00, h01, h11, c0, c1;
+ const __m128i zero = _mm_setzero_si128();
+ h01 = h11 = c0 = c1 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m128i s_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+ __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f1 = _mm_sub_epi32(f1, d);
+ f2 = _mm_sub_epi32(f2, d);
+
+ const __m128i h00_even = _mm_mul_epi32(f1, f1);
+ const __m128i h00_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+ h00 = _mm_add_epi64(h00, h00_even);
+ h00 = _mm_add_epi64(h00, h00_odd);
+
+ const __m128i h01_even = _mm_mul_epi32(f1, f2);
+ const __m128i h01_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f2, 32));
+ h01 = _mm_add_epi64(h01, h01_even);
+ h01 = _mm_add_epi64(h01, h01_odd);
+
+ const __m128i h11_even = _mm_mul_epi32(f2, f2);
+ const __m128i h11_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+ h11 = _mm_add_epi64(h11, h11_even);
+ h11 = _mm_add_epi64(h11, h11_odd);
+
+ const __m128i c0_even = _mm_mul_epi32(f1, s);
+ const __m128i c0_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+ c0 = _mm_add_epi64(c0, c0_even);
+ c0 = _mm_add_epi64(c0, c0_odd);
+
+ const __m128i c1_even = _mm_mul_epi32(f2, s);
+ const __m128i c1_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+ c1 = _mm_add_epi64(c1, c1_even);
+ c1 = _mm_add_epi64(c1, c1_odd);
+ }
+ }
+
+ __m128i c_low = _mm_unpacklo_epi64(c0, c1);
+ const __m128i c_high = _mm_unpackhi_epi64(c0, c1);
+ c_low = _mm_add_epi64(c_low, c_high);
+
+ __m128i h0x_low = _mm_unpacklo_epi64(h00, h01);
+ const __m128i h0x_high = _mm_unpackhi_epi64(h00, h01);
+ h0x_low = _mm_add_epi64(h0x_low, h0x_high);
+
+ // Using the symmetric properties of H, calculations of H[1][0] are not
+ // needed.
+ __m128i h1x_low = _mm_unpacklo_epi64(zero, h11);
+ const __m128i h1x_high = _mm_unpackhi_epi64(zero, h11);
+ h1x_low = _mm_add_epi64(h1x_low, h1x_high);
+
+ xx_storeu_128(C, c_low);
+ xx_storeu_128(H[0], h0x_low);
+ xx_storeu_128(H[1], h1x_low);
+
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+
+ // Since H is a symmetric matrix
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_high_bd_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m128i h00, c0;
+ const __m128i zero = _mm_setzero_si128();
+ c0 = h00 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m128i s_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m128i f1 = _mm_loadu_si128((__m128i *)(flt0 + i * flt0_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f1 = _mm_sub_epi32(f1, d);
+
+ const __m128i h00_even = _mm_mul_epi32(f1, f1);
+ const __m128i h00_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(f1, 32));
+ h00 = _mm_add_epi64(h00, h00_even);
+ h00 = _mm_add_epi64(h00, h00_odd);
+
+ const __m128i c0_even = _mm_mul_epi32(f1, s);
+ const __m128i c0_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f1, 32), _mm_srli_epi64(s, 32));
+ c0 = _mm_add_epi64(c0, c0_even);
+ c0 = _mm_add_epi64(c0, c0_odd);
+ }
+ }
+ const __m128i h00_val = _mm_add_epi64(h00, _mm_srli_si128(h00, 8));
+
+ const __m128i c0_val = _mm_add_epi64(c0, _mm_srli_si128(c0, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(c0_val, zero);
+ const __m128i h0x = _mm_unpacklo_epi64(h00_val, zero);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[0], h0x);
+
+ H[0][0] /= size;
+ C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_high_bd_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2]) {
+ const int size = width * height;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ __m128i h11, c1;
+ const __m128i zero = _mm_setzero_si128();
+ c1 = h11 = zero;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i u_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+ const __m128i s_load = _mm_cvtepu16_epi32(
+ _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+ __m128i f2 = _mm_loadu_si128((__m128i *)(flt1 + i * flt1_stride + j));
+ __m128i d = _mm_slli_epi32(u_load, SGRPROJ_RST_BITS);
+ __m128i s = _mm_slli_epi32(s_load, SGRPROJ_RST_BITS);
+ s = _mm_sub_epi32(s, d);
+ f2 = _mm_sub_epi32(f2, d);
+
+ const __m128i h11_even = _mm_mul_epi32(f2, f2);
+ const __m128i h11_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(f2, 32));
+ h11 = _mm_add_epi64(h11, h11_even);
+ h11 = _mm_add_epi64(h11, h11_odd);
+
+ const __m128i c1_even = _mm_mul_epi32(f2, s);
+ const __m128i c1_odd =
+ _mm_mul_epi32(_mm_srli_epi64(f2, 32), _mm_srli_epi64(s, 32));
+ c1 = _mm_add_epi64(c1, c1_even);
+ c1 = _mm_add_epi64(c1, c1_odd);
+ }
+ }
+
+ const __m128i h11_val = _mm_add_epi64(h11, _mm_srli_si128(h11, 8));
+
+ const __m128i c1_val = _mm_add_epi64(c1, _mm_srli_si128(c1, 8));
+
+ const __m128i c = _mm_unpacklo_epi64(zero, c1_val);
+ const __m128i h1x = _mm_unpacklo_epi64(zero, h11_val);
+
+ xx_storeu_128(C, c);
+ xx_storeu_128(H[1], h1x);
+
+ H[1][1] /= size;
+ C[1] /= size;
+}
+
+// SSE4.1 variant of av1_calc_proj_params_high_bd_c.
+void av1_calc_proj_params_high_bd_sse4_1(const uint8_t *src8, int width,
+ int height, int src_stride,
+ const uint8_t *dat8, int dat_stride,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride,
+ int64_t H[2][2], int64_t C[2],
+ const sgr_params_type *params) {
+ if ((params->r[0] > 0) && (params->r[1] > 0)) {
+ calc_proj_params_r0_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, H, C);
+ } else if (params->r[0] > 0) {
+ calc_proj_params_r0_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, H, C);
+ } else if (params->r[1] > 0) {
+ calc_proj_params_r1_high_bd_sse4_1(src8, width, height, src_stride, dat8,
+ dat_stride, flt1, flt1_stride, H, C);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+ __m128i sum64 = _mm_setzero_si128();
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled
+ const __m128i xq0 = _mm_set1_epi32(xq[0]);
+ const __m128i xq1 = _mm_set1_epi32(xq[1]);
+
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j <= width - 8; j += 8) {
+ // Load 8x pixels from source image
+ const __m128i s0 = xx_loadu_128(src + j);
+ // s0 = [7 6 5 4 3 2 1 0] as i16 (indices of src[])
+
+ // Load 8x pixels from corrupted image
+ const __m128i d0 = xx_loadu_128(dat + j);
+ // d0 = [7 6 5 4 3 2 1 0] as i16 (indices of dat[])
+
+ // Shift each pixel value up by SGRPROJ_RST_BITS
+ const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+
+ // Split u0 into two halves and pad each from u16 to i32
+ const __m128i u0l = _mm_cvtepu16_epi32(u0);
+ const __m128i u0h = _mm_cvtepu16_epi32(_mm_srli_si128(u0, 8));
+ // u0h = [7 6 5 4] as i32, u0l = [3 2 1 0] as i32, all dat[] indices
+
+ // Load 8 pixels from first and second filtered images
+ const __m128i flt0l = xx_loadu_128(flt0 + j);
+ const __m128i flt0h = xx_loadu_128(flt0 + j + 4);
+ const __m128i flt1l = xx_loadu_128(flt1 + j);
+ const __m128i flt1h = xx_loadu_128(flt1 + j + 4);
+ // flt0 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt0+j)
+ // flt1 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt1+j)
+
+ // Subtract shifted corrupt image from each filtered image
+ // This gives our two basis vectors for the projection
+ const __m128i flt0l_subu = _mm_sub_epi32(flt0l, u0l);
+ const __m128i flt0h_subu = _mm_sub_epi32(flt0h, u0h);
+ const __m128i flt1l_subu = _mm_sub_epi32(flt1l, u0l);
+ const __m128i flt1h_subu = _mm_sub_epi32(flt1h, u0h);
+ // flt?h_subu = [ f[7]-u[7] f[6]-u[6] f[5]-u[5] f[4]-u[4] ] as i32
+ // flt?l_subu = [ f[3]-u[3] f[2]-u[2] f[1]-u[1] f[0]-u[0] ] as i32
+
+ // Multiply each basis vector by the corresponding coefficient
+ const __m128i v0l = _mm_mullo_epi32(flt0l_subu, xq0);
+ const __m128i v0h = _mm_mullo_epi32(flt0h_subu, xq0);
+ const __m128i v1l = _mm_mullo_epi32(flt1l_subu, xq1);
+ const __m128i v1h = _mm_mullo_epi32(flt1h_subu, xq1);
+
+ // Add together the contribution from each scaled basis vector
+ const __m128i vl = _mm_add_epi32(v0l, v1l);
+ const __m128i vh = _mm_add_epi32(v0h, v1h);
+
+ // Right-shift v with appropriate rounding
+ const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift);
+ const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift);
+
+ // Saturate each i32 value to i16 and combine lower and upper halves
+ const __m128i vr = _mm_packs_epi32(vrl, vrh);
+
+ // Add twin-subspace-sgr-filter to corrupt image then subtract source
+ const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0);
+
+ // Calculate squared error and add adjacent values
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+
+ const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+ sum64 = _mm_add_epi64(sum64, sum32l);
+ const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels in this row (modulo 8)
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled
+ const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1];
+ const __m128i xq_active = _mm_set1_epi32(xq_on);
+ const __m128i xq_inactive =
+ _mm_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS));
+ const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+ const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j <= width - 8; j += 8) {
+ // Load 8x pixels from source image
+ const __m128i s0 = xx_loadu_128(src + j);
+ // s0 = [7 6 5 4 3 2 1 0] as u16 (indices of src[])
+
+ // Load 8x pixels from corrupted image and pad each u16 to i32
+ const __m128i d0 = xx_loadu_128(dat + j);
+ const __m128i d0h = _mm_cvtepu16_epi32(_mm_srli_si128(d0, 8));
+ const __m128i d0l = _mm_cvtepu16_epi32(d0);
+ // d0h, d0l = [7 6 5 4], [3 2 1 0] as u32 (indices of dat[])
+
+ // Load 8 pixels from the filtered image
+ const __m128i flth = xx_loadu_128(flt + j + 4);
+ const __m128i fltl = xx_loadu_128(flt + j);
+ // flth, fltl = [7 6 5 4], [3 2 1 0] as i32 (indices of flt+j)
+
+ const __m128i flth_xq = _mm_mullo_epi32(flth, xq_active);
+ const __m128i fltl_xq = _mm_mullo_epi32(fltl, xq_active);
+ const __m128i d0h_xq = _mm_mullo_epi32(d0h, xq_inactive);
+ const __m128i d0l_xq = _mm_mullo_epi32(d0l, xq_inactive);
+
+ const __m128i vh = _mm_add_epi32(flth_xq, d0h_xq);
+ const __m128i vl = _mm_add_epi32(fltl_xq, d0l_xq);
+ // vh = [ xq0(f[7]-d[7]) xq0(f[6]-d[6]) xq0(f[5]-d[5]) xq0(f[4]-d[4]) ]
+ // vl = [ xq0(f[3]-d[3]) xq0(f[2]-d[2]) xq0(f[1]-d[1]) xq0(f[0]-d[0]) ]
+
+ // Shift this down with appropriate rounding
+ const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift);
+ const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift);
+
+ // Saturate vr0 and vr1 from i32 to i16 then pack together
+ const __m128i vr = _mm_packs_epi32(vrl, vrh);
+
+ // Subtract twin-subspace-sgr filtered from source image to get error
+ const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0);
+
+ // Calculate squared error and add adjacent values
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+
+ const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+ sum64 = _mm_add_epi64(sum64, sum32l);
+ const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels in this row (modulo 8)
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq_on * (flt[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt += flt_stride;
+ }
+ } else { // Neither filter is enabled
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j <= width - 16; j += 16) {
+ // Load 2x8 u16 from source image
+ const __m128i s0 = xx_loadu_128(src + j);
+ const __m128i s1 = xx_loadu_128(src + j + 8);
+ // Load 2x8 u16 from corrupted image
+ const __m128i d0 = xx_loadu_128(dat + j);
+ const __m128i d1 = xx_loadu_128(dat + j + 8);
+
+ // Subtract corrupted image from source image
+ const __m128i diff0 = _mm_sub_epi16(d0, s0);
+ const __m128i diff1 = _mm_sub_epi16(d1, s1);
+
+ // Square error and add adjacent values
+ const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+ const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+
+ sum32 = _mm_add_epi32(sum32, err0);
+ sum32 = _mm_add_epi32(sum32, err1);
+ }
+
+ const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+ sum64 = _mm_add_epi64(sum64, sum32l);
+ const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum32h);
+
+ // Process remaining pixels (modulu 8)
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += ((int64_t)e * e);
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+
+ // Sum 4 values from sum64l and sum64h into err
+ int64_t sum[2];
+ xx_storeu_128(sum, sum64);
+ err += sum[0] + sum[1];
+ return err;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/av1/encoder/x86/rdopt_avx2.c b/third_party/aom/av1/encoder/x86/rdopt_avx2.c
new file mode 100644
index 0000000000..a0ab3940c0
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/rdopt_avx2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/rdopt.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+ __m256i *xy_sum_32,
+ __m256i *xz_sum_32, __m256i *x_sum_32,
+ __m256i *x2_sum_32) {
+ // Pixels in this 4x4 [ a b c d ]
+ // are referred to as: [ e f g h ]
+ // [ i j k l ]
+ // [ m n o p ]
+
+ const __m256i pixels = _mm256_set_epi64x(
+ loadu_int64(&diff[0 * stride]), loadu_int64(&diff[1 * stride]),
+ loadu_int64(&diff[2 * stride]), loadu_int64(&diff[3 * stride]));
+ // pixels = [d c b a h g f e] [l k j i p o n m] as i16
+
+ const __m256i slli = _mm256_slli_epi64(pixels, 16);
+ // slli = [c b a 0 g f e 0] [k j i 0 o n m 0] as i16
+
+ const __m256i madd_xy = _mm256_madd_epi16(pixels, slli);
+ // madd_xy = [bc+cd ab fg+gh ef] [jk+kl ij no+op mn] as i32
+ *xy_sum_32 = _mm256_add_epi32(*xy_sum_32, madd_xy);
+
+ // Permute control [3 2] [1 0] => [2 1] [0 0], 0b10010000 = 0x90
+ const __m256i perm = _mm256_permute4x64_epi64(slli, 0x90);
+ // perm = [g f e 0 k j i 0] [o n m 0 o n m 0] as i16
+
+ const __m256i madd_xz = _mm256_madd_epi16(slli, perm);
+ // madd_xz = [cg+bf ae gk+fj ei] [ko+jn im oo+nn mm] as i32
+ *xz_sum_32 = _mm256_add_epi32(*xz_sum_32, madd_xz);
+
+ // Sum every element in slli (and then also their squares)
+ const __m256i madd1_slli = _mm256_madd_epi16(slli, _mm256_set1_epi16(1));
+ // madd1_slli = [c+b a g+f e] [k+j i o+n m] as i32
+ *x_sum_32 = _mm256_add_epi32(*x_sum_32, madd1_slli);
+
+ const __m256i madd_slli = _mm256_madd_epi16(slli, slli);
+ // madd_slli = [cc+bb aa gg+ff ee] [kk+jj ii oo+nn mm] as i32
+ *x2_sum_32 = _mm256_add_epi32(*x2_sum_32, madd_slli);
+}
+
+void av1_get_horver_correlation_full_avx2(const int16_t *diff, int stride,
+ int width, int height, float *hcorr,
+ float *vcorr) {
+ // The following notation is used:
+ // x - current pixel
+ // y - right neighbour pixel
+ // z - below neighbour pixel
+ // w - down-right neighbour pixel
+ int64_t xy_sum = 0, xz_sum = 0;
+ int64_t x_sum = 0, x2_sum = 0;
+
+ // Process horizontal and vertical correlations through the body in 4x4
+ // blocks. This excludes the final row and column and possibly one extra
+ // column depending how 3 divides into width and height
+ int32_t xy_xz_tmp[8] = { 0 }, x_x2_tmp[8] = { 0 };
+ __m256i xy_sum_32 = _mm256_setzero_si256();
+ __m256i xz_sum_32 = _mm256_setzero_si256();
+ __m256i x_sum_32 = _mm256_setzero_si256();
+ __m256i x2_sum_32 = _mm256_setzero_si256();
+ for (int i = 0; i <= height - 4; i += 3) {
+ for (int j = 0; j <= width - 4; j += 3) {
+ horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+ &xz_sum_32, &x_sum_32, &x2_sum_32);
+ }
+ const __m256i hadd_xy_xz = _mm256_hadd_epi32(xy_sum_32, xz_sum_32);
+ // hadd_xy_xz = [ae+bf+cg ei+fj+gk ab+bc+cd ef+fg+gh]
+ // [im+jn+ko mm+nn+oo ij+jk+kl mn+no+op] as i32
+ yy_storeu_256(xy_xz_tmp, hadd_xy_xz);
+ xy_sum += (int64_t)xy_xz_tmp[5] + xy_xz_tmp[4] + xy_xz_tmp[1];
+ xz_sum += (int64_t)xy_xz_tmp[7] + xy_xz_tmp[6] + xy_xz_tmp[3];
+
+ const __m256i hadd_x_x2 = _mm256_hadd_epi32(x_sum_32, x2_sum_32);
+ // hadd_x_x2 = [aa+bb+cc ee+ff+gg a+b+c e+f+g]
+ // [ii+jj+kk mm+nn+oo i+j+k m+n+o] as i32
+ yy_storeu_256(x_x2_tmp, hadd_x_x2);
+ x_sum += (int64_t)x_x2_tmp[5] + x_x2_tmp[4] + x_x2_tmp[1];
+ x2_sum += (int64_t)x_x2_tmp[7] + x_x2_tmp[6] + x_x2_tmp[3];
+
+ xy_sum_32 = _mm256_setzero_si256();
+ xz_sum_32 = _mm256_setzero_si256();
+ x_sum_32 = _mm256_setzero_si256();
+ x2_sum_32 = _mm256_setzero_si256();
+ }
+
+ // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+ int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+ // Do we have 2 rows remaining or just the one? Note that width and height
+ // are powers of 2, so each modulo 3 must be 1 or 2.
+ if (height % 3 == 1) { // Just horiz corrs on the final row
+ const int16_t x0 = diff[(height - 1) * stride];
+ x_sum += x0;
+ x_finalrow += x0;
+ x2_sum += x0 * x0;
+ x2_finalrow += x0 * x0;
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 1) * stride + j];
+ const int16_t y = diff[(height - 1) * stride + j + 1];
+ xy_sum += x * y;
+ x_sum += y;
+ x2_sum += y * y;
+ x_finalrow += y;
+ x2_finalrow += y * y;
+ }
+ } else { // Two rows remaining to do
+ const int16_t x0 = diff[(height - 2) * stride];
+ const int16_t z0 = diff[(height - 1) * stride];
+ x_sum += x0 + z0;
+ x2_sum += x0 * x0 + z0 * z0;
+ x_finalrow += z0;
+ x2_finalrow += z0 * z0;
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 2) * stride + j];
+ const int16_t y = diff[(height - 2) * stride + j + 1];
+ const int16_t z = diff[(height - 1) * stride + j];
+ const int16_t w = diff[(height - 1) * stride + j + 1];
+
+ // Horizontal and vertical correlations for the penultimate row:
+ xy_sum += x * y;
+ xz_sum += x * z;
+
+ // Now just horizontal correlations for the final row:
+ xy_sum += z * w;
+
+ x_sum += y + w;
+ x2_sum += y * y + w * w;
+ x_finalrow += w;
+ x2_finalrow += w * w;
+ }
+ }
+
+ // Do we have 2 columns remaining or just the one?
+ if (width % 3 == 1) { // Just vert corrs on the final col
+ const int16_t x0 = diff[width - 1];
+ x_sum += x0;
+ x_finalcol += x0;
+ x2_sum += x0 * x0;
+ x2_finalcol += x0 * x0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 1];
+ xz_sum += x * z;
+ x_finalcol += z;
+ x2_finalcol += z * z;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z;
+ x2_sum += z * z;
+ }
+ }
+ } else { // Two cols remaining
+ const int16_t x0 = diff[width - 2];
+ const int16_t y0 = diff[width - 1];
+ x_sum += x0 + y0;
+ x2_sum += x0 * x0 + y0 * y0;
+ x_finalcol += y0;
+ x2_finalcol += y0 * y0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 2];
+ const int16_t y = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 2];
+ const int16_t w = diff[(i + 1) * stride + width - 1];
+
+ // Horizontal and vertical correlations for the penultimate col:
+ // Skip these on the last iteration of this loop if we also had two
+ // rows remaining, otherwise the final horizontal and vertical correlation
+ // get erroneously processed twice
+ if (i < height - 2 || height % 3 == 1) {
+ xy_sum += x * y;
+ xz_sum += x * z;
+ }
+
+ x_finalcol += w;
+ x2_finalcol += w * w;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z + w;
+ x2_sum += z * z + w * w;
+ }
+
+ // Now just vertical correlations for the final column:
+ xz_sum += y * w;
+ }
+ }
+
+ // Calculate the simple sums and squared-sums
+ int64_t x_firstrow = 0, x_firstcol = 0;
+ int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+ for (int j = 0; j < width; ++j) {
+ x_firstrow += diff[j];
+ x2_firstrow += diff[j] * diff[j];
+ }
+ for (int i = 0; i < height; ++i) {
+ x_firstcol += diff[i * stride];
+ x2_firstcol += diff[i * stride] * diff[i * stride];
+ }
+
+ int64_t xhor_sum = x_sum - x_finalcol;
+ int64_t xver_sum = x_sum - x_finalrow;
+ int64_t y_sum = x_sum - x_firstcol;
+ int64_t z_sum = x_sum - x_firstrow;
+ int64_t x2hor_sum = x2_sum - x2_finalcol;
+ int64_t x2ver_sum = x2_sum - x2_finalrow;
+ int64_t y2_sum = x2_sum - x2_firstcol;
+ int64_t z2_sum = x2_sum - x2_firstrow;
+
+ const float num_hor = (float)(height * (width - 1));
+ const float num_ver = (float)((height - 1) * width);
+
+ const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+ const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+ const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+ const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+ const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+ const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+ if (xhor_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ } else {
+ *hcorr = 1.0;
+ }
+ if (xver_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ } else {
+ *vcorr = 1.0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/rdopt_sse4.c b/third_party/aom/av1/encoder/x86/rdopt_sse4.c
new file mode 100644
index 0000000000..12ac146195
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/rdopt_sse4.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/rdopt.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+ __m128i *xy_sum_32,
+ __m128i *xz_sum_32, __m128i *x_sum_32,
+ __m128i *x2_sum_32) {
+ // Pixels in this 4x4 [ a b c d ]
+ // are referred to as: [ e f g h ]
+ // [ i j k l ]
+ // [ m n o p ]
+
+ const __m128i pixelsa = _mm_set_epi64x(*(int64_t *)&diff[0 * stride],
+ *(int64_t *)&diff[2 * stride]);
+ const __m128i pixelsb = _mm_set_epi64x(*(int64_t *)&diff[1 * stride],
+ *(int64_t *)&diff[3 * stride]);
+ // pixelsa = [d c b a l k j i] as i16
+ // pixelsb = [h g f e p o n m] as i16
+
+ const __m128i slli_a = _mm_slli_epi64(pixelsa, 16);
+ const __m128i slli_b = _mm_slli_epi64(pixelsb, 16);
+ // slli_a = [c b a 0 k j i 0] as i16
+ // slli_b = [g f e 0 o n m 0] as i16
+
+ const __m128i xy_madd_a = _mm_madd_epi16(pixelsa, slli_a);
+ const __m128i xy_madd_b = _mm_madd_epi16(pixelsb, slli_b);
+ // xy_madd_a = [bc+cd ab jk+kl ij] as i32
+ // xy_madd_b = [fg+gh ef no+op mn] as i32
+
+ const __m128i xy32 = _mm_hadd_epi32(xy_madd_b, xy_madd_a);
+ // xy32 = [ab+bc+cd ij+jk+kl ef+fg+gh mn+no+op] as i32
+ *xy_sum_32 = _mm_add_epi32(*xy_sum_32, xy32);
+
+ const __m128i xz_madd_a = _mm_madd_epi16(slli_a, slli_b);
+ // xz_madd_a = [bf+cg ae jn+ko im] i32
+
+ const __m128i swap_b = _mm_srli_si128(slli_b, 8);
+ // swap_b = [0 0 0 0 g f e 0] as i16
+ const __m128i xz_madd_b = _mm_madd_epi16(slli_a, swap_b);
+ // xz_madd_b = [0 0 gk+fj ei] i32
+
+ const __m128i xz32 = _mm_hadd_epi32(xz_madd_b, xz_madd_a);
+ // xz32 = [ae+bf+cg im+jn+ko 0 ei+fj+gk] i32
+ *xz_sum_32 = _mm_add_epi32(*xz_sum_32, xz32);
+
+ // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
+ // (sum up every element in slli_a and swap_b)
+ const __m128i sum_slli_a = _mm_hadd_epi16(slli_a, slli_a);
+ const __m128i sum_slli_a32 = _mm_cvtepi16_epi32(sum_slli_a);
+ // sum_slli_a32 = [c+b a k+j i] as i32
+ const __m128i swap_b32 = _mm_cvtepi16_epi32(swap_b);
+ // swap_b32 = [g f e 0] as i32
+ *x_sum_32 = _mm_add_epi32(*x_sum_32, sum_slli_a32);
+ *x_sum_32 = _mm_add_epi32(*x_sum_32, swap_b32);
+ // sum = [c+b+g a+f k+j+e i] as i32
+
+ // Also sum their squares
+ const __m128i slli_a_2 = _mm_madd_epi16(slli_a, slli_a);
+ const __m128i swap_b_2 = _mm_madd_epi16(swap_b, swap_b);
+ // slli_a_2 = [c2+b2 a2 k2+j2 i2]
+ // swap_b_2 = [0 0 g2+f2 e2]
+ const __m128i sum2 = _mm_hadd_epi32(slli_a_2, swap_b_2);
+ // sum2 = [0 g2+f2+e2 c2+b2+a2 k2+j2+i2]
+ *x2_sum_32 = _mm_add_epi32(*x2_sum_32, sum2);
+}
+
+void av1_get_horver_correlation_full_sse4_1(const int16_t *diff, int stride,
+ int width, int height, float *hcorr,
+ float *vcorr) {
+ // The following notation is used:
+ // x - current pixel
+ // y - right neighbour pixel
+ // z - below neighbour pixel
+ // w - down-right neighbour pixel
+ int64_t xy_sum = 0, xz_sum = 0;
+ int64_t x_sum = 0, x2_sum = 0;
+
+ // Process horizontal and vertical correlations through the body in 4x4
+ // blocks. This excludes the final row and column and possibly one extra
+ // column depending how 3 divides into width and height
+ int32_t xy_tmp[4] = { 0 }, xz_tmp[4] = { 0 };
+ int32_t x_tmp[4] = { 0 }, x2_tmp[4] = { 0 };
+ __m128i xy_sum_32 = _mm_setzero_si128();
+ __m128i xz_sum_32 = _mm_setzero_si128();
+ __m128i x_sum_32 = _mm_setzero_si128();
+ __m128i x2_sum_32 = _mm_setzero_si128();
+ for (int i = 0; i <= height - 4; i += 3) {
+ for (int j = 0; j <= width - 4; j += 3) {
+ horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+ &xz_sum_32, &x_sum_32, &x2_sum_32);
+ }
+ xx_storeu_128(xy_tmp, xy_sum_32);
+ xx_storeu_128(xz_tmp, xz_sum_32);
+ xx_storeu_128(x_tmp, x_sum_32);
+ xx_storeu_128(x2_tmp, x2_sum_32);
+ xy_sum += (int64_t)xy_tmp[3] + xy_tmp[2] + xy_tmp[1];
+ xz_sum += (int64_t)xz_tmp[3] + xz_tmp[2] + xz_tmp[0];
+ x_sum += (int64_t)x_tmp[3] + x_tmp[2] + x_tmp[1] + x_tmp[0];
+ x2_sum += (int64_t)x2_tmp[2] + x2_tmp[1] + x2_tmp[0];
+ xy_sum_32 = _mm_setzero_si128();
+ xz_sum_32 = _mm_setzero_si128();
+ x_sum_32 = _mm_setzero_si128();
+ x2_sum_32 = _mm_setzero_si128();
+ }
+
+ // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+ int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+ // Do we have 2 rows remaining or just the one? Note that width and height
+ // are powers of 2, so each modulo 3 must be 1 or 2.
+ if (height % 3 == 1) { // Just horiz corrs on the final row
+ const int16_t x0 = diff[(height - 1) * stride];
+ x_sum += x0;
+ x_finalrow += x0;
+ x2_sum += x0 * x0;
+ x2_finalrow += x0 * x0;
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 1) * stride + j];
+ const int16_t y = diff[(height - 1) * stride + j + 1];
+ xy_sum += x * y;
+ x_sum += y;
+ x2_sum += y * y;
+ x_finalrow += y;
+ x2_finalrow += y * y;
+ }
+ } else { // Two rows remaining to do
+ const int16_t x0 = diff[(height - 2) * stride];
+ const int16_t z0 = diff[(height - 1) * stride];
+ x_sum += x0 + z0;
+ x2_sum += x0 * x0 + z0 * z0;
+ x_finalrow += z0;
+ x2_finalrow += z0 * z0;
+ for (int j = 0; j < width - 1; ++j) {
+ const int16_t x = diff[(height - 2) * stride + j];
+ const int16_t y = diff[(height - 2) * stride + j + 1];
+ const int16_t z = diff[(height - 1) * stride + j];
+ const int16_t w = diff[(height - 1) * stride + j + 1];
+
+ // Horizontal and vertical correlations for the penultimate row:
+ xy_sum += x * y;
+ xz_sum += x * z;
+
+ // Now just horizontal correlations for the final row:
+ xy_sum += z * w;
+
+ x_sum += y + w;
+ x2_sum += y * y + w * w;
+ x_finalrow += w;
+ x2_finalrow += w * w;
+ }
+ }
+
+ // Do we have 2 columns remaining or just the one?
+ if (width % 3 == 1) { // Just vert corrs on the final col
+ const int16_t x0 = diff[width - 1];
+ x_sum += x0;
+ x_finalcol += x0;
+ x2_sum += x0 * x0;
+ x2_finalcol += x0 * x0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 1];
+ xz_sum += x * z;
+ x_finalcol += z;
+ x2_finalcol += z * z;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z;
+ x2_sum += z * z;
+ }
+ }
+ } else { // Two cols remaining
+ const int16_t x0 = diff[width - 2];
+ const int16_t y0 = diff[width - 1];
+ x_sum += x0 + y0;
+ x2_sum += x0 * x0 + y0 * y0;
+ x_finalcol += y0;
+ x2_finalcol += y0 * y0;
+ for (int i = 0; i < height - 1; ++i) {
+ const int16_t x = diff[i * stride + width - 2];
+ const int16_t y = diff[i * stride + width - 1];
+ const int16_t z = diff[(i + 1) * stride + width - 2];
+ const int16_t w = diff[(i + 1) * stride + width - 1];
+
+ // Horizontal and vertical correlations for the penultimate col:
+ // Skip these on the last iteration of this loop if we also had two
+ // rows remaining, otherwise the final horizontal and vertical correlation
+ // get erroneously processed twice
+ if (i < height - 2 || height % 3 == 1) {
+ xy_sum += x * y;
+ xz_sum += x * z;
+ }
+
+ x_finalcol += w;
+ x2_finalcol += w * w;
+ // So the bottom-right elements don't get counted twice:
+ if (i < height - (height % 3 == 1 ? 2 : 3)) {
+ x_sum += z + w;
+ x2_sum += z * z + w * w;
+ }
+
+ // Now just vertical correlations for the final column:
+ xz_sum += y * w;
+ }
+ }
+
+ // Calculate the simple sums and squared-sums
+ int64_t x_firstrow = 0, x_firstcol = 0;
+ int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+ for (int j = 0; j < width; ++j) {
+ x_firstrow += diff[j];
+ x2_firstrow += diff[j] * diff[j];
+ }
+ for (int i = 0; i < height; ++i) {
+ x_firstcol += diff[i * stride];
+ x2_firstcol += diff[i * stride] * diff[i * stride];
+ }
+
+ int64_t xhor_sum = x_sum - x_finalcol;
+ int64_t xver_sum = x_sum - x_finalrow;
+ int64_t y_sum = x_sum - x_firstcol;
+ int64_t z_sum = x_sum - x_firstrow;
+ int64_t x2hor_sum = x2_sum - x2_finalcol;
+ int64_t x2ver_sum = x2_sum - x2_finalrow;
+ int64_t y2_sum = x2_sum - x2_firstcol;
+ int64_t z2_sum = x2_sum - x2_firstrow;
+
+ const float num_hor = (float)(height * (width - 1));
+ const float num_ver = (float)((height - 1) * width);
+
+ const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+ const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+ const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+ const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+ const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+ const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+ if (xhor_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ } else {
+ *hcorr = 1.0;
+ }
+ if (xver_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ } else {
+ *vcorr = 1.0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c b/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c
new file mode 100644
index 0000000000..a492483721
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/reconinter_enc_sse2.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/reconinter_enc.h"
+
+void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref, int ref_stride,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+ // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
+ // 2-tap yet.
+ int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ if (width >= 16) {
+ int i;
+ assert(!(width & 15));
+ /*Read 16 pixels one row at a time.*/
+ for (i = 0; i < height; i++) {
+ int j;
+ for (j = 0; j < width; j += 16) {
+ xx_storeu_128(comp_pred, xx_loadu_128(ref));
+ comp_pred += 16;
+ ref += 16;
+ }
+ ref += ref_stride - width;
+ }
+ } else if (width >= 8) {
+ int i;
+ assert(!(width & 7));
+ assert(!(height & 1));
+ /*Read 8 pixels two rows at a time.*/
+ for (i = 0; i < height; i += 2) {
+ __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+ __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+ xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
+ comp_pred += 16;
+ ref += 2 * ref_stride;
+ }
+ } else {
+ int i;
+ assert(!(width & 3));
+ assert(!(height & 3));
+ /*Read 4 pixels four rows at a time.*/
+ for (i = 0; i < height; i++) {
+ const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
+ const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
+ const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
+ const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
+ const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+ _mm_unpacklo_epi32(row2, row3));
+ xx_storeu_128(comp_pred, reg);
+ comp_pred += 16;
+ ref += 4 * ref_stride;
+ }
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+ width, height);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+ width, height);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t,
+ temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+ uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+ ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+ : temp;
+ uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+ int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+ kernel_x, 16, NULL, -1, width, intermediate_height);
+ aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+ kernel_y, 16, width, height);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w0,
+ const __m128i *w1,
+ const __m128i *r,
+ void *const result) {
+ assert(DIST_PRECISION_BITS <= 4);
+ __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+ __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+ __m128i sum = _mm_adds_epu16(mult0, mult1);
+ __m128i round = _mm_adds_epu16(sum, *r);
+ __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+ xx_storeu_128(result, shift);
+}
+
+void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
+ const struct AV1Common *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ uint8_t *comp_pred8, int width, int height,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *ref8, int ref_stride, int bd,
+ int subpel_search) {
+ // expect xd == NULL only in tests
+ if (xd != NULL) {
+ const MB_MODE_INFO *mi = xd->mi[0];
+ const int ref_num = 0;
+ const int is_intrabc = is_intrabc_block(mi);
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+ const int is_scaled = av1_is_scaled(sf);
+
+ if (is_scaled) {
+ int plane = 0;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const dst_buf = &pd->dst;
+ const struct buf_2d *const pre_buf =
+ is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+ InterPredParams inter_pred_params;
+ inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+ const int_interpfilters filters =
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+ av1_init_inter_params(
+ &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+ mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+ xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+ av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+ &inter_pred_params);
+ return;
+ }
+ }
+
+ const InterpFilterParams *filter = av1_get_filter(subpel_search);
+ int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+ if (!subpel_x_q3 && !subpel_y_q3) {
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ if (width >= 8) {
+ int i;
+ assert(!(width & 7));
+ /*Read 8 pixels one row at a time.*/
+ for (i = 0; i < height; i++) {
+ int j;
+ for (j = 0; j < width; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+ _mm_storeu_si128((__m128i *)comp_pred, s0);
+ comp_pred += 8;
+ ref += 8;
+ }
+ ref += ref_stride - width;
+ }
+ } else {
+ int i;
+ assert(!(width & 3));
+ /*Read 4 pixels two rows at a time.*/
+ for (i = 0; i < height; i += 2) {
+ __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
+ __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
+ __m128i t0 = _mm_unpacklo_epi64(s0, s1);
+ _mm_storeu_si128((__m128i *)comp_pred, t0);
+ comp_pred += 8;
+ ref += 2 * ref_stride;
+ }
+ }
+ } else if (!subpel_y_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+ NULL, -1, width, height, bd);
+ } else if (!subpel_x_q3) {
+ const int16_t *const kernel =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+ kernel, 16, width, height, bd);
+ } else {
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+ const int16_t *const kernel_x =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+ const int16_t *const kernel_y =
+ av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+ const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
+ uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+ ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+ : temp;
+ uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+ const int intermediate_height =
+ (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+ assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+ aom_highbd_convolve8_horiz(
+ ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
+ MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
+ aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
+ comp_pred8, width, NULL, -1, kernel_y, 16, width,
+ height, bd);
+ }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_sse2(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, int subpel_search) {
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+ bd, subpel_search);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+ /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
+ assert(!(width * height & 7));
+ int n = width * height >> 3;
+ for (int i = 0; i < n; i++) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
+ __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+ _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
+ comp_pred16 += 8;
+ pred += 8;
+ }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+ int subpel_search) {
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ int n;
+ int i;
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+ height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+ bd, subpel_search);
+ assert(!(width * height & 7));
+ n = width * height >> 3;
+
+ const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
+ const int16_t wt1 = (int16_t)jcp_param->bck_offset;
+ const __m128i w0 = _mm_set1_epi16(wt0);
+ const __m128i w1 = _mm_set1_epi16(wt1);
+ const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+ const __m128i r = _mm_set1_epi16(round);
+
+ uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+ for (i = 0; i < n; i++) {
+ __m128i p0 = xx_loadu_128(comp_pred16);
+ __m128i p1 = xx_loadu_128(pred);
+
+ highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+
+ comp_pred16 += 8;
+ pred += 8;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_avg_upsampled_pred_sse2(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, int subpel_search) {
+ int n;
+ int i;
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+ /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+ assert(!(width * height & 15));
+ n = width * height >> 4;
+ for (i = 0; i < n; i++) {
+ __m128i s0 = xx_loadu_128(comp_pred);
+ __m128i p0 = xx_loadu_128(pred);
+ xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
+ comp_pred += 16;
+ pred += 16;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c b/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c
new file mode 100644
index 0000000000..df7aa95855
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/reconinter_enc_ssse3.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w, const __m128i *r,
+ void *const result) {
+ __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+ __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+ __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+ __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+ __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+ __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+ __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+ __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+ xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
+ MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+ int n;
+ int i;
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+ subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+ /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+ assert(!(width * height & 15));
+ n = width * height >> 4;
+
+ const int8_t w0 = (int8_t)jcp_param->fwd_offset;
+ const int8_t w1 = (int8_t)jcp_param->bck_offset;
+ const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+ w1, w0, w1, w0);
+ const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+ const __m128i r = _mm_set1_epi16(round);
+
+ for (i = 0; i < n; i++) {
+ __m128i p0 = xx_loadu_128(comp_pred);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c b/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c
new file mode 100644
index 0000000000..752d6f3f0b
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_avx2.c
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 2)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+ { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+ { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+ { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = {
+ { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 }
+};
+
+#define CALC_X_GRADIENT(AC, GI, DF, out) \
+ out = _mm256_abs_epi16( \
+ _mm256_add_epi16(_mm256_add_epi16(AC, GI), _mm256_slli_epi16(DF, 1)));
+
+#define CALC_Y_GRADIENT(AC, GI, BH, out) \
+ out = _mm256_abs_epi16( \
+ _mm256_add_epi16(_mm256_sub_epi16(AC, GI), _mm256_slli_epi16(BH, 1)));
+
+double av1_estimate_noise_from_single_plane_avx2(const uint8_t *src, int height,
+ int width, int stride,
+ int edge_thresh) {
+ int count = 0;
+ int64_t accum = 0;
+ // w32 stores width multiple of 32.
+ const int w32 = (width - 1) & ~0x1f;
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i edge_threshold = _mm256_set1_epi16(edge_thresh);
+ __m256i num_accumulator = zero;
+ __m256i sum_accumulator = zero;
+
+ // A | B | C
+ // D | E | F
+ // G | H | I
+ // g_x = (A - C) + (G - I) + 2*(D - F)
+ // g_y = (A + C) - (G + I) + 2*(B - H)
+ // v = 4*E - 2*(D+F+B+H) + (A+C+G+I)
+
+ // Process the width multiple of 32 here.
+ for (int w = 1; w < w32; w += 32) {
+ int h = 1;
+ const int start_idx = h * stride + w;
+ const int stride_0 = start_idx - stride;
+
+ __m256i num_accum_row_lvl = zero;
+ const __m256i A = _mm256_loadu_si256((__m256i *)(&src[stride_0 - 1]));
+ const __m256i C = _mm256_loadu_si256((__m256i *)(&src[stride_0 + 1]));
+ const __m256i D = _mm256_loadu_si256((__m256i *)(&src[start_idx - 1]));
+ const __m256i F = _mm256_loadu_si256((__m256i *)(&src[start_idx + 1]));
+ __m256i B = _mm256_loadu_si256((__m256i *)(&src[stride_0]));
+ __m256i E = _mm256_loadu_si256((__m256i *)(&src[start_idx]));
+
+ const __m256i A_lo = _mm256_unpacklo_epi8(A, zero);
+ const __m256i A_hi = _mm256_unpackhi_epi8(A, zero);
+ const __m256i C_lo = _mm256_unpacklo_epi8(C, zero);
+ const __m256i C_hi = _mm256_unpackhi_epi8(C, zero);
+ const __m256i D_lo = _mm256_unpacklo_epi8(D, zero);
+ const __m256i D_hi = _mm256_unpackhi_epi8(D, zero);
+ const __m256i F_lo = _mm256_unpacklo_epi8(F, zero);
+ const __m256i F_hi = _mm256_unpackhi_epi8(F, zero);
+
+ __m256i sub_AC_lo = _mm256_sub_epi16(A_lo, C_lo);
+ __m256i sub_AC_hi = _mm256_sub_epi16(A_hi, C_hi);
+ __m256i sum_AC_lo = _mm256_add_epi16(A_lo, C_lo);
+ __m256i sum_AC_hi = _mm256_add_epi16(A_hi, C_hi);
+ __m256i sub_DF_lo = _mm256_sub_epi16(D_lo, F_lo);
+ __m256i sub_DF_hi = _mm256_sub_epi16(D_hi, F_hi);
+ __m256i sum_DF_lo = _mm256_add_epi16(D_lo, F_lo);
+ __m256i sum_DF_hi = _mm256_add_epi16(D_hi, F_hi);
+
+ for (; h < height - 1; h++) {
+ __m256i sum_GI_lo, sub_GI_lo, sum_GI_hi, sub_GI_hi, gx_lo, gy_lo, gx_hi,
+ gy_hi;
+ const int k = h * stride + w;
+ const __m256i G = _mm256_loadu_si256((__m256i *)(&src[k + stride - 1]));
+ const __m256i H = _mm256_loadu_si256((__m256i *)(&src[k + stride]));
+ const __m256i I = _mm256_loadu_si256((__m256i *)(&src[k + stride + 1]));
+
+ const __m256i B_lo = _mm256_unpacklo_epi8(B, zero);
+ const __m256i B_hi = _mm256_unpackhi_epi8(B, zero);
+ const __m256i G_lo = _mm256_unpacklo_epi8(G, zero);
+ const __m256i G_hi = _mm256_unpackhi_epi8(G, zero);
+ const __m256i I_lo = _mm256_unpacklo_epi8(I, zero);
+ const __m256i I_hi = _mm256_unpackhi_epi8(I, zero);
+ const __m256i H_lo = _mm256_unpacklo_epi8(H, zero);
+ const __m256i H_hi = _mm256_unpackhi_epi8(H, zero);
+
+ sub_GI_lo = _mm256_sub_epi16(G_lo, I_lo);
+ sub_GI_hi = _mm256_sub_epi16(G_hi, I_hi);
+ sum_GI_lo = _mm256_add_epi16(G_lo, I_lo);
+ sum_GI_hi = _mm256_add_epi16(G_hi, I_hi);
+ const __m256i sub_BH_lo = _mm256_sub_epi16(B_lo, H_lo);
+ const __m256i sub_BH_hi = _mm256_sub_epi16(B_hi, H_hi);
+
+ CALC_X_GRADIENT(sub_AC_lo, sub_GI_lo, sub_DF_lo, gx_lo)
+ CALC_Y_GRADIENT(sum_AC_lo, sum_GI_lo, sub_BH_lo, gy_lo)
+
+ const __m256i ga_lo = _mm256_add_epi16(gx_lo, gy_lo);
+
+ CALC_X_GRADIENT(sub_AC_hi, sub_GI_hi, sub_DF_hi, gx_hi)
+ CALC_Y_GRADIENT(sum_AC_hi, sum_GI_hi, sub_BH_hi, gy_hi)
+
+ const __m256i ga_hi = _mm256_add_epi16(gx_hi, gy_hi);
+
+ __m256i cmp_lo = _mm256_cmpgt_epi16(edge_threshold, ga_lo);
+ __m256i cmp_hi = _mm256_cmpgt_epi16(edge_threshold, ga_hi);
+ const __m256i comp_reg = _mm256_add_epi16(cmp_lo, cmp_hi);
+
+ // v = 4*E -2*(D+F+B+H) + (A+C+G+I)
+ if (_mm256_movemask_epi8(comp_reg) != 0) {
+ const __m256i sum_BH_lo = _mm256_add_epi16(B_lo, H_lo);
+ const __m256i sum_BH_hi = _mm256_add_epi16(B_hi, H_hi);
+
+ // 2*(D+F+B+H)
+ const __m256i sum_DFBH_lo =
+ _mm256_slli_epi16(_mm256_add_epi16(sum_DF_lo, sum_BH_lo), 1);
+ // (A+C+G+I)
+ const __m256i sum_ACGI_lo = _mm256_add_epi16(sum_AC_lo, sum_GI_lo);
+ const __m256i sum_DFBH_hi =
+ _mm256_slli_epi16(_mm256_add_epi16(sum_DF_hi, sum_BH_hi), 1);
+ const __m256i sum_ACGI_hi = _mm256_add_epi16(sum_AC_hi, sum_GI_hi);
+
+ // Convert E register values from 8bit to 16bit
+ const __m256i E_lo = _mm256_unpacklo_epi8(E, zero);
+ const __m256i E_hi = _mm256_unpackhi_epi8(E, zero);
+
+ // 4*E - 2*(D+F+B+H)+ (A+C+G+I)
+ const __m256i var_lo_0 = _mm256_abs_epi16(_mm256_add_epi16(
+ _mm256_sub_epi16(_mm256_slli_epi16(E_lo, 2), sum_DFBH_lo),
+ sum_ACGI_lo));
+ const __m256i var_hi_0 = _mm256_abs_epi16(_mm256_add_epi16(
+ _mm256_sub_epi16(_mm256_slli_epi16(E_hi, 2), sum_DFBH_hi),
+ sum_ACGI_hi));
+ cmp_lo = _mm256_srli_epi16(cmp_lo, 15);
+ cmp_hi = _mm256_srli_epi16(cmp_hi, 15);
+ const __m256i var_lo = _mm256_mullo_epi16(var_lo_0, cmp_lo);
+ const __m256i var_hi = _mm256_mullo_epi16(var_hi_0, cmp_hi);
+
+ num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_lo);
+ num_accum_row_lvl = _mm256_add_epi16(num_accum_row_lvl, cmp_hi);
+
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpacklo_epi16(var_lo, zero));
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpackhi_epi16(var_lo, zero));
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpacklo_epi16(var_hi, zero));
+ sum_accumulator = _mm256_add_epi32(sum_accumulator,
+ _mm256_unpackhi_epi16(var_hi, zero));
+ }
+ sub_AC_lo = sub_DF_lo;
+ sub_AC_hi = sub_DF_hi;
+ sub_DF_lo = sub_GI_lo;
+ sub_DF_hi = sub_GI_hi;
+ sum_AC_lo = sum_DF_lo;
+ sum_AC_hi = sum_DF_hi;
+ sum_DF_lo = sum_GI_lo;
+ sum_DF_hi = sum_GI_hi;
+ B = E;
+ E = H;
+ }
+ const __m256i num_0 = _mm256_unpacklo_epi16(num_accum_row_lvl, zero);
+ const __m256i num_1 = _mm256_unpackhi_epi16(num_accum_row_lvl, zero);
+ num_accumulator =
+ _mm256_add_epi32(num_accumulator, _mm256_add_epi32(num_0, num_1));
+ }
+
+ // Process the remaining width here.
+ for (int h = 1; h < height - 1; ++h) {
+ for (int w = w32 + 1; w < width - 1; ++w) {
+ const int k = h * stride + w;
+
+ // Compute sobel gradients
+ const int g_x = (src[k - stride - 1] - src[k - stride + 1]) +
+ (src[k + stride - 1] - src[k + stride + 1]) +
+ 2 * (src[k - 1] - src[k + 1]);
+ const int g_y = (src[k - stride - 1] - src[k + stride - 1]) +
+ (src[k - stride + 1] - src[k + stride + 1]) +
+ 2 * (src[k - stride] - src[k + stride]);
+ const int ga = abs(g_x) + abs(g_y);
+
+ if (ga < edge_thresh) {
+ // Find Laplacian
+ const int v =
+ 4 * src[k] -
+ 2 * (src[k - 1] + src[k + 1] + src[k - stride] + src[k + stride]) +
+ (src[k - stride - 1] + src[k - stride + 1] + src[k + stride - 1] +
+ src[k + stride + 1]);
+ accum += abs(v);
+ ++count;
+ }
+ }
+ }
+
+ // s0 s1 n0 n1 s2 s3 n2 n3
+ __m256i sum_avx = _mm256_hadd_epi32(sum_accumulator, num_accumulator);
+ __m128i sum_avx_lo = _mm256_castsi256_si128(sum_avx);
+ __m128i sum_avx_hi = _mm256_extractf128_si256(sum_avx, 1);
+ // s0+s2 s1+s3 n0+n2 n1+n3
+ __m128i sum_avx_1 = _mm_add_epi32(sum_avx_lo, sum_avx_hi);
+ // s0+s2+s1+s3 n0+n2+n1+n3
+ __m128i result = _mm_add_epi32(_mm_srli_si128(sum_avx_1, 4), sum_avx_1);
+
+ accum += _mm_cvtsi128_si32(result);
+ count += _mm_extract_epi32(result, 2);
+
+ // If very few smooth pels, return -1 since the estimate is unreliable.
+ return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
+static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+ const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ uint16_t *frame_sse, const unsigned int sse_stride) {
+ (void)block_width;
+ const uint8_t *src1 = frame1;
+ const uint8_t *src2 = frame2;
+ uint16_t *dst = frame_sse;
+ for (int i = 0; i < block_height; i++) {
+ __m128i vf1_128, vf2_128;
+ __m256i vf1, vf2, vdiff1, vsqdiff1;
+
+ vf1_128 = _mm_loadu_si128((__m128i *)(src1));
+ vf2_128 = _mm_loadu_si128((__m128i *)(src2));
+ vf1 = _mm256_cvtepu8_epi16(vf1_128);
+ vf2 = _mm256_cvtepu8_epi16(vf2_128);
+ vdiff1 = _mm256_sub_epi16(vf1, vf2);
+ vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+
+ _mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
+ // Set zero to uninitialized memory to avoid uninitialized loads later
+ *(int *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+ src1 += stride, src2 += stride2;
+ dst += sse_stride;
+ }
+}
+
+static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+ const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ uint16_t *frame_sse, const unsigned int sse_stride) {
+ (void)block_width;
+ const uint8_t *src1 = frame1;
+ const uint8_t *src2 = frame2;
+ uint16_t *dst = frame_sse;
+ for (int i = 0; i < block_height; i++) {
+ __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
+
+ vsrc1 = _mm256_loadu_si256((__m256i *)src1);
+ vsrc2 = _mm256_loadu_si256((__m256i *)src2);
+ vmax = _mm256_max_epu8(vsrc1, vsrc2);
+ vmin = _mm256_min_epu8(vsrc1, vsrc2);
+ vdiff = _mm256_subs_epu8(vmax, vmin);
+
+ __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
+ __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
+ vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
+ vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
+
+ vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+ vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
+ _mm256_storeu_si256((__m256i *)(dst), vres1);
+ _mm256_storeu_si256((__m256i *)(dst + 16), vres2);
+ // Set zero to uninitialized memory to avoid uninitialized loads later
+ *(int *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+ src1 += stride;
+ src2 += stride2;
+ dst += sse_stride;
+ }
+}
+
+static AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col,
+ int block_width) {
+ __m128i v128tmp = _mm_loadu_si128((__m128i *)(src));
+ if (col == 0) {
+ // For the first column, replicate the first element twice to the left
+ v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]);
+ }
+ if (col == block_width - 4) {
+ // For the last column, replicate the last element twice to the right
+ v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]);
+ }
+ return _mm256_cvtepu16_epi32(v128tmp);
+}
+
+static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+ // Mask the required 5 values inside the vector
+ __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+ __m128i v128a, v128b;
+ // Extract 256b as two 128b registers A and B
+ v128a = _mm256_castsi256_si128(vtmp);
+ v128b = _mm256_extracti128_si256(vtmp, 1);
+ // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+ v128a = _mm_add_epi32(v128a, v128b);
+ // B = [A2+B2, A3+B3, 0, 0]
+ v128b = _mm_srli_si128(v128a, 8);
+ // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+ v128a = _mm_add_epi32(v128a, v128b);
+ // B = [A1+B1+A3+B3, 0, 0, 0]
+ v128b = _mm_srli_si128(v128a, 4);
+ // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+ v128a = _mm_add_epi32(v128a, v128b);
+ return _mm_extract_epi32(v128a, 0);
+}
+
+// AVX2 implementation of approx_exp()
+static AOM_INLINE __m256 approx_exp_avx2(__m256 y) {
+#define A ((1 << 23) / 0.69314718056f) // (1 << 23) / ln(2)
+#define B \
+ 127 // Offset for the exponent according to IEEE floating point standard.
+#define C 60801 // Magic number controls the accuracy of approximation
+ const __m256 multiplier = _mm256_set1_ps(A);
+ const __m256i offset = _mm256_set1_epi32(B * (1 << 23) - C);
+
+ y = _mm256_mul_ps(y, multiplier);
+ y = _mm256_castsi256_ps(_mm256_add_epi32(_mm256_cvttps_epi32(y), offset));
+ return y;
+#undef A
+#undef B
+#undef C
+}
+
+static void apply_temporal_filter(
+ const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ uint16_t *frame_sse, uint32_t *luma_sse_sum,
+ const double inv_num_ref_pixels, const double decay_factor,
+ const double inv_factor, const double weight_factor, double *d_factor,
+ int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_sse[BH][BW];
+
+ if (block_width == 32) {
+ get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+ block_height, frame_sse, SSE_STRIDE);
+ } else {
+ get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+ block_height, frame_sse, SSE_STRIDE);
+ }
+
+ __m256i vsrc[5];
+
+ // Traverse 4 columns at a time
+ // First and last columns will require padding
+ for (int col = 0; col < block_width; col += 4) {
+ uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse;
+
+ // Load and pad(for first and last col) 3 rows from the top
+ for (int i = 2; i < 5; i++) {
+ vsrc[i] = xx_load_and_pad(src, col, block_width);
+ src += SSE_STRIDE;
+ }
+
+ // Copy first row to first 2 vectors
+ vsrc[0] = vsrc[2];
+ vsrc[1] = vsrc[2];
+
+ for (int row = 0; row < block_height; row++) {
+ __m256i vsum = _mm256_setzero_si256();
+
+ // Add 5 consecutive rows
+ for (int i = 0; i < 5; i++) {
+ vsum = _mm256_add_epi32(vsum, vsrc[i]);
+ }
+
+ // Push all elements by one element to the top
+ for (int i = 0; i < 4; i++) {
+ vsrc[i] = vsrc[i + 1];
+ }
+
+ // Load next row to the last element
+ if (row <= block_height - 4) {
+ vsrc[4] = xx_load_and_pad(src, col, block_width);
+ src += SSE_STRIDE;
+ } else {
+ vsrc[4] = vsrc[3];
+ }
+
+ // Accumulate the sum horizontally
+ for (int i = 0; i < 4; i++) {
+ acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i);
+ }
+ }
+ }
+
+ double subblock_mses_scaled[4];
+ double d_factor_decayed[4];
+ for (int idx = 0; idx < 4; idx++) {
+ subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+ d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+ }
+ if (tf_wgt_calc_lvl == 0) {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ } else {
+ __m256d subblock_mses_reg[4];
+ __m256d d_factor_mul_n_decay_qr_invs[4];
+ const __m256 zero = _mm256_set1_ps(0.0f);
+ const __m256 point_five = _mm256_set1_ps(0.5f);
+ const __m256 seven = _mm256_set1_ps(7.0f);
+ const __m256d inv_num_ref_pixel_256bit = _mm256_set1_pd(inv_num_ref_pixels);
+ const __m256d weight_factor_256bit = _mm256_set1_pd(weight_factor);
+ const __m256 tf_weight_scale = _mm256_set1_ps((float)TF_WEIGHT_SCALE);
+ // Maintain registers to hold mse and d_factor at subblock level.
+ subblock_mses_reg[0] = _mm256_set1_pd(subblock_mses_scaled[0]);
+ subblock_mses_reg[1] = _mm256_set1_pd(subblock_mses_scaled[1]);
+ subblock_mses_reg[2] = _mm256_set1_pd(subblock_mses_scaled[2]);
+ subblock_mses_reg[3] = _mm256_set1_pd(subblock_mses_scaled[3]);
+ d_factor_mul_n_decay_qr_invs[0] = _mm256_set1_pd(d_factor_decayed[0]);
+ d_factor_mul_n_decay_qr_invs[1] = _mm256_set1_pd(d_factor_decayed[1]);
+ d_factor_mul_n_decay_qr_invs[2] = _mm256_set1_pd(d_factor_decayed[2]);
+ d_factor_mul_n_decay_qr_invs[3] = _mm256_set1_pd(d_factor_decayed[3]);
+
+ for (int i = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ uint32_t *luma_sse_sum_temp = luma_sse_sum + i * BW;
+ for (int j = 0; j < block_width; j += 8) {
+ const __m256i acc_sse =
+ _mm256_lddqu_si256((__m256i *)(acc_5x5_sse[i] + j));
+ const __m256i luma_sse =
+ _mm256_lddqu_si256((__m256i *)((luma_sse_sum_temp + j)));
+
+ // uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+ const __m256i diff_sse = _mm256_add_epi32(acc_sse, luma_sse);
+
+ const __m256d diff_sse_pd_1 =
+ _mm256_cvtepi32_pd(_mm256_castsi256_si128(diff_sse));
+ const __m256d diff_sse_pd_2 =
+ _mm256_cvtepi32_pd(_mm256_extracti128_si256(diff_sse, 1));
+
+ // const double window_error = diff_sse * inv_num_ref_pixels;
+ const __m256d window_error_1 =
+ _mm256_mul_pd(diff_sse_pd_1, inv_num_ref_pixel_256bit);
+ const __m256d window_error_2 =
+ _mm256_mul_pd(diff_sse_pd_2, inv_num_ref_pixel_256bit);
+
+ // const int subblock_idx = y_blk_raster_offset + (j >= block_width /
+ // 2);
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const __m256d blk_error = subblock_mses_reg[subblock_idx];
+
+ // const double combined_error =
+ // weight_factor *window_error + subblock_mses_scaled[subblock_idx];
+ const __m256d combined_error_1 = _mm256_add_pd(
+ _mm256_mul_pd(window_error_1, weight_factor_256bit), blk_error);
+
+ const __m256d combined_error_2 = _mm256_add_pd(
+ _mm256_mul_pd(window_error_2, weight_factor_256bit), blk_error);
+
+ // d_factor_decayed[subblock_idx]
+ const __m256d d_fact_mul_n_decay =
+ d_factor_mul_n_decay_qr_invs[subblock_idx];
+
+ // double scaled_error = combined_error *
+ // d_factor_decayed[subblock_idx];
+ const __m256d scaled_error_1 =
+ _mm256_mul_pd(combined_error_1, d_fact_mul_n_decay);
+ const __m256d scaled_error_2 =
+ _mm256_mul_pd(combined_error_2, d_fact_mul_n_decay);
+
+ const __m128 scaled_error_ps_1 = _mm256_cvtpd_ps(scaled_error_1);
+ const __m128 scaled_error_ps_2 = _mm256_cvtpd_ps(scaled_error_2);
+
+ const __m256 scaled_error_ps = _mm256_insertf128_ps(
+ _mm256_castps128_ps256(scaled_error_ps_1), scaled_error_ps_2, 0x1);
+
+ // scaled_error = AOMMIN(scaled_error, 7);
+ const __m256 scaled_diff_ps = _mm256_min_ps(scaled_error_ps, seven);
+ const __m256 minus_scaled_diff_ps = _mm256_sub_ps(zero, scaled_diff_ps);
+ // const int weight =
+ //(int)(approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE + 0.5f);
+ const __m256 exp_result = approx_exp_avx2(minus_scaled_diff_ps);
+ const __m256 scale_weight_exp_result =
+ _mm256_mul_ps(exp_result, tf_weight_scale);
+ const __m256 round_result =
+ _mm256_add_ps(scale_weight_exp_result, point_five);
+ __m256i weights_in_32bit = _mm256_cvttps_epi32(round_result);
+
+ __m128i weights_in_16bit =
+ _mm_packus_epi32(_mm256_castsi256_si128(weights_in_32bit),
+ _mm256_extractf128_si256(weights_in_32bit, 0x1));
+
+ // count[k] += weight;
+ // accumulator[k] += weight * pixel_value;
+ const int stride_idx = i * stride2 + j;
+ const __m128i count_array =
+ _mm_loadu_si128((__m128i *)(count + stride_idx));
+ _mm_storeu_si128((__m128i *)(count + stride_idx),
+ _mm_add_epi16(count_array, weights_in_16bit));
+
+ const __m256i accumulator_array =
+ _mm256_loadu_si256((__m256i *)(accumulator + stride_idx));
+ const __m128i pred_values =
+ _mm_loadl_epi64((__m128i *)(frame2 + stride_idx));
+
+ const __m256i pred_values_u32 = _mm256_cvtepu8_epi32(pred_values);
+ const __m256i mull_frame2_weight_u32 =
+ _mm256_mullo_epi32(pred_values_u32, weights_in_32bit);
+ _mm256_storeu_si256(
+ (__m256i *)(accumulator + stride_idx),
+ _mm256_add_epi32(accumulator_array, mull_frame2_weight_u32));
+ }
+ }
+ }
+}
+
+void av1_apply_temporal_filter_avx2(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with avx2!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with avx2!");
+ assert(!is_high_bitdepth && "Only support low bit-depth with avx2!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++, k++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx];
+ }
+ }
+ }
+ }
+ }
+
+ apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w,
+ plane_w, plane_h, subblock_mses, accum + plane_offset,
+ count + plane_offset, frame_sse, luma_sse_sum,
+ inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c b/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c
new file mode 100644
index 0000000000..842d3b13c8
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_sse2.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/mathutils.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+// For the squared error buffer, keep a padding for 4 samples
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
+ { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+ { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+ { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+ { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+ { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
+};
+
+static void get_squared_error(const uint8_t *frame1, const unsigned int stride,
+ const uint8_t *frame2, const unsigned int stride2,
+ const int block_width, const int block_height,
+ uint16_t *frame_sse,
+ const unsigned int dst_stride) {
+ const uint8_t *src1 = frame1;
+ const uint8_t *src2 = frame2;
+ uint16_t *dst = frame_sse;
+
+ for (int i = 0; i < block_height; i++) {
+ for (int j = 0; j < block_width; j += 16) {
+ // Set zero to uninitialized memory to avoid uninitialized loads later
+ *(int *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+ __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
+ __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
+
+ __m128i vmax = _mm_max_epu8(vsrc1, vsrc2);
+ __m128i vmin = _mm_min_epu8(vsrc1, vsrc2);
+ __m128i vdiff = _mm_subs_epu8(vmax, vmin);
+
+ __m128i vzero = _mm_setzero_si128();
+ __m128i vdiff1 = _mm_unpacklo_epi8(vdiff, vzero);
+ __m128i vdiff2 = _mm_unpackhi_epi8(vdiff, vzero);
+
+ __m128i vres1 = _mm_mullo_epi16(vdiff1, vdiff1);
+ __m128i vres2 = _mm_mullo_epi16(vdiff2, vdiff2);
+
+ _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
+ _mm_storeu_si128((__m128i *)(dst + j + 10), vres2);
+ }
+
+ // Set zero to uninitialized memory to avoid uninitialized loads later
+ *(int *)(dst + block_width + 2) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+ src1 += stride;
+ src2 += stride2;
+ dst += dst_stride;
+ }
+}
+
+static void xx_load_and_pad(uint16_t *src, __m128i *dstvec, int col,
+ int block_width) {
+ __m128i vtmp = _mm_loadu_si128((__m128i *)src);
+ __m128i vzero = _mm_setzero_si128();
+ __m128i vtmp1 = _mm_unpacklo_epi16(vtmp, vzero);
+ __m128i vtmp2 = _mm_unpackhi_epi16(vtmp, vzero);
+ // For the first column, replicate the first element twice to the left
+ dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
+ // For the last column, replicate the last element twice to the right
+ dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
+}
+
+static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
+ __m128i veca, vecb;
+ // Mask and obtain the required 5 values inside the vector
+ veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
+ vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
+ // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+ veca = _mm_add_epi32(veca, vecb);
+ // B = [A2+B2, A3+B3, 0, 0]
+ vecb = _mm_srli_si128(veca, 8);
+ // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+ veca = _mm_add_epi32(veca, vecb);
+ // B = [A1+B1+A3+B3, 0, 0, 0]
+ vecb = _mm_srli_si128(veca, 4);
+ // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+ veca = _mm_add_epi32(veca, vecb);
+ return _mm_cvtsi128_si32(veca);
+}
+
+static void apply_temporal_filter(
+ const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+ const unsigned int stride2, const int block_width, const int block_height,
+ const int *subblock_mses, unsigned int *accumulator, uint16_t *count,
+ uint16_t *frame_sse, uint32_t *luma_sse_sum,
+ const double inv_num_ref_pixels, const double decay_factor,
+ const double inv_factor, const double weight_factor, double *d_factor,
+ int tf_wgt_calc_lvl) {
+ assert(((block_width == 16) || (block_width == 32)) &&
+ ((block_height == 16) || (block_height == 32)));
+
+ uint32_t acc_5x5_sse[BH][BW];
+
+ get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
+ frame_sse, SSE_STRIDE);
+
+ __m128i vsrc[5][2];
+
+ // Traverse 4 columns at a time
+ // First and last columns will require padding
+ for (int col = 0; col < block_width; col += 4) {
+ uint16_t *src = frame_sse + col;
+
+ // Load and pad(for first and last col) 3 rows from the top
+ for (int i = 2; i < 5; i++) {
+ xx_load_and_pad(src, vsrc[i], col, block_width);
+ src += SSE_STRIDE;
+ }
+
+ // Padding for top 2 rows
+ vsrc[0][0] = vsrc[2][0];
+ vsrc[0][1] = vsrc[2][1];
+ vsrc[1][0] = vsrc[2][0];
+ vsrc[1][1] = vsrc[2][1];
+
+ for (int row = 0; row < block_height; row++) {
+ __m128i vsum1 = _mm_setzero_si128();
+ __m128i vsum2 = _mm_setzero_si128();
+
+ // Add 5 consecutive rows
+ for (int i = 0; i < 5; i++) {
+ vsum1 = _mm_add_epi32(vsrc[i][0], vsum1);
+ vsum2 = _mm_add_epi32(vsrc[i][1], vsum2);
+ }
+
+ // Push all elements by one element to the top
+ for (int i = 0; i < 4; i++) {
+ vsrc[i][0] = vsrc[i + 1][0];
+ vsrc[i][1] = vsrc[i + 1][1];
+ }
+
+ if (row <= block_height - 4) {
+ // Load next row
+ xx_load_and_pad(src, vsrc[4], col, block_width);
+ src += SSE_STRIDE;
+ } else {
+ // Padding for bottom 2 rows
+ vsrc[4][0] = vsrc[3][0];
+ vsrc[4][1] = vsrc[3][1];
+ }
+
+ // Accumulate the sum horizontally
+ for (int i = 0; i < 4; i++) {
+ acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum1, vsum2, i);
+ }
+ }
+ }
+
+ double subblock_mses_scaled[4];
+ double d_factor_decayed[4];
+ for (int idx = 0; idx < 4; idx++) {
+ subblock_mses_scaled[idx] = subblock_mses[idx] * inv_factor;
+ d_factor_decayed[idx] = d_factor[idx] * decay_factor;
+ }
+ if (tf_wgt_calc_lvl == 0) {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
+
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ } else {
+ for (int i = 0, k = 0; i < block_height; i++) {
+ const int y_blk_raster_offset = (i >= block_height / 2) * 2;
+ for (int j = 0; j < block_width; j++, k++) {
+ const int pixel_value = frame2[i * stride2 + j];
+ uint32_t diff_sse = acc_5x5_sse[i][j] + luma_sse_sum[i * BW + j];
+
+ const double window_error = diff_sse * inv_num_ref_pixels;
+ const int subblock_idx = y_blk_raster_offset + (j >= block_width / 2);
+ const double combined_error =
+ weight_factor * window_error + subblock_mses_scaled[subblock_idx];
+
+ double scaled_error = combined_error * d_factor_decayed[subblock_idx];
+ scaled_error = AOMMIN(scaled_error, 7);
+ const float fweight =
+ approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
+ const int weight = iroundpf(fweight);
+ count[k] += weight;
+ accumulator[k] += weight * pixel_value;
+ }
+ }
+ }
+}
+
+void av1_apply_temporal_filter_sse2(
+ const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+ const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+ const int num_planes, const double *noise_levels, const MV *subblock_mvs,
+ const int *subblock_mses, const int q_factor, const int filter_strength,
+ int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
+ uint16_t *count) {
+ const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
+ assert(block_size == BLOCK_32X32 && "Only support 32x32 block with sse2!");
+ assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with sse2!");
+ assert(!is_high_bitdepth && "Only support low bit-depth with sse2!");
+ assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+ (void)is_high_bitdepth;
+
+ const int mb_height = block_size_high[block_size];
+ const int mb_width = block_size_wide[block_size];
+ const int frame_height = frame_to_filter->y_crop_height;
+ const int frame_width = frame_to_filter->y_crop_width;
+ const int min_frame_size = AOMMIN(frame_height, frame_width);
+ // Variables to simplify combined error calculation.
+ const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
+ TF_SEARCH_ERROR_NORM_WEIGHT);
+ const double weight_factor =
+ (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
+ // Adjust filtering based on q.
+ // Larger q -> stronger filtering -> larger weight.
+ // Smaller q -> weaker filtering -> smaller weight.
+ double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
+ q_decay = CLIP(q_decay, 1e-5, 1);
+ if (q_factor >= TF_QINDEX_CUTOFF) {
+ // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+ // We do not need a clip here.
+ q_decay = 0.5 * pow((double)q_factor / 64, 2);
+ }
+ // Smaller strength -> smaller filtering weight.
+ double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
+ s_decay = CLIP(s_decay, 1e-5, 1);
+ double d_factor[4] = { 0 };
+ uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
+ uint32_t luma_sse_sum[BW * BH] = { 0 };
+
+ for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
+ // Larger motion vector -> smaller filtering weight.
+ const MV mv = subblock_mvs[subblock_idx];
+ const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+ double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
+ distance_threshold = AOMMAX(distance_threshold, 1);
+ d_factor[subblock_idx] = distance / distance_threshold;
+ d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
+ }
+
+ // Handle planes in sequence.
+ int plane_offset = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+ const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+ const uint32_t frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+ const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+ const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
+ const int ss_x_shift =
+ mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
+ const int ss_y_shift =
+ mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
+ const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
+ ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
+ const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
+ // Larger noise -> larger filtering weight.
+ const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+ // Decay factors for non-local mean approach.
+ const double decay_factor = 1 / (n_decay * q_decay * s_decay);
+
+ // Filter U-plane and V-plane using Y-plane. This is because motion
+ // search is only done on Y-plane, so the information from Y-plane
+ // will be more accurate. The luma sse sum is reused in both chroma
+ // planes.
+ if (plane == AOM_PLANE_U) {
+ for (unsigned int i = 0, k = 0; i < plane_h; i++) {
+ for (unsigned int j = 0; j < plane_w; j++, k++) {
+ for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+ for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+ const int yy = (i << ss_y_shift) + ii; // Y-coord on Y-plane.
+ const int xx = (j << ss_x_shift) + jj; // X-coord on Y-plane.
+ luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
+ }
+ }
+ }
+ }
+ }
+
+ apply_temporal_filter(ref, frame_stride, pred + plane_offset, plane_w,
+ plane_w, plane_h, subblock_mses, accum + plane_offset,
+ count + plane_offset, frame_sse, luma_sse_sum,
+ inv_num_ref_pixels, decay_factor, inv_factor,
+ weight_factor, d_factor, tf_wgt_calc_lvl);
+ plane_offset += plane_h * plane_w;
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
new file mode 100644
index 0000000000..9cde860534
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ int n = -N;
+
+ uint64_t csse;
+
+ const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
+ const __m256i v_zext_q = yy_set1_64_from_32i(~0);
+
+ __m256i v_acc0_q = _mm256_setzero_si256();
+
+ assert(N % 64 == 0);
+
+ r1 += N;
+ d += N;
+ m += N;
+
+ do {
+ const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n));
+ const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n));
+ const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n));
+
+ const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w);
+ const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w);
+ const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b);
+
+ const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w);
+ const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w);
+
+ const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w);
+ const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w);
+
+ const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d);
+
+ const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w);
+
+ const __m256i v_sum0_q = _mm256_add_epi64(
+ _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32));
+
+ v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q);
+
+ n += 16;
+ } while (n);
+
+ v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8));
+ __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q);
+ __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1);
+ v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+#if AOM_ARCH_X86_64
+ csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+ xx_storel_64(&csse, v_acc_q_0);
+#endif
+
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int64_t acc;
+ __m256i v_acc0_d = _mm256_setzero_si256();
+
+ // Input size limited to 8192 by the use of 32 bit accumulators and m
+ // being between [0, 64]. Overflow might happen at larger sizes,
+ // though it is practically impossible on real video input.
+ assert(N < 8192);
+ assert(N % 64 == 0);
+
+ do {
+ const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m));
+ const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32));
+
+ const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds));
+ const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16));
+ const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32));
+ const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48));
+
+ const __m256i v_m0_w =
+ _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b));
+ const __m256i v_m1_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1));
+ const __m256i v_m2_w =
+ _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b));
+ const __m256i v_m3_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1));
+
+ const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w);
+ const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w);
+ const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w);
+ const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w);
+
+ const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d);
+ const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d);
+
+ const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d);
+
+ v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d);
+
+ ds += 64;
+ m += 64;
+
+ N -= 64;
+ } while (N);
+
+ __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31);
+ v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d),
+ _mm256_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+ __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8));
+
+ __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q);
+ __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1);
+ v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+
+#if AOM_ARCH_X86_64
+ acc = _mm_extract_epi64(v_acc_q_0, 0);
+#else
+ xx_storel_64(&acc, v_acc_q_0);
+#endif
+
+ return acc > limit;
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ const __m256i v_neg_w = _mm256_set1_epi32((int)0xffff0001);
+
+ assert(N % 64 == 0);
+
+ do {
+ const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a));
+ const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b));
+ const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16));
+ const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16));
+ const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32));
+ const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32));
+ const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48));
+ const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48));
+
+ const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w);
+ const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w);
+ const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w);
+ const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w);
+ const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w);
+ const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w);
+ const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w);
+ const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w);
+
+ // Negate top word of pairs
+ const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w);
+ const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w);
+ const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w);
+ const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w);
+ const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w);
+ const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w);
+ const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w);
+ const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w);
+
+ const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w);
+ const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w);
+ const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w);
+ const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w);
+ const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w);
+ const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w);
+ const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w);
+ const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+ const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w);
+ const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w);
+ const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w);
+ const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w);
+
+ _mm256_store_si256((__m256i *)(d), v_r0_w);
+ _mm256_store_si256((__m256i *)(d + 16), v_r1_w);
+ _mm256_store_si256((__m256i *)(d + 32), v_r2_w);
+ _mm256_store_si256((__m256i *)(d + 48), v_r3_w);
+
+ a += 64;
+ b += 64;
+ d += 64;
+ N -= 64;
+ } while (N);
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
new file mode 100644
index 0000000000..d7ac2223f2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ int n = -N;
+ int n8 = n + 8;
+
+ uint64_t csse;
+
+ const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
+ const __m128i v_zext_q = xx_set1_64_from_32i(~0);
+
+ __m128i v_acc0_q = _mm_setzero_si128();
+
+ assert(N % 64 == 0);
+
+ r1 += N;
+ d += N;
+ m += N;
+
+ do {
+ const __m128i v_r0_w = xx_load_128(r1 + n);
+ const __m128i v_r1_w = xx_load_128(r1 + n8);
+ const __m128i v_d0_w = xx_load_128(d + n);
+ const __m128i v_d1_w = xx_load_128(d + n8);
+ const __m128i v_m01_b = xx_load_128(m + n);
+
+ const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
+ const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
+ const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
+ const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
+ const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+
+ const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
+ const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
+ const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
+ const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
+
+ const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
+ const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
+ const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
+ const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
+
+ const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
+ const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
+
+ const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
+ const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
+
+ const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
+ _mm_srli_epi64(v_sq0_d, 32));
+ const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
+ _mm_srli_epi64(v_sq1_d, 32));
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
+
+ n8 += 16;
+ n += 16;
+ } while (n);
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if AOM_ARCH_X86_64
+ csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+ xx_storel_64(&csse, v_acc0_q);
+#endif
+
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int64_t acc;
+
+ __m128i v_sign_d;
+ __m128i v_acc0_d = _mm_setzero_si128();
+ __m128i v_acc1_d = _mm_setzero_si128();
+ __m128i v_acc_q;
+
+ // Input size limited to 8192 by the use of 32 bit accumulators and m
+ // being between [0, 64]. Overflow might happen at larger sizes,
+ // though it is practically impossible on real video input.
+ assert(N < 8192);
+ assert(N % 64 == 0);
+
+ do {
+ const __m128i v_m01_b = xx_load_128(m);
+ const __m128i v_m23_b = xx_load_128(m + 16);
+ const __m128i v_m45_b = xx_load_128(m + 32);
+ const __m128i v_m67_b = xx_load_128(m + 48);
+
+ const __m128i v_d0_w = xx_load_128(ds);
+ const __m128i v_d1_w = xx_load_128(ds + 8);
+ const __m128i v_d2_w = xx_load_128(ds + 16);
+ const __m128i v_d3_w = xx_load_128(ds + 24);
+ const __m128i v_d4_w = xx_load_128(ds + 32);
+ const __m128i v_d5_w = xx_load_128(ds + 40);
+ const __m128i v_d6_w = xx_load_128(ds + 48);
+ const __m128i v_d7_w = xx_load_128(ds + 56);
+
+ const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
+ const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
+ const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
+ const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
+ const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
+ const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
+
+ const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+ const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+ const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
+ const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
+ const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
+ const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
+ const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
+ const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
+
+ const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
+ const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
+ const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
+ const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
+
+ const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
+ const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
+
+ v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
+ v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
+
+ ds += 64;
+ m += 64;
+
+ N -= 64;
+ } while (N);
+
+ v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
+ v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
+ _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+ v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
+ v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
+ _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
+
+ v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if AOM_ARCH_X86_64
+ acc = _mm_cvtsi128_si64(v_acc_q);
+#else
+ xx_storel_64(&acc, v_acc_q);
+#endif
+
+ return acc > limit;
+}
+
+// Negate under mask
+static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+ return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0,
+ (short)0xffff, 0, (short)0xffff, 0);
+
+ assert(N % 64 == 0);
+
+ do {
+ const __m128i v_a0_w = xx_load_128(a);
+ const __m128i v_b0_w = xx_load_128(b);
+ const __m128i v_a1_w = xx_load_128(a + 8);
+ const __m128i v_b1_w = xx_load_128(b + 8);
+ const __m128i v_a2_w = xx_load_128(a + 16);
+ const __m128i v_b2_w = xx_load_128(b + 16);
+ const __m128i v_a3_w = xx_load_128(a + 24);
+ const __m128i v_b3_w = xx_load_128(b + 24);
+
+ const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
+ const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
+ const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
+ const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
+ const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
+ const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
+ const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
+ const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
+
+ // Negate top word of pairs
+ const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
+ const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
+ const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
+ const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
+ const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
+ const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
+ const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
+ const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
+
+ const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
+ const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
+ const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
+ const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
+ const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
+ const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
+ const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
+ const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+ const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
+ const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
+ const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
+ const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
+
+ xx_store_128(d, v_r0_w);
+ xx_store_128(d + 8, v_r1_w);
+ xx_store_128(d + 16, v_r2_w);
+ xx_store_128(d + 24, v_r3_w);
+
+ a += 32;
+ b += 32;
+ d += 32;
+ N -= 32;
+ } while (N);
+}